From e698d182421168301d0591bc01afdf2c273e15d8 Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Wed, 9 Oct 2024 17:05:09 +0200 Subject: [PATCH] chore(gpu): automatically generate rust bindings for cuda functions, except device.cu --- .../workflows/benchmark_gpu_core_crypto.yml | 3 +- .github/workflows/benchmark_gpu_integer.yml | 2 +- .../benchmark_gpu_integer_2H100_full.yml | 2 +- .../workflows/benchmark_gpu_integer_full.yml | 2 +- .../benchmark_gpu_integer_multi_bit.yml | 2 +- ...chmark_gpu_integer_multi_bit_multi_gpu.yml | 2 +- .../benchmark_gpu_integer_multi_gpu_full.yml | 2 +- .github/workflows/benchmark_gpu_l40.yml | 2 +- .github/workflows/gpu_fast_h100_tests.yml | 2 +- .github/workflows/gpu_fast_tests.yml | 2 +- .github/workflows/gpu_full_h100_tests.yml | 2 +- .../workflows/gpu_full_multi_gpu_tests.yml | 2 +- .../gpu_signed_integer_h100_tests.yml | 3 +- .../workflows/gpu_signed_integer_tests.yml | 3 +- .../gpu_unsigned_integer_h100_tests.yml | 3 +- .../workflows/gpu_unsigned_integer_tests.yml | 3 +- Makefile | 10 +- backends/tfhe-cuda-backend/Cargo.toml | 1 + backends/tfhe-cuda-backend/build.rs | 36 +- .../cuda/include/ciphertext.h | 13 +- .../tfhe-cuda-backend/cuda/include/device.h | 2 +- .../include/integer/compression/compression.h | 45 + .../compression/compression_utilities.h} | 55 +- .../cuda/include/integer/integer.h | 421 +++++ .../integer_utilities.h} | 596 +----- .../cuda/include/keyswitch.h | 25 +- .../cuda/include/linear_algebra.h | 40 +- .../cuda/include/pbs/pbs_enums.h | 7 + .../pbs_multibit_utilities.h} | 62 +- .../pbs_utilities.h} | 104 +- .../cuda/include/pbs/programmable_bootstrap.h | 86 + .../pbs/programmable_bootstrap_multibit.h | 38 + .../cuda/src/crypto/ciphertext.cu | 18 +- .../cuda/src/crypto/ciphertext.cuh | 10 +- .../cuda/src/crypto/keyswitch.cu | 33 +- .../cuda/src/crypto/keyswitch.cuh | 32 +- backends/tfhe-cuda-backend/cuda/src/device.cu | 2 +- .../cuda/src/integer/addition.cu | 21 +- .../cuda/src/integer/addition.cuh | 22 +- .../cuda/src/integer/bitwise_ops.cu | 20 +- .../cuda/src/integer/bitwise_ops.cuh | 20 +- .../cuda/src/integer/cmux.cu | 21 +- .../cuda/src/integer/cmux.cuh | 24 +- .../cuda/src/integer/comparison.cu | 30 +- .../cuda/src/integer/comparison.cuh | 100 +- .../src/integer/compression/compression.cu | 55 +- .../src/integer/compression/compression.cuh | 54 +- .../cuda/src/integer/div_rem.cu | 19 +- .../cuda/src/integer/div_rem.cuh | 257 +-- .../cuda/src/integer/integer.cu | 142 +- .../cuda/src/integer/integer.cuh | 233 +-- .../cuda/src/integer/multiplication.cu | 84 +- .../cuda/src/integer/multiplication.cuh | 32 +- .../cuda/src/integer/negation.cu | 28 +- .../cuda/src/integer/negation.cuh | 34 +- .../cuda/src/integer/scalar_addition.cu | 12 +- .../cuda/src/integer/scalar_addition.cuh | 24 +- .../cuda/src/integer/scalar_bitops.cu | 12 +- .../cuda/src/integer/scalar_bitops.cuh | 9 +- .../cuda/src/integer/scalar_comparison.cu | 20 +- .../cuda/src/integer/scalar_comparison.cuh | 86 +- .../cuda/src/integer/scalar_mul.cu | 23 +- .../cuda/src/integer/scalar_mul.cuh | 23 +- .../cuda/src/integer/scalar_rotate.cu | 14 +- .../cuda/src/integer/scalar_rotate.cuh | 17 +- .../cuda/src/integer/scalar_shifts.cu | 34 +- .../cuda/src/integer/scalar_shifts.cuh | 30 +- .../cuda/src/integer/shift_and_rotate.cu | 17 +- .../cuda/src/integer/shift_and_rotate.cuh | 22 +- .../cuda/src/linearalgebra/addition.cu | 36 +- .../cuda/src/linearalgebra/addition.cuh | 31 +- .../cuda/src/linearalgebra/multiplication.cu | 20 +- .../cuda/src/linearalgebra/multiplication.cuh | 17 +- .../cuda/src/linearalgebra/negation.cu | 8 +- .../cuda/src/linearalgebra/negation.cuh | 4 +- .../cuda/src/pbs/bootstrapping_key.cu | 182 +- .../cuda/src/pbs/bootstrapping_key.cuh | 177 +- .../cuda/src/pbs/programmable_bootstrap.cuh | 32 +- .../pbs/programmable_bootstrap_amortized.cu | 10 +- .../pbs/programmable_bootstrap_amortized.cuh | 3 +- .../pbs/programmable_bootstrap_cg_classic.cuh | 8 +- .../programmable_bootstrap_cg_multibit.cuh | 23 +- .../src/pbs/programmable_bootstrap_classic.cu | 133 +- .../pbs/programmable_bootstrap_classic.cuh | 42 +- .../pbs/programmable_bootstrap_multibit.cu | 76 +- .../pbs/programmable_bootstrap_multibit.cuh | 30 +- .../programmable_bootstrap_tbc_classic.cuh | 8 +- .../programmable_bootstrap_tbc_multibit.cuh | 25 +- .../cuda/src/polynomial/functions.cuh | 4 +- .../cuda/src/utils/helper_multi_gpu.cuh | 31 +- .../benchmarks/benchmark_fft.cpp | 1 + .../benchmarks/benchmark_pbs.cpp | 21 +- .../include/setup_and_teardown.h | 4 +- .../tests_and_benchmarks/tests/test_fft.cpp | 3 +- .../cuda/tests_and_benchmarks/utils.cpp | 4 +- backends/tfhe-cuda-backend/src/bindings.rs | 1684 +++++++++++++++++ backends/tfhe-cuda-backend/src/cuda_bind.rs | 1170 ------------ backends/tfhe-cuda-backend/src/ffi.rs | 11 + backends/tfhe-cuda-backend/src/lib.rs | 3 + backends/tfhe-cuda-backend/wrapper.h | 7 + ci/slab.toml | 2 +- tfhe/docs/guides/run_on_gpu.md | 1 + tfhe/src/core_crypto/gpu/mod.rs | 4 +- tfhe/src/integer/gpu/mod.rs | 1 + 104 files changed, 3900 insertions(+), 3128 deletions(-) create mode 100644 backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h rename backends/tfhe-cuda-backend/cuda/include/{compression.h => integer/compression/compression_utilities.h} (64%) create mode 100644 backends/tfhe-cuda-backend/cuda/include/integer/integer.h rename backends/tfhe-cuda-backend/cuda/include/{integer.h => integer/integer_utilities.h} (81%) create mode 100644 backends/tfhe-cuda-backend/cuda/include/pbs/pbs_enums.h rename backends/tfhe-cuda-backend/cuda/include/{programmable_bootstrap_multibit.h => pbs/pbs_multibit_utilities.h} (84%) rename backends/tfhe-cuda-backend/cuda/include/{programmable_bootstrap.h => pbs/pbs_utilities.h} (74%) create mode 100644 backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h create mode 100644 backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h create mode 100644 backends/tfhe-cuda-backend/src/bindings.rs create mode 100644 backends/tfhe-cuda-backend/src/ffi.rs create mode 100644 backends/tfhe-cuda-backend/wrapper.h diff --git a/.github/workflows/benchmark_gpu_core_crypto.yml b/.github/workflows/benchmark_gpu_core_crypto.yml index 70368536d4..71c9677236 100644 --- a/.github/workflows/benchmark_gpu_core_crypto.yml +++ b/.github/workflows/benchmark_gpu_core_crypto.yml @@ -56,7 +56,7 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt install -y checkinstall zlib1g-dev libssl-dev + sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz cd cmake-${{ env.CMAKE_VERSION }} @@ -64,6 +64,7 @@ jobs: make -j"$(nproc)" sudo make install + - name: Checkout tfhe-rs repo with tags uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 with: diff --git a/.github/workflows/benchmark_gpu_integer.yml b/.github/workflows/benchmark_gpu_integer.yml index 4eb8d88fb6..39c2a400bb 100644 --- a/.github/workflows/benchmark_gpu_integer.yml +++ b/.github/workflows/benchmark_gpu_integer.yml @@ -59,7 +59,7 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt install -y checkinstall zlib1g-dev libssl-dev + sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz cd cmake-${{ env.CMAKE_VERSION }} diff --git a/.github/workflows/benchmark_gpu_integer_2H100_full.yml b/.github/workflows/benchmark_gpu_integer_2H100_full.yml index 1cd2a8b08e..dcae96defb 100644 --- a/.github/workflows/benchmark_gpu_integer_2H100_full.yml +++ b/.github/workflows/benchmark_gpu_integer_2H100_full.yml @@ -63,7 +63,7 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt install -y checkinstall zlib1g-dev libssl-dev + sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz cd cmake-${{ env.CMAKE_VERSION }} diff --git a/.github/workflows/benchmark_gpu_integer_full.yml b/.github/workflows/benchmark_gpu_integer_full.yml index fb5d6bd041..72fcbfd235 100644 --- a/.github/workflows/benchmark_gpu_integer_full.yml +++ b/.github/workflows/benchmark_gpu_integer_full.yml @@ -63,7 +63,7 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt install -y checkinstall zlib1g-dev libssl-dev + sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz cd cmake-${{ env.CMAKE_VERSION }} diff --git a/.github/workflows/benchmark_gpu_integer_multi_bit.yml b/.github/workflows/benchmark_gpu_integer_multi_bit.yml index 22fcf6ae5b..dee5c05dc6 100644 --- a/.github/workflows/benchmark_gpu_integer_multi_bit.yml +++ b/.github/workflows/benchmark_gpu_integer_multi_bit.yml @@ -72,7 +72,7 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt install -y checkinstall zlib1g-dev libssl-dev + sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz cd cmake-${{ env.CMAKE_VERSION }} diff --git a/.github/workflows/benchmark_gpu_integer_multi_bit_multi_gpu.yml b/.github/workflows/benchmark_gpu_integer_multi_bit_multi_gpu.yml index 1eb43c7a43..ef3435c247 100644 --- a/.github/workflows/benchmark_gpu_integer_multi_bit_multi_gpu.yml +++ b/.github/workflows/benchmark_gpu_integer_multi_bit_multi_gpu.yml @@ -73,7 +73,7 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt install -y checkinstall zlib1g-dev libssl-dev + sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz cd cmake-${{ env.CMAKE_VERSION }} diff --git a/.github/workflows/benchmark_gpu_integer_multi_gpu_full.yml b/.github/workflows/benchmark_gpu_integer_multi_gpu_full.yml index 8ffac05bb7..4e454e6065 100644 --- a/.github/workflows/benchmark_gpu_integer_multi_gpu_full.yml +++ b/.github/workflows/benchmark_gpu_integer_multi_gpu_full.yml @@ -63,7 +63,7 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt install -y checkinstall zlib1g-dev libssl-dev + sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz cd cmake-${{ env.CMAKE_VERSION }} diff --git a/.github/workflows/benchmark_gpu_l40.yml b/.github/workflows/benchmark_gpu_l40.yml index 1f518bbaaf..c179e5d043 100644 --- a/.github/workflows/benchmark_gpu_l40.yml +++ b/.github/workflows/benchmark_gpu_l40.yml @@ -63,7 +63,7 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt install -y checkinstall zlib1g-dev libssl-dev + sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz cd cmake-${{ env.CMAKE_VERSION }} diff --git a/.github/workflows/gpu_fast_h100_tests.yml b/.github/workflows/gpu_fast_h100_tests.yml index 34cf6620f3..690f71dfd2 100644 --- a/.github/workflows/gpu_fast_h100_tests.yml +++ b/.github/workflows/gpu_fast_h100_tests.yml @@ -99,7 +99,7 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt install -y checkinstall zlib1g-dev libssl-dev + sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz cd cmake-${{ env.CMAKE_VERSION }} diff --git a/.github/workflows/gpu_fast_tests.yml b/.github/workflows/gpu_fast_tests.yml index 21371925c5..46bfef8003 100644 --- a/.github/workflows/gpu_fast_tests.yml +++ b/.github/workflows/gpu_fast_tests.yml @@ -97,7 +97,7 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt install -y checkinstall zlib1g-dev libssl-dev + sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz cd cmake-${{ env.CMAKE_VERSION }} diff --git a/.github/workflows/gpu_full_h100_tests.yml b/.github/workflows/gpu_full_h100_tests.yml index 579b7a10f8..3f30bd6372 100644 --- a/.github/workflows/gpu_full_h100_tests.yml +++ b/.github/workflows/gpu_full_h100_tests.yml @@ -57,7 +57,7 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt install -y checkinstall zlib1g-dev libssl-dev + sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz cd cmake-${{ env.CMAKE_VERSION }} diff --git a/.github/workflows/gpu_full_multi_gpu_tests.yml b/.github/workflows/gpu_full_multi_gpu_tests.yml index 232670d687..26d5eb73a6 100644 --- a/.github/workflows/gpu_full_multi_gpu_tests.yml +++ b/.github/workflows/gpu_full_multi_gpu_tests.yml @@ -99,7 +99,7 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt install -y checkinstall zlib1g-dev libssl-dev + sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz cd cmake-${{ env.CMAKE_VERSION }} diff --git a/.github/workflows/gpu_signed_integer_h100_tests.yml b/.github/workflows/gpu_signed_integer_h100_tests.yml index e0503b983b..b74199a092 100644 --- a/.github/workflows/gpu_signed_integer_h100_tests.yml +++ b/.github/workflows/gpu_signed_integer_h100_tests.yml @@ -100,7 +100,7 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt install -y checkinstall zlib1g-dev libssl-dev + sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz cd cmake-${{ env.CMAKE_VERSION }} @@ -108,6 +108,7 @@ jobs: make -j"$(nproc)" sudo make install + - name: Checkout tfhe-rs uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 diff --git a/.github/workflows/gpu_signed_integer_tests.yml b/.github/workflows/gpu_signed_integer_tests.yml index cfa5960c5c..1aa1f2caad 100644 --- a/.github/workflows/gpu_signed_integer_tests.yml +++ b/.github/workflows/gpu_signed_integer_tests.yml @@ -107,7 +107,7 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt install -y checkinstall zlib1g-dev libssl-dev + sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz cd cmake-${{ env.CMAKE_VERSION }} @@ -115,6 +115,7 @@ jobs: make -j"$(nproc)" sudo make install + - name: Checkout tfhe-rs uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 with: diff --git a/.github/workflows/gpu_unsigned_integer_h100_tests.yml b/.github/workflows/gpu_unsigned_integer_h100_tests.yml index 4cdb94f48f..ba034f1eb2 100644 --- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml +++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml @@ -100,7 +100,7 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt install -y checkinstall zlib1g-dev libssl-dev + sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz cd cmake-${{ env.CMAKE_VERSION }} @@ -108,6 +108,7 @@ jobs: make -j"$(nproc)" sudo make install + - name: Checkout tfhe-rs uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 diff --git a/.github/workflows/gpu_unsigned_integer_tests.yml b/.github/workflows/gpu_unsigned_integer_tests.yml index 886f2ff75a..cc2a0bb6f2 100644 --- a/.github/workflows/gpu_unsigned_integer_tests.yml +++ b/.github/workflows/gpu_unsigned_integer_tests.yml @@ -107,7 +107,7 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt install -y checkinstall zlib1g-dev libssl-dev + sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz cd cmake-${{ env.CMAKE_VERSION }} @@ -115,6 +115,7 @@ jobs: make -j"$(nproc)" sudo make install + - name: Checkout tfhe-rs uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 diff --git a/Makefile b/Makefile index 9491768133..cc31c56f11 100644 --- a/Makefile +++ b/Makefile @@ -418,6 +418,14 @@ clippy_cuda_backend: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \ -p tfhe-cuda-backend -- --no-deps -D warnings +.PHONY: check_rust_bindings_did_not_change # Check rust bindings are up to date for tfhe-cuda-backend +check_rust_bindings_did_not_change: + cargo build -p tfhe-cuda-backend && \ + git diff --quiet HEAD -- backends/tfhe-cuda-backend/src/bindings.rs || \ + ( echo "Generated bindings have changed! Please run 'git add backends/tfhe-cuda-backend/src/bindings.rs' \ + and commit the changes." && exit 1 ) + + .PHONY: tfhe_lints # Run custom tfhe-rs lints tfhe_lints: install_tfhe_lints cd tfhe && RUSTFLAGS="$(RUSTFLAGS)" cargo tfhe-lints \ @@ -1257,7 +1265,7 @@ pcc: no_tfhe_typo no_dbg_log check_fmt check_typos lint_doc check_md_docs_are_te clippy_all tfhe_lints check_compile_tests .PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation -pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu +pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu check_rust_bindings_did_not_change .PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast fpcc: no_tfhe_typo no_dbg_log check_fmt check_typos lint_doc check_md_docs_are_tested clippy_fast \ diff --git a/backends/tfhe-cuda-backend/Cargo.toml b/backends/tfhe-cuda-backend/Cargo.toml index 1aae9b5541..ee54e28489 100644 --- a/backends/tfhe-cuda-backend/Cargo.toml +++ b/backends/tfhe-cuda-backend/Cargo.toml @@ -14,3 +14,4 @@ keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"] [build-dependencies] cmake = { version = "0.1" } pkg-config = { version = "0.3" } +bindgen = "0.70.1" diff --git a/backends/tfhe-cuda-backend/build.rs b/backends/tfhe-cuda-backend/build.rs index 5452b45686..c1d869b619 100644 --- a/backends/tfhe-cuda-backend/build.rs +++ b/backends/tfhe-cuda-backend/build.rs @@ -1,5 +1,6 @@ -use std::env; +use std::path::PathBuf; use std::process::Command; +use std::{env, fs}; fn main() { if let Ok(val) = env::var("DOCS_RS") { @@ -26,6 +27,7 @@ fn main() { println!("cargo::rerun-if-changed=cuda/tests_and_benchmarks"); println!("cargo::rerun-if-changed=cuda/CMakeLists.txt"); println!("cargo::rerun-if-changed=src"); + if env::consts::OS == "linux" { let output = Command::new("./get_os_name.sh").output().unwrap(); let distribution = String::from_utf8(output.stdout).unwrap(); @@ -35,6 +37,7 @@ fn main() { Only Ubuntu is supported by tfhe-cuda-backend at this time. Build may fail\n" ); } + let dest = cmake::build("cuda"); println!("cargo:rustc-link-search=native={}", dest.display()); println!("cargo:rustc-link-lib=static=tfhe_cuda_backend"); @@ -51,6 +54,37 @@ fn main() { println!("cargo:rustc-link-lib=cudart"); println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/"); println!("cargo:rustc-link-lib=stdc++"); + + let header_path = "wrapper.h"; + println!("cargo:rerun-if-changed={}", header_path); + + let out_path = PathBuf::from("src").join("bindings.rs"); + + // Check modification times + let header_modified = fs::metadata(header_path).unwrap().modified().unwrap(); + let bindings_modified = if out_path.exists() { + fs::metadata(&out_path).unwrap().modified().unwrap() + } else { + std::time::SystemTime::UNIX_EPOCH // If bindings file doesn't exist, consider it older + }; + // Regenerate bindings only if header has been modified + if header_modified > bindings_modified { + let bindings = bindgen::Builder::default() + .header(header_path) + .clang_arg("-x") + .clang_arg("c++") + .clang_arg("-std=c++17") + .clang_arg("-I/usr/include") + .clang_arg("-I/usr/local/include") + .ctypes_prefix("ffi") + .raw_line("use crate::ffi;") + .generate() + .expect("Unable to generate bindings"); + + bindings + .write_to_file(&out_path) + .expect("Couldn't write bindings!"); + } } else { panic!( "Error: platform not supported, tfhe-cuda-backend not built (only Linux is supported)" diff --git a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h index 3b2559eb69..b3978834a9 100644 --- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h +++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h @@ -1,25 +1,24 @@ #ifndef CUDA_CIPHERTEXT_H #define CUDA_CIPHERTEXT_H -#include "device.h" -#include +#include "stdint.h" extern "C" { void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream, uint32_t gpu_index, - void *dest, void *src, + void *dest, void const *src, uint32_t number_of_cts, uint32_t lwe_dimension); void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream, uint32_t gpu_index, - void *dest, void *src, + void *dest, void const *src, uint32_t number_of_cts, uint32_t lwe_dimension); void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index, - void *lwe_array_out, void *glwe_array_in, - uint32_t *nth_array, uint32_t num_nths, + void *lwe_array_out, void const *glwe_array_in, + uint32_t const *nth_array, uint32_t num_nths, uint32_t glwe_dimension, uint32_t polynomial_size); -}; +} #endif diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h index e2c3922020..bcb2c6cbe9 100644 --- a/backends/tfhe-cuda-backend/cuda/include/device.h +++ b/backends/tfhe-cuda-backend/cuda/include/device.h @@ -42,7 +42,7 @@ void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index); void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size, cudaStream_t stream, uint32_t gpu_index); -void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size, +void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size, cudaStream_t stream, uint32_t gpu_index); void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size, diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h new file mode 100644 index 0000000000..be42d8a914 --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h @@ -0,0 +1,45 @@ +#ifndef CUDA_INTEGER_COMPRESSION_H +#define CUDA_INTEGER_COMPRESSION_H + +#include "../../pbs/pbs_enums.h" + +extern "C" { +void scratch_cuda_integer_compress_radix_ciphertext_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t compression_glwe_dimension, + uint32_t compression_polynomial_size, uint32_t lwe_dimension, + uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + uint32_t lwe_per_glwe, uint32_t storage_log_modulus, + bool allocate_gpu_memory); + +void scratch_cuda_integer_decompress_radix_ciphertext_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t encryption_glwe_dimension, + uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension, + uint32_t compression_polynomial_size, uint32_t lwe_dimension, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t num_radix_blocks, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + uint32_t storage_log_modulus, uint32_t body_count, + bool allocate_gpu_memory); + +void cuda_integer_compress_radix_ciphertext_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *glwe_array_out, void const *lwe_array_in, void *const *fp_ksk, + uint32_t num_nths, int8_t *mem_ptr); + +void cuda_integer_decompress_radix_ciphertext_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array_out, void const *glwe_in, uint32_t const *indexes_array, + uint32_t indexes_array_size, void *const *bsks, int8_t *mem_ptr); + +void cleanup_cuda_integer_compress_radix_ciphertext_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr_void); + +void cleanup_cuda_integer_decompress_radix_ciphertext_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr_void); +} + +#endif diff --git a/backends/tfhe-cuda-backend/cuda/include/compression.h b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h similarity index 64% rename from backends/tfhe-cuda-backend/cuda/include/compression.h rename to backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h index 1154b0a46d..acb569e653 100644 --- a/backends/tfhe-cuda-backend/cuda/include/compression.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h @@ -1,46 +1,7 @@ -#ifndef CUDA_INTEGER_COMPRESSION_H -#define CUDA_INTEGER_COMPRESSION_H - -#include "integer.h" - -extern "C" { -void scratch_cuda_integer_compress_radix_ciphertext_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size, - uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus, - bool allocate_gpu_memory); - -void scratch_cuda_integer_decompress_radix_ciphertext_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size, - uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size, - uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count, - bool allocate_gpu_memory); - -void cuda_integer_compress_radix_ciphertext_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths, - int8_t *mem_ptr); - -void cuda_integer_decompress_radix_ciphertext_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *lwe_array_out, void *glwe_in, uint32_t *indexes_array, - uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr); - -void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams, - uint32_t *gpu_indexes, - uint32_t gpu_count, - int8_t **mem_ptr_void); - -void cleanup_cuda_integer_decompress_radix_ciphertext_64(void **streams, - uint32_t *gpu_indexes, - uint32_t gpu_count, - int8_t **mem_ptr_void); -} +#ifndef CUDA_INTEGER_COMPRESSION_UTILITIES_H +#define CUDA_INTEGER_COMPRESSION_UTILITIES_H + +#include "../integer_utilities.h" template struct int_compression { int_radix_params compression_params; @@ -54,7 +15,7 @@ template struct int_compression { Torus *tmp_lwe; Torus *tmp_glwe_array_out; - int_compression(cudaStream_t *streams, uint32_t *gpu_indexes, + int_compression(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int_radix_params compression_params, uint32_t num_radix_blocks, uint32_t lwe_per_glwe, uint32_t storage_log_modulus, bool allocate_gpu_memory) { @@ -81,7 +42,7 @@ template struct int_compression { num_radix_blocks, true); } } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { cuda_drop_async(tmp_lwe, streams[0], gpu_indexes[0]); cuda_drop_async(tmp_glwe_array_out, streams[0], gpu_indexes[0]); @@ -105,7 +66,7 @@ template struct int_decompression { int_radix_lut *carry_extract_lut; - int_decompression(cudaStream_t *streams, uint32_t *gpu_indexes, + int_decompression(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int_radix_params encryption_params, int_radix_params compression_params, uint32_t num_radix_blocks, uint32_t body_count, @@ -150,7 +111,7 @@ template struct int_decompression { carry_extract_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); } } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { cuda_drop_async(tmp_extracted_glwe, streams[0], gpu_indexes[0]); cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]); diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h new file mode 100644 index 0000000000..a9990423fe --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h @@ -0,0 +1,421 @@ +#ifndef CUDA_INTEGER_H +#define CUDA_INTEGER_H + +#include "../pbs/pbs_enums.h" +#include + +enum OUTPUT_CARRY { NONE = 0, GENERATED = 1, PROPAGATED = 2 }; +enum SHIFT_OR_ROTATE_TYPE { + LEFT_SHIFT = 0, + RIGHT_SHIFT = 1, + LEFT_ROTATE = 2, + RIGHT_ROTATE = 3 +}; +enum BITOP_TYPE { + BITAND = 0, + BITOR = 1, + BITXOR = 2, + SCALAR_BITAND = 3, + SCALAR_BITOR = 4, + SCALAR_BITXOR = 5, +}; + +enum COMPARISON_TYPE { + EQ = 0, + NE = 1, + GT = 2, + GE = 3, + LT = 4, + LE = 5, + MAX = 6, + MIN = 7, +}; + +enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 }; + +enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 }; + +extern "C" { +void scratch_cuda_apply_univariate_lut_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory); + +void cuda_apply_univariate_lut_kb_64(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, void *output_radix_lwe, + void const *input_radix_lwe, + int8_t *mem_ptr, void *const *ksks, + void *const *bsks, uint32_t num_blocks); + +void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void); + +void scratch_cuda_apply_bivariate_lut_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory); + +void cuda_apply_bivariate_lut_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *output_radix_lwe, void const *input_radix_lwe_1, + void const *input_radix_lwe_2, int8_t *mem_ptr, void *const *ksks, + void *const *bsks, uint32_t num_blocks, uint32_t shift); + +void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void); + +void cuda_apply_many_univariate_lut_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *output_radix_lwe, void const *input_radix_lwe, int8_t *mem_ptr, + void *const *ksks, void *const *bsks, uint32_t num_blocks, + uint32_t num_luts, uint32_t lut_stride); + +void scratch_cuda_full_propagation_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory); + +void cuda_full_propagation_64_inplace(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, void *input_blocks, + int8_t *mem_ptr, void *const *ksks, + void *const *bsks, uint32_t num_blocks); + +void cleanup_cuda_full_propagation(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void); + +void scratch_cuda_integer_mult_radix_ciphertext_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t message_modulus, uint32_t carry_modulus, + uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, + uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log, + uint32_t ks_level, uint32_t grouping_factor, uint32_t num_blocks, + PBS_TYPE pbs_type, bool allocate_gpu_memory); + +void cuda_integer_mult_radix_ciphertext_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *radix_lwe_out, void const *radix_lwe_left, + void const *radix_lwe_right, void *const *bsks, void *const *ksks, + int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks); + +void cleanup_cuda_integer_mult(void *const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr_void); + +void cuda_negate_integer_radix_ciphertext_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array_out, void const *lwe_array_in, uint32_t lwe_dimension, + uint32_t lwe_ciphertext_count, uint32_t message_modulus, + uint32_t carry_modulus); + +void cuda_scalar_addition_integer_radix_ciphertext_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array, void const *scalar_input, uint32_t lwe_dimension, + uint32_t lwe_ciphertext_count, uint32_t message_modulus, + uint32_t carry_modulus); + +void scratch_cuda_integer_radix_logical_scalar_shift_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, + bool allocate_gpu_memory); + +void cuda_integer_radix_logical_scalar_shift_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array, uint32_t shift, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t num_blocks); + +void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, + bool allocate_gpu_memory); + +void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array, uint32_t shift, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t num_blocks); + +void cleanup_cuda_integer_radix_logical_scalar_shift( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr_void); + +void cleanup_cuda_integer_radix_arithmetic_scalar_shift( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr_void); + +void scratch_cuda_integer_radix_shift_and_rotate_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, + bool is_signed, bool allocate_gpu_memory); + +void cuda_integer_radix_shift_and_rotate_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array, void const *lwe_shift, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t num_blocks); + +void cleanup_cuda_integer_radix_shift_and_rotate(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void); + +void scratch_cuda_integer_radix_comparison_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t lwe_ciphertext_count, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory); + +void cuda_comparison_integer_radix_ciphertext_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2, + int8_t *mem_ptr, void *const *bsks, void *const *ksks, + uint32_t lwe_ciphertext_count); + +void cuda_scalar_comparison_integer_radix_ciphertext_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array_out, void const *lwe_array_in, void const *scalar_blocks, + int8_t *mem_ptr, void *const *bsks, void *const *ksks, + uint32_t lwe_ciphertext_count, uint32_t num_scalar_blocks); + +void cleanup_cuda_integer_comparison(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void); + +void scratch_cuda_integer_radix_bitop_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t lwe_ciphertext_count, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + BITOP_TYPE op_type, bool allocate_gpu_memory); + +void cuda_bitop_integer_radix_ciphertext_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2, + int8_t *mem_ptr, void *const *bsks, void *const *ksks, + uint32_t lwe_ciphertext_count); + +void cuda_scalar_bitop_integer_radix_ciphertext_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array_out, void const *lwe_array_input, void const *clear_blocks, + uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t lwe_ciphertext_count, BITOP_TYPE op); + +void cleanup_cuda_integer_bitop(void *const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr_void); + +void scratch_cuda_integer_radix_cmux_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t lwe_ciphertext_count, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory); + +void cuda_cmux_integer_radix_ciphertext_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true, + void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t lwe_ciphertext_count); + +void cleanup_cuda_integer_radix_cmux(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void); + +void scratch_cuda_integer_radix_scalar_rotate_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, + bool allocate_gpu_memory); + +void cuda_integer_radix_scalar_rotate_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array, uint32_t n, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t num_blocks); + +void cleanup_cuda_integer_radix_scalar_rotate(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void); + +void scratch_cuda_propagate_single_carry_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory); + +void cuda_propagate_single_carry_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t num_blocks); + +void cuda_propagate_single_carry_get_input_carries_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr, + void *const *bsks, void *const *ksks, uint32_t num_blocks); + +void cleanup_cuda_propagate_single_carry(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void); + +void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory); + +void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec, + int8_t *mem_ptr, void *const *bsks, void *const *ksks, + uint32_t num_blocks_in_radix); + +void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr_void); + +void scratch_cuda_integer_radix_overflowing_sub_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory); + +void cuda_integer_radix_overflowing_sub_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *radix_lwe_out, void *radix_lwe_overflowed, void const *radix_lwe_left, + void const *radix_lwe_right, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t num_blocks_in_radix); + +void cleanup_cuda_integer_radix_overflowing_sub(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void); + +void scratch_cuda_integer_scalar_mul_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, + PBS_TYPE pbs_type, bool allocate_gpu_memory); + +void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array, uint64_t const *decomposed_scalar, + uint64_t const *has_at_least_one_set, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t lwe_dimension, uint32_t polynomial_size, + uint32_t message_modulus, uint32_t num_blocks, uint32_t num_scalars); + +void cleanup_cuda_integer_radix_scalar_mul(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void); + +void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory); + +void cuda_integer_div_rem_radix_ciphertext_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *quotient, void *remainder, void const *numerator, void const *divisor, + int8_t *mem_ptr, void *const *bsks, void *const *ksks, + uint32_t num_blocks_in_radix); + +void cleanup_cuda_integer_div_rem(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, int8_t **mem_ptr_void); + +void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory); + +void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lhs, void const *rhs, void *overflowed, int8_t signed_operation, + int8_t *mem_ptr, void *const *bsks, void *const *ksks, + uint32_t num_blocks_in_radix); + +void cleanup_signed_overflowing_add_or_sub(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void); + +void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_radix_blocks, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory); + +void cuda_integer_compute_prefix_sum_hillis_steele_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr, + void *const *ksks, void *const *bsks, uint32_t num_blocks, uint32_t shift); + +void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr_void); + +void cuda_integer_reverse_blocks_64_inplace(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, void *lwe_array, + uint32_t num_blocks, + uint32_t lwe_size); + +} // extern C +#endif // CUDA_INTEGER_H diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h similarity index 81% rename from backends/tfhe-cuda-backend/cuda/include/integer.h rename to backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h index 0943b10003..6dc85d4225 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h @@ -1,411 +1,13 @@ -#ifndef CUDA_INTEGER_H -#define CUDA_INTEGER_H +#ifndef CUDA_INTEGER_UTILITIES_H +#define CUDA_INTEGER_UTILITIES_H +#include "integer.h" #include "keyswitch.h" #include "pbs/programmable_bootstrap.cuh" -#include "programmable_bootstrap.h" -#include "programmable_bootstrap_multibit.h" #include #include #include -enum OUTPUT_CARRY { NONE = 0, GENERATED = 1, PROPAGATED = 2 }; -enum SHIFT_OR_ROTATE_TYPE { - LEFT_SHIFT = 0, - RIGHT_SHIFT = 1, - LEFT_ROTATE = 2, - RIGHT_ROTATE = 3 -}; -enum BITOP_TYPE { - BITAND = 0, - BITOR = 1, - BITXOR = 2, - SCALAR_BITAND = 3, - SCALAR_BITOR = 4, - SCALAR_BITXOR = 5, -}; - -enum COMPARISON_TYPE { - EQ = 0, - NE = 1, - GT = 2, - GE = 3, - LT = 4, - LE = 5, - MAX = 6, - MIN = 7, -}; - -enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 }; - -enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 }; - -extern "C" { -void scratch_cuda_apply_univariate_lut_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory); - -void cuda_apply_univariate_lut_kb_64(void **streams, uint32_t *gpu_indexes, - uint32_t gpu_count, void *output_radix_lwe, - void *input_radix_lwe, int8_t *mem_ptr, - void **ksks, void **bsks, - uint32_t num_blocks); - -void cleanup_cuda_apply_univariate_lut_kb_64(void **streams, - uint32_t *gpu_indexes, - uint32_t gpu_count, - int8_t **mem_ptr_void); - -void scratch_cuda_apply_bivariate_lut_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory); - -void cuda_apply_bivariate_lut_kb_64(void **streams, uint32_t *gpu_indexes, - uint32_t gpu_count, void *output_radix_lwe, - void *input_radix_lwe_1, - void *input_radix_lwe_2, int8_t *mem_ptr, - void **ksks, void **bsks, - uint32_t num_blocks, uint32_t shift); - -void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams, - uint32_t *gpu_indexes, - uint32_t gpu_count, - int8_t **mem_ptr_void); - -void cuda_apply_many_univariate_lut_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks, - void **bsks, uint32_t num_blocks, uint32_t num_luts, uint32_t lut_stride); - -void scratch_cuda_full_propagation_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, - uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory); - -void cuda_full_propagation_64_inplace(void **streams, uint32_t *gpu_indexes, - uint32_t gpu_count, void *input_blocks, - int8_t *mem_ptr, void **ksks, void **bsks, - uint32_t num_blocks); - -void cleanup_cuda_full_propagation(void **streams, uint32_t *gpu_indexes, - uint32_t gpu_count, int8_t **mem_ptr_void); - -void scratch_cuda_integer_mult_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension, - uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log, - uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level, - uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type, - bool allocate_gpu_memory); - -void cuda_integer_mult_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *radix_lwe_out, void *radix_lwe_left, void *radix_lwe_right, - void **bsks, void **ksks, int8_t *mem_ptr, uint32_t polynomial_size, - uint32_t num_blocks); - -void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes, - uint32_t gpu_count, int8_t **mem_ptr_void); - -void cuda_negate_integer_radix_ciphertext_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension, - uint32_t lwe_ciphertext_count, uint32_t message_modulus, - uint32_t carry_modulus); - -void cuda_scalar_addition_integer_radix_ciphertext_64_inplace( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - void *scalar_input, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count, - uint32_t message_modulus, uint32_t carry_modulus); - -void scratch_cuda_integer_radix_logical_scalar_shift_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, - uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, - bool allocate_gpu_memory); - -void cuda_integer_radix_logical_scalar_shift_kb_64_inplace( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - uint32_t shift, int8_t *mem_ptr, void **bsks, void **ksks, - uint32_t num_blocks); - -void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, - uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, - bool allocate_gpu_memory); - -void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - uint32_t shift, int8_t *mem_ptr, void **bsks, void **ksks, - uint32_t num_blocks); - -void cleanup_cuda_integer_radix_logical_scalar_shift(void **streams, - uint32_t *gpu_indexes, - uint32_t gpu_count, - int8_t **mem_ptr_void); - -void cleanup_cuda_integer_radix_arithmetic_scalar_shift(void **streams, - uint32_t *gpu_indexes, - uint32_t gpu_count, - int8_t **mem_ptr_void); - -void scratch_cuda_integer_radix_shift_and_rotate_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, - uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, - bool is_signed, bool allocate_gpu_memory); - -void cuda_integer_radix_shift_and_rotate_kb_64_inplace( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - void *lwe_shift, int8_t *mem_ptr, void **bsks, void **ksks, - uint32_t num_blocks); - -void cleanup_cuda_integer_radix_shift_and_rotate(void **streams, - uint32_t *gpu_indexes, - uint32_t gpu_count, - int8_t **mem_ptr_void); - -void scratch_cuda_integer_radix_comparison_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, - uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t grouping_factor, uint32_t lwe_ciphertext_count, - uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, - COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory); - -void cuda_comparison_integer_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr, - void **bsks, void **ksks, uint32_t lwe_ciphertext_count); - -void cuda_scalar_comparison_integer_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *lwe_array_out, void *lwe_array_in, void *scalar_blocks, - int8_t *mem_ptr, void **bsks, void **ksks, uint32_t lwe_ciphertext_count, - uint32_t num_scalar_blocks); - -void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes, - uint32_t gpu_count, int8_t **mem_ptr_void); - -void scratch_cuda_integer_radix_bitop_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, - uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t grouping_factor, uint32_t lwe_ciphertext_count, - uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, - BITOP_TYPE op_type, bool allocate_gpu_memory); - -void cuda_bitop_integer_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr, - void **bsks, void **ksks, uint32_t lwe_ciphertext_count); - -void cuda_scalar_bitop_integer_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *lwe_array_out, void *lwe_array_input, void *clear_blocks, - uint32_t num_clear_blocks, int8_t *mem_ptr, void **bsks, void **ksks, - uint32_t lwe_ciphertext_count, BITOP_TYPE op); - -void cleanup_cuda_integer_bitop(void **streams, uint32_t *gpu_indexes, - uint32_t gpu_count, int8_t **mem_ptr_void); - -void scratch_cuda_integer_radix_cmux_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, - uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t grouping_factor, uint32_t lwe_ciphertext_count, - uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, - bool allocate_gpu_memory); - -void cuda_cmux_integer_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *lwe_array_out, void *lwe_condition, void *lwe_array_true, - void *lwe_array_false, int8_t *mem_ptr, void **bsks, void **ksks, - uint32_t lwe_ciphertext_count); - -void cleanup_cuda_integer_radix_cmux(void **streams, uint32_t *gpu_indexes, - uint32_t gpu_count, int8_t **mem_ptr_void); - -void scratch_cuda_integer_radix_scalar_rotate_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, - uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, - bool allocate_gpu_memory); - -void cuda_integer_radix_scalar_rotate_kb_64_inplace( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - uint32_t n, int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks); - -void cleanup_cuda_integer_radix_scalar_rotate(void **streams, - uint32_t *gpu_indexes, - uint32_t gpu_count, - int8_t **mem_ptr_void); - -void scratch_cuda_propagate_single_carry_kb_64_inplace( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, - uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory); - -void cuda_propagate_single_carry_kb_64_inplace( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - void *carry_out, int8_t *mem_ptr, void **bsks, void **ksks, - uint32_t num_blocks); - -void cuda_propagate_single_carry_get_input_carries_kb_64_inplace( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - void *carry_out, void *input_carries, int8_t *mem_ptr, void **bsks, - void **ksks, uint32_t num_blocks); - -void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes, - uint32_t gpu_count, - int8_t **mem_ptr_void); - -void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension, - uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, - uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec, - uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, - bool allocate_gpu_memory); - -void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec, - int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks_in_radix); - -void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int8_t **mem_ptr_void); - -void scratch_cuda_integer_radix_overflowing_sub_kb_64( - void **stream, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, - uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory); - -void cuda_integer_radix_overflowing_sub_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *radix_lwe_out, void *radix_lwe_overflowed, void *radix_lwe_left, - void *radix_lwe_right, int8_t *mem_ptr, void **bsks, void **ksks, - uint32_t num_blocks_in_radix); - -void cleanup_cuda_integer_radix_overflowing_sub(void **streams, - uint32_t *gpu_indexes, - uint32_t gpu_count, - int8_t **mem_ptr_void); - -void scratch_cuda_integer_scalar_mul_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension, - uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, - uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks, - uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, - bool allocate_gpu_memory); - -void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - uint64_t *decomposed_scalar, uint64_t *has_at_least_one_set, - int8_t *mem_ptr, void **bsks, void **ksks, uint32_t lwe_dimension, - uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_blocks, - uint32_t num_scalars); - -void cleanup_cuda_integer_radix_scalar_mul(void **streams, - uint32_t *gpu_indexes, - uint32_t gpu_count, - int8_t **mem_ptr_void); - -void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, - uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory); - -void cuda_integer_div_rem_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *quotient, - void *remainder, void *numerator, void *divisor, int8_t *mem_ptr, - void **bsks, void **ksks, uint32_t num_blocks_in_radix); - -void cleanup_cuda_integer_div_rem(void **streams, uint32_t *gpu_indexes, - uint32_t gpu_count, int8_t **mem_ptr_void); - -void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, - uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation, - uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, - bool allocate_gpu_memory); - -void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lhs, - void *rhs, void *overflowed, int8_t signed_operation, int8_t *mem_ptr, - void **bsks, void **ksks, uint32_t num_blocks_in_radix); - -void cleanup_signed_overflowing_add_or_sub(void **streams, - uint32_t *gpu_indexes, - uint32_t gpu_count, - int8_t **mem_ptr_void); - -void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, bool allocate_gpu_memory); - -void cuda_integer_compute_prefix_sum_hillis_steele_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr, - void **ksks, void **bsks, uint32_t num_blocks, uint32_t shift); - -void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int8_t **mem_ptr_void); - -void cuda_integer_reverse_blocks_64_inplace(void **streams, - uint32_t *gpu_indexes, - uint32_t gpu_count, void *lwe_array, - uint32_t num_blocks, - uint32_t lwe_size); - -} // extern C - template __global__ void radix_blocks_rotate_right(Torus *dst, Torus *src, uint32_t value, uint32_t blocks_count, @@ -532,7 +134,7 @@ template struct int_radix_lut { std::vector lwe_after_pbs_vec; std::vector lwe_trivial_indexes_vec; - int_radix_lut(cudaStream_t *streams, uint32_t *gpu_indexes, + int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int_radix_params params, uint32_t num_luts, uint32_t num_radix_blocks, bool allocate_gpu_memory) { @@ -638,7 +240,7 @@ template struct int_radix_lut { } // constructor to reuse memory - int_radix_lut(cudaStream_t *streams, uint32_t *gpu_indexes, + int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int_radix_params params, uint32_t num_luts, uint32_t num_radix_blocks, int_radix_lut *base_lut_object) { @@ -746,7 +348,7 @@ template struct int_radix_lut { } // Broadcast luts from gpu src_gpu_idx to all active gpus - void broadcast_lut(cudaStream_t *streams, uint32_t *gpu_indexes, + void broadcast_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t src_gpu_idx) { Torus lut_size = (params.glwe_dimension + 1) * params.polynomial_size; @@ -769,7 +371,7 @@ template struct int_radix_lut { } } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { for (uint i = 0; i < active_gpu_count; i++) { cuda_drop_async(lut_vec[i], streams[i], gpu_indexes[i]); @@ -824,10 +426,10 @@ template struct int_bit_extract_luts_buffer { int_radix_lut *lut; // With offset - int_bit_extract_luts_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, int_radix_params params, - uint32_t bits_per_block, uint32_t final_offset, - uint32_t num_radix_blocks, + int_bit_extract_luts_buffer(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + int_radix_params params, uint32_t bits_per_block, + uint32_t final_offset, uint32_t num_radix_blocks, bool allocate_gpu_memory) { this->params = params; @@ -898,16 +500,16 @@ template struct int_bit_extract_luts_buffer { } // Without offset - int_bit_extract_luts_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, int_radix_params params, - uint32_t bits_per_block, + int_bit_extract_luts_buffer(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + int_radix_params params, uint32_t bits_per_block, uint32_t num_radix_blocks, bool allocate_gpu_memory) : int_bit_extract_luts_buffer(streams, gpu_indexes, gpu_count, params, bits_per_block, 0, num_radix_blocks, allocate_gpu_memory) {} - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { lut->release(streams, gpu_indexes, gpu_count); delete (lut); @@ -933,8 +535,8 @@ template struct int_shift_and_rotate_buffer { Torus offset; - int_shift_and_rotate_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, + int_shift_and_rotate_buffer(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed, int_radix_params params, uint32_t num_radix_blocks, @@ -1056,7 +658,7 @@ template struct int_shift_and_rotate_buffer { } } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { cuda_drop_async(tmp_bits, streams[0], gpu_indexes[0]); cuda_drop_async(tmp_shift_bits, streams[0], gpu_indexes[0]); @@ -1085,7 +687,7 @@ template struct int_fullprop_buffer { Torus *tmp_small_lwe_vector; Torus *tmp_big_lwe_vector; - int_fullprop_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, + int_fullprop_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int_radix_params params, bool allocate_gpu_memory) { this->params = params; @@ -1142,7 +744,7 @@ template struct int_fullprop_buffer { } } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { lut->release(streams, gpu_indexes, 1); @@ -1165,7 +767,7 @@ template struct int_sc_prop_memory { int_radix_params params; - int_sc_prop_memory(cudaStream_t *streams, uint32_t *gpu_indexes, + int_sc_prop_memory(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int_radix_params params, uint32_t num_radix_blocks, bool allocate_gpu_memory) { this->params = params; @@ -1258,7 +860,7 @@ template struct int_sc_prop_memory { message_acc->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { cuda_drop_async(generates_or_propagates, streams[0], gpu_indexes[0]); cuda_drop_async(step_output, streams[0], gpu_indexes[0]); @@ -1285,9 +887,9 @@ template struct int_overflowing_sub_memory { int_radix_params params; - int_overflowing_sub_memory(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, int_radix_params params, - uint32_t num_radix_blocks, + int_overflowing_sub_memory(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + int_radix_params params, uint32_t num_radix_blocks, bool allocate_gpu_memory) { this->params = params; auto glwe_dimension = params.glwe_dimension; @@ -1379,7 +981,7 @@ template struct int_overflowing_sub_memory { message_acc->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { cuda_drop_async(generates_or_propagates, streams[0], gpu_indexes[0]); cuda_drop_async(step_output, streams[0], gpu_indexes[0]); @@ -1407,7 +1009,8 @@ template struct int_sum_ciphertexts_vec_memory { bool mem_reuse = false; - int_sum_ciphertexts_vec_memory(cudaStream_t *streams, uint32_t *gpu_indexes, + int_sum_ciphertexts_vec_memory(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, int_radix_params params, uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec, @@ -1460,7 +1063,8 @@ template struct int_sum_ciphertexts_vec_memory { streams[0], gpu_indexes[0]); } - int_sum_ciphertexts_vec_memory(cudaStream_t *streams, uint32_t *gpu_indexes, + int_sum_ciphertexts_vec_memory(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, int_radix_params params, uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec, @@ -1496,7 +1100,7 @@ template struct int_sum_ciphertexts_vec_memory { streams[0], gpu_indexes[0]); } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { cuda_drop_async(d_smart_copy_in, streams[0], gpu_indexes[0]); cuda_drop_async(d_smart_copy_out, streams[0], gpu_indexes[0]); @@ -1523,7 +1127,7 @@ template struct int_mul_memory { int_radix_params params; - int_mul_memory(cudaStream_t *streams, uint32_t *gpu_indexes, + int_mul_memory(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int_radix_params params, uint32_t num_radix_blocks, bool allocate_gpu_memory) { this->params = params; @@ -1597,7 +1201,7 @@ template struct int_mul_memory { small_lwe_vector); } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { cuda_drop_async(vector_result_sb, streams[0], gpu_indexes[0]); cuda_drop_async(block_mul_res, streams[0], gpu_indexes[0]); @@ -1621,7 +1225,8 @@ template struct int_logical_scalar_shift_buffer { bool reuse_memory = false; - int_logical_scalar_shift_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, + int_logical_scalar_shift_buffer(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, SHIFT_OR_ROTATE_TYPE shift_type, int_radix_params params, @@ -1712,13 +1317,11 @@ template struct int_logical_scalar_shift_buffer { } } - int_logical_scalar_shift_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, - SHIFT_OR_ROTATE_TYPE shift_type, - int_radix_params params, - uint32_t num_radix_blocks, - bool allocate_gpu_memory, - Torus *pre_allocated_buffer) { + int_logical_scalar_shift_buffer( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, SHIFT_OR_ROTATE_TYPE shift_type, + int_radix_params params, uint32_t num_radix_blocks, + bool allocate_gpu_memory, Torus *pre_allocated_buffer) { this->shift_type = shift_type; this->params = params; tmp_rotated = pre_allocated_buffer; @@ -1800,7 +1403,7 @@ template struct int_logical_scalar_shift_buffer { } } } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { for (auto &buffer : lut_buffers_bivariate) { buffer->release(streams, gpu_indexes, gpu_count); @@ -1826,8 +1429,9 @@ template struct int_arithmetic_scalar_shift_buffer { cudaStream_t *local_streams_2; uint32_t active_gpu_count; - int_arithmetic_scalar_shift_buffer(cudaStream_t *streams, - uint32_t *gpu_indexes, uint32_t gpu_count, + int_arithmetic_scalar_shift_buffer(cudaStream_t const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, SHIFT_OR_ROTATE_TYPE shift_type, int_radix_params params, uint32_t num_radix_blocks, @@ -1971,7 +1575,7 @@ template struct int_arithmetic_scalar_shift_buffer { } } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { for (uint j = 0; j < active_gpu_count; j++) { cuda_destroy_stream(local_streams_1[j], gpu_indexes[j]); @@ -2004,9 +1608,10 @@ template struct int_zero_out_if_buffer { cudaStream_t *false_streams; uint32_t active_gpu_count; - int_zero_out_if_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, int_radix_params params, - uint32_t num_radix_blocks, bool allocate_gpu_memory) { + int_zero_out_if_buffer(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + int_radix_params params, uint32_t num_radix_blocks, + bool allocate_gpu_memory) { this->params = params; active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); @@ -2025,7 +1630,7 @@ template struct int_zero_out_if_buffer { } } } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { cuda_drop_async(tmp, streams[0], gpu_indexes[0]); for (uint j = 0; j < active_gpu_count; j++) { @@ -2050,7 +1655,7 @@ template struct int_cmux_buffer { int_radix_params params; - int_cmux_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, + int_cmux_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, std::function predicate_lut_f, int_radix_params params, uint32_t num_radix_blocks, @@ -2121,7 +1726,7 @@ template struct int_cmux_buffer { } } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { predicate_lut->release(streams, gpu_indexes, gpu_count); delete predicate_lut; @@ -2152,9 +1757,9 @@ template struct int_are_all_block_true_buffer { // value). std::unordered_map *> is_equal_to_lut_map; - int_are_all_block_true_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, COMPARISON_TYPE op, - int_radix_params params, + int_are_all_block_true_buffer(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + COMPARISON_TYPE op, int_radix_params params, uint32_t num_radix_blocks, bool allocate_gpu_memory) { this->params = params; @@ -2174,7 +1779,7 @@ template struct int_are_all_block_true_buffer { } } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { for (auto &lut : is_equal_to_lut_map) { lut.second->release(streams, gpu_indexes, gpu_count); @@ -2197,9 +1802,10 @@ template struct int_comparison_eq_buffer { int_are_all_block_true_buffer *are_all_block_true_buffer; - int_comparison_eq_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, COMPARISON_TYPE op, - int_radix_params params, uint32_t num_radix_blocks, + int_comparison_eq_buffer(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + COMPARISON_TYPE op, int_radix_params params, + uint32_t num_radix_blocks, bool allocate_gpu_memory) { this->params = params; this->op = op; @@ -2272,7 +1878,7 @@ template struct int_comparison_eq_buffer { } } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { operator_lut->release(streams, gpu_indexes, gpu_count); delete operator_lut; @@ -2298,7 +1904,8 @@ template struct int_tree_sign_reduction_buffer { Torus *tmp_x; Torus *tmp_y; - int_tree_sign_reduction_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, + int_tree_sign_reduction_buffer(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, std::function operator_f, int_radix_params params, @@ -2340,7 +1947,7 @@ template struct int_tree_sign_reduction_buffer { } } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { tree_inner_leaf_lut->release(streams, gpu_indexes, gpu_count); delete tree_inner_leaf_lut; @@ -2369,9 +1976,10 @@ template struct int_comparison_diff_buffer { Torus *tmp_signs_b; int_radix_lut *reduce_signs_lut; - int_comparison_diff_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, COMPARISON_TYPE op, - int_radix_params params, uint32_t num_radix_blocks, + int_comparison_diff_buffer(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + COMPARISON_TYPE op, int_radix_params params, + uint32_t num_radix_blocks, bool allocate_gpu_memory) { this->params = params; this->op = op; @@ -2415,7 +2023,7 @@ template struct int_comparison_diff_buffer { } } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { tree_buffer->release(streams, gpu_indexes, gpu_count); delete tree_buffer; @@ -2463,10 +2071,11 @@ template struct int_comparison_buffer { cudaStream_t *msb_streams; uint32_t active_gpu_count; - int_comparison_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, COMPARISON_TYPE op, - int_radix_params params, uint32_t num_radix_blocks, - bool is_signed, bool allocate_gpu_memory) { + int_comparison_buffer(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + COMPARISON_TYPE op, int_radix_params params, + uint32_t num_radix_blocks, bool is_signed, + bool allocate_gpu_memory) { this->params = params; this->op = op; this->is_signed = is_signed; @@ -2610,7 +2219,7 @@ template struct int_comparison_buffer { } } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { switch (op) { case COMPARISON_TYPE::MAX: @@ -2701,8 +2310,9 @@ template struct int_div_rem_memory { // allocate and initialize if needed, temporary arrays used to calculate // cuda integer div_rem operation - void init_temporary_buffers(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, uint32_t num_blocks) { + void init_temporary_buffers(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + uint32_t num_blocks) { uint32_t big_lwe_size = params.big_lwe_dimension + 1; // non boolean temporary arrays, with `num_blocks` blocks @@ -2749,8 +2359,9 @@ template struct int_div_rem_memory { } // initialize lookup tables for div_rem operation - void init_lookup_tables(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, uint32_t num_blocks) { + void init_lookup_tables(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + uint32_t num_blocks) { uint32_t num_bits_in_message = 31 - __builtin_clz(params.message_modulus); // create and generate masking_luts_1[] and masking_lut_2[] @@ -2890,7 +2501,7 @@ template struct int_div_rem_memory { } } - int_div_rem_memory(cudaStream_t *streams, uint32_t *gpu_indexes, + int_div_rem_memory(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int_radix_params params, uint32_t num_blocks, bool allocate_gpu_memory) { active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count); @@ -2930,7 +2541,7 @@ template struct int_div_rem_memory { } } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { uint32_t num_bits_in_message = 31 - __builtin_clz(params.message_modulus); @@ -3033,9 +2644,9 @@ template struct int_last_block_inner_propagate_memory { int_radix_params params; int_last_block_inner_propagate_memory( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_radix_params params, SIGNED_OPERATION op, uint32_t num_radix_blocks, - bool allocate_gpu_memory) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_radix_params params, SIGNED_OPERATION op, + uint32_t num_radix_blocks, bool allocate_gpu_memory) { this->params = params; auto message_modulus = params.message_modulus; @@ -3100,7 +2711,7 @@ template struct int_last_block_inner_propagate_memory { gpu_indexes[0]); } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { last_block_inner_propagation_lut->release(streams, gpu_indexes, gpu_count); delete last_block_inner_propagation_lut; @@ -3114,8 +2725,9 @@ template struct int_resolve_signed_overflow_memory { Torus *x; - int_resolve_signed_overflow_memory(cudaStream_t *streams, - uint32_t *gpu_indexes, uint32_t gpu_count, + int_resolve_signed_overflow_memory(cudaStream_t const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, int_radix_params params, bool allocate_gpu_memory) { @@ -3160,7 +2772,7 @@ template struct int_resolve_signed_overflow_memory { resolve_overflow_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { resolve_overflow_lut->release(streams, gpu_indexes, gpu_count); delete resolve_overflow_lut; @@ -3190,7 +2802,8 @@ template struct int_signed_overflowing_add_or_sub_memory { // allocate temporary arrays used to calculate // cuda integer signed overflowing add or sub - void allocate_temporary_buffers(cudaStream_t *streams, uint32_t *gpu_indexes, + void allocate_temporary_buffers(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, uint32_t num_blocks) { uint32_t big_lwe_size = params.big_lwe_dimension + 1; @@ -3210,9 +2823,9 @@ template struct int_signed_overflowing_add_or_sub_memory { // constructor without memory reuse int_signed_overflowing_add_or_sub_memory( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_radix_params params, uint32_t num_blocks, SIGNED_OPERATION op, - bool allocate_gpu_memory) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_radix_params params, uint32_t num_blocks, + SIGNED_OPERATION op, bool allocate_gpu_memory) { this->params = params; active_gpu_count = get_active_gpu_count(num_blocks, gpu_count); @@ -3241,7 +2854,7 @@ template struct int_signed_overflowing_add_or_sub_memory { streams, gpu_indexes, gpu_count, params, allocate_gpu_memory); } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { // memory objects for other operations scp_mem->release(streams, gpu_indexes, gpu_count); @@ -3273,7 +2886,7 @@ template struct int_bitop_buffer { int_radix_params params; int_radix_lut *lut; - int_bitop_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, + int_bitop_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, BITOP_TYPE op, int_radix_params params, uint32_t num_radix_blocks, bool allocate_gpu_memory) { @@ -3337,7 +2950,7 @@ template struct int_bitop_buffer { } } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { lut->release(streams, gpu_indexes, gpu_count); delete lut; @@ -3351,9 +2964,10 @@ template struct int_scalar_mul_buffer { Torus *preshifted_buffer; Torus *all_shifted_buffer; - int_scalar_mul_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, int_radix_params params, - uint32_t num_radix_blocks, bool allocate_gpu_memory) { + int_scalar_mul_buffer(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + int_radix_params params, uint32_t num_radix_blocks, + bool allocate_gpu_memory) { this->params = params; if (allocate_gpu_memory) { @@ -3390,7 +3004,7 @@ template struct int_scalar_mul_buffer { } } - void release(cudaStream_t *streams, uint32_t *gpu_indexes, + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { sum_ciphertexts_vec_mem->release(streams, gpu_indexes, gpu_count); delete sum_ciphertexts_vec_mem; @@ -3398,4 +3012,4 @@ template struct int_scalar_mul_buffer { } }; -#endif // CUDA_INTEGER_H +#endif // CUDA_INTEGER_UTILITIES_H diff --git a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h index 9de953d053..00e85e5246 100644 --- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h +++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h @@ -1,21 +1,23 @@ #ifndef CNCRT_KS_H_ #define CNCRT_KS_H_ -#include +#include extern "C" { void cuda_keyswitch_lwe_ciphertext_vector_32( void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, - void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, - uint32_t base_log, uint32_t level_count, uint32_t num_samples); + void const *lwe_output_indexes, void const *lwe_array_in, + void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in, + uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count, + uint32_t num_samples); void cuda_keyswitch_lwe_ciphertext_vector_64( void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, - void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, - uint32_t base_log, uint32_t level_count, uint32_t num_samples); + void const *lwe_output_indexes, void const *lwe_array_in, + void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in, + uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count, + uint32_t num_samples); void scratch_packing_keyswitch_lwe_list_to_glwe_64( void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer, @@ -23,10 +25,11 @@ void scratch_packing_keyswitch_lwe_list_to_glwe_64( bool allocate_gpu_memory); void cuda_packing_keyswitch_lwe_list_to_glwe_64( - void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in, - void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension, - uint32_t output_glwe_dimension, uint32_t output_polynomial_size, - uint32_t base_log, uint32_t level_count, uint32_t num_lwes); + void *stream, uint32_t gpu_index, void *glwe_array_out, + void const *lwe_array_in, void const *fp_ksk_array, int8_t *fp_ks_buffer, + uint32_t input_lwe_dimension, uint32_t output_glwe_dimension, + uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count, + uint32_t num_lwes); void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream, uint32_t gpu_index, diff --git a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h index 1427622c1d..10c476c12b 100644 --- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h +++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h @@ -1,50 +1,48 @@ #ifndef CUDA_LINALG_H_ #define CUDA_LINALG_H_ -#include "programmable_bootstrap.h" -#include -#include +#include extern "C" { void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_array_in, + void const *lwe_array_in, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count); void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_array_in, + void const *lwe_array_in, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count); void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_array_in_1, - void *lwe_array_in_2, + void const *lwe_array_in_1, + void const *lwe_array_in_2, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count); void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_array_in_1, - void *lwe_array_in_2, + void const *lwe_array_in_1, + void const *lwe_array_in_2, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count); void cuda_add_lwe_ciphertext_vector_plaintext_vector_32( - void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, - void *plaintext_array_in, uint32_t input_lwe_dimension, - uint32_t input_lwe_ciphertext_count); + void *stream, uint32_t gpu_index, void *lwe_array_out, + void const *lwe_array_in, void const *plaintext_array_in, + uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count); void cuda_add_lwe_ciphertext_vector_plaintext_vector_64( - void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, - void *plaintext_array_in, uint32_t input_lwe_dimension, - uint32_t input_lwe_ciphertext_count); + void *stream, uint32_t gpu_index, void *lwe_array_out, + void const *lwe_array_in, void const *plaintext_array_in, + uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count); void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32( - void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, - void *cleartext_array_in, uint32_t input_lwe_dimension, - uint32_t input_lwe_ciphertext_count); + void *stream, uint32_t gpu_index, void *lwe_array_out, + void const *lwe_array_in, void const *cleartext_array_in, + uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count); void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( - void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, - void *cleartext_array_in, uint32_t input_lwe_dimension, - uint32_t input_lwe_ciphertext_count); + void *stream, uint32_t gpu_index, void *lwe_array_out, + void const *lwe_array_in, void const *cleartext_array_in, + uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count); } #endif // CUDA_LINALG_H_ diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_enums.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_enums.h new file mode 100644 index 0000000000..f4317eab65 --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_enums.h @@ -0,0 +1,7 @@ +#ifndef CUDA_PBS_ENUMS_H +#define CUDA_PBS_ENUMS_H + +enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 }; +enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 }; + +#endif // CUDA_PBS_ENUMS_H diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h similarity index 84% rename from backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h rename to backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h index 51b2a62040..1e109b4409 100644 --- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h @@ -1,38 +1,7 @@ -#ifndef CUDA_MULTI_BIT_H -#define CUDA_MULTI_BIT_H +#ifndef CUDA_MULTI_BIT_UTILITIES_H +#define CUDA_MULTI_BIT_UTILITIES_H -#include "programmable_bootstrap.h" -#include - -extern "C" { - -bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit( - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t num_samples); - -void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64( - void *stream, uint32_t gpu_index, void *dest, void *src, - uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count, - uint32_t polynomial_size, uint32_t grouping_factor); - -void scratch_cuda_multi_bit_programmable_bootstrap_64( - void *stream, uint32_t gpu_index, int8_t **pbs_buffer, - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); - -void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( - void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes, - void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, - int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t lut_count, - uint32_t lut_stride); - -void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream, - uint32_t gpu_index, - int8_t **pbs_buffer); -} +#include "pbs_utilities.h" template bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap( @@ -53,8 +22,9 @@ void scratch_cuda_tbc_multi_bit_programmable_bootstrap( template void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key, + Torus const *lwe_output_indexes, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, Torus const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, @@ -70,8 +40,9 @@ void scratch_cuda_cg_multi_bit_programmable_bootstrap( template void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key, + Torus const *lwe_output_indexes, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, Torus const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, @@ -86,8 +57,9 @@ void scratch_cuda_multi_bit_programmable_bootstrap( template void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key, + Torus const *lwe_output_indexes, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, Torus const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, @@ -121,6 +93,10 @@ template uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap( uint32_t polynomial_size); +template +uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs, + uint32_t polynomial_size); + template struct pbs_buffer { int8_t *d_mem_keybundle = NULL; int8_t *d_mem_acc_step_one = NULL; @@ -288,8 +264,4 @@ template struct pbs_buffer { } }; -template -uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs, - uint32_t polynomial_size); - -#endif // CUDA_MULTI_BIT_H +#endif // CUDA_MULTI_BIT_UTILITIES_H diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h similarity index 74% rename from backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h rename to backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h index fa7ddffcbc..b5451f5bdd 100644 --- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h @@ -1,87 +1,10 @@ -#ifndef CUDA_BOOTSTRAP_H -#define CUDA_BOOTSTRAP_H +#ifndef CUDA_BOOTSTRAP_UTILITIES_H +#define CUDA_BOOTSTRAP_UTILITIES_H #include "device.h" -#include - -enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 }; -enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 }; - -extern "C" { -void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, - void *input1, void *input2, void *output, - uint32_t polynomial_size, - uint32_t total_polynomials); - -void cuda_convert_lwe_programmable_bootstrap_key_32( - void *stream, uint32_t gpu_index, void *dest, void *src, - uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count, - uint32_t polynomial_size); - -void cuda_convert_lwe_programmable_bootstrap_key_64( - void *stream, uint32_t gpu_index, void *dest, void *src, - uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count, - uint32_t polynomial_size); - -void scratch_cuda_programmable_bootstrap_amortized_32( - void *stream, uint32_t gpu_index, int8_t **pbs_buffer, - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); - -void scratch_cuda_programmable_bootstrap_amortized_64( - void *stream, uint32_t gpu_index, int8_t **pbs_buffer, - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); - -void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( - void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes, - void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, - int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples); - -void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( - void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes, - void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, - int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples); - -void cleanup_cuda_programmable_bootstrap_amortized(void *stream, - uint32_t gpu_index, - int8_t **pbs_buffer); - -void scratch_cuda_programmable_bootstrap_32( - void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); - -void scratch_cuda_programmable_bootstrap_64( - void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); - -void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( - void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes, - void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, - int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride); - -void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( - void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes, - void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, - int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride); - -void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index, - int8_t **pbs_buffer); -} +#include "pbs_enums.h" +#include "vector_types.h" +#include template uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_one( @@ -327,8 +250,9 @@ bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension, template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, + Torus const *lwe_output_indexes, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t lut_count, @@ -337,8 +261,9 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( template void cuda_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, + Torus const *lwe_output_indexes, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t lut_count, @@ -348,8 +273,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, + Torus const *lwe_output_indexes, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t lut_count, @@ -408,4 +334,4 @@ __device__ const T *get_multi_bit_ith_lwe_gth_group_kth_block( #endif -#endif // CUDA_BOOTSTRAP_H +#endif // CUDA_BOOTSTRAP_UTILITIES_H diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h new file mode 100644 index 0000000000..c90d671fdb --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h @@ -0,0 +1,86 @@ +#ifndef CUDA_BOOTSTRAP_H +#define CUDA_BOOTSTRAP_H + +#include "pbs_enums.h" +#include + +extern "C" { +void cuda_fourier_polynomial_mul(void *stream, uint32_t gpu_index, + void const *input1, void const *input2, + void *output, uint32_t polynomial_size, + uint32_t total_polynomials); + +void cuda_convert_lwe_programmable_bootstrap_key_32( + void *stream, uint32_t gpu_index, void *dest, void const *src, + uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count, + uint32_t polynomial_size); + +void cuda_convert_lwe_programmable_bootstrap_key_64( + void *stream, uint32_t gpu_index, void *dest, void const *src, + uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count, + uint32_t polynomial_size); + +void scratch_cuda_programmable_bootstrap_amortized_32( + void *stream, uint32_t gpu_index, int8_t **pbs_buffer, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); + +void scratch_cuda_programmable_bootstrap_amortized_64( + void *stream, uint32_t gpu_index, int8_t **pbs_buffer, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); + +void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( + void *stream, uint32_t gpu_index, void *lwe_array_out, + void const *lwe_output_indexes, void const *lut_vector, + void const *lut_vector_indexes, void const *lwe_array_in, + void const *lwe_input_indexes, void const *bootstrapping_key, + int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, + uint32_t num_samples); + +void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( + void *stream, uint32_t gpu_index, void *lwe_array_out, + void const *lwe_output_indexes, void const *lut_vector, + void const *lut_vector_indexes, void const *lwe_array_in, + void const *lwe_input_indexes, void const *bootstrapping_key, + int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, + uint32_t num_samples); + +void cleanup_cuda_programmable_bootstrap_amortized(void *stream, + uint32_t gpu_index, + int8_t **pbs_buffer); + +void scratch_cuda_programmable_bootstrap_32( + void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t level_count, + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); + +void scratch_cuda_programmable_bootstrap_64( + void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t level_count, + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); + +void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( + void *stream, uint32_t gpu_index, void *lwe_array_out, + void const *lwe_output_indexes, void const *lut_vector, + void const *lut_vector_indexes, void const *lwe_array_in, + void const *lwe_input_indexes, void const *bootstrapping_key, + int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, + uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride); + +void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( + void *stream, uint32_t gpu_index, void *lwe_array_out, + void const *lwe_output_indexes, void const *lut_vector, + void const *lut_vector_indexes, void const *lwe_array_in, + void const *lwe_input_indexes, void const *bootstrapping_key, + int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, + uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride); + +void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index, + int8_t **pbs_buffer); +} +#endif // CUDA_BOOTSTRAP_H diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h new file mode 100644 index 0000000000..fceac32e97 --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h @@ -0,0 +1,38 @@ +#ifndef CUDA_MULTI_BIT_H +#define CUDA_MULTI_BIT_H + +#include "pbs_enums.h" +#include "stdint.h" + +extern "C" { + +bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit( + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, + uint32_t num_samples); + +void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64( + void *stream, uint32_t gpu_index, void *dest, void const *src, + uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count, + uint32_t polynomial_size, uint32_t grouping_factor); + +void scratch_cuda_multi_bit_programmable_bootstrap_64( + void *stream, uint32_t gpu_index, int8_t **pbs_buffer, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); + +void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( + void *stream, uint32_t gpu_index, void *lwe_array_out, + void const *lwe_output_indexes, void const *lut_vector, + void const *lut_vector_indexes, void const *lwe_array_in, + void const *lwe_input_indexes, void const *bootstrapping_key, + int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, + uint32_t level_count, uint32_t num_samples, uint32_t lut_count, + uint32_t lut_stride); + +void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream, + uint32_t gpu_index, + int8_t **pbs_buffer); +} + +#endif // CUDA_MULTI_BIT_H diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu index a898eabd6b..90d1ca35f3 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu @@ -22,8 +22,8 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream, } void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index, - void *lwe_array_out, void *glwe_array_in, - uint32_t *nth_array, uint32_t num_nths, + void *lwe_array_out, void const *glwe_array_in, + uint32_t const *nth_array, uint32_t num_nths, uint32_t glwe_dimension, uint32_t polynomial_size) { @@ -31,43 +31,43 @@ void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index, case 256: host_sample_extract>( static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, - (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths, + (uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths, glwe_dimension); break; case 512: host_sample_extract>( static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, - (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths, + (uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths, glwe_dimension); break; case 1024: host_sample_extract>( static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, - (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths, + (uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths, glwe_dimension); break; case 2048: host_sample_extract>( static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, - (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths, + (uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths, glwe_dimension); break; case 4096: host_sample_extract>( static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, - (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths, + (uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths, glwe_dimension); break; case 8192: host_sample_extract>( static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, - (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths, + (uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths, glwe_dimension); break; case 16384: host_sample_extract>( static_cast(stream), gpu_index, (uint64_t *)lwe_array_out, - (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths, + (uint64_t const *)glwe_array_in, (uint32_t const *)nth_array, num_nths, glwe_dimension); break; default: diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh index 203499d89a..22e93c7539 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh @@ -27,8 +27,9 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream, } template -__global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in, - uint32_t *nth_array, uint32_t glwe_dimension) { +__global__ void sample_extract(Torus *lwe_array_out, Torus const *glwe_array_in, + uint32_t const *nth_array, + uint32_t glwe_dimension) { const int input_id = blockIdx.x; @@ -50,8 +51,9 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in, template __host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index, - Torus *lwe_array_out, Torus *glwe_array_in, - uint32_t *nth_array, uint32_t num_nths, + Torus *lwe_array_out, + Torus const *glwe_array_in, + uint32_t const *nth_array, uint32_t num_nths, uint32_t glwe_dimension) { cudaSetDevice(gpu_index); diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu index 20897b141b..ec1f6bf022 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu @@ -37,16 +37,18 @@ void cuda_keyswitch_lwe_ciphertext_vector_32( */ void cuda_keyswitch_lwe_ciphertext_vector_64( void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, - void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, - uint32_t base_log, uint32_t level_count, uint32_t num_samples) { + void const *lwe_output_indexes, void const *lwe_array_in, + void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in, + uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count, + uint32_t num_samples) { host_keyswitch_lwe_ciphertext_vector( static_cast(stream), gpu_index, static_cast(lwe_array_out), - static_cast(lwe_output_indexes), - static_cast(lwe_array_in), - static_cast(lwe_input_indexes), static_cast(ksk), - lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples); + static_cast(lwe_output_indexes), + static_cast(lwe_array_in), + static_cast(lwe_input_indexes), + static_cast(ksk), lwe_dimension_in, lwe_dimension_out, + base_log, level_count, num_samples); } void scratch_packing_keyswitch_lwe_list_to_glwe_64( @@ -61,18 +63,19 @@ void scratch_packing_keyswitch_lwe_list_to_glwe_64( * ciphertexts. */ void cuda_packing_keyswitch_lwe_list_to_glwe_64( - void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in, - void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension, - uint32_t output_glwe_dimension, uint32_t output_polynomial_size, - uint32_t base_log, uint32_t level_count, uint32_t num_lwes) { + void *stream, uint32_t gpu_index, void *glwe_array_out, + void const *lwe_array_in, void const *fp_ksk_array, int8_t *fp_ks_buffer, + uint32_t input_lwe_dimension, uint32_t output_glwe_dimension, + uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count, + uint32_t num_lwes) { host_packing_keyswitch_lwe_list_to_glwe( static_cast(stream), gpu_index, static_cast(glwe_array_out), - static_cast(lwe_array_in), - static_cast(fp_ksk_array), fp_ks_buffer, input_lwe_dimension, - output_glwe_dimension, output_polynomial_size, base_log, level_count, - num_lwes); + static_cast(lwe_array_in), + static_cast(fp_ksk_array), fp_ks_buffer, + input_lwe_dimension, output_glwe_dimension, output_polynomial_size, + base_log, level_count, num_lwes); } void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream, diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh index abf3e71804..15dc599dfb 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh @@ -101,9 +101,10 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, template __host__ void host_keyswitch_lwe_ciphertext_vector( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, - Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, - uint32_t base_log, uint32_t level_count, uint32_t num_samples) { + Torus const *lwe_output_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, Torus const *ksk, uint32_t lwe_dimension_in, + uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count, + uint32_t num_samples) { cudaSetDevice(gpu_index); @@ -124,13 +125,13 @@ __host__ void host_keyswitch_lwe_ciphertext_vector( } template -void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, +void execute_keyswitch_async(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, const LweArrayVariant &lwe_array_out, const LweArrayVariant &lwe_output_indexes, const LweArrayVariant &lwe_array_in, const LweArrayVariant &lwe_input_indexes, - Torus **ksks, uint32_t lwe_dimension_in, + Torus *const *ksks, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count, uint32_t num_samples) { @@ -176,9 +177,9 @@ __host__ void scratch_packing_keyswitch_lwe_list_to_glwe( // different thread blocks at the x-axis to work on that input. template __device__ void packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext( - Torus *glwe_out, Torus *lwe_in, Torus *fp_ksk, uint32_t lwe_dimension_in, - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count) { + Torus *glwe_out, Torus const *lwe_in, Torus const *fp_ksk, + uint32_t lwe_dimension_in, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, uint32_t level_count) { const int tid = threadIdx.x + blockIdx.x * blockDim.x; size_t glwe_size = (glwe_dimension + 1); @@ -225,12 +226,11 @@ __device__ void packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext( // Assumes there are (glwe_dimension+1) * polynomial_size threads split through // different thread blocks at the x-axis to work on that input. template -__global__ void -packing_keyswitch_lwe_list_to_glwe(Torus *glwe_array_out, Torus *lwe_array_in, - Torus *fp_ksk, uint32_t lwe_dimension_in, - uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, Torus *d_mem) { +__global__ void packing_keyswitch_lwe_list_to_glwe( + Torus *glwe_array_out, Torus const *lwe_array_in, Torus const *fp_ksk, + uint32_t lwe_dimension_in, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, + Torus *d_mem) { const int tid = threadIdx.x + blockIdx.x * blockDim.x; const int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size; @@ -276,7 +276,7 @@ __global__ void accumulate_glwes(Torus *glwe_out, Torus *glwe_array_in, template __host__ void host_packing_keyswitch_lwe_list_to_glwe( cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out, - Torus *lwe_array_in, Torus *fp_ksk_array, int8_t *fp_ks_buffer, + Torus const *lwe_array_in, Torus const *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t lwe_dimension_in, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_lwes) { diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu index 4baeb7be01..5177db7216 100644 --- a/backends/tfhe-cuda-backend/cuda/src/device.cu +++ b/backends/tfhe-cuda-backend/cuda/src/device.cu @@ -113,7 +113,7 @@ void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size, } /// Copy memory within a GPU asynchronously -void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size, +void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size, cudaStream_t stream, uint32_t gpu_index) { if (size == 0) return; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu index 6bc5b58df5..2ae72ad2a6 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu @@ -1,8 +1,8 @@ #include "integer/addition.cuh" void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation, @@ -23,9 +23,10 @@ void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( } void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lhs, - void *rhs, void *overflowed, int8_t signed_operation, int8_t *mem_ptr, - void **bsks, void **ksks, uint32_t num_blocks) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lhs, void const *rhs, void *overflowed, int8_t signed_operation, + int8_t *mem_ptr, void *const *bsks, void *const *ksks, + uint32_t num_blocks) { auto mem = (int_signed_overflowing_add_or_sub_memory *)mem_ptr; SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION @@ -33,13 +34,13 @@ void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( host_integer_signed_overflowing_add_or_sub_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(lhs), static_cast(rhs), - static_cast(overflowed), op, bsks, (uint64_t **)(ksks), mem, - num_blocks); + static_cast(lhs), static_cast(rhs), + static_cast(overflowed), op, bsks, (uint64_t *const *)(ksks), + mem, num_blocks); } -void cleanup_signed_overflowing_add_or_sub(void **streams, - uint32_t *gpu_indexes, +void cleanup_signed_overflowing_add_or_sub(void *const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { int_signed_overflowing_add_or_sub_memory *mem_ptr = diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh index daf90239d3..6699ec0db2 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh @@ -3,13 +3,13 @@ #include "crypto/keyswitch.cuh" #include "device.h" -#include "integer.h" #include "integer/comparison.cuh" #include "integer/integer.cuh" +#include "integer/integer_utilities.h" #include "integer/negation.cuh" #include "integer/scalar_shifts.cuh" #include "linear_algebra.h" -#include "programmable_bootstrap.h" +#include "pbs/programmable_bootstrap.h" #include "utils/helper.cuh" #include "utils/kernel_dimensions.cuh" #include @@ -20,10 +20,11 @@ template void host_resolve_signed_overflow( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *result, Torus *last_block_inner_propagation, - Torus *last_block_input_carry, Torus *last_block_output_carry, - int_resolve_signed_overflow_memory *mem, void **bsks, Torus **ksks) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *result, Torus *last_block_inner_propagation, + Torus const *last_block_input_carry, Torus *last_block_output_carry, + int_resolve_signed_overflow_memory *mem, void *const *bsks, + Torus *const *ksks) { auto x = mem->x; @@ -53,7 +54,8 @@ void host_resolve_signed_overflow( template __host__ void scratch_cuda_integer_signed_overflowing_add_or_sub_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_signed_overflowing_add_or_sub_memory **mem_ptr, uint32_t num_blocks, SIGNED_OPERATION op, int_radix_params params, bool allocate_gpu_memory) { @@ -69,9 +71,9 @@ __host__ void scratch_cuda_integer_signed_overflowing_add_or_sub_kb( */ template __host__ void host_integer_signed_overflowing_add_or_sub_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lhs, Torus *rhs, Torus *overflowed, SIGNED_OPERATION op, void **bsks, - uint64_t **ksks, + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lhs, Torus const *rhs, Torus *overflowed, + SIGNED_OPERATION op, void *const *bsks, uint64_t *const *ksks, int_signed_overflowing_add_or_sub_memory *mem_ptr, uint32_t num_blocks) { diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu index 6330db7130..fccd936952 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu @@ -1,8 +1,8 @@ #include "integer/bitwise_ops.cuh" void scratch_cuda_integer_radix_bitop_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t lwe_ciphertext_count, @@ -21,21 +21,23 @@ void scratch_cuda_integer_radix_bitop_kb_64( } void cuda_bitop_integer_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr, - void **bsks, void **ksks, uint32_t lwe_ciphertext_count) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2, + int8_t *mem_ptr, void *const *bsks, void *const *ksks, + uint32_t lwe_ciphertext_count) { host_integer_radix_bitop_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array_out), - static_cast(lwe_array_1), - static_cast(lwe_array_2), + static_cast(lwe_array_1), + static_cast(lwe_array_2), (int_bitop_buffer *)mem_ptr, bsks, (uint64_t **)(ksks), lwe_ciphertext_count); } -void cleanup_cuda_integer_bitop(void **streams, uint32_t *gpu_indexes, - uint32_t gpu_count, int8_t **mem_ptr_void) { +void cleanup_cuda_integer_bitop(void *const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr_void) { int_bitop_buffer *mem_ptr = (int_bitop_buffer *)(*mem_ptr_void); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh index 6569ff2d18..abbb8f9047 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh @@ -4,7 +4,7 @@ #include "crypto/keyswitch.cuh" #include "device.h" #include "integer.cuh" -#include "integer.h" +#include "integer/integer_utilities.h" #include "pbs/programmable_bootstrap_classic.cuh" #include "pbs/programmable_bootstrap_multibit.cuh" #include "polynomial/functions.cuh" @@ -12,12 +12,11 @@ #include template -__host__ void -host_integer_radix_bitop_kb(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *lwe_array_out, - Torus *lwe_array_1, Torus *lwe_array_2, - int_bitop_buffer *mem_ptr, void **bsks, - Torus **ksks, uint32_t num_radix_blocks) { +__host__ void host_integer_radix_bitop_kb( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_1, + Torus const *lwe_array_2, int_bitop_buffer *mem_ptr, + void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) { auto lut = mem_ptr->lut; @@ -28,9 +27,10 @@ host_integer_radix_bitop_kb(cudaStream_t *streams, uint32_t *gpu_indexes, template __host__ void scratch_cuda_integer_radix_bitop_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_bitop_buffer **mem_ptr, uint32_t num_radix_blocks, - int_radix_params params, BITOP_TYPE op, bool allocate_gpu_memory) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_bitop_buffer **mem_ptr, + uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op, + bool allocate_gpu_memory) { *mem_ptr = new int_bitop_buffer(streams, gpu_indexes, gpu_count, op, params, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu index 68a501eac6..bf2e7eeafe 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu @@ -1,8 +1,8 @@ #include "integer/cmux.cuh" void scratch_cuda_integer_radix_cmux_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t lwe_ciphertext_count, @@ -24,23 +24,24 @@ void scratch_cuda_integer_radix_cmux_kb_64( } void cuda_cmux_integer_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *lwe_array_out, void *lwe_condition, void *lwe_array_true, - void *lwe_array_false, int8_t *mem_ptr, void **bsks, void **ksks, - uint32_t lwe_ciphertext_count) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true, + void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t lwe_ciphertext_count) { host_integer_radix_cmux_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array_out), - static_cast(lwe_condition), - static_cast(lwe_array_true), - static_cast(lwe_array_false), + static_cast(lwe_condition), + static_cast(lwe_array_true), + static_cast(lwe_array_false), (int_cmux_buffer *)mem_ptr, bsks, (uint64_t **)(ksks), lwe_ciphertext_count); } -void cleanup_cuda_integer_radix_cmux(void **streams, uint32_t *gpu_indexes, +void cleanup_cuda_integer_radix_cmux(void *const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh index 9b72407f17..aea747c785 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh @@ -4,12 +4,13 @@ #include "integer.cuh" template -__host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *lwe_array_out, - Torus *lwe_array_input, Torus *lwe_condition, +__host__ void zero_out_if(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + Torus *lwe_array_out, Torus const *lwe_array_input, + Torus const *lwe_condition, int_zero_out_if_buffer *mem_ptr, - int_radix_lut *predicate, void **bsks, - Torus **ksks, uint32_t num_radix_blocks) { + int_radix_lut *predicate, void *const *bsks, + Torus *const *ksks, uint32_t num_radix_blocks) { cudaSetDevice(gpu_indexes[0]); auto params = mem_ptr->params; @@ -42,10 +43,11 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes, template __host__ void host_integer_radix_cmux_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_condition, Torus *lwe_array_true, - Torus *lwe_array_false, int_cmux_buffer *mem_ptr, void **bsks, - Torus **ksks, uint32_t num_radix_blocks) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_condition, + Torus const *lwe_array_true, Torus const *lwe_array_false, + int_cmux_buffer *mem_ptr, void *const *bsks, Torus *const *ksks, + uint32_t num_radix_blocks) { auto params = mem_ptr->params; @@ -89,8 +91,8 @@ __host__ void host_integer_radix_cmux_kb( template __host__ void scratch_cuda_integer_radix_cmux_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_cmux_buffer **mem_ptr, + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_cmux_buffer **mem_ptr, std::function predicate_lut_f, uint32_t num_radix_blocks, int_radix_params params, bool allocate_gpu_memory) { diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu index 7f877722bf..528fd54bec 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu @@ -1,8 +1,8 @@ #include "integer/comparison.cuh" void scratch_cuda_integer_radix_comparison_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks, @@ -37,9 +37,10 @@ void scratch_cuda_integer_radix_comparison_kb_64( } void cuda_comparison_integer_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr, - void **bsks, void **ksks, uint32_t num_radix_blocks) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2, + int8_t *mem_ptr, void *const *bsks, void *const *ksks, + uint32_t num_radix_blocks) { int_comparison_buffer *buffer = (int_comparison_buffer *)mem_ptr; @@ -49,9 +50,9 @@ void cuda_comparison_integer_radix_ciphertext_kb_64( host_integer_radix_equality_check_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array_out), - static_cast(lwe_array_1), - static_cast(lwe_array_2), buffer, bsks, (uint64_t **)(ksks), - num_radix_blocks); + static_cast(lwe_array_1), + static_cast(lwe_array_2), buffer, bsks, + (uint64_t **)(ksks), num_radix_blocks); break; case GT: case GE: @@ -60,8 +61,8 @@ void cuda_comparison_integer_radix_ciphertext_kb_64( host_integer_radix_difference_check_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array_out), - static_cast(lwe_array_1), - static_cast(lwe_array_2), buffer, + static_cast(lwe_array_1), + static_cast(lwe_array_2), buffer, buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks), num_radix_blocks); break; @@ -70,16 +71,17 @@ void cuda_comparison_integer_radix_ciphertext_kb_64( host_integer_radix_maxmin_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array_out), - static_cast(lwe_array_1), - static_cast(lwe_array_2), buffer, bsks, (uint64_t **)(ksks), - num_radix_blocks); + static_cast(lwe_array_1), + static_cast(lwe_array_2), buffer, bsks, + (uint64_t **)(ksks), num_radix_blocks); break; default: PANIC("Cuda error: integer operation not supported") } } -void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes, +void cleanup_cuda_integer_comparison(void *const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh index 2b1235d87c..e3ced3ded2 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh @@ -4,8 +4,8 @@ #include "crypto/keyswitch.cuh" #include "device.h" #include "integer.cuh" -#include "integer.h" #include "integer/cmux.cuh" +#include "integer/integer_utilities.h" #include "integer/negation.cuh" #include "integer/scalar_addition.cuh" #include "pbs/programmable_bootstrap_classic.cuh" @@ -16,9 +16,9 @@ // lwe_dimension + 1 threads // todo: This kernel MUST be refactored to a binary reduction template -__global__ void device_accumulate_all_blocks(Torus *output, Torus *input_block, - uint32_t lwe_dimension, - uint32_t num_blocks) { +__global__ void +device_accumulate_all_blocks(Torus *output, Torus const *input_block, + uint32_t lwe_dimension, uint32_t num_blocks) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < lwe_dimension + 1) { auto block = &input_block[idx]; @@ -34,7 +34,7 @@ __global__ void device_accumulate_all_blocks(Torus *output, Torus *input_block, template __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index, - Torus *output, Torus *input, + Torus *output, Torus const *input, uint32_t lwe_dimension, uint32_t num_radix_blocks) { @@ -57,10 +57,10 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index, */ template __host__ void are_all_comparisons_block_true( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_array_in, - int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, - uint32_t num_radix_blocks) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in, + int_comparison_buffer *mem_ptr, void *const *bsks, + Torus *const *ksks, uint32_t num_radix_blocks) { auto params = mem_ptr->params; auto big_lwe_dimension = params.big_lwe_dimension; @@ -159,10 +159,10 @@ __host__ void are_all_comparisons_block_true( */ template __host__ void is_at_least_one_comparisons_block_true( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_array_in, - int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, - uint32_t num_radix_blocks) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in, + int_comparison_buffer *mem_ptr, void *const *bsks, + Torus *const *ksks, uint32_t num_radix_blocks) { auto params = mem_ptr->params; auto big_lwe_dimension = params.big_lwe_dimension; @@ -239,10 +239,11 @@ __host__ void is_at_least_one_comparisons_block_true( // are_all_comparisons_block_true template __host__ void host_compare_with_zero_equality( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_array_in, - int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, - int32_t num_radix_blocks, int_radix_lut *zero_comparison) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in, + int_comparison_buffer *mem_ptr, void *const *bsks, + Torus *const *ksks, int32_t num_radix_blocks, + int_radix_lut *zero_comparison) { auto params = mem_ptr->params; auto big_lwe_dimension = params.big_lwe_dimension; @@ -301,10 +302,10 @@ __host__ void host_compare_with_zero_equality( template __host__ void host_integer_radix_equality_check_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2, - int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, - uint32_t num_radix_blocks) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_1, + Torus const *lwe_array_2, int_comparison_buffer *mem_ptr, + void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) { auto eq_buffer = mem_ptr->eq_buffer; @@ -325,12 +326,11 @@ __host__ void host_integer_radix_equality_check_kb( } template -__host__ void -compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *lwe_array_out, - Torus *lwe_array_left, Torus *lwe_array_right, - int_comparison_buffer *mem_ptr, void **bsks, - Torus **ksks, uint32_t num_radix_blocks) { +__host__ void compare_radix_blocks_kb( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_left, + Torus const *lwe_array_right, int_comparison_buffer *mem_ptr, + void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) { auto params = mem_ptr->params; auto big_lwe_dimension = params.big_lwe_dimension; @@ -374,13 +374,12 @@ compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes, // (inferior, equal, superior) to one single shortint block containing the // final sign template -__host__ void -tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *lwe_array_out, - Torus *lwe_block_comparisons, - int_tree_sign_reduction_buffer *tree_buffer, - std::function sign_handler_f, void **bsks, - Torus **ksks, uint32_t num_radix_blocks) { +__host__ void tree_sign_reduction( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_block_comparisons, + int_tree_sign_reduction_buffer *tree_buffer, + std::function sign_handler_f, void *const *bsks, + Torus *const *ksks, uint32_t num_radix_blocks) { auto params = tree_buffer->params; auto big_lwe_dimension = params.big_lwe_dimension; @@ -462,11 +461,11 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes, template __host__ void host_integer_radix_difference_check_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_array_left, Torus *lwe_array_right, - int_comparison_buffer *mem_ptr, - std::function reduction_lut_f, void **bsks, Torus **ksks, - uint32_t num_radix_blocks) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_left, + Torus const *lwe_array_right, int_comparison_buffer *mem_ptr, + std::function reduction_lut_f, void *const *bsks, + Torus *const *ksks, uint32_t num_radix_blocks) { auto diff_buffer = mem_ptr->diff_buffer; @@ -477,8 +476,8 @@ __host__ void host_integer_radix_difference_check_kb( auto carry_modulus = params.carry_modulus; uint32_t packed_num_radix_blocks = num_radix_blocks; - auto lhs = lwe_array_left; - auto rhs = lwe_array_right; + Torus *lhs = (Torus *)lwe_array_left; + Torus *rhs = (Torus *)lwe_array_right; if (carry_modulus >= message_modulus) { // Packing is possible // Pack inputs @@ -586,10 +585,10 @@ __host__ void host_integer_radix_difference_check_kb( template __host__ void scratch_cuda_integer_radix_comparison_check_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_comparison_buffer **mem_ptr, uint32_t num_radix_blocks, - int_radix_params params, COMPARISON_TYPE op, bool is_signed, - bool allocate_gpu_memory) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_comparison_buffer **mem_ptr, + uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op, + bool is_signed, bool allocate_gpu_memory) { *mem_ptr = new int_comparison_buffer(streams, gpu_indexes, gpu_count, op, params, num_radix_blocks, @@ -597,12 +596,11 @@ __host__ void scratch_cuda_integer_radix_comparison_check_kb( } template -__host__ void -host_integer_radix_maxmin_kb(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *lwe_array_out, - Torus *lwe_array_left, Torus *lwe_array_right, - int_comparison_buffer *mem_ptr, void **bsks, - Torus **ksks, uint32_t total_num_radix_blocks) { +__host__ void host_integer_radix_maxmin_kb( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_left, + Torus const *lwe_array_right, int_comparison_buffer *mem_ptr, + void *const *bsks, Torus *const *ksks, uint32_t total_num_radix_blocks) { // Compute the sign host_integer_radix_difference_check_kb( diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu index a40aedd7cb..ac570db934 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu @@ -1,11 +1,12 @@ #include "compression.cuh" void scratch_cuda_integer_compress_radix_ciphertext_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size, - uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t compression_glwe_dimension, + uint32_t compression_polynomial_size, uint32_t lwe_dimension, + uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + uint32_t lwe_per_glwe, uint32_t storage_log_modulus, bool allocate_gpu_memory) { int_radix_params compression_params( @@ -21,12 +22,13 @@ void scratch_cuda_integer_compress_radix_ciphertext_64( allocate_gpu_memory); } void scratch_cuda_integer_decompress_radix_ciphertext_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size, - uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size, - uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t encryption_glwe_dimension, + uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension, + uint32_t compression_polynomial_size, uint32_t lwe_dimension, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t num_radix_blocks, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + uint32_t storage_log_modulus, uint32_t body_count, bool allocate_gpu_memory) { // Decompression doesn't keyswitch, so big and small dimensions are the same @@ -47,32 +49,31 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64( allocate_gpu_memory); } void cuda_integer_compress_radix_ciphertext_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths, - int8_t *mem_ptr) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *glwe_array_out, void const *lwe_array_in, void *const *fp_ksk, + uint32_t num_nths, int8_t *mem_ptr) { host_integer_compress( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(glwe_array_out), - static_cast(lwe_array_in), (uint64_t **)(fp_ksk), num_nths, - (int_compression *)mem_ptr); + static_cast(lwe_array_in), (uint64_t *const *)(fp_ksk), + num_nths, (int_compression *)mem_ptr); } void cuda_integer_decompress_radix_ciphertext_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *lwe_array_out, void *glwe_in, uint32_t *indexes_array, - uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array_out, void const *glwe_in, uint32_t const *indexes_array, + uint32_t indexes_array_size, void *const *bsks, int8_t *mem_ptr) { host_integer_decompress( (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(lwe_array_out), static_cast(glwe_in), - indexes_array, indexes_array_size, bsks, - (int_decompression *)mem_ptr); + static_cast(lwe_array_out), + static_cast(glwe_in), indexes_array, indexes_array_size, + bsks, (int_decompression *)mem_ptr); } -void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams, - uint32_t *gpu_indexes, - uint32_t gpu_count, - int8_t **mem_ptr_void) { +void cleanup_cuda_integer_compress_radix_ciphertext_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr_void) { int_compression *mem_ptr = (int_compression *)(*mem_ptr_void); @@ -80,7 +81,7 @@ void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams, } void cleanup_cuda_integer_decompress_radix_ciphertext_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { int_decompression *mem_ptr = diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh index d6c179bb7a..88d598fef6 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh @@ -2,9 +2,10 @@ #define CUDA_INTEGER_COMPRESSION_CUH #include "ciphertext.h" -#include "compression.h" #include "crypto/keyswitch.cuh" #include "device.h" +#include "integer/compression/compression.h" +#include "integer/compression/compression_utilities.h" #include "integer/integer.cuh" #include "linearalgebra/multiplication.cuh" #include "polynomial/functions.cuh" @@ -77,11 +78,12 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index, } template -__host__ void host_integer_compress(cudaStream_t *streams, - uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *glwe_array_out, Torus *lwe_array_in, - Torus **fp_ksk, uint32_t num_radix_blocks, - int_compression *mem_ptr) { +__host__ void +host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *glwe_array_out, + Torus const *lwe_array_in, Torus *const *fp_ksk, + uint32_t num_radix_blocks, + int_compression *mem_ptr) { auto compression_params = mem_ptr->compression_params; auto input_lwe_dimension = compression_params.small_lwe_dimension; @@ -138,9 +140,9 @@ __host__ void host_integer_compress(cudaStream_t *streams, } template -__global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index, - uint32_t log_modulus, uint32_t input_len, - uint32_t initial_out_len) { +__global__ void extract(Torus *glwe_array_out, Torus const *array_in, + uint32_t index, uint32_t log_modulus, + uint32_t input_len, uint32_t initial_out_len) { auto nbits = sizeof(Torus) * 8; auto i = threadIdx.x + blockIdx.x * blockDim.x; @@ -176,7 +178,7 @@ __global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index, /// Extracts the glwe_index-nth GLWE ciphertext template __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index, - Torus *glwe_array_out, Torus *array_in, + Torus *glwe_array_out, Torus const *array_in, uint32_t glwe_index, int_decompression *mem_ptr) { if (array_in == glwe_array_out) @@ -219,15 +221,14 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index, } template -__host__ void -host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *d_lwe_array_out, - Torus *d_packed_glwe_in, uint32_t *h_indexes_array, - uint32_t indexes_array_size, void **d_bsks, - int_decompression *h_mem_ptr) { +__host__ void host_integer_decompress( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *d_lwe_array_out, Torus const *d_packed_glwe_in, + uint32_t const *h_indexes_array, uint32_t indexes_array_size, + void *const *d_bsks, int_decompression *h_mem_ptr) { auto d_indexes_array = h_mem_ptr->tmp_indexes_array; - cuda_memcpy_async_to_gpu(d_indexes_array, h_indexes_array, + cuda_memcpy_async_to_gpu(d_indexes_array, (void *)h_indexes_array, indexes_array_size * sizeof(uint32_t), streams[0], gpu_indexes[0]); @@ -355,10 +356,11 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes, template __host__ void scratch_cuda_compress_integer_radix_ciphertext( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_compression **mem_ptr, uint32_t num_radix_blocks, - int_radix_params compression_params, uint32_t lwe_per_glwe, - uint32_t storage_log_modulus, bool allocate_gpu_memory) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_compression **mem_ptr, + uint32_t num_radix_blocks, int_radix_params compression_params, + uint32_t lwe_per_glwe, uint32_t storage_log_modulus, + bool allocate_gpu_memory) { *mem_ptr = new int_compression( streams, gpu_indexes, gpu_count, compression_params, num_radix_blocks, @@ -367,11 +369,11 @@ __host__ void scratch_cuda_compress_integer_radix_ciphertext( template __host__ void scratch_cuda_integer_decompress_radix_ciphertext( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_decompression **mem_ptr, uint32_t num_radix_blocks, - uint32_t body_count, int_radix_params encryption_params, - int_radix_params compression_params, uint32_t storage_log_modulus, - bool allocate_gpu_memory) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_decompression **mem_ptr, + uint32_t num_radix_blocks, uint32_t body_count, + int_radix_params encryption_params, int_radix_params compression_params, + uint32_t storage_log_modulus, bool allocate_gpu_memory) { *mem_ptr = new int_decompression( streams, gpu_indexes, gpu_count, encryption_params, compression_params, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu index ecd31991d4..c83d252186 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu @@ -1,8 +1,8 @@ #include "integer/div_rem.cuh" void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, @@ -20,20 +20,23 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64( } void cuda_integer_div_rem_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *quotient, - void *remainder, void *numerator, void *divisor, int8_t *mem_ptr, - void **bsks, void **ksks, uint32_t num_blocks) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *quotient, void *remainder, void const *numerator, void const *divisor, + int8_t *mem_ptr, void *const *bsks, void *const *ksks, + uint32_t num_blocks) { auto mem = (int_div_rem_memory *)mem_ptr; host_integer_div_rem_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(quotient), static_cast(remainder), - static_cast(numerator), static_cast(divisor), - bsks, (uint64_t **)(ksks), mem, num_blocks); + static_cast(numerator), + static_cast(divisor), bsks, (uint64_t **)(ksks), mem, + num_blocks); } -void cleanup_cuda_integer_div_rem(void **streams, uint32_t *gpu_indexes, +void cleanup_cuda_integer_div_rem(void *const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { int_div_rem_memory *mem_ptr = (int_div_rem_memory *)(*mem_ptr_void); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh index 9e68cb408d..4bd9933f2c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh @@ -3,13 +3,13 @@ #include "crypto/keyswitch.cuh" #include "device.h" -#include "integer.h" #include "integer/comparison.cuh" #include "integer/integer.cuh" +#include "integer/integer_utilities.h" #include "integer/negation.cuh" #include "integer/scalar_shifts.cuh" #include "linear_algebra.h" -#include "programmable_bootstrap.h" +#include "pbs/programmable_bootstrap.h" #include "utils/helper.cuh" #include "utils/kernel_dimensions.cuh" #include @@ -160,21 +160,23 @@ template struct lwe_ciphertext_list { template __host__ void scratch_cuda_integer_div_rem_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_div_rem_memory **mem_ptr, uint32_t num_blocks, - int_radix_params params, bool allocate_gpu_memory) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_div_rem_memory **mem_ptr, + uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) { *mem_ptr = new int_div_rem_memory( streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory); } template -__host__ void -host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *quotient, Torus *remainder, - Torus *numerator, Torus *divisor, void **bsks, - uint64_t **ksks, int_div_rem_memory *mem_ptr, - uint32_t num_blocks) { +__host__ void host_integer_div_rem_kb(cudaStream_t const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *quotient, + Torus *remainder, Torus const *numerator, + Torus const *divisor, void *const *bsks, + uint64_t *const *ksks, + int_div_rem_memory *mem_ptr, + uint32_t num_blocks) { auto radix_params = mem_ptr->params; @@ -222,8 +224,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, lwe_ciphertext_list cleaned_merged_interesting_remainder( mem_ptr->cleaned_merged_interesting_remainder, radix_params, num_blocks); - numerator_block_stack.clone_from(numerator, 0, num_blocks - 1, streams[0], - gpu_indexes[0]); + numerator_block_stack.clone_from((Torus *)numerator, 0, num_blocks - 1, + streams[0], gpu_indexes[0]); remainder1.assign_zero(0, num_blocks - 1, streams[0], gpu_indexes[0]); remainder2.assign_zero(0, num_blocks - 1, streams[0], gpu_indexes[0]); @@ -245,9 +247,9 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, streams[0], gpu_indexes[0]); interesting_remainder2.clone_from(remainder2, 0, last_non_trivial_block, streams[0], gpu_indexes[0]); - interesting_divisor.clone_from(divisor, 0, last_non_trivial_block, + interesting_divisor.clone_from((Torus *)divisor, 0, last_non_trivial_block, streams[0], gpu_indexes[0]); - divisor_ms_blocks.clone_from(divisor, + divisor_ms_blocks.clone_from((Torus *)divisor, (msb_bit_set + 1) / num_bits_in_message, num_blocks - 1, streams[0], gpu_indexes[0]); @@ -256,65 +258,67 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, // msb_bit_set) the split versions share some bits they should not. So we do // one PBS on the last block of the interesting_divisor, and first block of // divisor_ms_blocks to trim out bits which should not be there - auto trim_last_interesting_divisor_bits = - [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) { - if ((msb_bit_set + 1) % num_bits_in_message == 0) { - return; - } - // The last block of the interesting part of the remainder - // can contain bits which we should not account for - // we have to zero them out. - - // Where the msb is set in the block - uint32_t pos_in_block = msb_bit_set % num_bits_in_message; - - // e.g 2 bits in message: - // if pos_in_block is 0, then we want to keep only first bit (right - // shift - // mask by 1) if pos_in_block is 1, then we want to keep the two - // bits - // (right shift mask by 0) - uint32_t shift_amount = num_bits_in_message - (pos_in_block + 1); - - // Create mask of 1s on the message part, 0s in the carries - uint32_t full_message_mask = message_modulus - 1; - - // Shift the mask so that we will only keep bits we should - uint32_t shifted_mask = full_message_mask >> shift_amount; - - integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, interesting_divisor.last_block(), - interesting_divisor.last_block(), bsks, ksks, 1, - mem_ptr->masking_luts_1[shifted_mask]); - }; // trim_last_interesting_divisor_bits - - auto trim_first_divisor_ms_bits = - [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) { - if (divisor_ms_blocks.is_empty() || - ((msb_bit_set + 1) % num_bits_in_message) == 0) { - return; - } - // Where the msb is set in the block - uint32_t pos_in_block = msb_bit_set % num_bits_in_message; - - // e.g 2 bits in message: - // if pos_in_block is 0, then we want to discard the first bit (left - // shift mask by 1) if pos_in_block is 1, then we want to discard the - // two bits (left shift mask by 2) let shift_amount = - // num_bits_in_message - pos_in_block - uint32_t shift_amount = pos_in_block + 1; - uint32_t full_message_mask = message_modulus - 1; - uint32_t shifted_mask = full_message_mask << shift_amount; - - // Keep the mask within the range of message bits, so that - // the estimated degree of the output is < msg_modulus - shifted_mask = shifted_mask & full_message_mask; - - integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(), - divisor_ms_blocks.first_block(), bsks, ksks, 1, - mem_ptr->masking_luts_2[shifted_mask]); - }; // trim_first_divisor_ms_bits + auto trim_last_interesting_divisor_bits = [&](cudaStream_t const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count) { + if ((msb_bit_set + 1) % num_bits_in_message == 0) { + return; + } + // The last block of the interesting part of the remainder + // can contain bits which we should not account for + // we have to zero them out. + + // Where the msb is set in the block + uint32_t pos_in_block = msb_bit_set % num_bits_in_message; + + // e.g 2 bits in message: + // if pos_in_block is 0, then we want to keep only first bit (right + // shift + // mask by 1) if pos_in_block is 1, then we want to keep the two + // bits + // (right shift mask by 0) + uint32_t shift_amount = num_bits_in_message - (pos_in_block + 1); + + // Create mask of 1s on the message part, 0s in the carries + uint32_t full_message_mask = message_modulus - 1; + + // Shift the mask so that we will only keep bits we should + uint32_t shifted_mask = full_message_mask >> shift_amount; + + integer_radix_apply_univariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, interesting_divisor.last_block(), + interesting_divisor.last_block(), bsks, ksks, 1, + mem_ptr->masking_luts_1[shifted_mask]); + }; // trim_last_interesting_divisor_bits + + auto trim_first_divisor_ms_bits = [&](cudaStream_t const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count) { + if (divisor_ms_blocks.is_empty() || + ((msb_bit_set + 1) % num_bits_in_message) == 0) { + return; + } + // Where the msb is set in the block + uint32_t pos_in_block = msb_bit_set % num_bits_in_message; + + // e.g 2 bits in message: + // if pos_in_block is 0, then we want to discard the first bit (left + // shift mask by 1) if pos_in_block is 1, then we want to discard the + // two bits (left shift mask by 2) let shift_amount = + // num_bits_in_message - pos_in_block + uint32_t shift_amount = pos_in_block + 1; + uint32_t full_message_mask = message_modulus - 1; + uint32_t shifted_mask = full_message_mask << shift_amount; + + // Keep the mask within the range of message bits, so that + // the estimated degree of the output is < msg_modulus + shifted_mask = shifted_mask & full_message_mask; + + integer_radix_apply_univariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(), + divisor_ms_blocks.first_block(), bsks, ksks, 1, + mem_ptr->masking_luts_2[shifted_mask]); + }; // trim_first_divisor_ms_bits // This does // R := R << 1; R(0) := N(i) @@ -325,48 +329,50 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, // However, to keep the remainder clean (noise wise), what we do is that we // put the remainder block from which we need to extract the bit, as the LSB // of the Remainder, so that left shifting will pull the bit we need. - auto left_shift_interesting_remainder1 = - [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) { - numerator_block_1.clone_from( - numerator_block_stack, numerator_block_stack.len - 1, - numerator_block_stack.len - 1, streams[0], gpu_indexes[0]); - numerator_block_stack.pop(); - interesting_remainder1.insert(0, numerator_block_1.first_block(), - streams[0], gpu_indexes[0]); - - host_integer_radix_logical_scalar_shift_kb_inplace( - streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1, - mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len); - - tmp_radix.clone_from(interesting_remainder1, 0, - interesting_remainder1.len - 1, streams[0], - gpu_indexes[0]); - - host_radix_blocks_rotate_left( - streams, gpu_indexes, gpu_count, interesting_remainder1.data, - tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size); - - numerator_block_1.clone_from( - interesting_remainder1, interesting_remainder1.len - 1, - interesting_remainder1.len - 1, streams[0], gpu_indexes[0]); - - interesting_remainder1.pop(); - - if (pos_in_block != 0) { - // We have not yet extracted all the bits from this numerator - // so, we put it back on the front so that it gets taken next - // iteration - numerator_block_stack.push(numerator_block_1.first_block(), - streams[0], gpu_indexes[0]); - } - }; // left_shift_interesting_remainder1 - - auto left_shift_interesting_remainder2 = - [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) { - host_integer_radix_logical_scalar_shift_kb_inplace( - streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1, - mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len); - }; // left_shift_interesting_remainder2 + auto left_shift_interesting_remainder1 = [&](cudaStream_t const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count) { + numerator_block_1.clone_from( + numerator_block_stack, numerator_block_stack.len - 1, + numerator_block_stack.len - 1, streams[0], gpu_indexes[0]); + numerator_block_stack.pop(); + interesting_remainder1.insert(0, numerator_block_1.first_block(), + streams[0], gpu_indexes[0]); + + host_integer_radix_logical_scalar_shift_kb_inplace( + streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1, + mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len); + + tmp_radix.clone_from(interesting_remainder1, 0, + interesting_remainder1.len - 1, streams[0], + gpu_indexes[0]); + + host_radix_blocks_rotate_left( + streams, gpu_indexes, gpu_count, interesting_remainder1.data, + tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size); + + numerator_block_1.clone_from( + interesting_remainder1, interesting_remainder1.len - 1, + interesting_remainder1.len - 1, streams[0], gpu_indexes[0]); + + interesting_remainder1.pop(); + + if (pos_in_block != 0) { + // We have not yet extracted all the bits from this numerator + // so, we put it back on the front so that it gets taken next + // iteration + numerator_block_stack.push(numerator_block_1.first_block(), streams[0], + gpu_indexes[0]); + } + }; // left_shift_interesting_remainder1 + + auto left_shift_interesting_remainder2 = [&](cudaStream_t const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count) { + host_integer_radix_logical_scalar_shift_kb_inplace( + streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1, + mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len); + }; // left_shift_interesting_remainder2 for (uint j = 0; j < gpu_count; j++) { cuda_synchronize_stream(streams[j], gpu_indexes[j]); @@ -416,7 +422,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, // fills: // `new_remainder` - radix ciphertext // `subtraction_overflowed` - single ciphertext - auto do_overflowing_sub = [&](cudaStream_t *streams, uint32_t *gpu_indexes, + auto do_overflowing_sub = [&](cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count) { host_integer_overflowing_sub_kb( streams, gpu_indexes, gpu_count, new_remainder.data, @@ -427,8 +434,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, // fills: // `at_least_one_upper_block_is_non_zero` - single ciphertext - auto check_divisor_upper_blocks = [&](cudaStream_t *streams, - uint32_t *gpu_indexes, + auto check_divisor_upper_blocks = [&](cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count) { auto &trivial_blocks = divisor_ms_blocks; if (trivial_blocks.is_empty()) { @@ -459,7 +466,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, // fills: // `cleaned_merged_interesting_remainder` - radix ciphertext auto create_clean_version_of_merged_remainder = - [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) { + [&](cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count) { integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, cleaned_merged_interesting_remainder.data, @@ -498,7 +506,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, streams[0], gpu_indexes[0]); auto conditionally_zero_out_merged_interesting_remainder = - [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) { + [&](cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count) { integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, cleaned_merged_interesting_remainder.data, @@ -510,7 +519,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, }; auto conditionally_zero_out_merged_new_remainder = - [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) { + [&](cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count) { integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, new_remainder.data, new_remainder.data, overflow_sum_radix.data, bsks, ksks, @@ -518,7 +528,8 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, mem_ptr->zero_out_if_overflow_happened[factor_lut_id], factor); }; - auto set_quotient_bit = [&](cudaStream_t *streams, uint32_t *gpu_indexes, + auto set_quotient_bit = [&](cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count) { integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, did_not_overflow.data, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu index d141f6cf1c..53b1366c37 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu @@ -1,10 +1,11 @@ #include "integer/integer.cuh" #include -void cuda_full_propagation_64_inplace(void **streams, uint32_t *gpu_indexes, +void cuda_full_propagation_64_inplace(void *const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, void *input_blocks, - int8_t *mem_ptr, void **ksks, void **bsks, - uint32_t num_blocks) { + int8_t *mem_ptr, void *const *ksks, + void *const *bsks, uint32_t num_blocks) { int_fullprop_buffer *buffer = (int_fullprop_buffer *)mem_ptr; @@ -16,11 +17,12 @@ void cuda_full_propagation_64_inplace(void **streams, uint32_t *gpu_indexes, } void scratch_cuda_full_propagation_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, - uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, glwe_dimension * polynomial_size, lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log, @@ -31,7 +33,8 @@ void scratch_cuda_full_propagation_64( (int_fullprop_buffer **)mem_ptr, params, allocate_gpu_memory); } -void cleanup_cuda_full_propagation(void **streams, uint32_t *gpu_indexes, +void cleanup_cuda_full_propagation(void *const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { int_fullprop_buffer *mem_ptr = @@ -41,8 +44,8 @@ void cleanup_cuda_full_propagation(void **streams, uint32_t *gpu_indexes, } void scratch_cuda_propagate_single_carry_kb_64_inplace( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, @@ -60,9 +63,9 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace( } void cuda_propagate_single_carry_kb_64_inplace( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - void *carry_out, int8_t *mem_ptr, void **bsks, void **ksks, - uint32_t num_blocks) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t num_blocks) { host_propagate_single_carry( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array), static_cast(carry_out), @@ -71,9 +74,9 @@ void cuda_propagate_single_carry_kb_64_inplace( } void cuda_propagate_single_carry_get_input_carries_kb_64_inplace( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - void *carry_out, void *input_carries, int8_t *mem_ptr, void **bsks, - void **ksks, uint32_t num_blocks) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr, + void *const *bsks, void *const *ksks, uint32_t num_blocks) { host_propagate_single_carry( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array), static_cast(carry_out), @@ -82,7 +85,8 @@ void cuda_propagate_single_carry_get_input_carries_kb_64_inplace( num_blocks); } -void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes, +void cleanup_cuda_propagate_single_carry(void *const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { int_sc_prop_memory *mem_ptr = @@ -91,12 +95,13 @@ void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes, } void scratch_cuda_apply_univariate_lut_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, bool allocate_gpu_memory) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_radix_blocks, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, glwe_dimension * polynomial_size, lwe_dimension, @@ -105,26 +110,28 @@ void scratch_cuda_apply_univariate_lut_kb_64( scratch_cuda_apply_univariate_lut_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, - (int_radix_lut **)mem_ptr, static_cast(input_lut), - num_radix_blocks, params, allocate_gpu_memory); + (int_radix_lut **)mem_ptr, + static_cast(input_lut), num_radix_blocks, params, + allocate_gpu_memory); } -void cuda_apply_univariate_lut_kb_64(void **streams, uint32_t *gpu_indexes, +void cuda_apply_univariate_lut_kb_64(void *const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, void *output_radix_lwe, - void *input_radix_lwe, int8_t *mem_ptr, - void **ksks, void **bsks, - uint32_t num_blocks) { + void const *input_radix_lwe, + int8_t *mem_ptr, void *const *ksks, + void *const *bsks, uint32_t num_blocks) { host_apply_univariate_lut_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(output_radix_lwe), - static_cast(input_radix_lwe), + static_cast(input_radix_lwe), (int_radix_lut *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks); } -void cleanup_cuda_apply_univariate_lut_kb_64(void **streams, - uint32_t *gpu_indexes, +void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { int_radix_lut *mem_ptr = (int_radix_lut *)(*mem_ptr_void); @@ -132,25 +139,27 @@ void cleanup_cuda_apply_univariate_lut_kb_64(void **streams, } void cuda_apply_many_univariate_lut_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks, - void **bsks, uint32_t num_blocks, uint32_t lut_count, uint32_t lut_stride) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *output_radix_lwe, void const *input_radix_lwe, int8_t *mem_ptr, + void *const *ksks, void *const *bsks, uint32_t num_blocks, + uint32_t lut_count, uint32_t lut_stride) { host_apply_many_univariate_lut_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(output_radix_lwe), - static_cast(input_radix_lwe), + static_cast(input_radix_lwe), (int_radix_lut *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks, lut_count, lut_stride); } void scratch_cuda_apply_bivariate_lut_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, bool allocate_gpu_memory) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, void *input_lut, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_radix_blocks, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, glwe_dimension * polynomial_size, lwe_dimension, @@ -163,24 +172,23 @@ void scratch_cuda_apply_bivariate_lut_kb_64( num_radix_blocks, params, allocate_gpu_memory); } -void cuda_apply_bivariate_lut_kb_64(void **streams, uint32_t *gpu_indexes, - uint32_t gpu_count, void *output_radix_lwe, - void *input_radix_lwe_1, - void *input_radix_lwe_2, int8_t *mem_ptr, - void **ksks, void **bsks, - uint32_t num_blocks, uint32_t shift) { +void cuda_apply_bivariate_lut_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *output_radix_lwe, void const *input_radix_lwe_1, + void const *input_radix_lwe_2, int8_t *mem_ptr, void *const *ksks, + void *const *bsks, uint32_t num_blocks, uint32_t shift) { host_apply_bivariate_lut_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(output_radix_lwe), - static_cast(input_radix_lwe_1), - static_cast(input_radix_lwe_2), + static_cast(input_radix_lwe_1), + static_cast(input_radix_lwe_2), (int_radix_lut *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks, shift); } -void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams, - uint32_t *gpu_indexes, +void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { int_radix_lut *mem_ptr = (int_radix_lut *)(*mem_ptr_void); @@ -188,12 +196,13 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams, } void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, bool allocate_gpu_memory) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_radix_blocks, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, glwe_dimension * polynomial_size, lwe_dimension, @@ -202,14 +211,15 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64( scratch_cuda_apply_bivariate_lut_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, - (int_radix_lut **)mem_ptr, static_cast(input_lut), - num_radix_blocks, params, allocate_gpu_memory); + (int_radix_lut **)mem_ptr, + static_cast(input_lut), num_radix_blocks, params, + allocate_gpu_memory); } void cuda_integer_compute_prefix_sum_hillis_steele_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr, - void **ksks, void **bsks, uint32_t num_blocks, uint32_t shift) { + void *const *ksks, void *const *bsks, uint32_t num_blocks, uint32_t shift) { int_radix_params params = ((int_radix_lut *)mem_ptr)->params; @@ -222,14 +232,14 @@ void cuda_integer_compute_prefix_sum_hillis_steele_64( } void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { int_radix_lut *mem_ptr = (int_radix_lut *)(*mem_ptr_void); mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); } -void cuda_integer_reverse_blocks_64_inplace(void **streams, - uint32_t *gpu_indexes, +void cuda_integer_reverse_blocks_64_inplace(void *const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, void *lwe_array, uint32_t num_blocks, uint32_t lwe_size) { diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index cd0f5b0b6a..8560b94c8d 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -4,12 +4,12 @@ #include "crypto/keyswitch.cuh" #include "device.h" #include "helper_multi_gpu.h" -#include "integer.h" +#include "integer/integer_utilities.h" #include "integer/scalar_addition.cuh" #include "linear_algebra.h" #include "linearalgebra/addition.cuh" +#include "pbs/programmable_bootstrap.h" #include "polynomial/functions.cuh" -#include "programmable_bootstrap.h" #include "utils/helper.cuh" #include "utils/helper_multi_gpu.cuh" #include "utils/kernel_dimensions.cuh" @@ -69,10 +69,10 @@ __global__ void radix_blocks_rotate_left(Torus *dst, Torus *src, uint32_t value, // one block is responsible to process single lwe ciphertext template __host__ void -host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *dst, Torus *src, - uint32_t value, uint32_t blocks_count, - uint32_t lwe_size) { +host_radix_blocks_rotate_right(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + Torus *dst, Torus *src, uint32_t value, + uint32_t blocks_count, uint32_t lwe_size) { if (src == dst) { PANIC("Cuda error (blocks_rotate_right): the source and destination " "pointers should be different"); @@ -86,10 +86,10 @@ host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes, // calculation is not inplace, so `dst` and `src` must not be the same template __host__ void -host_radix_blocks_rotate_left(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *dst, Torus *src, - uint32_t value, uint32_t blocks_count, - uint32_t lwe_size) { +host_radix_blocks_rotate_left(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + Torus *dst, Torus *src, uint32_t value, + uint32_t blocks_count, uint32_t lwe_size) { if (src == dst) { PANIC("Cuda error (blocks_rotate_left): the source and destination " "pointers should be different"); @@ -119,9 +119,9 @@ __global__ void radix_blocks_reverse_lwe_inplace(Torus *src, template __host__ void -host_radix_blocks_reverse_inplace(cudaStream_t *streams, uint32_t *gpu_indexes, - Torus *src, uint32_t blocks_count, - uint32_t lwe_size) { +host_radix_blocks_reverse_inplace(cudaStream_t const *streams, + uint32_t const *gpu_indexes, Torus *src, + uint32_t blocks_count, uint32_t lwe_size) { cudaSetDevice(gpu_indexes[0]); int num_blocks = blocks_count / 2, num_threads = 1024; radix_blocks_reverse_lwe_inplace @@ -131,10 +131,11 @@ host_radix_blocks_reverse_inplace(cudaStream_t *streams, uint32_t *gpu_indexes, // polynomial_size threads template __global__ void -device_pack_bivariate_blocks(Torus *lwe_array_out, Torus *lwe_indexes_out, - Torus *lwe_array_1, Torus *lwe_array_2, - Torus *lwe_indexes_in, uint32_t lwe_dimension, - uint32_t shift, uint32_t num_blocks) { +device_pack_bivariate_blocks(Torus *lwe_array_out, Torus const *lwe_indexes_out, + Torus const *lwe_array_1, Torus const *lwe_array_2, + Torus const *lwe_indexes_in, + uint32_t lwe_dimension, uint32_t shift, + uint32_t num_blocks) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < num_blocks * (lwe_dimension + 1)) { @@ -151,13 +152,13 @@ device_pack_bivariate_blocks(Torus *lwe_array_out, Torus *lwe_indexes_out, * becomes out = m1 * shift + m2 */ template -__host__ void pack_bivariate_blocks(cudaStream_t *streams, - uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, - Torus *lwe_indexes_out, Torus *lwe_array_1, - Torus *lwe_array_2, Torus *lwe_indexes_in, - uint32_t lwe_dimension, uint32_t shift, - uint32_t num_radix_blocks) { +__host__ void +pack_bivariate_blocks(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, + Torus const *lwe_indexes_out, Torus const *lwe_array_1, + Torus const *lwe_array_2, Torus const *lwe_indexes_in, + uint32_t lwe_dimension, uint32_t shift, + uint32_t num_radix_blocks) { cudaSetDevice(gpu_indexes[0]); // Left message is shifted @@ -173,9 +174,10 @@ __host__ void pack_bivariate_blocks(cudaStream_t *streams, template __host__ void integer_radix_apply_univariate_lookup_table_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_array_in, void **bsks, Torus **ksks, - uint32_t num_radix_blocks, int_radix_lut *lut) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in, + void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks, + int_radix_lut *lut) { // apply_lookup_table auto params = lut->params; auto pbs_type = params.pbs_type; @@ -202,10 +204,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); if (active_gpu_count == 1) { execute_keyswitch_async(streams, gpu_indexes, 1, lwe_after_ks_vec[0], - lwe_trivial_indexes_vec[0], lwe_array_in, - lut->lwe_indexes_in, ksks, big_lwe_dimension, - small_lwe_dimension, ks_base_log, ks_level, - num_radix_blocks); + lwe_trivial_indexes_vec[0], + (Torus *)lwe_array_in, lut->lwe_indexes_in, + ksks, big_lwe_dimension, small_lwe_dimension, + ks_base_log, ks_level, num_radix_blocks); /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE /// dimension to a big LWE dimension @@ -259,10 +261,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( template __host__ void integer_radix_apply_many_univariate_lookup_table_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_array_in, void **bsks, Torus **ksks, - uint32_t num_radix_blocks, int_radix_lut *lut, uint32_t lut_count, - uint32_t lut_stride) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in, + void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks, + int_radix_lut *lut, uint32_t lut_count, uint32_t lut_stride) { // apply_lookup_table auto params = lut->params; auto pbs_type = params.pbs_type; @@ -286,10 +288,10 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb( auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); if (active_gpu_count == 1) { execute_keyswitch_async(streams, gpu_indexes, 1, lwe_after_ks_vec[0], - lwe_trivial_indexes_vec[0], lwe_array_in, - lut->lwe_indexes_in, ksks, big_lwe_dimension, - small_lwe_dimension, ks_base_log, ks_level, - num_radix_blocks); + lwe_trivial_indexes_vec[0], + (Torus *)lwe_array_in, lut->lwe_indexes_in, + ksks, big_lwe_dimension, small_lwe_dimension, + ks_base_log, ks_level, num_radix_blocks); /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE /// dimension to a big LWE dimension @@ -343,10 +345,10 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb( template __host__ void integer_radix_apply_bivariate_lookup_table_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2, void **bsks, - Torus **ksks, uint32_t num_radix_blocks, int_radix_lut *lut, - uint32_t shift) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_1, + Torus const *lwe_array_2, void *const *bsks, Torus *const *ksks, + uint32_t num_radix_blocks, int_radix_lut *lut, uint32_t shift) { auto params = lut->params; auto pbs_type = params.pbs_type; @@ -612,9 +614,10 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index, template void scratch_cuda_propagate_single_carry_kb_inplace( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_sc_prop_memory **mem_ptr, uint32_t num_radix_blocks, - int_radix_params params, bool allocate_gpu_memory) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_sc_prop_memory **mem_ptr, + uint32_t num_radix_blocks, int_radix_params params, + bool allocate_gpu_memory) { *mem_ptr = new int_sc_prop_memory(streams, gpu_indexes, gpu_count, params, @@ -623,10 +626,10 @@ void scratch_cuda_propagate_single_carry_kb_inplace( template void host_compute_prefix_sum_hillis_steele( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *step_output, Torus *generates_or_propagates, int_radix_params params, - int_radix_lut *luts, void **bsks, Torus **ksks, - uint32_t num_blocks) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *step_output, Torus *generates_or_propagates, + int_radix_params params, int_radix_lut *luts, void *const *bsks, + Torus *const *ksks, uint32_t num_blocks) { auto glwe_dimension = params.glwe_dimension; auto polynomial_size = params.polynomial_size; @@ -659,11 +662,13 @@ void host_compute_prefix_sum_hillis_steele( } template -void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes, +void host_propagate_single_carry(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, Torus *lwe_array, Torus *carry_out, Torus *input_carries, - int_sc_prop_memory *mem, void **bsks, - Torus **ksks, uint32_t num_blocks) { + int_sc_prop_memory *mem, + void *const *bsks, Torus *const *ksks, + uint32_t num_blocks) { auto params = mem->params; if (params.message_modulus == 2) PANIC("Cuda error: single carry propagation is not supported for 1 bit " @@ -700,7 +705,7 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes, gpu_indexes[0]); if (input_carries != nullptr) { - cuda_memcpy_async_gpu_to_gpu(input_carries, step_output, + cuda_memcpy_async_gpu_to_gpu((void *)input_carries, step_output, big_lwe_size_bytes * num_blocks, streams[0], gpu_indexes[0]); } @@ -716,10 +721,10 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes, template void host_generate_last_block_inner_propagation( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *last_block_inner_propagation, Torus *lhs, Torus *rhs, - int_last_block_inner_propagate_memory *mem, void **bsks, - Torus **ksks) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *last_block_inner_propagation, Torus const *lhs, + Torus const *rhs, int_last_block_inner_propagate_memory *mem, + void *const *bsks, Torus *const *ksks) { integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, last_block_inner_propagation, lhs, rhs, @@ -728,11 +733,12 @@ void host_generate_last_block_inner_propagation( } template -void host_propagate_single_sub_borrow(cudaStream_t *streams, - uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *overflowed, Torus *lwe_array, +void host_propagate_single_sub_borrow(cudaStream_t const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *overflowed, + Torus *lwe_array, int_overflowing_sub_memory *mem, - void **bsks, Torus **ksks, + void *const *bsks, Torus *const *ksks, uint32_t num_blocks) { auto params = mem->params; auto glwe_dimension = params.glwe_dimension; @@ -784,10 +790,11 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams, * have size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus) */ template -void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes, +void host_full_propagate_inplace(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, Torus *input_blocks, int_fullprop_buffer *mem_ptr, - Torus **ksks, void **bsks, + Torus *const *ksks, void *const *bsks, uint32_t num_blocks) { auto params = mem_ptr->lut->params; @@ -821,14 +828,14 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes, params.polynomial_size, params.pbs_base_log, params.pbs_level, params.grouping_factor, 2, params.pbs_type, lut_count, lut_stride); - cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector, - big_lwe_size * sizeof(Torus), streams[0], - gpu_indexes[0]); + cuda_memcpy_async_gpu_to_gpu( + (void *)cur_input_block, mem_ptr->tmp_big_lwe_vector, + big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]); if (i < num_blocks - 1) { auto next_input_block = &input_blocks[(i + 1) * big_lwe_size]; host_addition(streams[0], gpu_indexes[0], next_input_block, - next_input_block, + (Torus const *)next_input_block, &mem_ptr->tmp_big_lwe_vector[big_lwe_size], params.big_lwe_dimension, 1); } @@ -836,7 +843,8 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes, } template -void scratch_cuda_full_propagation(cudaStream_t *streams, uint32_t *gpu_indexes, +void scratch_cuda_full_propagation(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, int_fullprop_buffer **mem_ptr, int_radix_params params, @@ -849,14 +857,16 @@ void scratch_cuda_full_propagation(cudaStream_t *streams, uint32_t *gpu_indexes, // (lwe_dimension+1) threads // (num_radix_blocks / 2) thread blocks template -__global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in, +__global__ void device_pack_blocks(Torus *lwe_array_out, + Torus const *lwe_array_in, uint32_t lwe_dimension, uint32_t num_radix_blocks, uint32_t factor) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < (lwe_dimension + 1)) { for (int bid = 0; bid < (num_radix_blocks / 2); bid++) { - Torus *lsb_block = lwe_array_in + (2 * bid) * (lwe_dimension + 1); + Torus *lsb_block = + (Torus *)lwe_array_in + (2 * bid) * (lwe_dimension + 1); Torus *msb_block = lsb_block + (lwe_dimension + 1); Torus *packed_block = lwe_array_out + bid * (lwe_dimension + 1); @@ -867,7 +877,7 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in, if (num_radix_blocks % 2 == 1) { // We couldn't host_pack the last block, so we just copy it Torus *lsb_block = - lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1); + (Torus *)lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1); Torus *last_block = lwe_array_out + (num_radix_blocks / 2) * (lwe_dimension + 1); @@ -885,7 +895,7 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in, // Expects the carry buffer to be empty template __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index, - Torus *lwe_array_out, Torus *lwe_array_in, + Torus *lwe_array_out, Torus const *lwe_array_in, uint32_t lwe_dimension, uint32_t num_radix_blocks, uint32_t factor) { if (num_radix_blocks == 0) @@ -900,7 +910,7 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index, template __global__ void -device_create_trivial_radix(Torus *lwe_array, Torus *scalar_input, +device_create_trivial_radix(Torus *lwe_array, Torus const *scalar_input, int32_t num_blocks, uint32_t lwe_dimension, uint64_t delta) { int tid = blockIdx.x * blockDim.x + threadIdx.x; @@ -915,7 +925,7 @@ device_create_trivial_radix(Torus *lwe_array, Torus *scalar_input, template __host__ void create_trivial_radix(cudaStream_t stream, uint32_t gpu_index, - Torus *lwe_array_out, Torus *scalar_array, + Torus *lwe_array_out, Torus const *scalar_array, uint32_t lwe_dimension, uint32_t num_radix_blocks, uint32_t num_scalar_blocks, uint64_t message_modulus, uint64_t carry_modulus) { @@ -951,9 +961,10 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index, * * (lwe_dimension+1) * sizeeof(Torus) bytes */ template -__host__ void extract_n_bits(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *lwe_array_out, - Torus *lwe_array_in, void **bsks, Torus **ksks, +__host__ void extract_n_bits(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + Torus *lwe_array_out, Torus *lwe_array_in, + void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks, uint32_t bits_per_block, int_bit_extract_luts_buffer *bit_extract) { @@ -964,11 +975,11 @@ __host__ void extract_n_bits(cudaStream_t *streams, uint32_t *gpu_indexes, template __host__ void -reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *signs_array_out, Torus *signs_array_in, +reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *signs_array_out, Torus *signs_array_in, int_comparison_buffer *mem_ptr, - std::function sign_handler_f, void **bsks, - Torus **ksks, uint32_t num_sign_blocks) { + std::function sign_handler_f, void *const *bsks, + Torus *const *ksks, uint32_t num_sign_blocks) { auto diff_buffer = mem_ptr->diff_buffer; @@ -1064,27 +1075,29 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, template void scratch_cuda_apply_univariate_lut_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_radix_lut **mem_ptr, Torus *input_lut, uint32_t num_radix_blocks, - int_radix_params params, bool allocate_gpu_memory) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_radix_lut **mem_ptr, Torus const *input_lut, + uint32_t num_radix_blocks, int_radix_params params, + bool allocate_gpu_memory) { *mem_ptr = new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, num_radix_blocks, allocate_gpu_memory); // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU // 0 - cuda_memcpy_async_to_gpu((*mem_ptr)->get_lut(gpu_indexes[0], 0), input_lut, - (params.glwe_dimension + 1) * - params.polynomial_size * sizeof(Torus), - streams[0], gpu_indexes[0]); + cuda_memcpy_async_to_gpu( + (*mem_ptr)->get_lut(gpu_indexes[0], 0), (void *)input_lut, + (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus), + streams[0], gpu_indexes[0]); (*mem_ptr)->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); } template -void host_apply_univariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes, +void host_apply_univariate_lut_kb(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, Torus *radix_lwe_out, - Torus *radix_lwe_in, - int_radix_lut *mem, Torus **ksks, - void **bsks, uint32_t num_blocks) { + Torus const *radix_lwe_in, + int_radix_lut *mem, Torus *const *ksks, + void *const *bsks, uint32_t num_blocks) { integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks, @@ -1093,10 +1106,10 @@ void host_apply_univariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes, template void host_apply_many_univariate_lut_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *radix_lwe_out, Torus *radix_lwe_in, int_radix_lut *mem, - Torus **ksks, void **bsks, uint32_t num_blocks, uint32_t lut_count, - uint32_t lut_stride) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *radix_lwe_out, Torus const *radix_lwe_in, + int_radix_lut *mem, Torus *const *ksks, void *const *bsks, + uint32_t num_blocks, uint32_t lut_count, uint32_t lut_stride) { integer_radix_apply_many_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks, @@ -1105,28 +1118,28 @@ void host_apply_many_univariate_lut_kb( template void scratch_cuda_apply_bivariate_lut_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_radix_lut **mem_ptr, Torus *input_lut, uint32_t num_radix_blocks, - int_radix_params params, bool allocate_gpu_memory) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_radix_lut **mem_ptr, Torus const *input_lut, + uint32_t num_radix_blocks, int_radix_params params, + bool allocate_gpu_memory) { *mem_ptr = new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, num_radix_blocks, allocate_gpu_memory); // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU // 0 - cuda_memcpy_async_to_gpu((*mem_ptr)->get_lut(gpu_indexes[0], 0), input_lut, - (params.glwe_dimension + 1) * - params.polynomial_size * sizeof(Torus), - streams[0], gpu_indexes[0]); + cuda_memcpy_async_to_gpu( + (*mem_ptr)->get_lut(gpu_indexes[0], 0), (void *)input_lut, + (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus), + streams[0], gpu_indexes[0]); (*mem_ptr)->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); } template -void host_apply_bivariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *radix_lwe_out, - Torus *radix_lwe_in_1, Torus *radix_lwe_in_2, - int_radix_lut *mem, Torus **ksks, - void **bsks, uint32_t num_blocks, - uint32_t shift) { +void host_apply_bivariate_lut_kb( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *radix_lwe_out, Torus const *radix_lwe_in_1, + Torus const *radix_lwe_in_2, int_radix_lut *mem, Torus *const *ksks, + void *const *bsks, uint32_t num_blocks, uint32_t shift) { integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in_1, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu index 1007ed7e80..e866f0c84f 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu @@ -66,12 +66,12 @@ void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in, * the integer radix multiplication in keyswitch->bootstrap order. */ void scratch_cuda_integer_mult_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension, - uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log, - uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level, - uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type, - bool allocate_gpu_memory) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t message_modulus, uint32_t carry_modulus, + uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, + uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log, + uint32_t ks_level, uint32_t grouping_factor, uint32_t num_radix_blocks, + PBS_TYPE pbs_type, bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, polynomial_size * glwe_dimension, lwe_dimension, @@ -87,7 +87,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64( case 8192: case 16384: scratch_cuda_integer_mult_radix_ciphertext_kb( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, + (cudaStream_t const *)(streams), gpu_indexes, gpu_count, (int_mul_memory **)mem_ptr, num_radix_blocks, params, allocate_gpu_memory); break; @@ -125,67 +125,67 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64( * - 'pbs_type' selects which PBS implementation should be used */ void cuda_integer_mult_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *radix_lwe_out, void *radix_lwe_left, void *radix_lwe_right, - void **bsks, void **ksks, int8_t *mem_ptr, uint32_t polynomial_size, - uint32_t num_blocks) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *radix_lwe_out, void const *radix_lwe_left, + void const *radix_lwe_right, void *const *bsks, void *const *ksks, + int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) { switch (polynomial_size) { case 256: host_integer_mult_radix_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), - static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), - (int_mul_memory *)mem_ptr, num_blocks); + static_cast(radix_lwe_left), + static_cast(radix_lwe_right), bsks, + (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; case 512: host_integer_mult_radix_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), - static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), - (int_mul_memory *)mem_ptr, num_blocks); + static_cast(radix_lwe_left), + static_cast(radix_lwe_right), bsks, + (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; case 1024: host_integer_mult_radix_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), - static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), - (int_mul_memory *)mem_ptr, num_blocks); + static_cast(radix_lwe_left), + static_cast(radix_lwe_right), bsks, + (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; case 2048: host_integer_mult_radix_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), - static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), - (int_mul_memory *)mem_ptr, num_blocks); + static_cast(radix_lwe_left), + static_cast(radix_lwe_right), bsks, + (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; case 4096: host_integer_mult_radix_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), - static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), - (int_mul_memory *)mem_ptr, num_blocks); + static_cast(radix_lwe_left), + static_cast(radix_lwe_right), bsks, + (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; case 8192: host_integer_mult_radix_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), - static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), - (int_mul_memory *)mem_ptr, num_blocks); + static_cast(radix_lwe_left), + static_cast(radix_lwe_right), bsks, + (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; case 16384: host_integer_mult_radix_kb>( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), - static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), - (int_mul_memory *)mem_ptr, num_blocks); + static_cast(radix_lwe_left), + static_cast(radix_lwe_right), bsks, + (uint64_t **)(ksks), (int_mul_memory *)mem_ptr, num_blocks); break; default: PANIC("Cuda error (integer multiplication): unsupported polynomial size. " @@ -193,8 +193,9 @@ void cuda_integer_mult_radix_ciphertext_kb_64( } } -void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes, - uint32_t gpu_count, int8_t **mem_ptr_void) { +void cleanup_cuda_integer_mult(void *const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr_void) { int_mul_memory *mem_ptr = (int_mul_memory *)(*mem_ptr_void); @@ -203,10 +204,10 @@ void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes, } void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension, - uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, - uint32_t pbs_base_log, uint32_t grouping_factor, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec, uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) { @@ -222,9 +223,10 @@ void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( } void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec, - int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks_in_radix) { + int8_t *mem_ptr, void *const *bsks, void *const *ksks, + uint32_t num_blocks_in_radix) { auto mem = (int_sum_ciphertexts_vec_memory *)mem_ptr; @@ -298,7 +300,7 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( } void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { int_sum_ciphertexts_vec_memory *mem_ptr = (int_sum_ciphertexts_vec_memory *)(*mem_ptr_void); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index 7bd838217f..0ebf410125 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -9,10 +9,10 @@ #include "crypto/keyswitch.cuh" #include "device.h" #include "helper_multi_gpu.h" -#include "integer.h" #include "integer/integer.cuh" +#include "integer/integer_utilities.h" #include "linear_algebra.h" -#include "programmable_bootstrap.h" +#include "pbs/programmable_bootstrap.h" #include "utils/helper.cuh" #include "utils/helper_multi_gpu.cuh" #include "utils/kernel_dimensions.cuh" @@ -43,8 +43,8 @@ __global__ void smart_copy(Torus *dst, Torus *src, int32_t *id_out, template __global__ void -all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext, - Torus *msb_ciphertext, Torus *radix_lwe_right, +all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext, + Torus *msb_ciphertext, Torus const *radix_lwe_right, Torus *lsb_rhs, Torus *msb_rhs, int num_blocks) { size_t block_id = blockIdx.x; @@ -170,8 +170,8 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks, } template __host__ void scratch_cuda_integer_partial_sum_ciphertexts_vec_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_sum_ciphertexts_vec_memory **mem_ptr, + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_sum_ciphertexts_vec_memory **mem_ptr, uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec, int_radix_params params, bool allocate_gpu_memory) { @@ -182,9 +182,10 @@ __host__ void scratch_cuda_integer_partial_sum_ciphertexts_vec_kb( template __host__ void host_integer_partial_sum_ciphertexts_vec_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *radix_lwe_out, Torus *terms, int *terms_degree, void **bsks, - uint64_t **ksks, int_sum_ciphertexts_vec_memory *mem_ptr, + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *radix_lwe_out, Torus *terms, int *terms_degree, + void *const *bsks, uint64_t *const *ksks, + int_sum_ciphertexts_vec_memory *mem_ptr, uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec, int_radix_lut *reused_lut) { @@ -450,9 +451,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( template __host__ void host_integer_mult_radix_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - uint64_t *radix_lwe_out, uint64_t *radix_lwe_left, - uint64_t *radix_lwe_right, void **bsks, uint64_t **ksks, + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, uint64_t *radix_lwe_out, uint64_t const *radix_lwe_left, + uint64_t const *radix_lwe_right, void *const *bsks, uint64_t *const *ksks, int_mul_memory *mem_ptr, uint32_t num_blocks) { auto glwe_dimension = mem_ptr->params.glwe_dimension; @@ -569,9 +570,10 @@ __host__ void host_integer_mult_radix_kb( template __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_mul_memory **mem_ptr, uint32_t num_radix_blocks, - int_radix_params params, bool allocate_gpu_memory) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_mul_memory **mem_ptr, + uint32_t num_radix_blocks, int_radix_params params, + bool allocate_gpu_memory) { *mem_ptr = new int_mul_memory(streams, gpu_indexes, gpu_count, params, num_radix_blocks, allocate_gpu_memory); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu index 67ae11be96..e55ea9e912 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu @@ -1,21 +1,21 @@ #include "integer/negation.cuh" void cuda_negate_integer_radix_ciphertext_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array_out, void const *lwe_array_in, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count, uint32_t message_modulus, uint32_t carry_modulus) { host_integer_radix_negation( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array_out), - static_cast(lwe_array_in), lwe_dimension, + static_cast(lwe_array_in), lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus); } void scratch_cuda_integer_radix_overflowing_sub_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, @@ -33,10 +33,10 @@ void scratch_cuda_integer_radix_overflowing_sub_kb_64( } void cuda_integer_radix_overflowing_sub_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *radix_lwe_out, void *radix_lwe_overflowed, void *radix_lwe_left, - void *radix_lwe_right, int8_t *mem_ptr, void **bsks, void **ksks, - uint32_t num_blocks) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *radix_lwe_out, void *radix_lwe_overflowed, void const *radix_lwe_left, + void const *radix_lwe_right, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t num_blocks) { auto mem = (int_overflowing_sub_memory *)mem_ptr; @@ -44,13 +44,13 @@ void cuda_integer_radix_overflowing_sub_kb_64( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(radix_lwe_out), static_cast(radix_lwe_overflowed), - static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), mem, - num_blocks); + static_cast(radix_lwe_left), + static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), + mem, num_blocks); } -void cleanup_cuda_integer_radix_overflowing_sub(void **streams, - uint32_t *gpu_indexes, +void cleanup_cuda_integer_radix_overflowing_sub(void *const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { int_overflowing_sub_memory *mem_ptr = diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh index eb8307c93a..6eda409df9 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh @@ -8,10 +8,10 @@ #include "crypto/keyswitch.cuh" #include "device.h" -#include "integer.h" #include "integer/integer.cuh" +#include "integer/integer_utilities.h" #include "linear_algebra.h" -#include "programmable_bootstrap.h" +#include "pbs/programmable_bootstrap.h" #include "utils/helper.cuh" #include "utils/kernel_dimensions.cuh" #include @@ -23,9 +23,9 @@ template __global__ void -device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks, - uint64_t lwe_dimension, uint64_t message_modulus, - uint64_t delta) { +device_integer_radix_negation(Torus *output, Torus const *input, + int32_t num_blocks, uint64_t lwe_dimension, + uint64_t message_modulus, uint64_t delta) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < lwe_dimension + 1) { bool is_body = (tid == lwe_dimension); @@ -54,12 +54,11 @@ device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks, } template -__host__ void -host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *output, Torus *input, - uint32_t lwe_dimension, - uint32_t input_lwe_ciphertext_count, - uint64_t message_modulus, uint64_t carry_modulus) { +__host__ void host_integer_radix_negation( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *output, Torus const *input, + uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count, + uint64_t message_modulus, uint64_t carry_modulus) { cudaSetDevice(gpu_indexes[0]); // lwe_size includes the presence of the body @@ -85,9 +84,9 @@ host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes, template __host__ void scratch_cuda_integer_overflowing_sub_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_overflowing_sub_memory **mem_ptr, uint32_t num_blocks, - int_radix_params params, bool allocate_gpu_memory) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_overflowing_sub_memory **mem_ptr, + uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) { *mem_ptr = new int_overflowing_sub_memory( streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory); @@ -95,9 +94,10 @@ __host__ void scratch_cuda_integer_overflowing_sub_kb( template __host__ void host_integer_overflowing_sub_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *radix_lwe_out, Torus *radix_lwe_overflowed, Torus *radix_lwe_left, - Torus *radix_lwe_right, void **bsks, uint64_t **ksks, + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *radix_lwe_out, Torus *radix_lwe_overflowed, + Torus const *radix_lwe_left, Torus const *radix_lwe_right, + void *const *bsks, uint64_t *const *ksks, int_overflowing_sub_memory *mem_ptr, uint32_t num_blocks) { auto radix_params = mem_ptr->params; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu index 153a6e5e86..cae83b55bd 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu @@ -1,12 +1,14 @@ #include "integer/scalar_addition.cuh" void cuda_scalar_addition_integer_radix_ciphertext_64_inplace( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - void *scalar_input, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count, - uint32_t message_modulus, uint32_t carry_modulus) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array, void const *scalar_input, uint32_t lwe_dimension, + uint32_t lwe_ciphertext_count, uint32_t message_modulus, + uint32_t carry_modulus) { host_integer_radix_scalar_addition_inplace( (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(lwe_array), static_cast(scalar_input), - lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus); + static_cast(lwe_array), + static_cast(scalar_input), lwe_dimension, + lwe_ciphertext_count, message_modulus, carry_modulus); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh index a7130b8681..4b1eae222e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh @@ -7,13 +7,13 @@ #endif #include "device.h" -#include "integer.h" +#include "integer/integer_utilities.h" #include "utils/kernel_dimensions.cuh" #include template __global__ void device_integer_radix_scalar_addition_inplace( - Torus *lwe_array, Torus *scalar_input, int32_t num_blocks, + Torus *lwe_array, Torus const *scalar_input, int32_t num_blocks, uint32_t lwe_dimension, uint64_t delta) { int tid = blockIdx.x * blockDim.x + threadIdx.x; @@ -25,10 +25,10 @@ __global__ void device_integer_radix_scalar_addition_inplace( template __host__ void host_integer_radix_scalar_addition_inplace( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array, Torus *scalar_input, uint32_t lwe_dimension, - uint32_t input_lwe_ciphertext_count, uint32_t message_modulus, - uint32_t carry_modulus) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array, Torus const *scalar_input, + uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count, + uint32_t message_modulus, uint32_t carry_modulus) { cudaSetDevice(gpu_indexes[0]); // Create a 1-dimensional grid of threads @@ -64,8 +64,8 @@ __global__ void device_integer_radix_add_scalar_one_inplace( template __host__ void host_integer_radix_add_scalar_one_inplace( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array, uint32_t lwe_dimension, + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array, uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count, uint32_t message_modulus, uint32_t carry_modulus) { cudaSetDevice(gpu_indexes[0]); @@ -104,10 +104,10 @@ __global__ void device_integer_radix_scalar_subtraction_inplace( template __host__ void host_integer_radix_scalar_subtraction_inplace( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array, Torus *scalar_input, uint32_t lwe_dimension, - uint32_t input_lwe_ciphertext_count, uint32_t message_modulus, - uint32_t carry_modulus) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array, Torus *scalar_input, + uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count, + uint32_t message_modulus, uint32_t carry_modulus) { cudaSetDevice(gpu_indexes[0]); // Create a 1-dimensional grid of threads diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu index 2a94cc99b1..1e992a3f61 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu @@ -1,16 +1,16 @@ #include "integer/scalar_bitops.cuh" void cuda_scalar_bitop_integer_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *lwe_array_out, void *lwe_array_input, void *clear_blocks, - uint32_t num_clear_blocks, int8_t *mem_ptr, void **bsks, void **ksks, - uint32_t lwe_ciphertext_count, BITOP_TYPE op) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array_out, void const *lwe_array_input, void const *clear_blocks, + uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t lwe_ciphertext_count, BITOP_TYPE op) { host_integer_radix_scalar_bitop_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array_out), - static_cast(lwe_array_input), - static_cast(clear_blocks), num_clear_blocks, + static_cast(lwe_array_input), + static_cast(clear_blocks), num_clear_blocks, (int_bitop_buffer *)mem_ptr, bsks, (uint64_t **)(ksks), lwe_ciphertext_count, op); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh index ac2ab7fb34..6338a6dced 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh @@ -6,10 +6,11 @@ template __host__ void host_integer_radix_scalar_bitop_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_array_input, Torus *clear_blocks, - uint32_t num_clear_blocks, int_bitop_buffer *mem_ptr, void **bsks, - Torus **ksks, uint32_t num_radix_blocks, BITOP_TYPE op) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_input, + Torus const *clear_blocks, uint32_t num_clear_blocks, + int_bitop_buffer *mem_ptr, void *const *bsks, Torus *const *ksks, + uint32_t num_radix_blocks, BITOP_TYPE op) { auto lut = mem_ptr->lut; auto params = lut->params; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu index 9334be96ea..8293ce9c1f 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu @@ -1,10 +1,10 @@ #include "integer/scalar_comparison.cuh" void cuda_scalar_comparison_integer_radix_ciphertext_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, - void *lwe_array_out, void *lwe_array_in, void *scalar_blocks, - int8_t *mem_ptr, void **bsks, void **ksks, uint32_t lwe_ciphertext_count, - uint32_t num_scalar_blocks) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array_out, void const *lwe_array_in, void const *scalar_blocks, + int8_t *mem_ptr, void *const *bsks, void *const *ksks, + uint32_t lwe_ciphertext_count, uint32_t num_scalar_blocks) { int_comparison_buffer *buffer = (int_comparison_buffer *)mem_ptr; @@ -14,8 +14,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64( host_integer_radix_scalar_equality_check_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array_out), - static_cast(lwe_array_in), - static_cast(scalar_blocks), buffer, bsks, + static_cast(lwe_array_in), + static_cast(scalar_blocks), buffer, bsks, (uint64_t **)(ksks), lwe_ciphertext_count, num_scalar_blocks); break; case GT: @@ -25,8 +25,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64( host_integer_radix_scalar_difference_check_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array_out), - static_cast(lwe_array_in), - static_cast(scalar_blocks), buffer, + static_cast(lwe_array_in), + static_cast(scalar_blocks), buffer, buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks), lwe_ciphertext_count, num_scalar_blocks); break; @@ -35,8 +35,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64( host_integer_radix_scalar_maxmin_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array_out), - static_cast(lwe_array_in), - static_cast(scalar_blocks), buffer, bsks, + static_cast(lwe_array_in), + static_cast(scalar_blocks), buffer, bsks, (uint64_t **)(ksks), lwe_ciphertext_count, num_scalar_blocks); break; default: diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh index a8cd292e7a..85e8942c39 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh @@ -5,10 +5,10 @@ template __host__ void scalar_compare_radix_blocks_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks, - int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, - uint32_t num_radix_blocks) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in, + Torus *scalar_blocks, int_comparison_buffer *mem_ptr, + void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) { if (num_radix_blocks == 0) return; @@ -57,11 +57,12 @@ __host__ void scalar_compare_radix_blocks_kb( template __host__ void integer_radix_unsigned_scalar_difference_check_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks, - int_comparison_buffer *mem_ptr, - std::function sign_handler_f, void **bsks, Torus **ksks, - uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in, + Torus const *scalar_blocks, int_comparison_buffer *mem_ptr, + std::function sign_handler_f, void *const *bsks, + Torus *const *ksks, uint32_t total_num_radix_blocks, + uint32_t total_num_scalar_blocks) { auto params = mem_ptr->params; auto big_lwe_dimension = params.big_lwe_dimension; @@ -243,11 +244,12 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( template __host__ void integer_radix_signed_scalar_difference_check_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks, - int_comparison_buffer *mem_ptr, - std::function sign_handler_f, void **bsks, Torus **ksks, - uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in, + Torus const *scalar_blocks, int_comparison_buffer *mem_ptr, + std::function sign_handler_f, void *const *bsks, + Torus *const *ksks, uint32_t total_num_radix_blocks, + uint32_t total_num_scalar_blocks) { auto params = mem_ptr->params; auto big_lwe_dimension = params.big_lwe_dimension; @@ -287,7 +289,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( host_compare_with_zero_equality( streams, gpu_indexes, gpu_count, are_all_msb_zeros, lwe_array_in, mem_ptr, bsks, ksks, total_num_radix_blocks, mem_ptr->is_zero_lut); - Torus *sign_block = + Torus const *sign_block = lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size; auto sign_bit_pos = (int)std::log2(message_modulus) - 1; @@ -426,7 +428,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( lut_f); signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); - Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size; + Torus const *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size; integer_radix_apply_bivariate_lookup_table_kb( msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block, are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut, @@ -476,9 +478,10 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count, lwe_array_ct_out, lhs, rhs, mem_ptr, bsks, ksks, num_lsb_radix_blocks); - Torus *encrypted_sign_block = + Torus const *encrypted_sign_block = lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size; - Torus *scalar_sign_block = scalar_blocks + (total_num_scalar_blocks - 1); + Torus const *scalar_sign_block = + scalar_blocks + (total_num_scalar_blocks - 1); auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block; create_trivial_radix( @@ -505,10 +508,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( template __host__ void integer_radix_signed_scalar_maxmin_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks, - int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, - uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in, + Torus *scalar_blocks, int_comparison_buffer *mem_ptr, + void *const *bsks, Torus *const *ksks, uint32_t total_num_radix_blocks, + uint32_t total_num_scalar_blocks) { auto params = mem_ptr->params; // Calculates the difference sign between the ciphertext and the scalar @@ -541,11 +545,12 @@ __host__ void integer_radix_signed_scalar_maxmin_kb( template __host__ void host_integer_radix_scalar_difference_check_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks, - int_comparison_buffer *mem_ptr, - std::function sign_handler_f, void **bsks, Torus **ksks, - uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in, + Torus const *scalar_blocks, int_comparison_buffer *mem_ptr, + std::function sign_handler_f, void *const *bsks, + Torus *const *ksks, uint32_t total_num_radix_blocks, + uint32_t total_num_scalar_blocks) { if (mem_ptr->is_signed) { // is signed and scalar is positive @@ -563,10 +568,11 @@ __host__ void host_integer_radix_scalar_difference_check_kb( template __host__ void host_integer_radix_signed_scalar_maxmin_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks, - int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, - uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in, + Torus *scalar_blocks, int_comparison_buffer *mem_ptr, + void *const *bsks, Torus *const *ksks, uint32_t total_num_radix_blocks, + uint32_t total_num_scalar_blocks) { if (mem_ptr->is_signed) { // is signed and scalar is positive @@ -582,10 +588,11 @@ __host__ void host_integer_radix_signed_scalar_maxmin_kb( template __host__ void host_integer_radix_scalar_maxmin_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks, - int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, - uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in, + Torus const *scalar_blocks, int_comparison_buffer *mem_ptr, + void *const *bsks, Torus *const *ksks, uint32_t total_num_radix_blocks, + uint32_t total_num_scalar_blocks) { auto params = mem_ptr->params; @@ -619,10 +626,11 @@ __host__ void host_integer_radix_scalar_maxmin_kb( template __host__ void host_integer_radix_scalar_equality_check_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks, - int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, - uint32_t num_radix_blocks, uint32_t num_scalar_blocks) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in, + Torus const *scalar_blocks, int_comparison_buffer *mem_ptr, + void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks, + uint32_t num_scalar_blocks) { auto params = mem_ptr->params; auto big_lwe_dimension = params.big_lwe_dimension; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu index c17d585194..7cbfd566d9 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu @@ -1,12 +1,12 @@ #include "integer/scalar_mul.cuh" void scratch_cuda_integer_scalar_mul_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension, - uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, - uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks, - uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, - bool allocate_gpu_memory) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, + PBS_TYPE pbs_type, bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, glwe_dimension * polynomial_size, lwe_dimension, @@ -20,9 +20,10 @@ void scratch_cuda_integer_scalar_mul_kb_64( } void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - uint64_t *decomposed_scalar, uint64_t *has_at_least_one_set, int8_t *mem, - void **bsks, void **ksks, uint32_t lwe_dimension, uint32_t polynomial_size, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array, uint64_t const *decomposed_scalar, + uint64_t const *has_at_least_one_set, int8_t *mem, void *const *bsks, + void *const *ksks, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_blocks, uint32_t num_scalars) { switch (polynomial_size) { @@ -86,8 +87,8 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace( } } -void cleanup_cuda_integer_radix_scalar_mul(void **streams, - uint32_t *gpu_indexes, +void cleanup_cuda_integer_radix_scalar_mul(void *const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh index 667e67da8c..37a51006ae 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh @@ -7,7 +7,7 @@ #endif #include "device.h" -#include "integer.h" +#include "integer/integer_utilities.h" #include "multiplication.cuh" #include "scalar_shifts.cuh" #include "utils/kernel_dimensions.cuh" @@ -29,9 +29,10 @@ __global__ void device_small_scalar_radix_multiplication(T *output_lwe_array, template __host__ void scratch_cuda_integer_radix_scalar_mul_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_scalar_mul_buffer **mem_ptr, uint32_t num_radix_blocks, - int_radix_params params, bool allocate_gpu_memory) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_scalar_mul_buffer **mem_ptr, + uint32_t num_radix_blocks, int_radix_params params, + bool allocate_gpu_memory) { *mem_ptr = new int_scalar_mul_buffer(streams, gpu_indexes, gpu_count, params, @@ -40,11 +41,11 @@ __host__ void scratch_cuda_integer_radix_scalar_mul_kb( template __host__ void host_integer_scalar_mul_radix( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - T *lwe_array, T *decomposed_scalar, T *has_at_least_one_set, - int_scalar_mul_buffer *mem, void **bsks, T **ksks, - uint32_t input_lwe_dimension, uint32_t message_modulus, - uint32_t num_radix_blocks, uint32_t num_scalars) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, T *lwe_array, T const *decomposed_scalar, + T const *has_at_least_one_set, int_scalar_mul_buffer *mem, + void *const *bsks, T *const *ksks, uint32_t input_lwe_dimension, + uint32_t message_modulus, uint32_t num_radix_blocks, uint32_t num_scalars) { if (num_radix_blocks == 0 | num_scalars == 0) return; @@ -121,8 +122,8 @@ __host__ void host_integer_scalar_mul_radix( // Small scalar_mul is used in shift/rotate template __host__ void host_integer_small_scalar_mul_radix( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - T *output_lwe_array, T *input_lwe_array, T scalar, + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, T *output_lwe_array, T *input_lwe_array, T scalar, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { cudaSetDevice(gpu_indexes[0]); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu index 51f5da598b..c7ba624bf3 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu @@ -1,8 +1,8 @@ #include "scalar_rotate.cuh" void scratch_cuda_integer_radix_scalar_rotate_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, @@ -21,9 +21,9 @@ void scratch_cuda_integer_radix_scalar_rotate_kb_64( } void cuda_integer_radix_scalar_rotate_kb_64_inplace( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - uint32_t n, int8_t *mem_ptr, void **bsks, void **ksks, - uint32_t num_blocks) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array, uint32_t n, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t num_blocks) { host_integer_radix_scalar_rotate_kb_inplace( (cudaStream_t *)(streams), gpu_indexes, gpu_count, @@ -32,8 +32,8 @@ void cuda_integer_radix_scalar_rotate_kb_64_inplace( (uint64_t **)(ksks), num_blocks); } -void cleanup_cuda_integer_radix_scalar_rotate(void **streams, - uint32_t *gpu_indexes, +void cleanup_cuda_integer_radix_scalar_rotate(void *const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh index 02a0a77a40..82124cc763 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh @@ -4,7 +4,7 @@ #include "crypto/keyswitch.cuh" #include "device.h" #include "integer.cuh" -#include "integer.h" +#include "integer/integer_utilities.h" #include "pbs/programmable_bootstrap_classic.cuh" #include "pbs/programmable_bootstrap_multibit.cuh" #include "types/complex/operations.cuh" @@ -13,10 +13,10 @@ template __host__ void scratch_cuda_integer_radix_scalar_rotate_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_logical_scalar_shift_buffer **mem_ptr, uint32_t num_radix_blocks, - int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type, - bool allocate_gpu_memory) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_logical_scalar_shift_buffer **mem_ptr, + uint32_t num_radix_blocks, int_radix_params params, + SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) { *mem_ptr = new int_logical_scalar_shift_buffer( streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks, @@ -25,9 +25,10 @@ __host__ void scratch_cuda_integer_radix_scalar_rotate_kb( template __host__ void host_integer_radix_scalar_rotate_kb_inplace( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array, uint32_t n, int_logical_scalar_shift_buffer *mem, - void **bsks, Torus **ksks, uint32_t num_blocks) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array, uint32_t n, + int_logical_scalar_shift_buffer *mem, void *const *bsks, + Torus *const *ksks, uint32_t num_blocks) { auto params = mem->params; auto glwe_dimension = params.glwe_dimension; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu index ea1e226cf8..afbc3bd120 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu @@ -1,8 +1,8 @@ #include "scalar_shifts.cuh" void scratch_cuda_integer_radix_logical_scalar_shift_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, @@ -25,9 +25,9 @@ void scratch_cuda_integer_radix_logical_scalar_shift_kb_64( /// the application of a PBS onto the rotated blocks up to num_blocks - /// rotations - 1 The remaining blocks are padded with zeros void cuda_integer_radix_logical_scalar_shift_kb_64_inplace( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - uint32_t shift, int8_t *mem_ptr, void **bsks, void **ksks, - uint32_t num_blocks) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array, uint32_t shift, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t num_blocks) { host_integer_radix_logical_scalar_shift_kb_inplace( (cudaStream_t *)(streams), gpu_indexes, gpu_count, @@ -37,8 +37,8 @@ void cuda_integer_radix_logical_scalar_shift_kb_64_inplace( } void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, @@ -64,9 +64,9 @@ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64( /// block, which is copied onto all remaining blocks instead of padding with /// zeros as would be done in the logical shift. void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - uint32_t shift, int8_t *mem_ptr, void **bsks, void **ksks, - uint32_t num_blocks) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array, uint32_t shift, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t num_blocks) { host_integer_radix_arithmetic_scalar_shift_kb_inplace( (cudaStream_t *)(streams), gpu_indexes, gpu_count, @@ -75,10 +75,9 @@ void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace( (uint64_t **)(ksks), num_blocks); } -void cleanup_cuda_integer_radix_logical_scalar_shift(void **streams, - uint32_t *gpu_indexes, - uint32_t gpu_count, - int8_t **mem_ptr_void) { +void cleanup_cuda_integer_radix_logical_scalar_shift( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr_void) { int_logical_scalar_shift_buffer *mem_ptr = (int_logical_scalar_shift_buffer *)(*mem_ptr_void); @@ -86,10 +85,9 @@ void cleanup_cuda_integer_radix_logical_scalar_shift(void **streams, mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); } -void cleanup_cuda_integer_radix_arithmetic_scalar_shift(void **streams, - uint32_t *gpu_indexes, - uint32_t gpu_count, - int8_t **mem_ptr_void) { +void cleanup_cuda_integer_radix_arithmetic_scalar_shift( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr_void) { int_arithmetic_scalar_shift_buffer *mem_ptr = (int_arithmetic_scalar_shift_buffer *)(*mem_ptr_void); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh index 6555fad9f1..1111f510e3 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh @@ -4,7 +4,7 @@ #include "crypto/keyswitch.cuh" #include "device.h" #include "integer.cuh" -#include "integer.h" +#include "integer/integer_utilities.h" #include "pbs/programmable_bootstrap_classic.cuh" #include "pbs/programmable_bootstrap_multibit.cuh" #include "types/complex/operations.cuh" @@ -13,10 +13,10 @@ template __host__ void scratch_cuda_integer_radix_logical_scalar_shift_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_logical_scalar_shift_buffer **mem_ptr, uint32_t num_radix_blocks, - int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type, - bool allocate_gpu_memory) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_logical_scalar_shift_buffer **mem_ptr, + uint32_t num_radix_blocks, int_radix_params params, + SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) { *mem_ptr = new int_logical_scalar_shift_buffer( streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks, @@ -25,10 +25,10 @@ __host__ void scratch_cuda_integer_radix_logical_scalar_shift_kb( template __host__ void host_integer_radix_logical_scalar_shift_kb_inplace( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array, uint32_t shift, - int_logical_scalar_shift_buffer *mem, void **bsks, Torus **ksks, - uint32_t num_blocks) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array, uint32_t shift, + int_logical_scalar_shift_buffer *mem, void *const *bsks, + Torus *const *ksks, uint32_t num_blocks) { auto params = mem->params; auto glwe_dimension = params.glwe_dimension; @@ -116,8 +116,8 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace( template __host__ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_arithmetic_scalar_shift_buffer **mem_ptr, + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_arithmetic_scalar_shift_buffer **mem_ptr, uint32_t num_radix_blocks, int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) { @@ -128,10 +128,10 @@ __host__ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb( template __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array, uint32_t shift, - int_arithmetic_scalar_shift_buffer *mem, void **bsks, Torus **ksks, - uint32_t num_blocks) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array, uint32_t shift, + int_arithmetic_scalar_shift_buffer *mem, void *const *bsks, + Torus *const *ksks, uint32_t num_blocks) { auto params = mem->params; auto glwe_dimension = params.glwe_dimension; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu index bf2ba84d10..01feaa0b27 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu @@ -1,8 +1,8 @@ #include "shift_and_rotate.cuh" void scratch_cuda_integer_radix_shift_and_rotate_kb_64( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, - uint32_t glwe_dimension, uint32_t polynomial_size, + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, @@ -21,19 +21,20 @@ void scratch_cuda_integer_radix_shift_and_rotate_kb_64( } void cuda_integer_radix_shift_and_rotate_kb_64_inplace( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - void *lwe_shift, int8_t *mem_ptr, void **bsks, void **ksks, - uint32_t num_blocks) { + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array, void const *lwe_shift, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t num_blocks) { host_integer_radix_shift_and_rotate_kb_inplace( (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(lwe_array), static_cast(lwe_shift), + static_cast(lwe_array), + static_cast(lwe_shift), (int_shift_and_rotate_buffer *)mem_ptr, bsks, (uint64_t **)(ksks), num_blocks); } -void cleanup_cuda_integer_radix_shift_and_rotate(void **streams, - uint32_t *gpu_indexes, +void cleanup_cuda_integer_radix_shift_and_rotate(void *const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void) { int_shift_and_rotate_buffer *mem_ptr = diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh index 92f61b830f..60ec4a1ab1 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh @@ -4,7 +4,7 @@ #include "crypto/keyswitch.cuh" #include "device.h" #include "integer.cuh" -#include "integer.h" +#include "integer/integer_utilities.h" #include "pbs/programmable_bootstrap_classic.cuh" #include "pbs/programmable_bootstrap_multibit.cuh" #include "scalar_mul.cuh" @@ -14,10 +14,10 @@ template __host__ void scratch_cuda_integer_radix_shift_and_rotate_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_shift_and_rotate_buffer **mem_ptr, uint32_t num_radix_blocks, - int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed, - bool allocate_gpu_memory) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_shift_and_rotate_buffer **mem_ptr, + uint32_t num_radix_blocks, int_radix_params params, + SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed, bool allocate_gpu_memory) { *mem_ptr = new int_shift_and_rotate_buffer( streams, gpu_indexes, gpu_count, shift_type, is_signed, params, num_radix_blocks, allocate_gpu_memory); @@ -25,9 +25,10 @@ __host__ void scratch_cuda_integer_radix_shift_and_rotate_kb( template __host__ void host_integer_radix_shift_and_rotate_kb_inplace( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array, Torus *lwe_shift, int_shift_and_rotate_buffer *mem, - void **bsks, Torus **ksks, uint32_t num_radix_blocks) { + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array, Torus const *lwe_shift, + int_shift_and_rotate_buffer *mem, void *const *bsks, + Torus *const *ksks, uint32_t num_radix_blocks) { uint32_t bits_per_block = std::log2(mem->params.message_modulus); uint32_t total_nb_bits = bits_per_block * num_radix_blocks; if (total_nb_bits == 0) @@ -60,8 +61,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( // Extracts bits and put them in the bit index 2 (=> bit number 3) // so that it is already aligned to the correct position of the cmux input // and we reduce noise growth - extract_n_bits(streams, gpu_indexes, gpu_count, shift_bits, lwe_shift, - bsks, ksks, 1, max_num_bits_that_tell_shift, + extract_n_bits(streams, gpu_indexes, gpu_count, shift_bits, + (Torus *)lwe_shift, bsks, ksks, 1, + max_num_bits_that_tell_shift, mem->bit_extract_luts_with_offset_2); // If signed, do an "arithmetic shift" by padding with the sign bit diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu index 17c06326f6..d3f47ad263 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu @@ -6,15 +6,15 @@ */ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_array_in_1, - void *lwe_array_in_2, + void const *lwe_array_in_1, + void const *lwe_array_in_2, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { host_addition(static_cast(stream), gpu_index, static_cast(lwe_array_out), - static_cast(lwe_array_in_1), - static_cast(lwe_array_in_2), + static_cast(lwe_array_in_1), + static_cast(lwe_array_in_2), input_lwe_dimension, input_lwe_ciphertext_count); } @@ -46,15 +46,15 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index, */ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_array_in_1, - void *lwe_array_in_2, + void const *lwe_array_in_1, + void const *lwe_array_in_2, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { host_addition(static_cast(stream), gpu_index, static_cast(lwe_array_out), - static_cast(lwe_array_in_1), - static_cast(lwe_array_in_2), + static_cast(lwe_array_in_1), + static_cast(lwe_array_in_2), input_lwe_dimension, input_lwe_ciphertext_count); } /* @@ -62,15 +62,15 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index, * plaintext vector. See the equivalent operation on u64 data for more details. */ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32( - void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, - void *plaintext_array_in, uint32_t input_lwe_dimension, - uint32_t input_lwe_ciphertext_count) { + void *stream, uint32_t gpu_index, void *lwe_array_out, + void const *lwe_array_in, void const *plaintext_array_in, + uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { host_addition_plaintext( static_cast(stream), gpu_index, static_cast(lwe_array_out), - static_cast(lwe_array_in), - static_cast(plaintext_array_in), input_lwe_dimension, + static_cast(lwe_array_in), + static_cast(plaintext_array_in), input_lwe_dimension, input_lwe_ciphertext_count); } /* @@ -102,14 +102,14 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32( * performs the operation on the GPU. */ void cuda_add_lwe_ciphertext_vector_plaintext_vector_64( - void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, - void *plaintext_array_in, uint32_t input_lwe_dimension, - uint32_t input_lwe_ciphertext_count) { + void *stream, uint32_t gpu_index, void *lwe_array_out, + void const *lwe_array_in, void const *plaintext_array_in, + uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { host_addition_plaintext( static_cast(stream), gpu_index, static_cast(lwe_array_out), - static_cast(lwe_array_in), - static_cast(plaintext_array_in), input_lwe_dimension, + static_cast(lwe_array_in), + static_cast(plaintext_array_in), input_lwe_dimension, input_lwe_ciphertext_count); } diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh index a587e5a134..29e1f62689 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh @@ -13,9 +13,9 @@ #include template -__global__ void plaintext_addition(T *output, T *lwe_input, T *plaintext_input, - uint32_t input_lwe_dimension, - uint32_t num_entries) { +__global__ void +plaintext_addition(T *output, T const *lwe_input, T const *plaintext_input, + uint32_t input_lwe_dimension, uint32_t num_entries) { int tid = threadIdx.x; int plaintext_index = blockIdx.x * blockDim.x + tid; @@ -30,7 +30,7 @@ __global__ void plaintext_addition(T *output, T *lwe_input, T *plaintext_input, template __host__ void host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output, - T *lwe_input, T *plaintext_input, + T const *lwe_input, T const *plaintext_input, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count) { cudaSetDevice(gpu_index); @@ -49,7 +49,7 @@ host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output, } template -__global__ void addition(T *output, T *input_1, T *input_2, +__global__ void addition(T *output, T const *input_1, T const *input_2, uint32_t num_entries) { int tid = threadIdx.x; @@ -63,7 +63,7 @@ __global__ void addition(T *output, T *input_1, T *input_2, // Coefficient-wise addition template __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output, - T *input_1, T *input_2, + T const *input_1, T const *input_2, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { @@ -83,7 +83,7 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output, } template -__global__ void subtraction(T *output, T *input_1, T *input_2, +__global__ void subtraction(T *output, T const *input_1, T const *input_2, uint32_t num_entries) { int tid = threadIdx.x; @@ -97,7 +97,7 @@ __global__ void subtraction(T *output, T *input_1, T *input_2, // Coefficient-wise subtraction template __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index, - T *output, T *input_1, T *input_2, + T *output, T const *input_1, T const *input_2, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { @@ -157,9 +157,11 @@ __host__ void host_subtraction_plaintext(cudaStream_t stream, } template -__global__ void unchecked_sub_with_correcting_term( - T *output, T *input_1, T *input_2, uint32_t num_entries, uint32_t lwe_size, - uint32_t message_modulus, uint32_t carry_modulus, uint32_t degree) { +__global__ void +unchecked_sub_with_correcting_term(T *output, T const *input_1, + T const *input_2, uint32_t num_entries, + uint32_t lwe_size, uint32_t message_modulus, + uint32_t carry_modulus, uint32_t degree) { uint32_t msg_mod = message_modulus; uint64_t z = max((uint64_t)ceil(degree / msg_mod), (uint64_t)1); z *= msg_mod; @@ -178,9 +180,10 @@ __global__ void unchecked_sub_with_correcting_term( } template __host__ void host_unchecked_sub_with_correcting_term( - cudaStream_t stream, uint32_t gpu_index, T *output, T *input_1, T *input_2, - uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count, - uint32_t message_modulus, uint32_t carry_modulus, uint32_t degree) { + cudaStream_t stream, uint32_t gpu_index, T *output, T const *input_1, + T const *input_2, uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count, uint32_t message_modulus, + uint32_t carry_modulus, uint32_t degree) { cudaSetDevice(gpu_index); // lwe_size includes the presence of the body diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu index 1c424b336c..8cb4a5a520 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu @@ -5,15 +5,15 @@ * cleartext vector. See the equivalent operation on u64 data for more details. */ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32( - void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, - void *cleartext_array_in, uint32_t input_lwe_dimension, - uint32_t input_lwe_ciphertext_count) { + void *stream, uint32_t gpu_index, void *lwe_array_out, + void const *lwe_array_in, void const *cleartext_array_in, + uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { host_cleartext_vec_multiplication( static_cast(stream), gpu_index, static_cast(lwe_array_out), - static_cast(lwe_array_in), - static_cast(cleartext_array_in), input_lwe_dimension, + static_cast(lwe_array_in), + static_cast(cleartext_array_in), input_lwe_dimension, input_lwe_ciphertext_count); } /* @@ -45,14 +45,14 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32( * function that performs the operation on the GPU. */ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( - void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, - void *cleartext_array_in, uint32_t input_lwe_dimension, - uint32_t input_lwe_ciphertext_count) { + void *stream, uint32_t gpu_index, void *lwe_array_out, + void const *lwe_array_in, void const *cleartext_array_in, + uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { host_cleartext_vec_multiplication( static_cast(stream), gpu_index, static_cast(lwe_array_out), - static_cast(lwe_array_in), - static_cast(cleartext_array_in), input_lwe_dimension, + static_cast(lwe_array_in), + static_cast(cleartext_array_in), input_lwe_dimension, input_lwe_ciphertext_count); } diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh index 9f9d396ed4..1288deb51b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh @@ -14,8 +14,8 @@ #include template -__global__ void cleartext_vec_multiplication(T *output, T *lwe_input, - T *cleartext_input, +__global__ void cleartext_vec_multiplication(T *output, T const *lwe_input, + T const *cleartext_input, uint32_t input_lwe_dimension, uint32_t num_entries) { @@ -29,11 +29,10 @@ __global__ void cleartext_vec_multiplication(T *output, T *lwe_input, } template -__host__ void -host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index, - T *output, T *lwe_input, T *cleartext_input, - uint32_t input_lwe_dimension, - uint32_t input_lwe_ciphertext_count) { +__host__ void host_cleartext_vec_multiplication( + cudaStream_t stream, uint32_t gpu_index, T *output, T const *lwe_input, + T const *cleartext_input, uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count) { cudaSetDevice(gpu_index); // lwe_size includes the presence of the body @@ -53,7 +52,7 @@ host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index, template __global__ void -cleartext_multiplication(T *output, T *lwe_input, T cleartext_input, +cleartext_multiplication(T *output, T const *lwe_input, T cleartext_input, uint32_t input_lwe_dimension, uint32_t num_entries) { int tid = threadIdx.x; @@ -67,7 +66,7 @@ cleartext_multiplication(T *output, T *lwe_input, T cleartext_input, template __host__ void host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index, - T *output, T *lwe_input, T cleartext_input, + T *output, T const *lwe_input, T cleartext_input, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu index a1465e390f..9601d97082 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu @@ -6,13 +6,13 @@ */ void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_array_in, + void const *lwe_array_in, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { host_negation(static_cast(stream), gpu_index, static_cast(lwe_array_out), - static_cast(lwe_array_in), + static_cast(lwe_array_in), input_lwe_dimension, input_lwe_ciphertext_count); } @@ -40,12 +40,12 @@ void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index, */ void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_array_in, + void const *lwe_array_in, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { host_negation(static_cast(stream), gpu_index, static_cast(lwe_array_out), - static_cast(lwe_array_in), + static_cast(lwe_array_in), input_lwe_dimension, input_lwe_ciphertext_count); } diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh index e8c4ba0a69..98a886cce2 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh @@ -11,7 +11,7 @@ #include "linear_algebra.h" template -__global__ void negation(T *output, T *input, uint32_t num_entries) { +__global__ void negation(T *output, T const *input, uint32_t num_entries) { int tid = threadIdx.x; int index = blockIdx.x * blockDim.x + tid; @@ -23,7 +23,7 @@ __global__ void negation(T *output, T *input, uint32_t num_entries) { template __host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output, - T *input, uint32_t input_lwe_dimension, + T const *input, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { cudaSetDevice(gpu_index); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu index abea6deba8..e018934cde 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu @@ -1,29 +1,29 @@ #include "bootstrapping_key.cuh" void cuda_convert_lwe_programmable_bootstrap_key_32( - void *stream, uint32_t gpu_index, void *dest, void *src, + void *stream, uint32_t gpu_index, void *dest, void const *src, uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size) { uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count; cuda_convert_lwe_programmable_bootstrap_key( static_cast(stream), gpu_index, (double2 *)dest, - (int32_t *)src, polynomial_size, total_polynomials); + (const int32_t *)src, polynomial_size, total_polynomials); } void cuda_convert_lwe_programmable_bootstrap_key_64( - void *stream, uint32_t gpu_index, void *dest, void *src, + void *stream, uint32_t gpu_index, void *dest, void const *src, uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size) { uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count; cuda_convert_lwe_programmable_bootstrap_key( static_cast(stream), gpu_index, (double2 *)dest, - (int64_t *)src, polynomial_size, total_polynomials); + (const int64_t *)src, polynomial_size, total_polynomials); } void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64( - void *stream, uint32_t gpu_index, void *dest, void *src, + void *stream, uint32_t gpu_index, void *dest, void const *src, uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size, uint32_t grouping_factor) { uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * @@ -89,3 +89,175 @@ template __device__ const double2 *get_multi_bit_ith_lwe_gth_group_kth_block( const double2 *ptr, int g, int i, int k, int level, uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count); + +void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index, + void const *_input1, void const *_input2, + void *_output, uint32_t polynomial_size, + uint32_t total_polynomials) { + + auto stream = static_cast(stream_v); + cudaSetDevice(gpu_index); + auto input1 = (double2 *)_input1; + auto input2 = (double2 *)_input2; + auto output = (double2 *)_output; + + size_t shared_memory_size = sizeof(double2) * polynomial_size / 2; + + int gridSize = total_polynomials; + int blockSize = polynomial_size / choose_opt_amortized(polynomial_size); + + double2 *buffer; + switch (polynomial_size) { + case 256: + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); + check_cuda_error(cudaFuncSetAttribute( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncCachePreferShared)); + batch_polynomial_mul, ForwardFFT>, FULLSM> + <<>>(input1, input2, + output, buffer); + } else { + buffer = (double2 *)cuda_malloc_async( + shared_memory_size * total_polynomials, stream, gpu_index); + batch_polynomial_mul, ForwardFFT>, NOSM> + <<>>(input1, input2, output, buffer); + } + break; + case 512: + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); + check_cuda_error(cudaFuncSetAttribute( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncCachePreferShared)); + batch_polynomial_mul, ForwardFFT>, FULLSM> + <<>>(input1, input2, + output, buffer); + } else { + buffer = (double2 *)cuda_malloc_async( + shared_memory_size * total_polynomials, stream, gpu_index); + batch_polynomial_mul, ForwardFFT>, NOSM> + <<>>(input1, input2, output, buffer); + } + break; + case 1024: + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); + check_cuda_error(cudaFuncSetAttribute( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncCachePreferShared)); + batch_polynomial_mul, ForwardFFT>, FULLSM> + <<>>(input1, input2, + output, buffer); + } else { + buffer = (double2 *)cuda_malloc_async( + shared_memory_size * total_polynomials, stream, gpu_index); + batch_polynomial_mul, ForwardFFT>, NOSM> + <<>>(input1, input2, output, buffer); + } + break; + case 2048: + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); + check_cuda_error(cudaFuncSetAttribute( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncCachePreferShared)); + batch_polynomial_mul, ForwardFFT>, FULLSM> + <<>>(input1, input2, + output, buffer); + } else { + buffer = (double2 *)cuda_malloc_async( + shared_memory_size * total_polynomials, stream, gpu_index); + batch_polynomial_mul, ForwardFFT>, NOSM> + <<>>(input1, input2, output, buffer); + } + break; + case 4096: + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); + check_cuda_error(cudaFuncSetAttribute( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncCachePreferShared)); + batch_polynomial_mul, ForwardFFT>, FULLSM> + <<>>(input1, input2, + output, buffer); + } else { + buffer = (double2 *)cuda_malloc_async( + shared_memory_size * total_polynomials, stream, gpu_index); + batch_polynomial_mul, ForwardFFT>, NOSM> + <<>>(input1, input2, output, buffer); + } + break; + case 8192: + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); + check_cuda_error(cudaFuncSetAttribute( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncCachePreferShared)); + batch_polynomial_mul, ForwardFFT>, FULLSM> + <<>>(input1, input2, + output, buffer); + } else { + buffer = (double2 *)cuda_malloc_async( + shared_memory_size * total_polynomials, stream, gpu_index); + batch_polynomial_mul, ForwardFFT>, NOSM> + <<>>(input1, input2, output, buffer); + } + break; + case 16384: + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); + check_cuda_error(cudaFuncSetAttribute( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncCachePreferShared)); + batch_polynomial_mul, ForwardFFT>, + FULLSM> + <<>>(input1, input2, + output, buffer); + } else { + buffer = (double2 *)cuda_malloc_async( + shared_memory_size * total_polynomials, stream, gpu_index); + batch_polynomial_mul, ForwardFFT>, NOSM> + <<>>(input1, input2, output, buffer); + } + break; + default: + break; + } + cuda_drop_async(buffer, stream, gpu_index); +} diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh index 13f878f055..38f2d60e1c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh @@ -3,9 +3,9 @@ #include "device.h" #include "fft/bnsmfft.cuh" +#include "pbs/programmable_bootstrap.h" +#include "pbs/programmable_bootstrap_multibit.h" #include "polynomial/parameters.cuh" -#include "programmable_bootstrap.h" -#include "programmable_bootstrap_multibit.h" #include #include @@ -75,7 +75,7 @@ __device__ const T *get_multi_bit_ith_lwe_gth_group_kth_block( template void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, uint32_t gpu_index, - double2 *dest, ST *src, + double2 *dest, ST const *src, uint32_t polynomial_size, uint32_t total_polynomials) { cudaSetDevice(gpu_index); @@ -249,175 +249,4 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, cudaFreeHost(h_bsk); } -void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, - void *_input1, void *_input2, void *_output, - uint32_t polynomial_size, - uint32_t total_polynomials) { - - cudaSetDevice(gpu_index); - auto input1 = (double2 *)_input1; - auto input2 = (double2 *)_input2; - auto output = (double2 *)_output; - - size_t shared_memory_size = sizeof(double2) * polynomial_size / 2; - - int gridSize = total_polynomials; - int blockSize = polynomial_size / choose_opt_amortized(polynomial_size); - - double2 *buffer; - switch (polynomial_size) { - case 256: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { - buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); - check_cuda_error(cudaFuncSetAttribute( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncCachePreferShared)); - batch_polynomial_mul, ForwardFFT>, FULLSM> - <<>>(input1, input2, - output, buffer); - } else { - buffer = (double2 *)cuda_malloc_async( - shared_memory_size * total_polynomials, stream, gpu_index); - batch_polynomial_mul, ForwardFFT>, NOSM> - <<>>(input1, input2, output, buffer); - } - break; - case 512: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { - buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); - check_cuda_error(cudaFuncSetAttribute( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncCachePreferShared)); - batch_polynomial_mul, ForwardFFT>, FULLSM> - <<>>(input1, input2, - output, buffer); - } else { - buffer = (double2 *)cuda_malloc_async( - shared_memory_size * total_polynomials, stream, gpu_index); - batch_polynomial_mul, ForwardFFT>, NOSM> - <<>>(input1, input2, output, buffer); - } - break; - case 1024: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { - buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); - check_cuda_error(cudaFuncSetAttribute( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncCachePreferShared)); - batch_polynomial_mul, ForwardFFT>, FULLSM> - <<>>(input1, input2, - output, buffer); - } else { - buffer = (double2 *)cuda_malloc_async( - shared_memory_size * total_polynomials, stream, gpu_index); - batch_polynomial_mul, ForwardFFT>, NOSM> - <<>>(input1, input2, output, buffer); - } - break; - case 2048: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { - buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); - check_cuda_error(cudaFuncSetAttribute( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncCachePreferShared)); - batch_polynomial_mul, ForwardFFT>, FULLSM> - <<>>(input1, input2, - output, buffer); - } else { - buffer = (double2 *)cuda_malloc_async( - shared_memory_size * total_polynomials, stream, gpu_index); - batch_polynomial_mul, ForwardFFT>, NOSM> - <<>>(input1, input2, output, buffer); - } - break; - case 4096: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { - buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); - check_cuda_error(cudaFuncSetAttribute( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncCachePreferShared)); - batch_polynomial_mul, ForwardFFT>, FULLSM> - <<>>(input1, input2, - output, buffer); - } else { - buffer = (double2 *)cuda_malloc_async( - shared_memory_size * total_polynomials, stream, gpu_index); - batch_polynomial_mul, ForwardFFT>, NOSM> - <<>>(input1, input2, output, buffer); - } - break; - case 8192: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { - buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); - check_cuda_error(cudaFuncSetAttribute( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncCachePreferShared)); - batch_polynomial_mul, ForwardFFT>, FULLSM> - <<>>(input1, input2, - output, buffer); - } else { - buffer = (double2 *)cuda_malloc_async( - shared_memory_size * total_polynomials, stream, gpu_index); - batch_polynomial_mul, ForwardFFT>, NOSM> - <<>>(input1, input2, output, buffer); - } - break; - case 16384: - if (shared_memory_size <= cuda_get_max_shared_memory(0)) { - buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); - check_cuda_error(cudaFuncSetAttribute( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncCachePreferShared)); - batch_polynomial_mul, ForwardFFT>, - FULLSM> - <<>>(input1, input2, - output, buffer); - } else { - buffer = (double2 *)cuda_malloc_async( - shared_memory_size * total_polynomials, stream, gpu_index); - batch_polynomial_mul, ForwardFFT>, NOSM> - <<>>(input1, input2, output, buffer); - } - break; - default: - break; - } - cuda_drop_async(buffer, stream, gpu_index); -} - #endif // CNCRT_BSK_H diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh index 459a496d11..95a7a78c14 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh @@ -1,12 +1,12 @@ #ifndef CUDA_PROGRAMMABLE_BOOTSTRAP_CUH #define CUDA_PROGRAMMABLE_BOOTSTRAP_CUH +#include "bootstrapping_key.cuh" #include "cooperative_groups.h" #include "device.h" #include "fft/bnsmfft.cuh" #include "helper_multi_gpu.h" -#include "programmable_bootstrap.h" -#include "programmable_bootstrap_multibit.h" +#include "pbs/programmable_bootstrap_multibit.h" using namespace cooperative_groups; namespace cg = cooperative_groups; @@ -117,18 +117,22 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer, } template -void execute_pbs_async( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - const LweArrayVariant &lwe_array_out, - const LweArrayVariant &lwe_output_indexes, - std::vector lut_vec, std::vector lut_indexes_vec, - const LweArrayVariant &lwe_array_in, - const LweArrayVariant &lwe_input_indexes, void **bootstrapping_keys, - std::vector pbs_buffer, uint32_t glwe_dimension, - uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type, uint32_t lut_count, - uint32_t lut_stride) { +void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, + const LweArrayVariant &lwe_array_out, + const LweArrayVariant &lwe_output_indexes, + const std::vector lut_vec, + const std::vector lut_indexes_vec, + const LweArrayVariant &lwe_array_in, + const LweArrayVariant &lwe_input_indexes, + void *const *bootstrapping_keys, + std::vector pbs_buffer, + uint32_t glwe_dimension, uint32_t lwe_dimension, + uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t grouping_factor, + uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type, + uint32_t lut_count, uint32_t lut_stride) { + switch (sizeof(Torus)) { case sizeof(uint32_t): // 32 bits diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu index 86ac8dee5d..35c2f19e8b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu @@ -126,8 +126,9 @@ void scratch_cuda_programmable_bootstrap_amortized_64( */ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes, - void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, + void const *lwe_output_indexes, void const *lut_vector, + void const *lut_vector_indexes, void const *lwe_array_in, + void const *lwe_input_indexes, void const *bootstrapping_key, int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples) { @@ -264,8 +265,9 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( */ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes, - void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, + void const *lwe_output_indexes, void const *lut_vector, + void const *lut_vector_indexes, void const *lwe_array_in, + void const *lwe_input_indexes, void const *bootstrapping_key, int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples) { diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh index 6a60a0f6d1..10cf9c27b0 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh @@ -6,15 +6,16 @@ #include #endif +#include "bootstrapping_key.cuh" #include "crypto/gadget.cuh" #include "crypto/torus.cuh" #include "device.h" #include "fft/bnsmfft.cuh" #include "fft/twiddles.cuh" +#include "pbs/programmable_bootstrap.h" #include "polynomial/functions.cuh" #include "polynomial/parameters.cuh" #include "polynomial/polynomial_math.cuh" -#include "programmable_bootstrap.h" #include "types/complex/operations.cuh" template diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh index f999d89cc7..87660a5977 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh @@ -12,10 +12,11 @@ #include "device.h" #include "fft/bnsmfft.cuh" #include "fft/twiddles.cuh" +#include "pbs/pbs_utilities.h" +#include "pbs/programmable_bootstrap.h" #include "polynomial/parameters.cuh" #include "polynomial/polynomial_math.cuh" #include "programmable_bootstrap.cuh" -#include "programmable_bootstrap.h" #include "types/complex/operations.cuh" using namespace cooperative_groups; @@ -228,8 +229,9 @@ __host__ void scratch_programmable_bootstrap_cg( template __host__ void host_programmable_bootstrap_cg( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, + Torus const *lwe_output_indexes, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh index a932f2e857..b8ddc8fdc7 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh @@ -8,11 +8,12 @@ #include "device.h" #include "fft/bnsmfft.cuh" #include "fft/twiddles.cuh" +#include "pbs/pbs_multibit_utilities.h" +#include "pbs/programmable_bootstrap.h" #include "polynomial/functions.cuh" #include "polynomial/parameters.cuh" #include "polynomial/polynomial_math.cuh" #include "programmable_bootstrap.cuh" -#include "programmable_bootstrap.h" #include "programmable_bootstrap_multibit.cuh" #include "types/complex/operations.cuh" #include @@ -285,13 +286,14 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap( template __host__ void execute_cg_external_product_loop( - cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, - Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, - Torus *lwe_array_out, Torus *lwe_output_indexes, - pbs_buffer *buffer, uint32_t num_samples, - uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, - uint32_t lwe_offset, uint32_t lut_count, uint32_t lut_stride) { + cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, Torus *lwe_array_out, + Torus const *lwe_output_indexes, pbs_buffer *buffer, + uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, + uint32_t level_count, uint32_t lwe_offset, uint32_t lut_count, + uint32_t lut_stride) { auto lwe_chunk_size = buffer->lwe_chunk_size; uint64_t full_dm = @@ -369,8 +371,9 @@ __host__ void execute_cg_external_product_loop( template __host__ void host_cg_multi_bit_programmable_bootstrap( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key, + Torus const *lwe_output_indexes, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, uint64_t const *bootstrapping_key, pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu index bfdb550911..cd6275122a 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu @@ -118,8 +118,9 @@ void scratch_cuda_programmable_bootstrap_tbc( template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, + Torus const *lwe_output_indexes, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t lut_count, @@ -374,8 +375,9 @@ void scratch_cuda_programmable_bootstrap_64( template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, + Torus const *lwe_output_indexes, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t lut_count, @@ -448,8 +450,9 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( template void cuda_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, + Torus const *lwe_output_indexes, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t lut_count, @@ -523,8 +526,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( */ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes, - void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, + void const *lwe_output_indexes, void const *lut_vector, + void const *lut_vector_indexes, void const *lwe_array_in, + void const *lwe_input_indexes, void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride) { @@ -540,12 +544,12 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( #if CUDA_ARCH >= 900 cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( stream, gpu_index, static_cast(lwe_array_out), - static_cast(lwe_output_indexes), - static_cast(lut_vector), - static_cast(lut_vector_indexes), - static_cast(lwe_array_in), - static_cast(lwe_input_indexes), - static_cast(bootstrapping_key), buffer, lwe_dimension, + static_cast(lwe_output_indexes), + static_cast(lut_vector), + static_cast(lut_vector_indexes), + static_cast(lwe_array_in), + static_cast(lwe_input_indexes), + static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, lut_count, lut_stride); break; @@ -555,24 +559,24 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( case CG: cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( stream, gpu_index, static_cast(lwe_array_out), - static_cast(lwe_output_indexes), - static_cast(lut_vector), - static_cast(lut_vector_indexes), - static_cast(lwe_array_in), - static_cast(lwe_input_indexes), - static_cast(bootstrapping_key), buffer, lwe_dimension, + static_cast(lwe_output_indexes), + static_cast(lut_vector), + static_cast(lut_vector_indexes), + static_cast(lwe_array_in), + static_cast(lwe_input_indexes), + static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, lut_count, lut_stride); break; case DEFAULT: cuda_programmable_bootstrap_lwe_ciphertext_vector( stream, gpu_index, static_cast(lwe_array_out), - static_cast(lwe_output_indexes), - static_cast(lut_vector), - static_cast(lut_vector_indexes), - static_cast(lwe_array_in), - static_cast(lwe_input_indexes), - static_cast(bootstrapping_key), buffer, lwe_dimension, + static_cast(lwe_output_indexes), + static_cast(lut_vector), + static_cast(lut_vector_indexes), + static_cast(lwe_array_in), + static_cast(lwe_input_indexes), + static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, lut_count, lut_stride); break; @@ -644,8 +648,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( */ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes, - void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, + void const *lwe_output_indexes, void const *lut_vector, + void const *lut_vector_indexes, void const *lwe_array_in, + void const *lwe_input_indexes, void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride) { @@ -660,12 +665,12 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( #if (CUDA_ARCH >= 900) cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( stream, gpu_index, static_cast(lwe_array_out), - static_cast(lwe_output_indexes), - static_cast(lut_vector), - static_cast(lut_vector_indexes), - static_cast(lwe_array_in), - static_cast(lwe_input_indexes), - static_cast(bootstrapping_key), buffer, lwe_dimension, + static_cast(lwe_output_indexes), + static_cast(lut_vector), + static_cast(lut_vector_indexes), + static_cast(lwe_array_in), + static_cast(lwe_input_indexes), + static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, lut_count, lut_stride); break; @@ -675,24 +680,24 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( case PBS_VARIANT::CG: cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( stream, gpu_index, static_cast(lwe_array_out), - static_cast(lwe_output_indexes), - static_cast(lut_vector), - static_cast(lut_vector_indexes), - static_cast(lwe_array_in), - static_cast(lwe_input_indexes), - static_cast(bootstrapping_key), buffer, lwe_dimension, + static_cast(lwe_output_indexes), + static_cast(lut_vector), + static_cast(lut_vector_indexes), + static_cast(lwe_array_in), + static_cast(lwe_input_indexes), + static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, lut_count, lut_stride); break; case PBS_VARIANT::DEFAULT: cuda_programmable_bootstrap_lwe_ciphertext_vector( stream, gpu_index, static_cast(lwe_array_out), - static_cast(lwe_output_indexes), - static_cast(lut_vector), - static_cast(lut_vector_indexes), - static_cast(lwe_array_in), - static_cast(lwe_input_indexes), - static_cast(bootstrapping_key), buffer, lwe_dimension, + static_cast(lwe_output_indexes), + static_cast(lut_vector), + static_cast(lut_vector_indexes), + static_cast(lwe_array_in), + static_cast(lwe_input_indexes), + static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, lut_count, lut_stride); break; @@ -717,9 +722,9 @@ template bool has_support_to_cuda_programmable_bootstrap_cg( template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, - uint64_t *lwe_output_indexes, uint64_t *lut_vector, - uint64_t *lut_vector_indexes, uint64_t *lwe_array_in, - uint64_t *lwe_input_indexes, double2 *bootstrapping_key, + uint64_t const *lwe_output_indexes, uint64_t const *lut_vector, + uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in, + uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t lut_count, @@ -727,9 +732,9 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( template void cuda_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, - uint64_t *lwe_output_indexes, uint64_t *lut_vector, - uint64_t *lut_vector_indexes, uint64_t *lwe_array_in, - uint64_t *lwe_input_indexes, double2 *bootstrapping_key, + uint64_t const *lwe_output_indexes, uint64_t const *lut_vector, + uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in, + uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t lut_count, @@ -748,9 +753,9 @@ template void scratch_cuda_programmable_bootstrap( template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint32_t *lwe_array_out, - uint32_t *lwe_output_indexes, uint32_t *lut_vector, - uint32_t *lut_vector_indexes, uint32_t *lwe_array_in, - uint32_t *lwe_input_indexes, double2 *bootstrapping_key, + uint32_t const *lwe_output_indexes, uint32_t const *lut_vector, + uint32_t const *lut_vector_indexes, uint32_t const *lwe_array_in, + uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t lut_count, @@ -758,9 +763,9 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( template void cuda_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint32_t *lwe_array_out, - uint32_t *lwe_output_indexes, uint32_t *lut_vector, - uint32_t *lut_vector_indexes, uint32_t *lwe_array_in, - uint32_t *lwe_input_indexes, double2 *bootstrapping_key, + uint32_t const *lwe_output_indexes, uint32_t const *lut_vector, + uint32_t const *lut_vector_indexes, uint32_t const *lwe_array_in, + uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t lut_count, @@ -787,18 +792,18 @@ template bool has_support_to_cuda_programmable_bootstrap_tbc( #if CUDA_ARCH >= 900 template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint32_t *lwe_array_out, - uint32_t *lwe_output_indexes, uint32_t *lut_vector, - uint32_t *lut_vector_indexes, uint32_t *lwe_array_in, - uint32_t *lwe_input_indexes, double2 *bootstrapping_key, + uint32_t const *lwe_output_indexes, uint32_t const *lut_vector, + uint32_t const *lut_vector_indexes, uint32_t const *lwe_array_in, + uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride); template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, - uint64_t *lwe_output_indexes, uint64_t *lut_vector, - uint64_t *lut_vector_indexes, uint64_t *lwe_array_in, - uint64_t *lwe_input_indexes, double2 *bootstrapping_key, + uint64_t const *lwe_output_indexes, uint64_t const *lut_vector, + uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in, + uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t lut_count, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh index 6c2c368d46..6836ecca13 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh @@ -11,9 +11,10 @@ #include "device.h" #include "fft/bnsmfft.cuh" #include "fft/twiddles.cuh" +#include "pbs/pbs_utilities.h" +#include "pbs/programmable_bootstrap.h" #include "polynomial/parameters.cuh" #include "polynomial/polynomial_math.cuh" -#include "programmable_bootstrap.h" #include "types/complex/operations.cuh" template @@ -363,16 +364,15 @@ __host__ void scratch_programmable_bootstrap( } template -__host__ void -execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, - Torus *lut_vector_indexes, Torus *lwe_array_in, - Torus *lwe_input_indexes, double2 *bootstrapping_key, - Torus *global_accumulator, double2 *global_accumulator_fft, - uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension, - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, int8_t *d_mem, - int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm, - uint64_t full_sm, uint64_t full_dm) { +__host__ void execute_step_one( + cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, double2 const *bootstrapping_key, + Torus *global_accumulator, double2 *global_accumulator_fft, + uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm, + uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) { int max_shared_memory = cuda_get_max_shared_memory(0); cudaSetDevice(gpu_index); @@ -407,13 +407,14 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, template __host__ void execute_step_two( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - double2 *bootstrapping_key, Torus *global_accumulator, - double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count, - uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, int8_t *d_mem, int lwe_iteration, - uint64_t partial_sm, uint64_t partial_dm, uint64_t full_sm, - uint64_t full_dm, uint32_t lut_count, uint32_t lut_stride) { + Torus const *lwe_output_indexes, Torus const *lut_vector, + Torus const *lut_vector_indexes, double2 const *bootstrapping_key, + Torus *global_accumulator, double2 *global_accumulator_fft, + uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm, + uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm, uint32_t lut_count, + uint32_t lut_stride) { int max_shared_memory = cuda_get_max_shared_memory(0); cudaSetDevice(gpu_index); @@ -450,8 +451,9 @@ __host__ void execute_step_two( template __host__ void host_programmable_bootstrap( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, + Torus const *lwe_output_indexes, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu index 6c979b2ca6..6c7418cada 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu @@ -1,7 +1,7 @@ #include "../polynomial/parameters.cuh" +#include "pbs/programmable_bootstrap_multibit.h" #include "programmable_bootstrap_cg_multibit.cuh" #include "programmable_bootstrap_multibit.cuh" -#include "programmable_bootstrap_multibit.h" #if (CUDA_ARCH >= 900) #include "programmable_bootstrap_tbc_multibit.cuh" @@ -61,8 +61,9 @@ bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( template void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key, + Torus const *lwe_output_indexes, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, Torus const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, @@ -138,8 +139,9 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( template void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key, + Torus const *lwe_output_indexes, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, Torus const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, @@ -214,8 +216,9 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( void *stream, uint32_t gpu_index, void *lwe_array_out, - void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes, - void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, + void const *lwe_output_indexes, void const *lut_vector, + void const *lut_vector_indexes, void const *lwe_array_in, + void const *lwe_input_indexes, void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t lut_count, @@ -229,12 +232,12 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( #if CUDA_ARCH >= 900 cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( stream, gpu_index, static_cast(lwe_array_out), - static_cast(lwe_output_indexes), - static_cast(lut_vector), - static_cast(lut_vector_indexes), - static_cast(lwe_array_in), - static_cast(lwe_input_indexes), - static_cast(bootstrapping_key), buffer, lwe_dimension, + static_cast(lwe_output_indexes), + static_cast(lut_vector), + static_cast(lut_vector_indexes), + static_cast(lwe_array_in), + static_cast(lwe_input_indexes), + static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, num_samples, lut_count, lut_stride); break; @@ -244,24 +247,24 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( case PBS_VARIANT::CG: cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( stream, gpu_index, static_cast(lwe_array_out), - static_cast(lwe_output_indexes), - static_cast(lut_vector), - static_cast(lut_vector_indexes), - static_cast(lwe_array_in), - static_cast(lwe_input_indexes), - static_cast(bootstrapping_key), buffer, lwe_dimension, + static_cast(lwe_output_indexes), + static_cast(lut_vector), + static_cast(lut_vector_indexes), + static_cast(lwe_array_in), + static_cast(lwe_input_indexes), + static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, num_samples, lut_count, lut_stride); break; case PBS_VARIANT::DEFAULT: cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( stream, gpu_index, static_cast(lwe_array_out), - static_cast(lwe_output_indexes), - static_cast(lut_vector), - static_cast(lut_vector_indexes), - static_cast(lwe_array_in), - static_cast(lwe_input_indexes), - static_cast(bootstrapping_key), buffer, lwe_dimension, + static_cast(lwe_output_indexes), + static_cast(lut_vector), + static_cast(lut_vector_indexes), + static_cast(lwe_array_in), + static_cast(lwe_input_indexes), + static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, num_samples, lut_count, lut_stride); break; @@ -493,9 +496,9 @@ template void scratch_cuda_multi_bit_programmable_bootstrap( template void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, - uint64_t *lwe_output_indexes, uint64_t *lut_vector, - uint64_t *lut_vector_indexes, uint64_t *lwe_array_in, - uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key, + uint64_t const *lwe_output_indexes, uint64_t const *lut_vector, + uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in, + uint64_t const *lwe_input_indexes, uint64_t const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, @@ -510,9 +513,9 @@ template void scratch_cuda_cg_multi_bit_programmable_bootstrap( template void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, - uint64_t *lwe_output_indexes, uint64_t *lut_vector, - uint64_t *lut_vector_indexes, uint64_t *lwe_array_in, - uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key, + uint64_t const *lwe_output_indexes, uint64_t const *lut_vector, + uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in, + uint64_t const *lwe_input_indexes, uint64_t const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, @@ -582,8 +585,9 @@ void scratch_cuda_tbc_multi_bit_programmable_bootstrap( template void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key, + Torus const *lwe_output_indexes, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, Torus const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, @@ -678,9 +682,9 @@ template void scratch_cuda_tbc_multi_bit_programmable_bootstrap( template void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, - uint64_t *lwe_output_indexes, uint64_t *lut_vector, - uint64_t *lut_vector_indexes, uint64_t *lwe_array_in, - uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key, + uint64_t const *lwe_output_indexes, uint64_t const *lut_vector, + uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in, + uint64_t const *lwe_input_indexes, uint64_t const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh index c0128138d0..450e1a7a16 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh @@ -8,12 +8,13 @@ #include "device.h" #include "fft/bnsmfft.cuh" #include "fft/twiddles.cuh" +#include "pbs/pbs_multibit_utilities.h" +#include "pbs/programmable_bootstrap.h" +#include "pbs/programmable_bootstrap_multibit.h" #include "polynomial/functions.cuh" #include "polynomial/parameters.cuh" #include "polynomial/polynomial_math.cuh" -#include "programmable_bootstrap.h" #include "programmable_bootstrap_cg_classic.cuh" -#include "programmable_bootstrap_multibit.h" #include "types/complex/operations.cuh" #include @@ -489,8 +490,8 @@ __host__ void scratch_multi_bit_programmable_bootstrap( template __host__ void execute_compute_keybundle( - cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_in, - Torus *lwe_input_indexes, Torus *bootstrapping_key, + cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, Torus const *bootstrapping_key, pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) { @@ -537,12 +538,14 @@ __host__ void execute_compute_keybundle( } template -__host__ void execute_step_one( - cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, - Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, - pbs_buffer *buffer, uint32_t num_samples, - uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, uint32_t j, uint32_t lwe_offset) { +__host__ void +execute_step_one(cudaStream_t stream, uint32_t gpu_index, + Torus const *lut_vector, Torus const *lut_vector_indexes, + Torus const *lwe_array_in, Torus const *lwe_input_indexes, + pbs_buffer *buffer, uint32_t num_samples, + uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t j, uint32_t lwe_offset) { uint64_t full_sm_accumulate_step_one = get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one( @@ -593,7 +596,7 @@ __host__ void execute_step_one( template __host__ void execute_step_two( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, pbs_buffer *buffer, + Torus const *lwe_output_indexes, pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, int32_t grouping_factor, uint32_t level_count, uint32_t j, uint32_t lwe_offset, uint32_t lut_count, uint32_t lut_stride) { @@ -637,8 +640,9 @@ __host__ void execute_step_two( template __host__ void host_multi_bit_programmable_bootstrap( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key, + Torus const *lwe_output_indexes, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, Torus const *bootstrapping_key, pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh index d898451cdb..910a74c18e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh @@ -12,10 +12,11 @@ #include "device.h" #include "fft/bnsmfft.cuh" #include "fft/twiddles.cuh" +#include "pbs/pbs_utilities.h" +#include "pbs/programmable_bootstrap.h" #include "polynomial/parameters.cuh" #include "polynomial/polynomial_math.cuh" #include "programmable_bootstrap.cuh" -#include "programmable_bootstrap.h" #include "types/complex/operations.cuh" using namespace cooperative_groups; @@ -253,8 +254,9 @@ __host__ void scratch_programmable_bootstrap_tbc( template __host__ void host_programmable_bootstrap_tbc( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, + Torus const *lwe_output_indexes, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh index 5834f779e9..ac5fd8051b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh @@ -8,12 +8,13 @@ #include "device.h" #include "fft/bnsmfft.cuh" #include "fft/twiddles.cuh" +#include "pbs/pbs_multibit_utilities.h" +#include "pbs/programmable_bootstrap.h" +#include "pbs/programmable_bootstrap_multibit.cuh" #include "polynomial/functions.cuh" #include "polynomial/parameters.cuh" #include "polynomial/polynomial_math.cuh" #include "programmable_bootstrap.cuh" -#include "programmable_bootstrap.h" -#include "programmable_bootstrap_multibit.cuh" #include "types/complex/operations.cuh" #include @@ -290,13 +291,14 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap( template __host__ void execute_tbc_external_product_loop( - cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, - Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, - Torus *lwe_array_out, Torus *lwe_output_indexes, - pbs_buffer *buffer, uint32_t num_samples, - uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, - uint32_t lwe_offset, uint32_t lut_count, uint32_t lut_stride) { + cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, Torus *lwe_array_out, + Torus const *lwe_output_indexes, pbs_buffer *buffer, + uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, + uint32_t level_count, uint32_t lwe_offset, uint32_t lut_count, + uint32_t lut_stride) { auto lwe_chunk_size = buffer->lwe_chunk_size; auto supports_dsm = @@ -393,8 +395,9 @@ __host__ void execute_tbc_external_product_loop( template __host__ void host_tbc_multi_bit_programmable_bootstrap( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key, + Torus const *lwe_output_indexes, Torus const *lut_vector, + Torus const *lut_vector_indexes, Torus const *lwe_array_in, + Torus const *lwe_input_indexes, Torus const *bootstrapping_key, pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh index 9c609341a0..f1da499b1c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh @@ -188,7 +188,7 @@ __device__ void add_to_torus(double2 *m_values, Torus *result, // Extracts the body of the nth-LWE in a GLWE. template -__device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe, +__device__ void sample_extract_body(Torus *lwe_array_out, Torus const *glwe, uint32_t glwe_dimension, uint32_t nth = 0) { // Set first coefficient of the glwe as the body of the LWE sample lwe_array_out[glwe_dimension * params::degree] = @@ -197,7 +197,7 @@ __device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe, // Extracts the mask from the nth-LWE in a GLWE. template -__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *glwe, +__device__ void sample_extract_mask(Torus *lwe_array_out, Torus const *glwe, uint32_t glwe_dimension = 1, uint32_t nth = 0) { for (int z = 0; z < glwe_dimension; z++) { diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh index 08cb9dfe99..eb09d17e52 100644 --- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh @@ -5,7 +5,8 @@ /// Initialize same-size arrays on all active gpus template -void multi_gpu_alloc_array_async(cudaStream_t *streams, uint32_t *gpu_indexes, +void multi_gpu_alloc_array_async(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, std::vector &dest, uint32_t elements_per_gpu) { @@ -18,9 +19,10 @@ void multi_gpu_alloc_array_async(cudaStream_t *streams, uint32_t *gpu_indexes, } /// Copy an array residing on one GPU to all active gpus template -void multi_gpu_copy_array_async(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, std::vector &dest, - Torus *src, uint32_t elements_per_gpu) { +void multi_gpu_copy_array_async(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + std::vector &dest, Torus const *src, + uint32_t elements_per_gpu) { dest.resize(gpu_count); for (uint i = 0; i < gpu_count; i++) { cuda_memcpy_async_gpu_to_gpu(dest[i], src, elements_per_gpu * sizeof(Torus), @@ -31,9 +33,10 @@ void multi_gpu_copy_array_async(cudaStream_t *streams, uint32_t *gpu_indexes, /// Initializes also the related indexing and initializes it to the trivial /// index template -void multi_gpu_alloc_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, std::vector &dest, - uint32_t num_inputs, uint32_t lwe_size) { +void multi_gpu_alloc_lwe_async(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + std::vector &dest, uint32_t num_inputs, + uint32_t lwe_size) { dest.resize(gpu_count); for (uint i = 0; i < gpu_count; i++) { auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count); @@ -48,9 +51,10 @@ void multi_gpu_alloc_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes, /// The input indexing logic is given by an index array. /// The output indexing is always the trivial one template -void multi_gpu_scatter_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes, +void multi_gpu_scatter_lwe_async(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, std::vector &dest, - Torus *src, Torus *h_src_indexes, + Torus const *src, Torus const *h_src_indexes, bool is_trivial_index, uint32_t num_inputs, uint32_t lwe_size) { @@ -88,9 +92,9 @@ void multi_gpu_scatter_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes, /// dest_indexes /// The input indexing should be the trivial one template -void multi_gpu_gather_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes, - uint32_t gpu_count, Torus *dest, - const std::vector &src, +void multi_gpu_gather_lwe_async(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + Torus *dest, const std::vector &src, Torus *h_dest_indexes, bool is_trivial_index, uint32_t num_inputs, uint32_t lwe_size) { @@ -123,7 +127,8 @@ void multi_gpu_gather_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes, } template -void multi_gpu_release_async(cudaStream_t *streams, uint32_t *gpu_indexes, +void multi_gpu_release_async(cudaStream_t const *streams, + uint32_t const *gpu_indexes, std::vector &vec) { for (uint i = 0; i < vec.size(); i++) diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_fft.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_fft.cpp index 963a9f3c64..98d44a38e6 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_fft.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_fft.cpp @@ -1,3 +1,4 @@ +#include "pbs/pbs_utilities.h" #include #include #include diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp index 315dbe6a8c..ae6ec59f31 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp @@ -1,9 +1,8 @@ +#include "pbs/pbs_multibit_utilities.h" +#include "pbs/pbs_utilities.h" #include #include #include -#include -#include -#include #include typedef struct { @@ -50,7 +49,6 @@ class MultiBitBootstrap_u64 : public benchmark::Fixture { uint64_t *d_lut_pbs_indexes; uint64_t *d_lwe_ct_in_array; uint64_t *d_lwe_ct_out_array; - uint64_t *lwe_ct_out_array; uint64_t *d_lwe_input_indexes; uint64_t *d_lwe_output_indexes; int8_t *buffer; @@ -215,12 +213,15 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit) for (auto _ : st) { // Execute PBS cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( - stream, gpu_index, d_lwe_ct_out_array, d_lwe_output_indexes, - d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array, - d_lwe_input_indexes, d_bsk, (pbs_buffer *)buffer, - lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, - pbs_base_log, pbs_level, input_lwe_ciphertext_count, lut_count, - lut_stride); + stream, gpu_index, d_lwe_ct_out_array, + (const uint64_t *)d_lwe_output_indexes, + (const uint64_t *)d_lut_pbs_identity, + (const uint64_t *)d_lut_pbs_indexes, + (const uint64_t *)d_lwe_ct_in_array, + (const uint64_t *)d_lwe_input_indexes, (const uint64_t *)d_bsk, + (pbs_buffer *)buffer, lwe_dimension, + glwe_dimension, polynomial_size, grouping_factor, pbs_base_log, + pbs_level, input_lwe_ciphertext_count, lut_count, lut_stride); cuda_synchronize_stream(stream, gpu_index); } diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/include/setup_and_teardown.h b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/include/setup_and_teardown.h index 6c808b8a3b..183a2dc044 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/include/setup_and_teardown.h +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/include/setup_and_teardown.h @@ -1,10 +1,10 @@ #ifndef SETUP_AND_TEARDOWN_H #define SETUP_AND_TEARDOWN_H +#include "pbs/programmable_bootstrap.h" +#include "pbs/programmable_bootstrap_multibit.h" #include #include -#include -#include #include void programmable_bootstrap_classical_setup( diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_fft.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_fft.cpp index a2b11dcc34..7231ad9d33 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_fft.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_fft.cpp @@ -1,9 +1,10 @@ +#include "pbs/pbs_utilities.h" +#include "pbs/programmable_bootstrap.h" #include "utils.h" #include "gtest/gtest.h" #include #include #include -#include #include #include #include diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/utils.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/utils.cpp index 259267f65e..608cfc895c 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/utils.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/utils.cpp @@ -1,6 +1,6 @@ #include -#include -#include +#include "pbs/programmable_bootstrap.h" +#include "pbs/programmable_bootstrap_multibit.h" #include #include #include diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs new file mode 100644 index 0000000000..927849bc03 --- /dev/null +++ b/backends/tfhe-cuda-backend/src/bindings.rs @@ -0,0 +1,1684 @@ +/* automatically generated by rust-bindgen 0.70.1 */ + +use crate::ffi; + +pub const _STDINT_H: u32 = 1; +pub const _FEATURES_H: u32 = 1; +pub const _ISOC95_SOURCE: u32 = 1; +pub const _ISOC99_SOURCE: u32 = 1; +pub const _ISOC11_SOURCE: u32 = 1; +pub const _ISOC2X_SOURCE: u32 = 1; +pub const _POSIX_SOURCE: u32 = 1; +pub const _POSIX_C_SOURCE: u32 = 200809; +pub const _XOPEN_SOURCE: u32 = 700; +pub const _XOPEN_SOURCE_EXTENDED: u32 = 1; +pub const _LARGEFILE64_SOURCE: u32 = 1; +pub const _DEFAULT_SOURCE: u32 = 1; +pub const _ATFILE_SOURCE: u32 = 1; +pub const _DYNAMIC_STACK_SIZE_SOURCE: u32 = 1; +pub const __GLIBC_USE_ISOC2X: u32 = 1; +pub const __USE_ISOC11: u32 = 1; +pub const __USE_ISOC99: u32 = 1; +pub const __USE_ISOC95: u32 = 1; +pub const __USE_ISOCXX11: u32 = 1; +pub const __USE_POSIX: u32 = 1; +pub const __USE_POSIX2: u32 = 1; +pub const __USE_POSIX199309: u32 = 1; +pub const __USE_POSIX199506: u32 = 1; +pub const __USE_XOPEN2K: u32 = 1; +pub const __USE_XOPEN2K8: u32 = 1; +pub const __USE_XOPEN: u32 = 1; +pub const __USE_XOPEN_EXTENDED: u32 = 1; +pub const __USE_UNIX98: u32 = 1; +pub const _LARGEFILE_SOURCE: u32 = 1; +pub const __USE_XOPEN2K8XSI: u32 = 1; +pub const __USE_XOPEN2KXSI: u32 = 1; +pub const __USE_LARGEFILE: u32 = 1; +pub const __USE_LARGEFILE64: u32 = 1; +pub const __WORDSIZE: u32 = 64; +pub const __WORDSIZE_TIME64_COMPAT32: u32 = 1; +pub const __SYSCALL_WORDSIZE: u32 = 64; +pub const __TIMESIZE: u32 = 64; +pub const __USE_MISC: u32 = 1; +pub const __USE_ATFILE: u32 = 1; +pub const __USE_DYNAMIC_STACK_SIZE: u32 = 1; +pub const __USE_GNU: u32 = 1; +pub const __USE_FORTIFY_LEVEL: u32 = 0; +pub const __GLIBC_USE_DEPRECATED_GETS: u32 = 0; +pub const __GLIBC_USE_DEPRECATED_SCANF: u32 = 0; +pub const _STDC_PREDEF_H: u32 = 1; +pub const __STDC_IEC_559__: u32 = 1; +pub const __STDC_IEC_60559_BFP__: u32 = 201404; +pub const __STDC_IEC_559_COMPLEX__: u32 = 1; +pub const __STDC_IEC_60559_COMPLEX__: u32 = 201404; +pub const __STDC_ISO_10646__: u32 = 201706; +pub const __GNU_LIBRARY__: u32 = 6; +pub const __GLIBC__: u32 = 2; +pub const __GLIBC_MINOR__: u32 = 35; +pub const _SYS_CDEFS_H: u32 = 1; +pub const __glibc_c99_flexarr_available: u32 = 1; +pub const __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI: u32 = 0; +pub const __HAVE_GENERIC_SELECTION: u32 = 0; +pub const __GLIBC_USE_LIB_EXT2: u32 = 1; +pub const __GLIBC_USE_IEC_60559_BFP_EXT: u32 = 1; +pub const __GLIBC_USE_IEC_60559_BFP_EXT_C2X: u32 = 1; +pub const __GLIBC_USE_IEC_60559_EXT: u32 = 1; +pub const __GLIBC_USE_IEC_60559_FUNCS_EXT: u32 = 1; +pub const __GLIBC_USE_IEC_60559_FUNCS_EXT_C2X: u32 = 1; +pub const __GLIBC_USE_IEC_60559_TYPES_EXT: u32 = 1; +pub const _BITS_TYPES_H: u32 = 1; +pub const _BITS_TYPESIZES_H: u32 = 1; +pub const __OFF_T_MATCHES_OFF64_T: u32 = 1; +pub const __INO_T_MATCHES_INO64_T: u32 = 1; +pub const __RLIM_T_MATCHES_RLIM64_T: u32 = 1; +pub const __STATFS_MATCHES_STATFS64: u32 = 1; +pub const __KERNEL_OLD_TIMEVAL_MATCHES_TIMEVAL64: u32 = 1; +pub const __FD_SETSIZE: u32 = 1024; +pub const _BITS_TIME64_H: u32 = 1; +pub const _BITS_WCHAR_H: u32 = 1; +pub const _BITS_STDINT_INTN_H: u32 = 1; +pub const _BITS_STDINT_UINTN_H: u32 = 1; +pub const INT8_MIN: i32 = -128; +pub const INT16_MIN: i32 = -32768; +pub const INT32_MIN: i32 = -2147483648; +pub const INT8_MAX: u32 = 127; +pub const INT16_MAX: u32 = 32767; +pub const INT32_MAX: u32 = 2147483647; +pub const UINT8_MAX: u32 = 255; +pub const UINT16_MAX: u32 = 65535; +pub const UINT32_MAX: u32 = 4294967295; +pub const INT_LEAST8_MIN: i32 = -128; +pub const INT_LEAST16_MIN: i32 = -32768; +pub const INT_LEAST32_MIN: i32 = -2147483648; +pub const INT_LEAST8_MAX: u32 = 127; +pub const INT_LEAST16_MAX: u32 = 32767; +pub const INT_LEAST32_MAX: u32 = 2147483647; +pub const UINT_LEAST8_MAX: u32 = 255; +pub const UINT_LEAST16_MAX: u32 = 65535; +pub const UINT_LEAST32_MAX: u32 = 4294967295; +pub const INT_FAST8_MIN: i32 = -128; +pub const INT_FAST16_MIN: i64 = -9223372036854775808; +pub const INT_FAST32_MIN: i64 = -9223372036854775808; +pub const INT_FAST8_MAX: u32 = 127; +pub const INT_FAST16_MAX: u64 = 9223372036854775807; +pub const INT_FAST32_MAX: u64 = 9223372036854775807; +pub const UINT_FAST8_MAX: u32 = 255; +pub const UINT_FAST16_MAX: i32 = -1; +pub const UINT_FAST32_MAX: i32 = -1; +pub const INTPTR_MIN: i64 = -9223372036854775808; +pub const INTPTR_MAX: u64 = 9223372036854775807; +pub const UINTPTR_MAX: i32 = -1; +pub const PTRDIFF_MIN: i64 = -9223372036854775808; +pub const PTRDIFF_MAX: u64 = 9223372036854775807; +pub const SIG_ATOMIC_MIN: i32 = -2147483648; +pub const SIG_ATOMIC_MAX: u32 = 2147483647; +pub const SIZE_MAX: i32 = -1; +pub const WINT_MIN: u32 = 0; +pub const WINT_MAX: u32 = 4294967295; +pub const INT8_WIDTH: u32 = 8; +pub const UINT8_WIDTH: u32 = 8; +pub const INT16_WIDTH: u32 = 16; +pub const UINT16_WIDTH: u32 = 16; +pub const INT32_WIDTH: u32 = 32; +pub const UINT32_WIDTH: u32 = 32; +pub const INT64_WIDTH: u32 = 64; +pub const UINT64_WIDTH: u32 = 64; +pub const INT_LEAST8_WIDTH: u32 = 8; +pub const UINT_LEAST8_WIDTH: u32 = 8; +pub const INT_LEAST16_WIDTH: u32 = 16; +pub const UINT_LEAST16_WIDTH: u32 = 16; +pub const INT_LEAST32_WIDTH: u32 = 32; +pub const UINT_LEAST32_WIDTH: u32 = 32; +pub const INT_LEAST64_WIDTH: u32 = 64; +pub const UINT_LEAST64_WIDTH: u32 = 64; +pub const INT_FAST8_WIDTH: u32 = 8; +pub const UINT_FAST8_WIDTH: u32 = 8; +pub const INT_FAST16_WIDTH: u32 = 64; +pub const UINT_FAST16_WIDTH: u32 = 64; +pub const INT_FAST32_WIDTH: u32 = 64; +pub const UINT_FAST32_WIDTH: u32 = 64; +pub const INT_FAST64_WIDTH: u32 = 64; +pub const UINT_FAST64_WIDTH: u32 = 64; +pub const INTPTR_WIDTH: u32 = 64; +pub const UINTPTR_WIDTH: u32 = 64; +pub const INTMAX_WIDTH: u32 = 64; +pub const UINTMAX_WIDTH: u32 = 64; +pub const PTRDIFF_WIDTH: u32 = 64; +pub const SIG_ATOMIC_WIDTH: u32 = 32; +pub const SIZE_WIDTH: u32 = 64; +pub const WCHAR_WIDTH: u32 = 32; +pub const WINT_WIDTH: u32 = 32; +pub type __u_char = ffi::c_uchar; +pub type __u_short = ffi::c_ushort; +pub type __u_int = ffi::c_uint; +pub type __u_long = ffi::c_ulong; +pub type __int8_t = ffi::c_schar; +pub type __uint8_t = ffi::c_uchar; +pub type __int16_t = ffi::c_short; +pub type __uint16_t = ffi::c_ushort; +pub type __int32_t = ffi::c_int; +pub type __uint32_t = ffi::c_uint; +pub type __int64_t = ffi::c_long; +pub type __uint64_t = ffi::c_ulong; +pub type __int_least8_t = __int8_t; +pub type __uint_least8_t = __uint8_t; +pub type __int_least16_t = __int16_t; +pub type __uint_least16_t = __uint16_t; +pub type __int_least32_t = __int32_t; +pub type __uint_least32_t = __uint32_t; +pub type __int_least64_t = __int64_t; +pub type __uint_least64_t = __uint64_t; +pub type __quad_t = ffi::c_long; +pub type __u_quad_t = ffi::c_ulong; +pub type __intmax_t = ffi::c_long; +pub type __uintmax_t = ffi::c_ulong; +pub type __dev_t = ffi::c_ulong; +pub type __uid_t = ffi::c_uint; +pub type __gid_t = ffi::c_uint; +pub type __ino_t = ffi::c_ulong; +pub type __ino64_t = ffi::c_ulong; +pub type __mode_t = ffi::c_uint; +pub type __nlink_t = ffi::c_ulong; +pub type __off_t = ffi::c_long; +pub type __off64_t = ffi::c_long; +pub type __pid_t = ffi::c_int; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct __fsid_t { + pub __val: [ffi::c_int; 2usize], +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of __fsid_t"][::std::mem::size_of::<__fsid_t>() - 8usize]; + ["Alignment of __fsid_t"][::std::mem::align_of::<__fsid_t>() - 4usize]; + ["Offset of field: __fsid_t::__val"][::std::mem::offset_of!(__fsid_t, __val) - 0usize]; +}; +pub type __clock_t = ffi::c_long; +pub type __rlim_t = ffi::c_ulong; +pub type __rlim64_t = ffi::c_ulong; +pub type __id_t = ffi::c_uint; +pub type __time_t = ffi::c_long; +pub type __useconds_t = ffi::c_uint; +pub type __suseconds_t = ffi::c_long; +pub type __suseconds64_t = ffi::c_long; +pub type __daddr_t = ffi::c_int; +pub type __key_t = ffi::c_int; +pub type __clockid_t = ffi::c_int; +pub type __timer_t = *mut ffi::c_void; +pub type __blksize_t = ffi::c_long; +pub type __blkcnt_t = ffi::c_long; +pub type __blkcnt64_t = ffi::c_long; +pub type __fsblkcnt_t = ffi::c_ulong; +pub type __fsblkcnt64_t = ffi::c_ulong; +pub type __fsfilcnt_t = ffi::c_ulong; +pub type __fsfilcnt64_t = ffi::c_ulong; +pub type __fsword_t = ffi::c_long; +pub type __ssize_t = ffi::c_long; +pub type __syscall_slong_t = ffi::c_long; +pub type __syscall_ulong_t = ffi::c_ulong; +pub type __loff_t = __off64_t; +pub type __caddr_t = *mut ffi::c_char; +pub type __intptr_t = ffi::c_long; +pub type __socklen_t = ffi::c_uint; +pub type __sig_atomic_t = ffi::c_int; +pub type int_least8_t = __int_least8_t; +pub type int_least16_t = __int_least16_t; +pub type int_least32_t = __int_least32_t; +pub type int_least64_t = __int_least64_t; +pub type uint_least8_t = __uint_least8_t; +pub type uint_least16_t = __uint_least16_t; +pub type uint_least32_t = __uint_least32_t; +pub type uint_least64_t = __uint_least64_t; +pub type int_fast8_t = ffi::c_schar; +pub type int_fast16_t = ffi::c_long; +pub type int_fast32_t = ffi::c_long; +pub type int_fast64_t = ffi::c_long; +pub type uint_fast8_t = ffi::c_uchar; +pub type uint_fast16_t = ffi::c_ulong; +pub type uint_fast32_t = ffi::c_ulong; +pub type uint_fast64_t = ffi::c_ulong; +pub type intmax_t = __intmax_t; +pub type uintmax_t = __uintmax_t; +extern "C" { + pub fn cuda_convert_lwe_ciphertext_vector_to_gpu_64( + stream: *mut ffi::c_void, + gpu_index: u32, + dest: *mut ffi::c_void, + src: *const ffi::c_void, + number_of_cts: u32, + lwe_dimension: u32, + ); +} +extern "C" { + pub fn cuda_convert_lwe_ciphertext_vector_to_cpu_64( + stream: *mut ffi::c_void, + gpu_index: u32, + dest: *mut ffi::c_void, + src: *const ffi::c_void, + number_of_cts: u32, + lwe_dimension: u32, + ); +} +extern "C" { + pub fn cuda_glwe_sample_extract_64( + stream: *mut ffi::c_void, + gpu_index: u32, + lwe_array_out: *mut ffi::c_void, + glwe_array_in: *const ffi::c_void, + nth_array: *const u32, + num_nths: u32, + glwe_dimension: u32, + polynomial_size: u32, + ); +} +pub const PBS_TYPE_MULTI_BIT: PBS_TYPE = 0; +pub const PBS_TYPE_CLASSICAL: PBS_TYPE = 1; +pub type PBS_TYPE = ffi::c_uint; +pub const PBS_VARIANT_DEFAULT: PBS_VARIANT = 0; +pub const PBS_VARIANT_CG: PBS_VARIANT = 1; +pub const PBS_VARIANT_TBC: PBS_VARIANT = 2; +pub type PBS_VARIANT = ffi::c_uint; +extern "C" { + pub fn scratch_cuda_integer_compress_radix_ciphertext_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + compression_glwe_dimension: u32, + compression_polynomial_size: u32, + lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + num_radix_blocks: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + lwe_per_glwe: u32, + storage_log_modulus: u32, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn scratch_cuda_integer_decompress_radix_ciphertext_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + encryption_glwe_dimension: u32, + encryption_polynomial_size: u32, + compression_glwe_dimension: u32, + compression_polynomial_size: u32, + lwe_dimension: u32, + pbs_level: u32, + pbs_base_log: u32, + num_radix_blocks: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + storage_log_modulus: u32, + body_count: u32, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_integer_compress_radix_ciphertext_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + glwe_array_out: *mut ffi::c_void, + lwe_array_in: *const ffi::c_void, + fp_ksk: *const *mut ffi::c_void, + num_nths: u32, + mem_ptr: *mut i8, + ); +} +extern "C" { + pub fn cuda_integer_decompress_radix_ciphertext_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array_out: *mut ffi::c_void, + glwe_in: *const ffi::c_void, + indexes_array: *const u32, + indexes_array_size: u32, + bsks: *const *mut ffi::c_void, + mem_ptr: *mut i8, + ); +} +extern "C" { + pub fn cleanup_cuda_integer_compress_radix_ciphertext_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn cleanup_cuda_integer_decompress_radix_ciphertext_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +pub const OUTPUT_CARRY_NONE: OUTPUT_CARRY = 0; +pub const OUTPUT_CARRY_GENERATED: OUTPUT_CARRY = 1; +pub const OUTPUT_CARRY_PROPAGATED: OUTPUT_CARRY = 2; +pub type OUTPUT_CARRY = ffi::c_uint; +pub const SHIFT_OR_ROTATE_TYPE_LEFT_SHIFT: SHIFT_OR_ROTATE_TYPE = 0; +pub const SHIFT_OR_ROTATE_TYPE_RIGHT_SHIFT: SHIFT_OR_ROTATE_TYPE = 1; +pub const SHIFT_OR_ROTATE_TYPE_LEFT_ROTATE: SHIFT_OR_ROTATE_TYPE = 2; +pub const SHIFT_OR_ROTATE_TYPE_RIGHT_ROTATE: SHIFT_OR_ROTATE_TYPE = 3; +pub type SHIFT_OR_ROTATE_TYPE = ffi::c_uint; +pub const BITOP_TYPE_BITAND: BITOP_TYPE = 0; +pub const BITOP_TYPE_BITOR: BITOP_TYPE = 1; +pub const BITOP_TYPE_BITXOR: BITOP_TYPE = 2; +pub const BITOP_TYPE_SCALAR_BITAND: BITOP_TYPE = 3; +pub const BITOP_TYPE_SCALAR_BITOR: BITOP_TYPE = 4; +pub const BITOP_TYPE_SCALAR_BITXOR: BITOP_TYPE = 5; +pub type BITOP_TYPE = ffi::c_uint; +pub const COMPARISON_TYPE_EQ: COMPARISON_TYPE = 0; +pub const COMPARISON_TYPE_NE: COMPARISON_TYPE = 1; +pub const COMPARISON_TYPE_GT: COMPARISON_TYPE = 2; +pub const COMPARISON_TYPE_GE: COMPARISON_TYPE = 3; +pub const COMPARISON_TYPE_LT: COMPARISON_TYPE = 4; +pub const COMPARISON_TYPE_LE: COMPARISON_TYPE = 5; +pub const COMPARISON_TYPE_MAX: COMPARISON_TYPE = 6; +pub const COMPARISON_TYPE_MIN: COMPARISON_TYPE = 7; +pub type COMPARISON_TYPE = ffi::c_uint; +pub const CMP_ORDERING_IS_INFERIOR: CMP_ORDERING = 0; +pub const CMP_ORDERING_IS_EQUAL: CMP_ORDERING = 1; +pub const CMP_ORDERING_IS_SUPERIOR: CMP_ORDERING = 2; +pub type CMP_ORDERING = ffi::c_uint; +pub const SIGNED_OPERATION_ADDITION: SIGNED_OPERATION = 1; +pub const SIGNED_OPERATION_SUBTRACTION: SIGNED_OPERATION = -1; +pub type SIGNED_OPERATION = ffi::c_int; +extern "C" { + pub fn scratch_cuda_apply_univariate_lut_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + input_lut: *const ffi::c_void, + lwe_dimension: u32, + glwe_dimension: u32, + polynomial_size: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + input_lwe_ciphertext_count: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_apply_univariate_lut_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + output_radix_lwe: *mut ffi::c_void, + input_radix_lwe: *const ffi::c_void, + mem_ptr: *mut i8, + ksks: *const *mut ffi::c_void, + bsks: *const *mut ffi::c_void, + num_blocks: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_apply_univariate_lut_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn scratch_cuda_apply_bivariate_lut_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + input_lut: *const ffi::c_void, + lwe_dimension: u32, + glwe_dimension: u32, + polynomial_size: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + input_lwe_ciphertext_count: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_apply_bivariate_lut_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + output_radix_lwe: *mut ffi::c_void, + input_radix_lwe_1: *const ffi::c_void, + input_radix_lwe_2: *const ffi::c_void, + mem_ptr: *mut i8, + ksks: *const *mut ffi::c_void, + bsks: *const *mut ffi::c_void, + num_blocks: u32, + shift: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_apply_bivariate_lut_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn cuda_apply_many_univariate_lut_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + output_radix_lwe: *mut ffi::c_void, + input_radix_lwe: *const ffi::c_void, + mem_ptr: *mut i8, + ksks: *const *mut ffi::c_void, + bsks: *const *mut ffi::c_void, + num_blocks: u32, + num_luts: u32, + lut_stride: u32, + ); +} +extern "C" { + pub fn scratch_cuda_full_propagation_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + lwe_dimension: u32, + glwe_dimension: u32, + polynomial_size: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_full_propagation_64_inplace( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + input_blocks: *mut ffi::c_void, + mem_ptr: *mut i8, + ksks: *const *mut ffi::c_void, + bsks: *const *mut ffi::c_void, + num_blocks: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_full_propagation( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn scratch_cuda_integer_mult_radix_ciphertext_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + message_modulus: u32, + carry_modulus: u32, + glwe_dimension: u32, + lwe_dimension: u32, + polynomial_size: u32, + pbs_base_log: u32, + pbs_level: u32, + ks_base_log: u32, + ks_level: u32, + grouping_factor: u32, + num_blocks: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_integer_mult_radix_ciphertext_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + radix_lwe_out: *mut ffi::c_void, + radix_lwe_left: *const ffi::c_void, + radix_lwe_right: *const ffi::c_void, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + mem_ptr: *mut i8, + polynomial_size: u32, + num_blocks: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_integer_mult( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn cuda_negate_integer_radix_ciphertext_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array_out: *mut ffi::c_void, + lwe_array_in: *const ffi::c_void, + lwe_dimension: u32, + lwe_ciphertext_count: u32, + message_modulus: u32, + carry_modulus: u32, + ); +} +extern "C" { + pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array: *mut ffi::c_void, + scalar_input: *const ffi::c_void, + lwe_dimension: u32, + lwe_ciphertext_count: u32, + message_modulus: u32, + carry_modulus: u32, + ); +} +extern "C" { + pub fn scratch_cuda_integer_radix_logical_scalar_shift_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_blocks: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + shift_type: SHIFT_OR_ROTATE_TYPE, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_integer_radix_logical_scalar_shift_kb_64_inplace( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array: *mut ffi::c_void, + shift: u32, + mem_ptr: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + num_blocks: u32, + ); +} +extern "C" { + pub fn scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_blocks: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + shift_type: SHIFT_OR_ROTATE_TYPE, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array: *mut ffi::c_void, + shift: u32, + mem_ptr: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + num_blocks: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_integer_radix_logical_scalar_shift( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn cleanup_cuda_integer_radix_arithmetic_scalar_shift( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn scratch_cuda_integer_radix_shift_and_rotate_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_blocks: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + shift_type: SHIFT_OR_ROTATE_TYPE, + is_signed: bool, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_integer_radix_shift_and_rotate_kb_64_inplace( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array: *mut ffi::c_void, + lwe_shift: *const ffi::c_void, + mem_ptr: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + num_blocks: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_integer_radix_shift_and_rotate( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn scratch_cuda_integer_radix_comparison_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + lwe_ciphertext_count: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + op_type: COMPARISON_TYPE, + is_signed: bool, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_comparison_integer_radix_ciphertext_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array_out: *mut ffi::c_void, + lwe_array_1: *const ffi::c_void, + lwe_array_2: *const ffi::c_void, + mem_ptr: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + lwe_ciphertext_count: u32, + ); +} +extern "C" { + pub fn cuda_scalar_comparison_integer_radix_ciphertext_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array_out: *mut ffi::c_void, + lwe_array_in: *const ffi::c_void, + scalar_blocks: *const ffi::c_void, + mem_ptr: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + lwe_ciphertext_count: u32, + num_scalar_blocks: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_integer_comparison( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn scratch_cuda_integer_radix_bitop_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + lwe_ciphertext_count: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + op_type: BITOP_TYPE, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_bitop_integer_radix_ciphertext_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array_out: *mut ffi::c_void, + lwe_array_1: *const ffi::c_void, + lwe_array_2: *const ffi::c_void, + mem_ptr: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + lwe_ciphertext_count: u32, + ); +} +extern "C" { + pub fn cuda_scalar_bitop_integer_radix_ciphertext_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array_out: *mut ffi::c_void, + lwe_array_input: *const ffi::c_void, + clear_blocks: *const ffi::c_void, + num_clear_blocks: u32, + mem_ptr: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + lwe_ciphertext_count: u32, + op: BITOP_TYPE, + ); +} +extern "C" { + pub fn cleanup_cuda_integer_bitop( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn scratch_cuda_integer_radix_cmux_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + lwe_ciphertext_count: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_cmux_integer_radix_ciphertext_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array_out: *mut ffi::c_void, + lwe_condition: *const ffi::c_void, + lwe_array_true: *const ffi::c_void, + lwe_array_false: *const ffi::c_void, + mem_ptr: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + lwe_ciphertext_count: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_integer_radix_cmux( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn scratch_cuda_integer_radix_scalar_rotate_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_blocks: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + shift_type: SHIFT_OR_ROTATE_TYPE, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_integer_radix_scalar_rotate_kb_64_inplace( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array: *mut ffi::c_void, + n: u32, + mem_ptr: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + num_blocks: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_integer_radix_scalar_rotate( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn scratch_cuda_propagate_single_carry_kb_64_inplace( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_blocks: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_propagate_single_carry_kb_64_inplace( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array: *mut ffi::c_void, + carry_out: *mut ffi::c_void, + mem_ptr: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + num_blocks: u32, + ); +} +extern "C" { + pub fn cuda_propagate_single_carry_get_input_carries_kb_64_inplace( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array: *mut ffi::c_void, + carry_out: *mut ffi::c_void, + input_carries: *mut ffi::c_void, + mem_ptr: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + num_blocks: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_propagate_single_carry( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_blocks_in_radix: u32, + max_num_radix_in_vec: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + radix_lwe_out: *mut ffi::c_void, + radix_lwe_vec: *mut ffi::c_void, + num_radix_in_vec: u32, + mem_ptr: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + num_blocks_in_radix: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn scratch_cuda_integer_radix_overflowing_sub_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_blocks: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_integer_radix_overflowing_sub_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + radix_lwe_out: *mut ffi::c_void, + radix_lwe_overflowed: *mut ffi::c_void, + radix_lwe_left: *const ffi::c_void, + radix_lwe_right: *const ffi::c_void, + mem_ptr: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + num_blocks_in_radix: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_integer_radix_overflowing_sub( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn scratch_cuda_integer_scalar_mul_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_blocks: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array: *mut ffi::c_void, + decomposed_scalar: *const u64, + has_at_least_one_set: *const u64, + mem_ptr: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + lwe_dimension: u32, + polynomial_size: u32, + message_modulus: u32, + num_blocks: u32, + num_scalars: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_integer_radix_scalar_mul( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn scratch_cuda_integer_div_rem_radix_ciphertext_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_blocks: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_integer_div_rem_radix_ciphertext_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + quotient: *mut ffi::c_void, + remainder: *mut ffi::c_void, + numerator: *const ffi::c_void, + divisor: *const ffi::c_void, + mem_ptr: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + num_blocks_in_radix: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_integer_div_rem( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_blocks: u32, + signed_operation: i8, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lhs: *mut ffi::c_void, + rhs: *const ffi::c_void, + overflowed: *mut ffi::c_void, + signed_operation: i8, + mem_ptr: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + num_blocks_in_radix: u32, + ); +} +extern "C" { + pub fn cleanup_signed_overflowing_add_or_sub( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn scratch_cuda_integer_compute_prefix_sum_hillis_steele_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + input_lut: *const ffi::c_void, + lwe_dimension: u32, + glwe_dimension: u32, + polynomial_size: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_radix_blocks: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_integer_compute_prefix_sum_hillis_steele_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + output_radix_lwe: *mut ffi::c_void, + generates_or_propagates: *mut ffi::c_void, + mem_ptr: *mut i8, + ksks: *const *mut ffi::c_void, + bsks: *const *mut ffi::c_void, + num_blocks: u32, + shift: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn cuda_integer_reverse_blocks_64_inplace( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array: *mut ffi::c_void, + num_blocks: u32, + lwe_size: u32, + ); +} +extern "C" { + pub fn cuda_keyswitch_lwe_ciphertext_vector_32( + stream: *mut ffi::c_void, + gpu_index: u32, + lwe_array_out: *mut ffi::c_void, + lwe_output_indexes: *const ffi::c_void, + lwe_array_in: *const ffi::c_void, + lwe_input_indexes: *const ffi::c_void, + ksk: *const ffi::c_void, + lwe_dimension_in: u32, + lwe_dimension_out: u32, + base_log: u32, + level_count: u32, + num_samples: u32, + ); +} +extern "C" { + pub fn cuda_keyswitch_lwe_ciphertext_vector_64( + stream: *mut ffi::c_void, + gpu_index: u32, + lwe_array_out: *mut ffi::c_void, + lwe_output_indexes: *const ffi::c_void, + lwe_array_in: *const ffi::c_void, + lwe_input_indexes: *const ffi::c_void, + ksk: *const ffi::c_void, + lwe_dimension_in: u32, + lwe_dimension_out: u32, + base_log: u32, + level_count: u32, + num_samples: u32, + ); +} +extern "C" { + pub fn scratch_packing_keyswitch_lwe_list_to_glwe_64( + stream: *mut ffi::c_void, + gpu_index: u32, + fp_ks_buffer: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + num_lwes: u32, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_packing_keyswitch_lwe_list_to_glwe_64( + stream: *mut ffi::c_void, + gpu_index: u32, + glwe_array_out: *mut ffi::c_void, + lwe_array_in: *const ffi::c_void, + fp_ksk_array: *const ffi::c_void, + fp_ks_buffer: *mut i8, + input_lwe_dimension: u32, + output_glwe_dimension: u32, + output_polynomial_size: u32, + base_log: u32, + level_count: u32, + num_lwes: u32, + ); +} +extern "C" { + pub fn cleanup_packing_keyswitch_lwe_list_to_glwe( + stream: *mut ffi::c_void, + gpu_index: u32, + fp_ks_buffer: *mut *mut i8, + ); +} +extern "C" { + pub fn cuda_negate_lwe_ciphertext_vector_32( + stream: *mut ffi::c_void, + gpu_index: u32, + lwe_array_out: *mut ffi::c_void, + lwe_array_in: *const ffi::c_void, + input_lwe_dimension: u32, + input_lwe_ciphertext_count: u32, + ); +} +extern "C" { + pub fn cuda_negate_lwe_ciphertext_vector_64( + stream: *mut ffi::c_void, + gpu_index: u32, + lwe_array_out: *mut ffi::c_void, + lwe_array_in: *const ffi::c_void, + input_lwe_dimension: u32, + input_lwe_ciphertext_count: u32, + ); +} +extern "C" { + pub fn cuda_add_lwe_ciphertext_vector_32( + stream: *mut ffi::c_void, + gpu_index: u32, + lwe_array_out: *mut ffi::c_void, + lwe_array_in_1: *const ffi::c_void, + lwe_array_in_2: *const ffi::c_void, + input_lwe_dimension: u32, + input_lwe_ciphertext_count: u32, + ); +} +extern "C" { + pub fn cuda_add_lwe_ciphertext_vector_64( + stream: *mut ffi::c_void, + gpu_index: u32, + lwe_array_out: *mut ffi::c_void, + lwe_array_in_1: *const ffi::c_void, + lwe_array_in_2: *const ffi::c_void, + input_lwe_dimension: u32, + input_lwe_ciphertext_count: u32, + ); +} +extern "C" { + pub fn cuda_add_lwe_ciphertext_vector_plaintext_vector_32( + stream: *mut ffi::c_void, + gpu_index: u32, + lwe_array_out: *mut ffi::c_void, + lwe_array_in: *const ffi::c_void, + plaintext_array_in: *const ffi::c_void, + input_lwe_dimension: u32, + input_lwe_ciphertext_count: u32, + ); +} +extern "C" { + pub fn cuda_add_lwe_ciphertext_vector_plaintext_vector_64( + stream: *mut ffi::c_void, + gpu_index: u32, + lwe_array_out: *mut ffi::c_void, + lwe_array_in: *const ffi::c_void, + plaintext_array_in: *const ffi::c_void, + input_lwe_dimension: u32, + input_lwe_ciphertext_count: u32, + ); +} +extern "C" { + pub fn cuda_mult_lwe_ciphertext_vector_cleartext_vector_32( + stream: *mut ffi::c_void, + gpu_index: u32, + lwe_array_out: *mut ffi::c_void, + lwe_array_in: *const ffi::c_void, + cleartext_array_in: *const ffi::c_void, + input_lwe_dimension: u32, + input_lwe_ciphertext_count: u32, + ); +} +extern "C" { + pub fn cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( + stream: *mut ffi::c_void, + gpu_index: u32, + lwe_array_out: *mut ffi::c_void, + lwe_array_in: *const ffi::c_void, + cleartext_array_in: *const ffi::c_void, + input_lwe_dimension: u32, + input_lwe_ciphertext_count: u32, + ); +} +extern "C" { + pub fn cuda_fourier_polynomial_mul( + stream: *mut ffi::c_void, + gpu_index: u32, + input1: *const ffi::c_void, + input2: *const ffi::c_void, + output: *mut ffi::c_void, + polynomial_size: u32, + total_polynomials: u32, + ); +} +extern "C" { + pub fn cuda_convert_lwe_programmable_bootstrap_key_32( + stream: *mut ffi::c_void, + gpu_index: u32, + dest: *mut ffi::c_void, + src: *const ffi::c_void, + input_lwe_dim: u32, + glwe_dim: u32, + level_count: u32, + polynomial_size: u32, + ); +} +extern "C" { + pub fn cuda_convert_lwe_programmable_bootstrap_key_64( + stream: *mut ffi::c_void, + gpu_index: u32, + dest: *mut ffi::c_void, + src: *const ffi::c_void, + input_lwe_dim: u32, + glwe_dim: u32, + level_count: u32, + polynomial_size: u32, + ); +} +extern "C" { + pub fn scratch_cuda_programmable_bootstrap_amortized_32( + stream: *mut ffi::c_void, + gpu_index: u32, + pbs_buffer: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + input_lwe_ciphertext_count: u32, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn scratch_cuda_programmable_bootstrap_amortized_64( + stream: *mut ffi::c_void, + gpu_index: u32, + pbs_buffer: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + input_lwe_ciphertext_count: u32, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( + stream: *mut ffi::c_void, + gpu_index: u32, + lwe_array_out: *mut ffi::c_void, + lwe_output_indexes: *const ffi::c_void, + lut_vector: *const ffi::c_void, + lut_vector_indexes: *const ffi::c_void, + lwe_array_in: *const ffi::c_void, + lwe_input_indexes: *const ffi::c_void, + bootstrapping_key: *const ffi::c_void, + pbs_buffer: *mut i8, + lwe_dimension: u32, + glwe_dimension: u32, + polynomial_size: u32, + base_log: u32, + level_count: u32, + num_samples: u32, + ); +} +extern "C" { + pub fn cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( + stream: *mut ffi::c_void, + gpu_index: u32, + lwe_array_out: *mut ffi::c_void, + lwe_output_indexes: *const ffi::c_void, + lut_vector: *const ffi::c_void, + lut_vector_indexes: *const ffi::c_void, + lwe_array_in: *const ffi::c_void, + lwe_input_indexes: *const ffi::c_void, + bootstrapping_key: *const ffi::c_void, + pbs_buffer: *mut i8, + lwe_dimension: u32, + glwe_dimension: u32, + polynomial_size: u32, + base_log: u32, + level_count: u32, + num_samples: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_programmable_bootstrap_amortized( + stream: *mut ffi::c_void, + gpu_index: u32, + pbs_buffer: *mut *mut i8, + ); +} +extern "C" { + pub fn scratch_cuda_programmable_bootstrap_32( + stream: *mut ffi::c_void, + gpu_index: u32, + buffer: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + level_count: u32, + input_lwe_ciphertext_count: u32, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn scratch_cuda_programmable_bootstrap_64( + stream: *mut ffi::c_void, + gpu_index: u32, + buffer: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + level_count: u32, + input_lwe_ciphertext_count: u32, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_32( + stream: *mut ffi::c_void, + gpu_index: u32, + lwe_array_out: *mut ffi::c_void, + lwe_output_indexes: *const ffi::c_void, + lut_vector: *const ffi::c_void, + lut_vector_indexes: *const ffi::c_void, + lwe_array_in: *const ffi::c_void, + lwe_input_indexes: *const ffi::c_void, + bootstrapping_key: *const ffi::c_void, + buffer: *mut i8, + lwe_dimension: u32, + glwe_dimension: u32, + polynomial_size: u32, + base_log: u32, + level_count: u32, + num_samples: u32, + lut_count: u32, + lut_stride: u32, + ); +} +extern "C" { + pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_64( + stream: *mut ffi::c_void, + gpu_index: u32, + lwe_array_out: *mut ffi::c_void, + lwe_output_indexes: *const ffi::c_void, + lut_vector: *const ffi::c_void, + lut_vector_indexes: *const ffi::c_void, + lwe_array_in: *const ffi::c_void, + lwe_input_indexes: *const ffi::c_void, + bootstrapping_key: *const ffi::c_void, + buffer: *mut i8, + lwe_dimension: u32, + glwe_dimension: u32, + polynomial_size: u32, + base_log: u32, + level_count: u32, + num_samples: u32, + lut_count: u32, + lut_stride: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_programmable_bootstrap( + stream: *mut ffi::c_void, + gpu_index: u32, + pbs_buffer: *mut *mut i8, + ); +} +extern "C" { + pub fn has_support_to_cuda_programmable_bootstrap_cg_multi_bit( + glwe_dimension: u32, + polynomial_size: u32, + level_count: u32, + num_samples: u32, + ) -> bool; +} +extern "C" { + pub fn cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64( + stream: *mut ffi::c_void, + gpu_index: u32, + dest: *mut ffi::c_void, + src: *const ffi::c_void, + input_lwe_dim: u32, + glwe_dim: u32, + level_count: u32, + polynomial_size: u32, + grouping_factor: u32, + ); +} +extern "C" { + pub fn scratch_cuda_multi_bit_programmable_bootstrap_64( + stream: *mut ffi::c_void, + gpu_index: u32, + pbs_buffer: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + level_count: u32, + input_lwe_ciphertext_count: u32, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( + stream: *mut ffi::c_void, + gpu_index: u32, + lwe_array_out: *mut ffi::c_void, + lwe_output_indexes: *const ffi::c_void, + lut_vector: *const ffi::c_void, + lut_vector_indexes: *const ffi::c_void, + lwe_array_in: *const ffi::c_void, + lwe_input_indexes: *const ffi::c_void, + bootstrapping_key: *const ffi::c_void, + buffer: *mut i8, + lwe_dimension: u32, + glwe_dimension: u32, + polynomial_size: u32, + grouping_factor: u32, + base_log: u32, + level_count: u32, + num_samples: u32, + lut_count: u32, + lut_stride: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_multi_bit_programmable_bootstrap( + stream: *mut ffi::c_void, + gpu_index: u32, + pbs_buffer: *mut *mut i8, + ); +} diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs index fa6e82335d..3c359de5ea 100644 --- a/backends/tfhe-cuda-backend/src/cuda_bind.rs +++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs @@ -57,1176 +57,6 @@ extern "C" { pub fn cuda_drop_async(ptr: *mut c_void, stream: *mut c_void, gpu_index: u32); - pub fn cuda_convert_lwe_ciphertext_vector_to_gpu_64( - stream: *mut c_void, - gpu_index: u32, - dest: *mut c_void, - src: *mut c_void, - number_of_cts: u32, - lwe_dimension: u32, - ); - - pub fn cuda_convert_lwe_ciphertext_vector_to_cpu_64( - stream: *mut c_void, - gpu_index: u32, - dest: *mut c_void, - src: *mut c_void, - number_of_cts: u32, - lwe_dimension: u32, - ); - - pub fn cuda_glwe_sample_extract_64( - stream: *mut c_void, - gpu_index: u32, - lwe_array_out: *mut c_void, - glwe_array_in: *const c_void, - nth_array: *const u32, - num_glwes: u32, - glwe_dimension: u32, - polynomial_size: u32, - ); - - pub fn scratch_cuda_integer_compress_radix_ciphertext_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - compression_glwe_dimension: u32, - compression_polynomial_size: u32, - lwe_dimension: u32, - ks_level: u32, - ks_base_log: u32, - num_radix_blocks: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - lwe_per_glwe: u32, - storage_log_modulus: u32, - allocate_gpu_memory: bool, - ); - - pub fn scratch_cuda_integer_decompress_radix_ciphertext_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - encryption_glwe_dimension: u32, - encryption_polynomial_size: u32, - compression_glwe_dimension: u32, - compression_polynomial_size: u32, - lwe_dimension: u32, - pbs_level: u32, - pbs_base_log: u32, - num_radix_blocks: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - storage_log_modulus: u32, - bodies_count: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_integer_compress_radix_ciphertext_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - glwe_array_out: *mut c_void, - lwe_array_in: *const c_void, - fp_ksk: *const *mut c_void, - num_nths: u32, - mem_ptr: *mut i8, - ); - - pub fn cuda_integer_decompress_radix_ciphertext_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lwe_array_out: *mut c_void, - glwe_in: *const c_void, - indexes_array: *const u32, - indexes_array_size: u32, - bsks: *const *mut c_void, - mem_ptr: *mut i8, - ); - - pub fn cleanup_cuda_integer_compress_radix_ciphertext_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - - pub fn cleanup_cuda_integer_decompress_radix_ciphertext_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - pub fn cuda_setup_multi_gpu() -> i32; - pub fn scratch_cuda_apply_univariate_lut_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - input_lut: *const c_void, - lwe_dimension: u32, - glwe_dimension: u32, - polynomial_size: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - input_lwe_ciphertext_count: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_apply_univariate_lut_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - output_radix_lwe: *mut c_void, - input_radix_lwe: *const c_void, - mem_ptr: *mut i8, - ksks: *const *mut c_void, - bsks: *const *mut c_void, - num_blocks: u32, - ); - - pub fn cuda_apply_many_univariate_lut_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - output_radix_lwe: *mut c_void, - input_radix_lwe: *const c_void, - mem_ptr: *mut i8, - ksks: *const *mut c_void, - bsks: *const *mut c_void, - num_blocks: u32, - num_luts: u32, - lut_stride: u32, - ); - - pub fn cleanup_cuda_apply_univariate_lut_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - - pub fn scratch_cuda_apply_bivariate_lut_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - input_lut: *const c_void, - lwe_dimension: u32, - glwe_dimension: u32, - polynomial_size: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - input_lwe_ciphertext_count: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_apply_bivariate_lut_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - output_radix_lwe: *mut c_void, - input_radix_lwe_1: *const c_void, - input_radix_lwe_2: *const c_void, - mem_ptr: *mut i8, - ksks: *const *mut c_void, - bsks: *const *mut c_void, - num_blocks: u32, - shift: u32, - ); - - pub fn cleanup_cuda_apply_bivariate_lut_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - - pub fn scratch_cuda_full_propagation_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - lwe_dimension: u32, - glwe_dimension: u32, - polynomial_size: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_full_propagation_64_inplace( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - input_blocks: *mut c_void, - mem_ptr: *mut i8, - ksks: *const *mut c_void, - bsks: *const *mut c_void, - num_blocks: u32, - ); - - pub fn cleanup_cuda_full_propagation( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - - pub fn scratch_cuda_integer_mult_radix_ciphertext_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - message_modulus: u32, - carry_modulus: u32, - glwe_dimension: u32, - lwe_dimension: u32, - polynomial_size: u32, - pbs_base_log: u32, - pbs_level: u32, - ks_base_log: u32, - ks_level: u32, - grouping_factor: u32, - num_blocks: u32, - pbs_type: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_integer_mult_radix_ciphertext_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - radix_lwe_out: *mut c_void, - radix_lwe_left: *const c_void, - radix_lwe_right: *const c_void, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - mem_ptr: *mut i8, - polynomial_size: u32, - num_blocks: u32, - ); - - pub fn cleanup_cuda_integer_mult( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - - pub fn cuda_negate_integer_radix_ciphertext_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lwe_array_out: *mut c_void, - lwe_array_in: *const c_void, - lwe_dimension: u32, - lwe_ciphertext_count: u32, - message_modulus: u32, - carry_modulus: u32, - ); - - pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lwe_array: *mut c_void, - scalar_input: *const c_void, - lwe_dimension: u32, - lwe_ciphertext_count: u32, - message_modulus: u32, - carry_modulus: u32, - ); - - pub fn scratch_cuda_integer_radix_logical_scalar_shift_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - big_lwe_dimension: u32, - small_lwe_dimension: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - num_blocks: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - shift_type: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_integer_radix_logical_scalar_shift_kb_64_inplace( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lwe_array: *mut c_void, - shift: u32, - mem_ptr: *mut i8, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - num_blocks: u32, - ); - - pub fn scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - big_lwe_dimension: u32, - small_lwe_dimension: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - num_blocks: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - shift_type: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lwe_array: *mut c_void, - shift: u32, - mem_ptr: *mut i8, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - num_blocks: u32, - ); - - pub fn cleanup_cuda_integer_radix_logical_scalar_shift( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - - pub fn cleanup_cuda_integer_radix_arithmetic_scalar_shift( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - - pub fn scratch_cuda_integer_radix_shift_and_rotate_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - big_lwe_dimension: u32, - small_lwe_dimension: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - num_blocks: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - shift_type: u32, - is_signed: bool, - allocate_gpu_memory: bool, - ); - - pub fn cuda_integer_radix_shift_and_rotate_kb_64_inplace( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lwe_array: *mut c_void, - lwe_shift: *const c_void, - mem_ptr: *mut i8, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - num_blocks: u32, - ); - - pub fn cleanup_cuda_integer_radix_shift_and_rotate( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - - pub fn scratch_cuda_integer_radix_comparison_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - big_lwe_dimension: u32, - small_lwe_dimension: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - lwe_ciphertext_count: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - op_type: u32, - is_signed: bool, - allocate_gpu_memory: bool, - ); - - pub fn cuda_comparison_integer_radix_ciphertext_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lwe_array_out: *mut c_void, - lwe_array_1: *const c_void, - lwe_array_2: *const c_void, - mem_ptr: *mut i8, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - lwe_ciphertext_count: u32, - ); - - pub fn cuda_scalar_comparison_integer_radix_ciphertext_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lwe_array_out: *mut c_void, - lwe_array_in: *const c_void, - scalar_blocks: *const c_void, - mem_ptr: *mut i8, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - lwe_ciphertext_count: u32, - num_scalar_blocks: u32, - ); - - pub fn cleanup_cuda_integer_comparison( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - - pub fn scratch_cuda_integer_radix_bitop_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - big_lwe_dimension: u32, - small_lwe_dimension: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - lwe_ciphertext_count: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - op_type: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_bitop_integer_radix_ciphertext_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lwe_array_out: *mut c_void, - lwe_array_1: *const c_void, - lwe_array_2: *const c_void, - mem_ptr: *mut i8, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - lwe_ciphertext_count: u32, - ); - - pub fn cuda_scalar_bitop_integer_radix_ciphertext_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lwe_array_out: *mut c_void, - lwe_array_input: *const c_void, - clear_blocks: *const c_void, - num_clear_blocks: u32, - mem_ptr: *mut i8, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - lwe_ciphertext_count: u32, - op: u32, - ); - - pub fn cleanup_cuda_integer_bitop( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - - pub fn scratch_cuda_integer_radix_cmux_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - big_lwe_dimension: u32, - small_lwe_dimension: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - lwe_ciphertext_count: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_cmux_integer_radix_ciphertext_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lwe_array_out: *mut c_void, - lwe_condition: *const c_void, - lwe_array_true: *const c_void, - lwe_array_false: *const c_void, - mem_ptr: *mut i8, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - lwe_ciphertext_count: u32, - ); - - pub fn cleanup_cuda_integer_radix_cmux( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - - pub fn scratch_cuda_integer_radix_scalar_rotate_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - big_lwe_dimension: u32, - small_lwe_dimension: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - num_blocks: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - shift_type: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_integer_radix_scalar_rotate_kb_64_inplace( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lwe_array: *mut c_void, - n: u32, - mem_ptr: *mut i8, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - num_blocks: u32, - ); - - pub fn cleanup_cuda_integer_radix_scalar_rotate( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - - pub fn scratch_cuda_propagate_single_carry_kb_64_inplace( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - big_lwe_dimension: u32, - small_lwe_dimension: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - num_blocks: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_propagate_single_carry_kb_64_inplace( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lwe_array: *mut c_void, - carry_out: *mut c_void, - mem_ptr: *mut i8, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - num_blocks: u32, - ); - - pub fn cuda_propagate_single_carry_get_input_carries_kb_64_inplace( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lwe_array: *mut c_void, - carry_out: *mut c_void, - input_carries: *mut c_void, - mem_ptr: *mut i8, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - num_blocks: u32, - ); - - pub fn cleanup_cuda_propagate_single_carry( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - - pub fn scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - lwe_dimension: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - num_blocks_in_radix: u32, - max_num_radix_in_vec: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - radix_lwe_out: *mut c_void, - radix_lwe_vec: *mut c_void, - num_radix_in_vec: u32, - mem_ptr: *mut i8, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - num_blocks_in_radix: u32, - ); - - pub fn cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - - pub fn scratch_cuda_integer_radix_overflowing_sub_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - big_lwe_dimension: u32, - small_lwe_dimension: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - num_blocks: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_integer_radix_overflowing_sub_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - radix_lwe_out: *mut c_void, - radix_lwe_overflowed: *mut c_void, - radix_lwe_left: *const c_void, - radix_lwe_right: *const c_void, - mem_ptr: *mut i8, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - num_blocks_in_radix: u32, - ); - - pub fn cleanup_cuda_integer_radix_overflowing_sub( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - - pub fn scratch_cuda_integer_scalar_mul_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - lwe_dimension: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - num_blocks: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lwe_array: *mut c_void, - decomposed_scalar: *const u64, - has_at_least_one_set: *const u64, - mem_ptr: *mut i8, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - lwe_dimension: u32, - polynomial_size: u32, - message_modulus: u32, - num_blocks: u32, - num_scalars: u32, - ); - - pub fn cleanup_cuda_integer_radix_scalar_mul( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - - pub fn scratch_cuda_integer_div_rem_radix_ciphertext_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - big_lwe_dimension: u32, - small_lwe_dimension: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - num_blocks: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_integer_div_rem_radix_ciphertext_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - quotient: *mut c_void, - remainder: *mut c_void, - numerator: *const c_void, - divisor: *const c_void, - mem_ptr: *mut i8, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - num_blocks_in_radix: u32, - ); - - pub fn cleanup_cuda_integer_div_rem( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - - pub fn scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - big_lwe_dimension: u32, - small_lwe_dimension: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - num_blocks: u32, - signed_operation: i8, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lhs: *mut c_void, - rhs: *const c_void, - overflowed: *mut c_void, - signed_operation: i8, - mem_ptr: *mut i8, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - num_blocks_in_radix: u32, - ); - - pub fn cleanup_signed_overflowing_add_or_sub( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - - pub fn scratch_cuda_integer_compute_prefix_sum_hillis_steele_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - input_lut: *const c_void, - lwe_dimension: u32, - glwe_dimension: u32, - polynomial_size: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - num_radix_blocks: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_integer_compute_prefix_sum_hillis_steele_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - output_radix_lwe: *mut c_void, - generates_or_propagates: *mut c_void, - mem_ptr: *mut i8, - ksks: *const *mut c_void, - bsks: *const *mut c_void, - num_blocks: u32, - shift: u32, - ); - - pub fn cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - - pub fn cuda_integer_reverse_blocks_64_inplace( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lwe_array: *mut c_void, - num_blocks: u32, - lwe_size: u32, - ); - - pub fn cuda_keyswitch_lwe_ciphertext_vector_64( - stream: *mut c_void, - gpu_index: u32, - lwe_array_out: *mut c_void, - lwe_output_indexes: *const c_void, - lwe_array_in: *const c_void, - lwe_input_indexes: *const c_void, - ksk: *const c_void, - lwe_dimension_in: u32, - lwe_dimension_out: u32, - base_log: u32, - level_count: u32, - num_samples: u32, - ); - - pub fn scratch_packing_keyswitch_lwe_list_to_glwe_64( - stream: *mut c_void, - gpu_index: u32, - fp_ks_buffer: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - num_lwes: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_packing_keyswitch_lwe_list_to_glwe_64( - stream: *mut c_void, - gpu_index: u32, - glwe_array_out: *mut c_void, - lwe_array_in: *const c_void, - fp_ksk_array: *const c_void, - fp_ks_buffer: *mut i8, - input_lwe_dimension: u32, - output_glwe_dimension: u32, - output_polynomial_size: u32, - base_log: u32, - level_count: u32, - num_lwes: u32, - ); - - pub fn cleanup_packing_keyswitch_lwe_list_to_glwe( - stream: *mut c_void, - gpu_index: u32, - fp_ks_buffer: *mut *mut i8, - ); - - pub fn cuda_negate_lwe_ciphertext_vector_64( - stream: *mut c_void, - gpu_index: u32, - lwe_array_out: *mut c_void, - lwe_array_in: *const c_void, - input_lwe_dimension: u32, - input_lwe_ciphertext_count: u32, - ); - - pub fn cuda_add_lwe_ciphertext_vector_64( - stream: *mut c_void, - gpu_index: u32, - lwe_array_out: *mut c_void, - lwe_array_in_1: *const c_void, - lwe_array_in_2: *const c_void, - input_lwe_dimension: u32, - input_lwe_ciphertext_count: u32, - ); - - pub fn cuda_add_lwe_ciphertext_vector_plaintext_vector_64( - stream: *mut c_void, - gpu_index: u32, - lwe_array_out: *mut c_void, - lwe_array_in: *const c_void, - plaintext_array_in: *const c_void, - input_lwe_dimension: u32, - input_lwe_ciphertext_count: u32, - ); - - pub fn cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( - stream: *mut c_void, - gpu_index: u32, - lwe_array_out: *mut c_void, - lwe_array_in: *const c_void, - cleartext_array_in: *const c_void, - input_lwe_dimension: u32, - input_lwe_ciphertext_count: u32, - ); - - pub fn cuda_fourier_polynomial_mul( - stream: *mut c_void, - gpu_index: u32, - input1: *const c_void, - input2: *const c_void, - output: *mut c_void, - polynomial_size: u32, - total_polynomials: u32, - ); - - pub fn cuda_convert_lwe_programmable_bootstrap_key_64( - stream: *mut c_void, - gpu_index: u32, - dest: *mut c_void, - src: *const c_void, - input_lwe_dim: u32, - glwe_dim: u32, - level_count: u32, - polynomial_size: u32, - ); - - pub fn scratch_cuda_programmable_bootstrap_amortized_64( - stream: *mut c_void, - gpu_index: u32, - pbs_buffer: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - input_lwe_ciphertext_count: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( - stream: *mut c_void, - gpu_index: u32, - lwe_array_out: *mut c_void, - lwe_output_indexes: *const c_void, - lut_vector: *const c_void, - lut_vector_indexes: *const c_void, - lwe_array_in: *const c_void, - lwe_input_indexes: *const c_void, - bootstrapping_key: *const c_void, - pbs_buffer: *mut i8, - lwe_dimension: u32, - glwe_dimension: u32, - polynomial_size: u32, - base_log: u32, - level_count: u32, - num_samples: u32, - ); - - pub fn cleanup_cuda_programmable_bootstrap_amortized( - stream: *mut c_void, - gpu_index: u32, - pbs_buffer: *mut *mut i8, - ); - - pub fn scratch_cuda_programmable_bootstrap_64( - stream: *mut c_void, - gpu_index: u32, - pbs_buffer: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - level_count: u32, - input_lwe_ciphertext_count: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_64( - stream: *mut c_void, - gpu_index: u32, - lwe_array_out: *mut c_void, - lwe_output_indexes: *const c_void, - lut_vector: *const c_void, - lut_vector_indexes: *const c_void, - lwe_array_in: *const c_void, - lwe_input_indexes: *const c_void, - bootstrapping_key: *const c_void, - buffer: *mut i8, - lwe_dimension: u32, - glwe_dimension: u32, - polynomial_size: u32, - base_log: u32, - level_count: u32, - num_samples: u32, - lut_count: u32, - lut_stride: u32, - ); - - pub fn cleanup_cuda_programmable_bootstrap( - stream: *mut c_void, - gpu_index: u32, - pbs_buffer: *mut *mut i8, - ); - - pub fn cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64( - stream: *mut c_void, - gpu_index: u32, - dest: *mut c_void, - src: *const c_void, - input_lwe_dim: u32, - glwe_dim: u32, - level_count: u32, - polynomial_size: u32, - grouping_factor: u32, - ); - - pub fn scratch_cuda_multi_bit_programmable_bootstrap_64( - stream: *mut c_void, - gpu_index: u32, - pbs_buffer: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - level_count: u32, - input_lwe_ciphertext_count: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( - stream: *mut c_void, - gpu_index: u32, - lwe_array_out: *mut c_void, - lwe_output_indexes: *const c_void, - lut_vector: *const c_void, - lut_vector_indexes: *const c_void, - lwe_array_in: *const c_void, - lwe_input_indexes: *const c_void, - bootstrapping_key: *const c_void, - buffer: *mut i8, - lwe_dimension: u32, - glwe_dimension: u32, - polynomial_size: u32, - grouping_factor: u32, - base_log: u32, - level_count: u32, - num_samples: u32, - lut_count: u32, - lut_stride: u32, - ); - - pub fn cleanup_cuda_multi_bit_programmable_bootstrap( - stream: *mut c_void, - gpu_index: u32, - pbs_buffer: *mut *mut i8, - ); - } // extern "C" diff --git a/backends/tfhe-cuda-backend/src/ffi.rs b/backends/tfhe-cuda-backend/src/ffi.rs new file mode 100644 index 0000000000..6333a1db58 --- /dev/null +++ b/backends/tfhe-cuda-backend/src/ffi.rs @@ -0,0 +1,11 @@ +#![allow(warnings)] +pub type c_void = std::ffi::c_void; +pub type c_uint = std::ffi::c_uint; +pub type c_uchar = std::ffi::c_uchar; +pub type c_ushort = std::ffi::c_ushort; +pub type c_ulong = std::ffi::c_ulong; +pub type c_schar = std::ffi::c_schar; +pub type c_int = std::ffi::c_int; +pub type c_short = std::ffi::c_short; +pub type c_long = std::ffi::c_long; +pub type c_char = std::ffi::c_char; diff --git a/backends/tfhe-cuda-backend/src/lib.rs b/backends/tfhe-cuda-backend/src/lib.rs index 26b2e30eb6..5d2a1ea3c2 100644 --- a/backends/tfhe-cuda-backend/src/lib.rs +++ b/backends/tfhe-cuda-backend/src/lib.rs @@ -1 +1,4 @@ +#[allow(warnings)] +pub mod bindings; pub mod cuda_bind; +pub mod ffi; diff --git a/backends/tfhe-cuda-backend/wrapper.h b/backends/tfhe-cuda-backend/wrapper.h new file mode 100644 index 0000000000..1946762897 --- /dev/null +++ b/backends/tfhe-cuda-backend/wrapper.h @@ -0,0 +1,7 @@ +#include "cuda/include/ciphertext.h" +#include "cuda/include/integer/compression/compression.h" +#include "cuda/include/integer/integer.h" +#include "cuda/include/keyswitch.h" +#include "cuda/include/linear_algebra.h" +#include "cuda/include/pbs/programmable_bootstrap.h" +#include "cuda/include/pbs/programmable_bootstrap_multibit.h" diff --git a/ci/slab.toml b/ci/slab.toml index 1eb803c46d..a7eb7ef879 100644 --- a/ci/slab.toml +++ b/ci/slab.toml @@ -27,7 +27,7 @@ instance_type = "hpc7a.96xlarge" # Profile used to build CUDA code without the need to get p-like instance. [backend.aws.gpu-build] region = "us-east-1" -image_id = "ami-06b3d61f41bf8350a" +image_id = "ami-06a04649d895d10e0" instance_type = "m6i.4xlarge" [backend.hyperstack.gpu-test] diff --git a/tfhe/docs/guides/run_on_gpu.md b/tfhe/docs/guides/run_on_gpu.md index a687a652d3..395121eb8e 100644 --- a/tfhe/docs/guides/run_on_gpu.md +++ b/tfhe/docs/guides/run_on_gpu.md @@ -10,6 +10,7 @@ This guide explains how to update your existing program to leverage GPU accelera * Compute Capability >= 3.0 * [gcc](https://gcc.gnu.org/) >= 8.0 - check this [page](https://gist.github.com/ax3l/9489132) for more details about nvcc/gcc compatible versions * [cmake](https://cmake.org/) >= 3.24 +* libclang, to match Rust bingen [requirements](https://rust-lang.github.io/rust-bindgen/requirements.html) >= 9.0 * Rust version - check this [page](rust\_configuration.md) ## Importing to your project diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index 19868235e5..b25cde608f 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -12,7 +12,9 @@ use crate::core_crypto::prelude::{ pub use algorithms::*; pub use entities::*; use std::ffi::c_void; -pub(crate) use tfhe_cuda_backend::cuda_bind::*; +use tfhe_cuda_backend::bindings::*; +use tfhe_cuda_backend::cuda_bind::*; + #[derive(Debug)] pub struct CudaStreams { pub ptr: Vec<*mut c_void>, diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index e9004b45a2..9699bde614 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -15,6 +15,7 @@ use crate::shortint::{CarryModulus, MessageModulus}; pub use server_key::CudaServerKey; use std::cmp::min; +use tfhe_cuda_backend::bindings::*; use tfhe_cuda_backend::cuda_bind::*; #[repr(u32)]