zama-ai · agnesLeroy · Aug 9, 2024 · Aug 9, 2024 · Aug 9, 2024 · Aug 9, 2024
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
@@ -14,7 +14,6 @@
 #include "utils/kernel_dimensions.cuh"
 #include <fstream>
 #include <iostream>
-#include <omp.h>
 #include <sstream>
 #include <string>
 #include <vector>
@@ -110,26 +109,14 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
     cuda_synchronize_stream(streams[j], gpu_indexes[j]);
   }
 
-#pragma omp parallel sections
-  {
-    // generate input_carries and output_carry
-#pragma omp section
-    {
-      host_propagate_single_carry(
-          mem_ptr->sub_streams_1, gpu_indexes, gpu_count, result, output_carry,
-          input_carries, mem_ptr->scp_mem, bsks, ksks, num_blocks);
-    }
-
-    // generate generate_last_block_inner_propagation
-#pragma omp section
-    {
-      host_generate_last_block_inner_propagation(
-          mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
-          last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size],
-          &rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem,
-          bsks, ksks);
-    }
-  }
+  host_propagate_single_carry(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
+                              result, output_carry, input_carries,
+                              mem_ptr->scp_mem, bsks, ksks, num_blocks);
+  host_generate_last_block_inner_propagation(
+      mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
+      last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size],
+      &rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, bsks,
+      ksks);
 
   for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
     cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -2,7 +2,6 @@
 #define CUDA_INTEGER_CMUX_CUH
 
 #include "integer.cuh"
-#include <omp.h>
 
 template <typename Torus>
 __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
@@ -57,25 +56,14 @@ __host__ void host_integer_radix_cmux_kb(
     cuda_synchronize_stream(streams[j], gpu_indexes[j]);
   }
 
-#pragma omp parallel sections
-  {
-    // Both sections may be executed in parallel
-#pragma omp section
-    {
-      auto mem_true = mem_ptr->zero_if_true_buffer;
-      zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
-                  lwe_array_true, lwe_condition, mem_true,
-                  mem_ptr->inverted_predicate_lut, bsks, ksks,
-                  num_radix_blocks);
-    }
-#pragma omp section
-    {
-      auto mem_false = mem_ptr->zero_if_false_buffer;
-      zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
-                  lwe_array_false, lwe_condition, mem_false,
-                  mem_ptr->predicate_lut, bsks, ksks, num_radix_blocks);
-    }
-  }
+  auto mem_true = mem_ptr->zero_if_true_buffer;
+  zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
+              lwe_array_true, lwe_condition, mem_true,
+              mem_ptr->inverted_predicate_lut, bsks, ksks, num_radix_blocks);
+  auto mem_false = mem_ptr->zero_if_false_buffer;
+  zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
+              lwe_array_false, lwe_condition, mem_false, mem_ptr->predicate_lut,
+              bsks, ksks, num_radix_blocks);
   for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
     cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
   }

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -14,7 +14,6 @@
 #include "utils/kernel_dimensions.cuh"
 #include <fstream>
 #include <iostream>
-#include <omp.h>
 #include <sstream>
 #include <string>
 #include <vector>
@@ -372,34 +371,18 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
     for (uint j = 0; j < gpu_count; j++) {
       cuda_synchronize_stream(streams[j], gpu_indexes[j]);
     }
-#pragma omp parallel sections
-    {
-#pragma omp section
-      {
-        // interesting_divisor
-        trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
-                                           gpu_count);
-      }
-#pragma omp section
-      {
-        // divisor_ms_blocks
-        trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes,
-                                   gpu_count);
-      }
-#pragma omp section
-      {
-        // interesting_remainder1
-        // numerator_block_stack
-        left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes,
-                                          gpu_count);
-      }
-#pragma omp section
-      {
-        // interesting_remainder2
-        left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
-                                          gpu_count);
-      }
-    }
+    // interesting_divisor
+    trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
+                                       gpu_count);
+    // divisor_ms_blocks
+    trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
+    // interesting_remainder1
+    // numerator_block_stack
+    left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes,
+                                      gpu_count);
+    // interesting_remainder2
+    left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
+                                      gpu_count);
     for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
       cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
       cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
@@ -489,27 +472,14 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
     for (uint j = 0; j < gpu_count; j++) {
       cuda_synchronize_stream(streams[j], gpu_indexes[j]);
     }
-#pragma omp parallel sections
-    {
-#pragma omp section
-      {
-        // new_remainder
-        // subtraction_overflowed
-        do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
-      }
-#pragma omp section
-      {
-        // at_least_one_upper_block_is_non_zero
-        check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes,
-                                   gpu_count);
-      }
-#pragma omp section
-      {
-        // cleaned_merged_interesting_remainder
-        create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
-                                                 gpu_indexes, gpu_count);
-      }
-    }
+    // new_remainder
+    // subtraction_overflowed
+    do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
+    // at_least_one_upper_block_is_non_zero
+    check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
+    // cleaned_merged_interesting_remainder
+    create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
+                                             gpu_indexes, gpu_count);
     for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
       cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
       cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
@@ -567,26 +537,14 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
     for (uint j = 0; j < gpu_count; j++) {
       cuda_synchronize_stream(streams[j], gpu_indexes[j]);
     }
-#pragma omp parallel sections
-    {
-#pragma omp section
-      {
-        // cleaned_merged_interesting_remainder
-        conditionally_zero_out_merged_interesting_remainder(
-            mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
-      }
-#pragma omp section
-      {
-        // new_remainder
-        conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2,
-                                                    gpu_indexes, gpu_count);
-      }
-#pragma omp section
-      {
-        // quotient
-        set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
-      }
-    }
+    // cleaned_merged_interesting_remainder
+    conditionally_zero_out_merged_interesting_remainder(mem_ptr->sub_streams_1,
+                                                        gpu_indexes, gpu_count);
+    // new_remainder
+    conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2,
+                                                gpu_indexes, gpu_count);
+    // quotient
+    set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
     for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
       cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
       cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
@@ -613,21 +571,12 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
   for (uint j = 0; j < gpu_count; j++) {
     cuda_synchronize_stream(streams[j], gpu_indexes[j]);
   }
-#pragma omp parallel sections
-  {
-#pragma omp section
-    {
-      integer_radix_apply_univariate_lookup_table_kb(
-          mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
-          bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
-    }
-#pragma omp section
-    {
-      integer_radix_apply_univariate_lookup_table_kb(
-          mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient,
-          bsks, ksks, num_blocks, mem_ptr->message_extract_lut_2);
-    }
-  }
+  integer_radix_apply_univariate_lookup_table_kb(
+      mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
+      bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
+  integer_radix_apply_univariate_lookup_table_kb(
+      mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
+      ksks, num_blocks, mem_ptr->message_extract_lut_2);
   for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
     cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
     cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);