fix(compiler): [GPU backend] fix memory (de-)allocation.

zama-ai · Apr 24, 2024 · 41af3b7 · 41af3b7
1 parent 2ddd117
commit 41af3b7
Showing 1 changed file with 35 additions and 10 deletions.
diff --git a/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp b/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp
@@ -193,15 +193,21 @@ struct GPU_DFG {
     free_stream_order_dependent_data();
   }
   inline void register_stream(Stream *s) { streams.push_back(s); }
-  inline void register_stream_order_dependent_allocation(void *p) {
+  inline void register_stream_order_dependent_allocation(void *p, bool gpu) {
     std::lock_guard<std::mutex> guard(free_list_guard);
-    to_free_list.push_back(p);
+    if (gpu)
+      to_free_list_gpu.push_back(p);
+    else
+      to_free_list.push_back(p);
   }
   inline void free_stream_order_dependent_data() {
     std::lock_guard<std::mutex> guard(free_list_guard);
     for (auto p : to_free_list)
       free(p);
     to_free_list.clear();
+    for (auto p : to_free_list_gpu)
+      cudaFreeHost(p);
+    to_free_list_gpu.clear();
   }
   inline int8_t *get_pbs_buffer(uint32_t glwe_dimension,
                                 uint32_t polynomial_size,
@@ -230,6 +236,7 @@ struct GPU_DFG {
 
 private:
   std::list<void *> to_free_list;
+  std::list<void *> to_free_list_gpu;
   std::mutex free_list_guard;
   std::list<Stream *> streams;
   PBS_buffer *pbs_buffer;
@@ -417,7 +424,8 @@ struct Dependence {
       if (immediate)
         free(host_data.allocated);
       else
-        dfg->register_stream_order_dependent_allocation(host_data.allocated);
+        dfg->register_stream_order_dependent_allocation(host_data.allocated,
+                                                        false);
     }
     for (auto c : chunks)
       c->free_data(dfg, immediate);
@@ -432,6 +440,8 @@ struct Dependence {
       if (onHostReady)
         return;
       if (host_data.allocated == nullptr) {
+        // cudaHostAlloc(&host_data.aligned, data_size, cudaHostAllocDefault);
+        // host_data.allocated = host_data.aligned;
         host_data.allocated = host_data.aligned = (uint64_t *)malloc(data_size);
         hostAllocated = true;
       }
@@ -695,6 +705,8 @@ struct Stream {
             std::ceil((double)gpu_chunk_size / max_samples_per_chunk);
         num_chunks = num_cores * scale_factor;
         num_gpu_chunks = num_devices * scale_factor;
+        std::cout << "SCALE " << scale_factor << " - num_gpu_chunks "
+                  << num_gpu_chunks << "\n";
       }
     } else {
       num_chunks = std::min(num_cores, num_samples);
@@ -999,14 +1011,20 @@ void memref_keyswitch_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
 void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
                                       uint64_t *out_ptr) {
   assert(p->output_size.val == p->glwe_dim.val * p->poly_size.val + 1);
+  if (!p->output_streams[0]->need_new_gen(chunk_id))
+    return;
 
   Dependence *idep1 = p->input_streams[1]->get(host_location, chunk_id);
   MemRef2 &mtlu = idep1->host_data;
   uint32_t num_lut_vectors = mtlu.sizes[0];
   uint64_t glwe_ct_len =
       p->poly_size.val * (p->glwe_dim.val + 1) * num_lut_vectors;
   uint64_t glwe_ct_size = glwe_ct_len * sizeof(uint64_t);
-  uint64_t *glwe_ct = (uint64_t *)malloc(glwe_ct_size);
+  uint64_t *glwe_ct;
+  // if (loc == host_location)
+  glwe_ct = (uint64_t *)malloc(glwe_ct_size);
+  // else
+  // cudaHostAlloc(&glwe_ct, glwe_ct_size, cudaHostAllocDefault);
   auto tlu = mtlu.aligned + mtlu.offset;
   // Glwe trivial encryption
   size_t pos = 0, postlu = 0;
@@ -1030,7 +1048,12 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
     // Move test vector indexes to the GPU, the test vector indexes is set of 0
     uint32_t lwe_idx = 0,
              test_vector_idxes_size = num_samples * sizeof(uint64_t);
-    uint64_t *test_vector_idxes = (uint64_t *)malloc(test_vector_idxes_size);
+    uint64_t *test_vector_idxes;
+    // if (loc == host_location)
+    test_vector_idxes = (uint64_t *)malloc(test_vector_idxes_size);
+    // else
+    // cudaHostAlloc(&test_vector_idxes, test_vector_idxes_size,
+    // cudaHostAllocDefault);
     if (lut_indexes.size() == 1) {
       memset((void *)test_vector_idxes, lut_indexes[0], test_vector_idxes_size);
     } else {
@@ -1068,6 +1091,8 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
       Dependence *dep =
           new Dependence(loc, out, nullptr, true, true, d0->chunk_id);
       free(glwe_ct);
+      free(test_vector_idxes);
+      // cudaFreeHost(glwe_ct);
       return dep;
     } else {
       // Schedule the bootstrap kernel on the GPU
@@ -1096,8 +1121,9 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
       // As streams are not synchronized, we can only free this vector
       // after a later synchronization point where we are guaranteed that
       // this vector is no longer needed.
-      p->dfg->register_stream_order_dependent_allocation(test_vector_idxes);
-      p->dfg->register_stream_order_dependent_allocation(glwe_ct);
+      p->dfg->register_stream_order_dependent_allocation(test_vector_idxes,
+                                                         false);
+      p->dfg->register_stream_order_dependent_allocation(glwe_ct, false);
       return dep;
     }
   };
@@ -1115,9 +1141,8 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
 
   cudaStream_t *cstream = (cudaStream_t *)p->dfg->get_gpu_stream(loc);
   Dependence *idep0 = p->input_streams[0]->get(loc, chunk_id);
-  if (p->output_streams[0]->need_new_gen(chunk_id))
-    p->output_streams[0]->put(
-        sched(idep0, idep1, glwe_ct, lut_indexes, cstream, loc), chunk_id);
+  p->output_streams[0]->put(
+      sched(idep0, idep1, glwe_ct, lut_indexes, cstream, loc), chunk_id);
 }
 
 void memref_add_lwe_ciphertexts_u64_process(Process *p, int32_t loc,