Skip to content

Commit

Permalink
fix(compiler): [GPU backend] fix memory (de-)allocation.
Browse files Browse the repository at this point in the history
  • Loading branch information
antoniupop committed Apr 24, 2024
1 parent 2ddd117 commit 41af3b7
Showing 1 changed file with 35 additions and 10 deletions.
45 changes: 35 additions & 10 deletions compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,15 +193,21 @@ struct GPU_DFG {
free_stream_order_dependent_data();
}
inline void register_stream(Stream *s) { streams.push_back(s); }
inline void register_stream_order_dependent_allocation(void *p) {
inline void register_stream_order_dependent_allocation(void *p, bool gpu) {
std::lock_guard<std::mutex> guard(free_list_guard);
to_free_list.push_back(p);
if (gpu)
to_free_list_gpu.push_back(p);
else
to_free_list.push_back(p);
}
inline void free_stream_order_dependent_data() {
std::lock_guard<std::mutex> guard(free_list_guard);
for (auto p : to_free_list)
free(p);
to_free_list.clear();
for (auto p : to_free_list_gpu)
cudaFreeHost(p);
to_free_list_gpu.clear();
}
inline int8_t *get_pbs_buffer(uint32_t glwe_dimension,
uint32_t polynomial_size,
Expand Down Expand Up @@ -230,6 +236,7 @@ struct GPU_DFG {

private:
std::list<void *> to_free_list;
std::list<void *> to_free_list_gpu;
std::mutex free_list_guard;
std::list<Stream *> streams;
PBS_buffer *pbs_buffer;
Expand Down Expand Up @@ -417,7 +424,8 @@ struct Dependence {
if (immediate)
free(host_data.allocated);
else
dfg->register_stream_order_dependent_allocation(host_data.allocated);
dfg->register_stream_order_dependent_allocation(host_data.allocated,
false);
}
for (auto c : chunks)
c->free_data(dfg, immediate);
Expand All @@ -432,6 +440,8 @@ struct Dependence {
if (onHostReady)
return;
if (host_data.allocated == nullptr) {
// cudaHostAlloc(&host_data.aligned, data_size, cudaHostAllocDefault);
// host_data.allocated = host_data.aligned;
host_data.allocated = host_data.aligned = (uint64_t *)malloc(data_size);
hostAllocated = true;
}
Expand Down Expand Up @@ -695,6 +705,8 @@ struct Stream {
std::ceil((double)gpu_chunk_size / max_samples_per_chunk);
num_chunks = num_cores * scale_factor;
num_gpu_chunks = num_devices * scale_factor;
std::cout << "SCALE " << scale_factor << " - num_gpu_chunks "
<< num_gpu_chunks << "\n";
}
} else {
num_chunks = std::min(num_cores, num_samples);
Expand Down Expand Up @@ -999,14 +1011,20 @@ void memref_keyswitch_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
uint64_t *out_ptr) {
assert(p->output_size.val == p->glwe_dim.val * p->poly_size.val + 1);
if (!p->output_streams[0]->need_new_gen(chunk_id))
return;

Dependence *idep1 = p->input_streams[1]->get(host_location, chunk_id);
MemRef2 &mtlu = idep1->host_data;
uint32_t num_lut_vectors = mtlu.sizes[0];
uint64_t glwe_ct_len =
p->poly_size.val * (p->glwe_dim.val + 1) * num_lut_vectors;
uint64_t glwe_ct_size = glwe_ct_len * sizeof(uint64_t);
uint64_t *glwe_ct = (uint64_t *)malloc(glwe_ct_size);
uint64_t *glwe_ct;
// if (loc == host_location)
glwe_ct = (uint64_t *)malloc(glwe_ct_size);
// else
// cudaHostAlloc(&glwe_ct, glwe_ct_size, cudaHostAllocDefault);
auto tlu = mtlu.aligned + mtlu.offset;
// Glwe trivial encryption
size_t pos = 0, postlu = 0;
Expand All @@ -1030,7 +1048,12 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
// Move test vector indexes to the GPU, the test vector indexes is set of 0
uint32_t lwe_idx = 0,
test_vector_idxes_size = num_samples * sizeof(uint64_t);
uint64_t *test_vector_idxes = (uint64_t *)malloc(test_vector_idxes_size);
uint64_t *test_vector_idxes;
// if (loc == host_location)
test_vector_idxes = (uint64_t *)malloc(test_vector_idxes_size);
// else
// cudaHostAlloc(&test_vector_idxes, test_vector_idxes_size,
// cudaHostAllocDefault);
if (lut_indexes.size() == 1) {
memset((void *)test_vector_idxes, lut_indexes[0], test_vector_idxes_size);
} else {
Expand Down Expand Up @@ -1068,6 +1091,8 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
Dependence *dep =
new Dependence(loc, out, nullptr, true, true, d0->chunk_id);
free(glwe_ct);
free(test_vector_idxes);
// cudaFreeHost(glwe_ct);
return dep;
} else {
// Schedule the bootstrap kernel on the GPU
Expand Down Expand Up @@ -1096,8 +1121,9 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
// As streams are not synchronized, we can only free this vector
// after a later synchronization point where we are guaranteed that
// this vector is no longer needed.
p->dfg->register_stream_order_dependent_allocation(test_vector_idxes);
p->dfg->register_stream_order_dependent_allocation(glwe_ct);
p->dfg->register_stream_order_dependent_allocation(test_vector_idxes,
false);
p->dfg->register_stream_order_dependent_allocation(glwe_ct, false);
return dep;
}
};
Expand All @@ -1115,9 +1141,8 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,

cudaStream_t *cstream = (cudaStream_t *)p->dfg->get_gpu_stream(loc);
Dependence *idep0 = p->input_streams[0]->get(loc, chunk_id);
if (p->output_streams[0]->need_new_gen(chunk_id))
p->output_streams[0]->put(
sched(idep0, idep1, glwe_ct, lut_indexes, cstream, loc), chunk_id);
p->output_streams[0]->put(
sched(idep0, idep1, glwe_ct, lut_indexes, cstream, loc), chunk_id);
}

void memref_add_lwe_ciphertexts_u64_process(Process *p, int32_t loc,
Expand Down

0 comments on commit 41af3b7

Please sign in to comment.