diff --git a/lincs/liblincs/learning/mrsort-by-weights-profiles-breed/improve-profiles/accuracy-heuristic-on-gpu.cu b/lincs/liblincs/learning/mrsort-by-weights-profiles-breed/improve-profiles/accuracy-heuristic-on-gpu.cu index 1f861f40..d6811caf 100644 --- a/lincs/liblincs/learning/mrsort-by-weights-profiles-breed/improve-profiles/accuracy-heuristic-on-gpu.cu +++ b/lincs/liblincs/learning/mrsort-by-weights-profiles-breed/improve-profiles/accuracy-heuristic-on-gpu.cu @@ -391,8 +391,17 @@ void ImproveProfilesWithAccuracyHeuristicOnGpu::improve_model_profile( std::uniform_real_distribution(0, 1)(host_learning_data.urbgs[model_index])); check_last_cuda_error_sync_stream(cudaStreamDefault); - // @todo(Project management, soon) Double-check and document why we don't need [model_index] here - copy(gpu_learning_data.profiles[criterion_index][profile_index], host_learning_data.profiles[criterion_index][profile_index]); + // @todo(Performance, later) Can we group this copying somehow? + // Currently we copy just one float from device memory to host memory + // (because just one float is potentialy modified by 'apply_best_move__kernel', + // and we need it back on the device for the next iteration) + + // Lov-e-CUDA doesn't provide a way to copy scalars, so we're back to the basics, using cudaMemcpy directly and doing pointer arithmetic. + check_cuda_error(cudaMemcpy( + host_learning_data.profiles[criterion_index][profile_index].data() + model_index, + gpu_learning_data.profiles[criterion_index][profile_index].data() + model_index, + 1 * sizeof(float), + cudaMemcpyDeviceToHost)); } } // namespace lincs