From 61ee4f22770d737b50db6e6f1631aada410aac27 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 5 Jan 2024 00:15:39 -0500 Subject: [PATCH] fix segfault in ~Region (#3108) When debuging #3103, I notice a segfault in `~Region`, preventing the actual error message thrown. `_norm_copy_coord_gpu` replaces `boxt` and `rec_boxt` of `Region` with GPU pointers, runs `deepmd::copy_coord_gpu`, and finnally recover the original pointer that can be deleted. However, `deepmd::copy_coord_gpu` might throw CUDA errors for any reason, and the pointers are not recovered. `~Region` tries to delete a pointer that it doesn't own, causing the segfault. The CUDA error message is not visible due to segfault. The segfault in #2895 may be also caused by it. This PR adds a new constructor to `Region` to accept the external pointers. `~Region` will delete `boxt` and`rec_boxt` only when the pointer is not external. We still need to figure out the reason for the error of `copy_coord_gpu` behind the segfault. --------- Signed-off-by: Jinzhe Zeng --- source/lib/include/region.h | 4 ++ source/lib/src/region.cc | 14 +++++- source/lib/tests/test_coord.cc | 56 +++------------------- source/lib/tests/test_simulation_region.cc | 8 +--- source/op/prod_env_mat_multi_device.cc | 8 +--- source/tests/test_auto_batch_size.py | 4 +- 6 files changed, 28 insertions(+), 66 deletions(-) diff --git a/source/lib/include/region.h b/source/lib/include/region.h index 2f6dbbf4e0..ee11b5b8ac 100644 --- a/source/lib/include/region.h +++ b/source/lib/include/region.h @@ -8,7 +8,11 @@ struct Region { FPTYPE* boxt; FPTYPE* rec_boxt; Region(); + Region(FPTYPE* extern_boxt, FPTYPE* extern_rec_boxt); ~Region(); + + private: + bool self_allocated; }; template diff --git a/source/lib/src/region.cc b/source/lib/src/region.cc index 36a739d90a..6c5f5493e6 100644 --- a/source/lib/src/region.cc +++ b/source/lib/src/region.cc @@ -14,12 +14,22 @@ template Region::Region() { boxt = new FPTYPE[BOXT_DIM]; rec_boxt = new FPTYPE[BOXT_DIM]; + self_allocated = true; +} + +template +Region::Region(FPTYPE* extern_boxt, FPTYPE* extern_rec_boxt) { + boxt = extern_boxt; + rec_boxt = extern_rec_boxt; + self_allocated = false; } template Region::~Region() { - delete[] boxt; - delete[] rec_boxt; + if (self_allocated) { + delete[] boxt; + delete[] rec_boxt; + } } template struct deepmd::Region; diff --git a/source/lib/tests/test_coord.cc b/source/lib/tests/test_coord.cc index af320ca3f7..c939dd6fa6 100644 --- a/source/lib/tests/test_coord.cc +++ b/source/lib/tests/test_coord.cc @@ -62,9 +62,6 @@ TEST_F(TestNormCoord, cpu_case2) { #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM TEST_F(TestNormCoord, gpu_case0) { deepmd::Region region; - deepmd::Region region_dev; - double* new_boxt = region_dev.boxt; - double* new_rec_boxt = region_dev.rec_boxt; init_region_cpu(region, &boxt[0]); std::vector box_info; box_info.resize(18); @@ -75,11 +72,8 @@ TEST_F(TestNormCoord, gpu_case0) { std::vector out_c(r0); deepmd::malloc_device_memory_sync(box_info_dev, box_info); deepmd::malloc_device_memory_sync(out_c_dev, out_c); - region_dev.boxt = box_info_dev; - region_dev.rec_boxt = box_info_dev + 9; + deepmd::Region region_dev(box_info_dev, box_info_dev + 9); deepmd::normalize_coord_gpu(out_c_dev, natoms, region_dev); - region_dev.boxt = new_boxt; - region_dev.rec_boxt = new_rec_boxt; deepmd::memcpy_device_to_host(out_c_dev, out_c); deepmd::delete_device_memory(box_info_dev); deepmd::delete_device_memory(out_c_dev); @@ -90,9 +84,6 @@ TEST_F(TestNormCoord, gpu_case0) { TEST_F(TestNormCoord, gpu_case1) { deepmd::Region region; - deepmd::Region region_dev; - double* new_boxt = region_dev.boxt; - double* new_rec_boxt = region_dev.rec_boxt; init_region_cpu(region, &boxt[0]); std::vector box_info; box_info.resize(18); @@ -103,11 +94,8 @@ TEST_F(TestNormCoord, gpu_case1) { std::vector out_c(r1); deepmd::malloc_device_memory_sync(box_info_dev, box_info); deepmd::malloc_device_memory_sync(out_c_dev, out_c); - region_dev.boxt = box_info_dev; - region_dev.rec_boxt = box_info_dev + 9; + deepmd::Region region_dev(box_info_dev, box_info_dev + 9); deepmd::normalize_coord_gpu(out_c_dev, natoms, region_dev); - region_dev.boxt = new_boxt; - region_dev.rec_boxt = new_rec_boxt; deepmd::memcpy_device_to_host(out_c_dev, out_c); deepmd::delete_device_memory(box_info_dev); deepmd::delete_device_memory(out_c_dev); @@ -118,9 +106,6 @@ TEST_F(TestNormCoord, gpu_case1) { TEST_F(TestNormCoord, gpu_case2) { deepmd::Region region; - deepmd::Region region_dev; - double* new_boxt = region_dev.boxt; - double* new_rec_boxt = region_dev.rec_boxt; init_region_cpu(region, &boxt[0]); std::vector box_info; box_info.resize(18); @@ -131,11 +116,8 @@ TEST_F(TestNormCoord, gpu_case2) { std::vector out_c(r2); deepmd::malloc_device_memory_sync(box_info_dev, box_info); deepmd::malloc_device_memory_sync(out_c_dev, out_c); - region_dev.boxt = box_info_dev; - region_dev.rec_boxt = box_info_dev + 9; + deepmd::Region region_dev(box_info_dev, box_info_dev + 9); deepmd::normalize_coord_gpu(out_c_dev, natoms, region_dev); - region_dev.boxt = new_boxt; - region_dev.rec_boxt = new_rec_boxt; deepmd::memcpy_device_to_host(out_c_dev, out_c); deepmd::delete_device_memory(box_info_dev); deepmd::delete_device_memory(out_c_dev); @@ -298,9 +280,6 @@ TEST_F(TestCopyCoord, gpu) { std::vector cell_info; cell_info.resize(23); deepmd::Region region; - deepmd::Region region_dev; - double* new_boxt = region_dev.boxt; - double* new_rec_boxt = region_dev.rec_boxt; init_region_cpu(region, &boxt[0]); deepmd::compute_cell_info(&cell_info[0], rc, region); std::vector box_info; @@ -325,14 +304,11 @@ TEST_F(TestCopyCoord, gpu) { int_data_dev, nloc * 3 + loc_cellnum + total_cellnum * 3 + total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1 + nloc); - region_dev.boxt = box_info_dev; - region_dev.rec_boxt = box_info_dev + 9; + deepmd::Region region_dev(box_info_dev, box_info_dev + 9); int ret = deepmd::copy_coord_gpu(out_c_dev, out_t_dev, mapping_dev, &nall, int_data_dev, in_c_dev, in_t_dev, nloc, mem_size, loc_cellnum, total_cellnum, cell_info_dev, region_dev); - region_dev.boxt = new_boxt; - region_dev.rec_boxt = new_rec_boxt; deepmd::memcpy_device_to_host(out_c_dev, out_c); deepmd::memcpy_device_to_host(out_t_dev, out_t); deepmd::memcpy_device_to_host(mapping_dev, mapping); @@ -373,9 +349,6 @@ TEST_F(TestCopyCoord, gpu_lessmem) { std::vector cell_info; cell_info.resize(23); deepmd::Region region; - deepmd::Region region_dev; - double* new_boxt = region_dev.boxt; - double* new_rec_boxt = region_dev.rec_boxt; init_region_cpu(region, &boxt[0]); deepmd::compute_cell_info(&cell_info[0], rc, region); std::vector box_info; @@ -400,14 +373,11 @@ TEST_F(TestCopyCoord, gpu_lessmem) { int_data_dev, nloc * 3 + loc_cellnum + total_cellnum * 3 + total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1 + nloc); - region_dev.boxt = box_info_dev; - region_dev.rec_boxt = box_info_dev + 9; + deepmd::Region region_dev(box_info_dev, box_info_dev + 9); int ret = deepmd::copy_coord_gpu(out_c_dev, out_t_dev, mapping_dev, &nall, int_data_dev, in_c_dev, in_t_dev, nloc, mem_size, loc_cellnum, total_cellnum, cell_info_dev, region_dev); - region_dev.boxt = new_boxt; - region_dev.rec_boxt = new_rec_boxt; deepmd::memcpy_device_to_host(out_c_dev, out_c); deepmd::memcpy_device_to_host(out_t_dev, out_t); deepmd::memcpy_device_to_host(mapping_dev, mapping); @@ -544,9 +514,6 @@ TEST_F(TestCopyCoordMoreCell, gpu) { std::vector cell_info; cell_info.resize(23); deepmd::Region region; - deepmd::Region region_dev; - double* new_boxt = region_dev.boxt; - double* new_rec_boxt = region_dev.rec_boxt; init_region_cpu(region, &boxt[0]); deepmd::compute_cell_info(&cell_info[0], rc, region); std::vector box_info; @@ -571,14 +538,11 @@ TEST_F(TestCopyCoordMoreCell, gpu) { int_data_dev, nloc * 3 + loc_cellnum + total_cellnum * 3 + total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1 + nloc); - region_dev.boxt = box_info_dev; - region_dev.rec_boxt = box_info_dev + 9; + deepmd::Region region_dev(box_info_dev, box_info_dev + 9); int ret = deepmd::copy_coord_gpu(out_c_dev, out_t_dev, mapping_dev, &nall, int_data_dev, in_c_dev, in_t_dev, nloc, mem_size, loc_cellnum, total_cellnum, cell_info_dev, region_dev); - region_dev.boxt = new_boxt; - region_dev.rec_boxt = new_rec_boxt; deepmd::memcpy_device_to_host(out_c_dev, out_c); deepmd::memcpy_device_to_host(out_t_dev, out_t); deepmd::memcpy_device_to_host(mapping_dev, mapping); @@ -619,9 +583,6 @@ TEST_F(TestCopyCoordMoreCell, gpu_lessmem) { std::vector cell_info; cell_info.resize(23); deepmd::Region region; - deepmd::Region region_dev; - double* new_boxt = region_dev.boxt; - double* new_rec_boxt = region_dev.rec_boxt; init_region_cpu(region, &boxt[0]); deepmd::compute_cell_info(&cell_info[0], rc, region); std::vector box_info; @@ -646,14 +607,11 @@ TEST_F(TestCopyCoordMoreCell, gpu_lessmem) { int_data_dev, nloc * 3 + loc_cellnum + total_cellnum * 3 + total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1 + nloc); - region_dev.boxt = box_info_dev; - region_dev.rec_boxt = box_info_dev + 9; + deepmd::Region region_dev(box_info_dev, box_info_dev + 9); int ret = deepmd::copy_coord_gpu(out_c_dev, out_t_dev, mapping_dev, &nall, int_data_dev, in_c_dev, in_t_dev, nloc, mem_size, loc_cellnum, total_cellnum, cell_info_dev, region_dev); - region_dev.boxt = new_boxt; - region_dev.rec_boxt = new_rec_boxt; deepmd::memcpy_device_to_host(out_c_dev, out_c); deepmd::memcpy_device_to_host(out_t_dev, out_t); deepmd::memcpy_device_to_host(mapping_dev, mapping); diff --git a/source/lib/tests/test_simulation_region.cc b/source/lib/tests/test_simulation_region.cc index 98da9ec350..5f64d3f531 100644 --- a/source/lib/tests/test_simulation_region.cc +++ b/source/lib/tests/test_simulation_region.cc @@ -77,9 +77,6 @@ TEST_F(TestRegion, cpu) { TEST_F(TestRegion, gpu) { // check rec_box deepmd::Region region; - deepmd::Region region_dev; - double* new_boxt = region_dev.boxt; - double* new_rec_boxt = region_dev.rec_boxt; double *boxt_dev = NULL, *rec_boxt_dev = NULL; double *ref_rp_dev = NULL, *ref_ri_dev = NULL; init_region_cpu(region, &ref_boxt[0]); @@ -90,8 +87,7 @@ TEST_F(TestRegion, gpu) { deepmd::malloc_device_memory_sync(rec_boxt_dev, region.rec_boxt, 9); deepmd::malloc_device_memory_sync(ref_rp_dev, ref_rp); deepmd::malloc_device_memory_sync(ref_ri_dev, ref_ri); - region_dev.boxt = boxt_dev; - region_dev.rec_boxt = rec_boxt_dev; + deepmd::Region region_dev(boxt_dev, rec_boxt_dev); // check volume double vol[1]; double* vol_dev = NULL; @@ -141,8 +137,6 @@ TEST_F(TestRegion, gpu) { deepmd::delete_device_memory(rp2_dev); deepmd::delete_device_memory(rp_dev); deepmd::delete_device_memory(ri2_dev); - region_dev.boxt = new_boxt; - region_dev.rec_boxt = new_rec_boxt; } #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc index 048237e042..22654b5f3a 100644 --- a/source/op/prod_env_mat_multi_device.cc +++ b/source/op/prod_env_mat_multi_device.cc @@ -1496,11 +1496,7 @@ static int _norm_copy_coord_gpu(OpKernelContext* context, int* int_data_dev = cell_info_dev + 23; deepmd::memcpy_host_to_device(box_info_dev, box_info, 18); deepmd::memcpy_host_to_device(cell_info_dev, cell_info, 23); - deepmd::Region region_dev; - FPTYPE* new_boxt = region_dev.boxt; - FPTYPE* new_rec_boxt = region_dev.rec_boxt; - region_dev.boxt = box_info_dev; - region_dev.rec_boxt = box_info_dev + 9; + deepmd::Region region_dev(box_info_dev, box_info_dev + 9); deepmd::normalize_coord_gpu(tmp_coord, nall, region_dev); int tt; for (tt = 0; tt < max_cpy_trial; ++tt) { @@ -1531,8 +1527,6 @@ static int _norm_copy_coord_gpu(OpKernelContext* context, } } } - region_dev.boxt = new_boxt; - region_dev.rec_boxt = new_rec_boxt; return (tt != max_cpy_trial); } diff --git a/source/tests/test_auto_batch_size.py b/source/tests/test_auto_batch_size.py index 93a96c9c29..5a349f70b9 100644 --- a/source/tests/test_auto_batch_size.py +++ b/source/tests/test_auto_batch_size.py @@ -45,8 +45,10 @@ def test_execute_oom_gpu(self, mock_is_gpu_available): self.assertEqual(result.shape, (256, 2)) @unittest.mock.patch("tensorflow.compat.v1.test.is_gpu_available") - def test_execute_oom_cpu(self, mock_is_gpu_available): + @unittest.mock.patch("tensorflow.compat.v1.config.experimental.get_visible_devices") + def test_execute_oom_cpu(self, mock_is_gpu_available, mock_get_visible_devices): mock_is_gpu_available.return_value = False + mock_get_visible_devices.return_value = [] # initial batch size 256 = 128 * 2, nb is always 128 auto_batch_size = AutoBatchSize(256, 2.0) nb, result = auto_batch_size.execute(self.oom, 1, 2)