From 7c95fc71e30a2c73102e23147128b5c43b500e78 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Mon, 28 Nov 2022 13:34:22 -0500 Subject: [PATCH] Hommexx: Limit HIP team size in debug builds. Also clean up the team_num_threads_vectors routine to consolidate the rules. Fixes SCREAM issue 2058. [BFB] --- .../homme/src/share/cxx/ExecSpaceDefs.cpp | 36 ++++++++----------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/components/homme/src/share/cxx/ExecSpaceDefs.cpp b/components/homme/src/share/cxx/ExecSpaceDefs.cpp index 671917f5c767..784d37b65d20 100644 --- a/components/homme/src/share/cxx/ExecSpaceDefs.cpp +++ b/components/homme/src/share/cxx/ExecSpaceDefs.cpp @@ -173,39 +173,31 @@ team_num_threads_vectors (const int num_parallel_iterations, // fewer than 4 warps/thread block limits the thread occupancy to that // number/4. That seems to be in Cuda specs, but I don't know of a function // that provides this number. Use a configuration option that defaults to 4. - const int min_num_warps = HOMMEXX_CUDA_MIN_WARP_PER_TEAM; + int min_num_warps = HOMMEXX_CUDA_MIN_WARP_PER_TEAM; + + int max_num_warps = HOMMEXX_CUDA_MAX_WARP_PER_TEAM; +#ifdef KOKKOS_ENABLE_DEBUG + // In debug builds, team size must be smaller because of Kokkos-side data. + max_num_warps = std::min(max_num_warps, 8); +#endif + #ifdef KOKKOS_ENABLE_CUDA const int num_warps_device = Kokkos::Impl::cuda_internal_maximum_concurrent_block_count(); const int num_threads_warp = Kokkos::Impl::CudaTraits::WarpSize; - // The number on P100 is 16, but for some reason - // Kokkos::Impl::cuda_internal_maximum_grid_count() returns 8. I may be - // misusing the function. I have an open issue with the Kokkos team to resolve - // this. For now: -# ifdef KOKKOS_ENABLE_DEBUG - // Work around spurious "terminate called after throwing an instance - // of 'std::runtime_error'" message. - const int max_num_warps = 8; -# else - const int max_num_warps = HOMMEXX_CUDA_MAX_WARP_PER_TEAM; //Kokkos::Impl::cuda_internal_maximum_grid_count(); -# endif - #elif defined(KOKKOS_ENABLE_HIP) - - //use 64 wavefronts per CU and 120 CUs + // Use 64 wavefronts per CU and 120 CUs. const int num_warps_device = 120*64; // no such thing Kokkos::Impl::hip_internal_maximum_warp_count(); - const int max_num_warps = HOMMEXX_CUDA_MAX_WARP_PER_TEAM; const int num_threads_warp = Kokkos::Experimental::Impl::HIPTraits::WarpSize; - #else - - // I want thread-distribution rules to be unit-testable even when Cuda is - // off. Thus, make up a P100-like machine: + // I want thread-distribution rules to be unit-testable even when GPU spaces + // are off. Thus, make up a GPU-like machine: const int num_warps_device = 1792; const int num_threads_warp = 32; - const int max_num_warps = 16; - + max_num_warps = 16; #endif + min_num_warps = std::min(min_num_warps, max_num_warps); + return Parallel::team_num_threads_vectors_for_gpu( num_warps_device, num_threads_warp, min_num_warps, max_num_warps,