From 1f3bfff9739227bb61f4ac0123653061df102a03 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Sat, 10 Feb 2024 04:22:01 -0800 Subject: [PATCH] ROCm workaround: Use ParallelFor instead of Reduce (#2749) Assuming the failure is not often, we can use ParallelFor with atomicAdd to obtain the number of failures. With this change, the ROCm memory issue seems to be gone. --- Source/reactions/Castro_react.cpp | 85 ++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 30 deletions(-) diff --git a/Source/reactions/Castro_react.cpp b/Source/reactions/Castro_react.cpp index 9a53e398d3..f49efc9247 100644 --- a/Source/reactions/Castro_react.cpp +++ b/Source/reactions/Castro_react.cpp @@ -186,12 +186,14 @@ Castro::react_state(MultiFab& s, MultiFab& r, Real time, Real dt, const int stra MultiFab tmp_mask_mf; const MultiFab& mask_mf = mask_covered_zones ? getLevel(level+1).build_fine_mask() : tmp_mask_mf; - ReduceOps reduce_op; - ReduceData reduce_data(reduce_op); - using ReduceTuple = typename decltype(reduce_data)::Type; +#if defined(AMREX_USE_GPU) + Gpu::Buffer d_num_failed({0}); + auto* p_num_failed = d_num_failed.data(); +#endif + int num_failed = 0; #ifdef _OPENMP -#pragma omp parallel +#pragma omp parallel reduction(+:num_failed) #endif for (MFIter mfi(s, TilingIfNotGPU()); mfi.isValid(); ++mfi) { @@ -208,8 +210,11 @@ Castro::react_state(MultiFab& s, MultiFab& r, Real time, Real dt, const int stra const auto problo = geom.ProbLoArray(); #endif - reduce_op.eval(bx, reduce_data, - [=] AMREX_GPU_HOST_DEVICE (int i, int j, int k) -> ReduceTuple +#if defined(AMREX_USE_GPU) + ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) +#else + LoopOnCpu(bx, [&] (int i, int j, int k) mutable +#endif { burn_t burn_state; @@ -230,7 +235,7 @@ Castro::react_state(MultiFab& s, MultiFab& r, Real time, Real dt, const int stra bool do_burn = true; burn_state.success = true; - Real burn_failed = 0.0_rt; + int burn_failed = 0; // Don't burn on zones inside shock regions, if the relevant option is set. @@ -329,7 +334,7 @@ Castro::react_state(MultiFab& s, MultiFab& r, Real time, Real dt, const int stra // If we were unsuccessful, update the failure count. if (!burn_state.success) { - burn_failed = 1.0_rt; + burn_failed = 1; } // Add burning rates to reactions MultiFab, but be @@ -399,19 +404,25 @@ Castro::react_state(MultiFab& s, MultiFab& r, Real time, Real dt, const int stra } - - return {burn_failed}; - +#if defined(AMREX_USE_GPU) + if (burn_failed) { + Gpu::Atomic::Add(p_num_failed, burn_failed); + } +#else + num_failed += burn_failed; +#endif }); +#if defined(AMREX_USE_HIP) + Gpu::streamSynchronize(); // otherwise HIP may faile to allocate the necessary resources. +#endif } - ReduceTuple hv = reduce_data.value(); - Real burn_failed = amrex::get<0>(hv); +#if defined(AMREX_USE_GPU) + num_failed = *(d_num_failed.copyToHost()); +#endif - if (burn_failed != 0.0) { - burn_success = 0; - } + burn_success = !num_failed; ParallelDescriptor::ReduceIntMin(burn_success); @@ -516,11 +527,13 @@ Castro::react_state(Real time, Real dt) int burn_success = 1; - ReduceOps reduce_op; - ReduceData reduce_data(reduce_op); - - using ReduceTuple = typename decltype(reduce_data)::Type; +#if defined(AMREX_USE_GPU) + Gpu::Buffer d_num_failed({0}); + auto* p_num_failed = d_num_failed.data(); +#endif + int num_failed = 0; + // why no omp here? for (MFIter mfi(S_new, TilingIfNotGPU()); mfi.isValid(); ++mfi) { const Box& bx = mfi.growntilebox(ng); @@ -542,8 +555,11 @@ Castro::react_state(Real time, Real dt) const auto dx = geom.CellSizeArray(); const auto problo = geom.ProbLoArray(); - reduce_op.eval(bx, reduce_data, - [=] AMREX_GPU_HOST_DEVICE (int i, int j, int k) -> ReduceTuple +#if defined(AMREX_USE_GPU) + ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) +#else + LoopOnCpu(bx, [&] (int i, int j, int k) mutable +#endif { burn_t burn_state; @@ -563,7 +579,7 @@ Castro::react_state(Real time, Real dt) bool do_burn = true; burn_state.success = true; - Real burn_failed = 0.0_rt; + int burn_failed = 0; // Don't burn on zones inside shock regions, if the // relevant option is set. @@ -687,7 +703,7 @@ Castro::react_state(Real time, Real dt) // If we were unsuccessful, update the failure count. if (!burn_state.success) { - burn_failed = 1.0_rt; + burn_failed = 1; } // update the state data. @@ -780,16 +796,25 @@ Castro::react_state(Real time, Real dt) } } - return {burn_failed}; +#if defined(AMREX_USE_GPU) + if (burn_failed) { + Gpu::Atomic::Add(p_num_failed, burn_failed); + } +#else + num_failed += burn_failed; +#endif }); + +#if defined(AMREX_USE_HIP) + Gpu::streamSynchronize(); // otherwise HIP may faile to allocate the necessary resources. +#endif } - ReduceTuple hv = reduce_data.value(); - Real burn_failed = amrex::get<0>(hv); +#if defined(AMREX_USE_GPU) + num_failed = *(d_num_failed.copyToHost()); +#endif - if (burn_failed != 0.0) { - burn_success = 0; - } + burn_success = !num_failed; ParallelDescriptor::ReduceIntMin(burn_success);