diff --git a/src/serac/numerics/functional/domain_integral_kernels.cuh b/src/serac/numerics/functional/domain_integral_kernels.cuh index 1188dd2d3e..2daf872717 100644 --- a/src/serac/numerics/functional/domain_integral_kernels.cuh +++ b/src/serac/numerics/functional/domain_integral_kernels.cuh @@ -10,6 +10,9 @@ #include "serac/infrastructure/accelerator.hpp" #include "serac/numerics/functional/quadrature_data.hpp" +#include "RAJA/RAJA.hpp" +#include + namespace serac { namespace detail { @@ -193,7 +196,8 @@ template __global__ void eval_cuda_quadrature(const solution_type u, residual_type r, GPUArrayView qf_derivatives, jacobian_type J, position_type X, - int num_elements, lambda qf, QuadratureData data) + int num_elements, lambda qf, QuadratureData data, + int blocks_quadrature_element, int block_dim, int thread_idx) { using test_element = finite_element; using trial_element = finite_element; @@ -201,9 +205,9 @@ __global__ void eval_cuda_quadrature(const solution_type u, residual_type r, static constexpr auto rule = GaussQuadratureRule(); static constexpr int dim = dimension_of(g); - const int grid_stride = blockDim.x * gridDim.x; + const int grid_stride = block_dim * blocks_quadrature_element; // launch a thread for each quadrature x element point - for (int qe = blockIdx.x * blockDim.x + threadIdx.x; qe < num_elements * rule.size(); qe += grid_stride) { + for (int qe = thread_idx; qe < num_elements * rule.size(); qe += grid_stride) { // warps won't fetch that many elements ... not great.. but not horrible int e = qe / rule.size(); int q = qe % rule.size(); @@ -426,7 +430,8 @@ template __global__ void gradient_cuda_quadrature(const dsolution_type du, dresidual_type dr, GPUArrayView qf_derivatives, - const mfem::DeviceTensor<4, const double> J, int num_elements) + const mfem::DeviceTensor<4, const double> J, int num_elements, + int grid_dim, int block_dim, int thread_id) { using test_element = finite_element; using trial_element = finite_element; @@ -434,8 +439,7 @@ __global__ void gradient_cuda_quadrature(const dsolution_type du, dresidual_type static constexpr auto rule = GaussQuadratureRule(); static constexpr int dim = dimension_of(g); - const int grid_stride = blockDim.x * gridDim.x; - auto thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const int grid_stride = block_dim * grid_dim; auto num_quadrature_points = num_elements * rule.size(); #pragma unroll for (int qe = thread_id; qe < num_quadrature_points; qe += grid_stride) { @@ -518,19 +522,38 @@ void action_of_gradient_kernel(serac::detail::GPULaunchConfiguration config, con auto du = detail::Reshape(dU.Read(), trial_ndof, num_elements); auto dr = detail::Reshape(dR.ReadWrite(), test_ndof, num_elements); + using global_thread_x = RAJA::LoopPolicy - <<>>(du, dr, qf_derivatives, J, num_elements); - + RAJA::launch(RAJA::ExecPlace::DEVICE, + RAJA::LaunchParams(RAJA::Teams(blocks_quadrature_element), + RAJA::Threads(config.blocksize)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, range, [&](int t_idx) { + gradient_cuda_quadrature + (du, dr, qf_derivatives, J, num_elements, blocks_quadrature_element, config.blocksize, t_idx); + }); + }); } else if constexpr (policy == serac::detail::ThreadParallelizationStrategy::THREAD_PER_ELEMENT) { int blocks_element = (num_elements + config.blocksize - 1) / config.blocksize; - gradient_cuda_element - <<>>(du, dr, qf_derivatives, J, num_elements); + RAJA::launch(RAJA::ExecPlace::DEVICE, + RAJA::LaunchParams(RAJA::Teams(blocks_element), + RAJA::Threads(config.blocksize)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, range, [&](int t_idx) { + gradient_cuda_element + (du, dr, qf_derivatives, J, num_elements, blocks_element, config.blocksize, t_idx); + }); + }); } cudaDeviceSynchronize(); diff --git a/src/serac/numerics/functional/domain_integral_kernels.hpp b/src/serac/numerics/functional/domain_integral_kernels.hpp index ed7415fc04..5a59dc559b 100644 --- a/src/serac/numerics/functional/domain_integral_kernels.hpp +++ b/src/serac/numerics/functional/domain_integral_kernels.hpp @@ -5,12 +5,17 @@ // SPDX-License-Identifier: (BSD-3-Clause) #pragma once +#include "RAJA/RAJA.hpp" + #include "serac/infrastructure/accelerator.hpp" #include "serac/numerics/functional/quadrature_data.hpp" #include "serac/numerics/functional/function_signature.hpp" #include "serac/numerics/functional/differentiate_wrt.hpp" +#include +#include #include +#include namespace serac { @@ -159,8 +164,16 @@ void evaluation_kernel_impl(FunctionSignature, const std::vecto [[maybe_unused]] tuple u = { reinterpret_cast(trial_elements))::dof_type*>(inputs[indices])...}; + #if defined(RAJA_ENABLE_CUDA) + using policy = RAJA::cuda_exec<512>; + #elif defined(RAJA_ENABLE_OPENMP) + using policy = RAJA::omp_parallel_for_exec; + #else + using policy = RAJA::simd_exec; + #endif + // for each element in the domain - for (uint32_t e = 0; e < num_elements; e++) { + RAJA::forall(RAJA::TypedRangeSegment(0, num_elements), [&] (uint32_t e) { // load the jacobians and positions for each quadrature point in this element auto J_e = J[e]; auto x_e = x[e]; @@ -201,7 +214,7 @@ void evaluation_kernel_impl(FunctionSignature, const std::vecto // (batch) integrate the material response against the test-space basis functions test_element::integrate(get_value(qf_outputs), rule, &r[e]); - } + }); } //clang-format off diff --git a/src/serac/physics/heat_transfer.hpp b/src/serac/physics/heat_transfer.hpp index 17163937b3..f049abb032 100644 --- a/src/serac/physics/heat_transfer.hpp +++ b/src/serac/physics/heat_transfer.hpp @@ -63,7 +63,7 @@ const TimesteppingOptions default_static_options = {TimestepMethod::QuasiStatic} /** * @brief An object containing the solver for a thermal conduction PDE * - * This is a generic linear thermal diffusion oeprator of the form + * This is a generic linear thermal diffusion operator of the form * * \f[ * \mathbf{M} \frac{\partial \mathbf{u}}{\partial t} = -\kappa \mathbf{K} \mathbf{u} + \mathbf{f} @@ -276,6 +276,39 @@ class HeatTransfer, std::integer_sequ cycle_ += 1; } + template + struct ThermalMaterialIntegrand { + ThermalMaterialIntegrand(MaterialType material) : material_(material) {} + + template + auto operator()(X x, T temperature, dT_dt dtemp_dt, Shape shape, Params... params) { + // Get the value and the gradient from the input tuple + auto [u, du_dX] = temperature; + auto [p, dp_dX] = shape; + auto du_dt = get(dtemp_dt); + + auto I_plus_dp_dX = I + dp_dX; + auto inv_I_plus_dp_dX = inv(I_plus_dp_dX); + auto det_I_plus_dp_dX = det(I_plus_dp_dX); + + // Note that the current configuration x = X + p, where X is the original reference + // configuration and p is the shape displacement. We need the gradient with + // respect to the perturbed reference configuration x = X + p for the material model. Therefore, we calculate + // du/dx = du/dX * dX/dx = du/dX * (dx/dX)^-1 = du/dX * (I + dp/dX)^-1 + + auto du_dx = dot(du_dX, inv_I_plus_dp_dX); + + auto [heat_capacity, heat_flux] = material_(x + p, u, du_dx, params...); + + // Note that the return is integrated in the perturbed reference + // configuration, hence the det(I + dp_dx) = det(dx/dX) + return serac::tuple{heat_capacity * du_dt * det_I_plus_dp_dX, + -1.0 * dot(inv_I_plus_dp_dX, heat_flux) * det_I_plus_dp_dX}; + } + private: + MaterialType material_; + }; + /** * @brief Set the thermal material model for the physics solver * @@ -300,33 +333,10 @@ class HeatTransfer, std::integer_sequ template void setMaterial(DependsOn, MaterialType material) { + ThermalMaterialIntegrand integrand (material); residual_->AddDomainIntegral( Dimension{}, DependsOn<0, 1, 2, active_parameters + NUM_STATE_VARS...>{}, - [material](auto x, auto temperature, auto dtemp_dt, auto shape, auto... params) { - // Get the value and the gradient from the input tuple - auto [u, du_dX] = temperature; - auto [p, dp_dX] = shape; - auto du_dt = get(dtemp_dt); - - auto I_plus_dp_dX = I + dp_dX; - auto inv_I_plus_dp_dX = inv(I_plus_dp_dX); - auto det_I_plus_dp_dX = det(I_plus_dp_dX); - - // Note that the current configuration x = X + p, where X is the original reference - // configuration and p is the shape displacement. We need the gradient with - // respect to the perturbed reference configuration x = X + p for the material model. Therefore, we calculate - // du/dx = du/dX * dX/dx = du/dX * (dx/dX)^-1 = du/dX * (I + dp/dX)^-1 - - auto du_dx = dot(du_dX, inv_I_plus_dp_dX); - - auto [heat_capacity, heat_flux] = material(x + p, u, du_dx, params...); - - // Note that the return is integrated in the perturbed reference - // configuration, hence the det(I + dp_dx) = det(dx/dX) - return serac::tuple{heat_capacity * du_dt * det_I_plus_dp_dX, - -1.0 * dot(inv_I_plus_dp_dX, heat_flux) * det_I_plus_dp_dX}; - }, - mesh_); + integrand, mesh_); } /// @overload