Skip to content

Commit

Permalink
Refactor domain_integral_kernels.cuh to use RAJA, change to use domai…
Browse files Browse the repository at this point in the history
…n_integral_kernels.hpp to use RAJA
  • Loading branch information
johnbowen42 committed Aug 22, 2023
1 parent a20a7c9 commit f7de220
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 39 deletions.
45 changes: 34 additions & 11 deletions src/serac/numerics/functional/domain_integral_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
#include "serac/infrastructure/accelerator.hpp"
#include "serac/numerics/functional/quadrature_data.hpp"

#include "RAJA/RAJA.hpp"
#include <RAJA/index/RangeSegment.hpp>

namespace serac {

namespace detail {
Expand Down Expand Up @@ -193,17 +196,18 @@ template <mfem::Geometry::Type g, typename test, typename trial, int Q, typename
typename qpt_data_type = void>
__global__ void eval_cuda_quadrature(const solution_type u, residual_type r,
GPUArrayView<derivatives_type, 2> qf_derivatives, jacobian_type J, position_type X,
int num_elements, lambda qf, QuadratureData<qpt_data_type> data)
int num_elements, lambda qf, QuadratureData<qpt_data_type> data,
int blocks_quadrature_element, int block_dim, int thread_idx)
{
using test_element = finite_element<g, test>;
using trial_element = finite_element<g, trial>;
using element_residual_type = typename test_element::residual_type;
static constexpr auto rule = GaussQuadratureRule<g, Q>();
static constexpr int dim = dimension_of(g);

const int grid_stride = blockDim.x * gridDim.x;
const int grid_stride = block_dim * blocks_quadrature_element;
// launch a thread for each quadrature x element point
for (int qe = blockIdx.x * blockDim.x + threadIdx.x; qe < num_elements * rule.size(); qe += grid_stride) {
for (int qe = thread_idx; qe < num_elements * rule.size(); qe += grid_stride) {
// warps won't fetch that many elements ... not great.. but not horrible
int e = qe / rule.size();
int q = qe % rule.size();
Expand Down Expand Up @@ -426,16 +430,16 @@ template <mfem::Geometry::Type g, typename test, typename trial, int Q, typename
typename dsolution_type, typename dresidual_type>
__global__ void gradient_cuda_quadrature(const dsolution_type du, dresidual_type dr,
GPUArrayView<derivatives_type, 2> qf_derivatives,
const mfem::DeviceTensor<4, const double> J, int num_elements)
const mfem::DeviceTensor<4, const double> J, int num_elements,
int grid_dim, int block_dim, int thread_id)
{
using test_element = finite_element<g, test>;
using trial_element = finite_element<g, trial>;
using element_residual_type = typename trial_element::residual_type;
static constexpr auto rule = GaussQuadratureRule<g, Q>();
static constexpr int dim = dimension_of(g);

const int grid_stride = blockDim.x * gridDim.x;
auto thread_id = blockIdx.x * blockDim.x + threadIdx.x;
const int grid_stride = block_dim * grid_dim;
auto num_quadrature_points = num_elements * rule.size();
#pragma unroll
for (int qe = thread_id; qe < num_quadrature_points; qe += grid_stride) {
Expand Down Expand Up @@ -518,19 +522,38 @@ void action_of_gradient_kernel(serac::detail::GPULaunchConfiguration config, con
auto du = detail::Reshape<trial>(dU.Read(), trial_ndof, num_elements);
auto dr = detail::Reshape<test>(dR.ReadWrite(), test_ndof, num_elements);

using global_thread_x = RAJA::LoopPolicy<loop_policy
#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
, gpu_global_thread_x_policy
#endif
RAJA::RangeSegment range (0, num_elements);

cudaDeviceSynchronize();
serac::accelerator::displayLastCUDAMessage();

// call gradient_cuda
if constexpr (policy == serac::detail::ThreadParallelizationStrategy::THREAD_PER_QUADRATURE_POINT) {
int blocks_quadrature_element = (num_elements * rule.size() + config.blocksize - 1) / config.blocksize;
gradient_cuda_quadrature<g, test, trial, Q, derivatives_type>
<<<blocks_quadrature_element, config.blocksize>>>(du, dr, qf_derivatives, J, num_elements);

RAJA::launch<launch_policy>(RAJA::ExecPlace::DEVICE,
RAJA::LaunchParams(RAJA::Teams(blocks_quadrature_element),
RAJA::Threads(config.blocksize)),
[=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
RAJA::loop<global_thread_x>(ctx, range, [&](int t_idx) {
gradient_cuda_quadrature<g, test, trial, Q, derivatives_type>
(du, dr, qf_derivatives, J, num_elements, blocks_quadrature_element, config.blocksize, t_idx);
});
});
} else if constexpr (policy == serac::detail::ThreadParallelizationStrategy::THREAD_PER_ELEMENT) {
int blocks_element = (num_elements + config.blocksize - 1) / config.blocksize;
gradient_cuda_element<g, test, trial, Q, derivatives_type>
<<<blocks_element, config.blocksize>>>(du, dr, qf_derivatives, J, num_elements);
RAJA::launch<launch_policy>(RAJA::ExecPlace::DEVICE,
RAJA::LaunchParams(RAJA::Teams(blocks_element),
RAJA::Threads(config.blocksize)),
[=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
RAJA::loop<global_thread_x>(ctx, range, [&](int t_idx) {
gradient_cuda_element<g, test, trial, Q, derivatives_type>
(du, dr, qf_derivatives, J, num_elements, blocks_element, config.blocksize, t_idx);
});
});
}

cudaDeviceSynchronize();
Expand Down
17 changes: 15 additions & 2 deletions src/serac/numerics/functional/domain_integral_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,17 @@
// SPDX-License-Identifier: (BSD-3-Clause)
#pragma once

#include "RAJA/RAJA.hpp"

#include "serac/infrastructure/accelerator.hpp"
#include "serac/numerics/functional/quadrature_data.hpp"
#include "serac/numerics/functional/function_signature.hpp"
#include "serac/numerics/functional/differentiate_wrt.hpp"

#include <RAJA/index/RangeSegment.hpp>
#include <RAJA/pattern/forall.hpp>
#include <array>
#include <cstdint>

namespace serac {

Expand Down Expand Up @@ -159,8 +164,16 @@ void evaluation_kernel_impl(FunctionSignature<test(trials...)>, const std::vecto
[[maybe_unused]] tuple u = {
reinterpret_cast<const typename decltype(type<indices>(trial_elements))::dof_type*>(inputs[indices])...};

#if defined(RAJA_ENABLE_CUDA)
using policy = RAJA::cuda_exec<512>;
#elif defined(RAJA_ENABLE_OPENMP)
using policy = RAJA::omp_parallel_for_exec;
#else
using policy = RAJA::simd_exec;
#endif

// for each element in the domain
for (uint32_t e = 0; e < num_elements; e++) {
RAJA::forall<policy>(RAJA::TypedRangeSegment<uint32_t>(0, num_elements), [&] (uint32_t e) {
// load the jacobians and positions for each quadrature point in this element
auto J_e = J[e];
auto x_e = x[e];
Expand Down Expand Up @@ -201,7 +214,7 @@ void evaluation_kernel_impl(FunctionSignature<test(trials...)>, const std::vecto

// (batch) integrate the material response against the test-space basis functions
test_element::integrate(get_value(qf_outputs), rule, &r[e]);
}
});
}

//clang-format off
Expand Down
62 changes: 36 additions & 26 deletions src/serac/physics/heat_transfer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ const TimesteppingOptions default_static_options = {TimestepMethod::QuasiStatic}
/**
* @brief An object containing the solver for a thermal conduction PDE
*
* This is a generic linear thermal diffusion oeprator of the form
* This is a generic linear thermal diffusion operator of the form
*
* \f[
* \mathbf{M} \frac{\partial \mathbf{u}}{\partial t} = -\kappa \mathbf{K} \mathbf{u} + \mathbf{f}
Expand Down Expand Up @@ -276,6 +276,39 @@ class HeatTransfer<order, dim, Parameters<parameter_space...>, std::integer_sequ
cycle_ += 1;
}

template<typename MaterialType>
struct ThermalMaterialIntegrand {
ThermalMaterialIntegrand(MaterialType material) : material_(material) {}

template<typename X, typename T, typename dT_dt, typename Shape, typename... Params>
auto operator()(X x, T temperature, dT_dt dtemp_dt, Shape shape, Params... params) {
// Get the value and the gradient from the input tuple
auto [u, du_dX] = temperature;
auto [p, dp_dX] = shape;
auto du_dt = get<VALUE>(dtemp_dt);

auto I_plus_dp_dX = I + dp_dX;
auto inv_I_plus_dp_dX = inv(I_plus_dp_dX);
auto det_I_plus_dp_dX = det(I_plus_dp_dX);

// Note that the current configuration x = X + p, where X is the original reference
// configuration and p is the shape displacement. We need the gradient with
// respect to the perturbed reference configuration x = X + p for the material model. Therefore, we calculate
// du/dx = du/dX * dX/dx = du/dX * (dx/dX)^-1 = du/dX * (I + dp/dX)^-1

auto du_dx = dot(du_dX, inv_I_plus_dp_dX);

auto [heat_capacity, heat_flux] = material_(x + p, u, du_dx, params...);

// Note that the return is integrated in the perturbed reference
// configuration, hence the det(I + dp_dx) = det(dx/dX)
return serac::tuple{heat_capacity * du_dt * det_I_plus_dp_dX,
-1.0 * dot(inv_I_plus_dp_dX, heat_flux) * det_I_plus_dp_dX};
}
private:
MaterialType material_;
};

/**
* @brief Set the thermal material model for the physics solver
*
Expand All @@ -300,33 +333,10 @@ class HeatTransfer<order, dim, Parameters<parameter_space...>, std::integer_sequ
template <int... active_parameters, typename MaterialType>
void setMaterial(DependsOn<active_parameters...>, MaterialType material)
{
ThermalMaterialIntegrand integrand (material);
residual_->AddDomainIntegral(
Dimension<dim>{}, DependsOn<0, 1, 2, active_parameters + NUM_STATE_VARS...>{},
[material](auto x, auto temperature, auto dtemp_dt, auto shape, auto... params) {
// Get the value and the gradient from the input tuple
auto [u, du_dX] = temperature;
auto [p, dp_dX] = shape;
auto du_dt = get<VALUE>(dtemp_dt);

auto I_plus_dp_dX = I + dp_dX;
auto inv_I_plus_dp_dX = inv(I_plus_dp_dX);
auto det_I_plus_dp_dX = det(I_plus_dp_dX);

// Note that the current configuration x = X + p, where X is the original reference
// configuration and p is the shape displacement. We need the gradient with
// respect to the perturbed reference configuration x = X + p for the material model. Therefore, we calculate
// du/dx = du/dX * dX/dx = du/dX * (dx/dX)^-1 = du/dX * (I + dp/dX)^-1

auto du_dx = dot(du_dX, inv_I_plus_dp_dX);

auto [heat_capacity, heat_flux] = material(x + p, u, du_dx, params...);

// Note that the return is integrated in the perturbed reference
// configuration, hence the det(I + dp_dx) = det(dx/dX)
return serac::tuple{heat_capacity * du_dt * det_I_plus_dp_dX,
-1.0 * dot(inv_I_plus_dp_dX, heat_flux) * det_I_plus_dp_dX};
},
mesh_);
integrand, mesh_);
}

/// @overload
Expand Down

0 comments on commit f7de220

Please sign in to comment.