Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Refactor serac to support RAJA #987

Draft
wants to merge 7 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cmake/SeracMacros.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,8 @@ endmacro(serac_convert_to_native_escaped_file_path)
##------------------------------------------------------------------------------
## serac_add_tests( SOURCES [source1 [source2 ...]]
## DEPENDS_ON [dep1 [dep2 ...]]
## NUM_MPI_TASKS [num tasks])
## NUM_MPI_TASKS [num tasks]
## )
##
## Creates an executable per given source and then adds the test to CTest
##------------------------------------------------------------------------------
Expand Down
121 changes: 84 additions & 37 deletions src/serac/numerics/functional/boundary_integral_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@
#include "serac/numerics/functional/quadrature_data.hpp"
#include "serac/numerics/functional/differentiate_wrt.hpp"

#include <RAJA/index/RangeSegment.hpp>
#include <RAJA/RAJA.hpp>
#include <array>
#include <cstdint>

namespace serac {

namespace boundary_integral {
Expand All @@ -26,49 +31,49 @@ struct QFunctionArgument;

/// @overload
template <int p>
struct QFunctionArgument<H1<p, 1>, Dimension<1>> {
RAJA_HOST_DEVICE struct QFunctionArgument<H1<p, 1>, Dimension<1>> {
using type = serac::tuple<double, double>; ///< what will be passed to the q-function
};

/// @overload
template <int p, int dim>
struct QFunctionArgument<H1<p, 1>, Dimension<dim>> {
RAJA_HOST_DEVICE struct QFunctionArgument<H1<p, 1>, Dimension<dim>> {
using type = serac::tuple<double, tensor<double, dim>>; ///< what will be passed to the q-function
};

/// @overload
template <int p, int c>
struct QFunctionArgument<H1<p, c>, Dimension<1>> {
RAJA_HOST_DEVICE struct QFunctionArgument<H1<p, c>, Dimension<1>> {
using type = serac::tuple<tensor<double, c>, tensor<double, c>>; ///< what will be passed to the q-function
};

/// @overload
template <int p, int c, int dim>
struct QFunctionArgument<H1<p, c>, Dimension<dim>> {
RAJA_HOST_DEVICE struct QFunctionArgument<H1<p, c>, Dimension<dim>> {
using type = serac::tuple<tensor<double, c>, tensor<double, c, dim>>; ///< what will be passed to the q-function
};

/// @overload
template <int p>
struct QFunctionArgument<L2<p, 1>, Dimension<1>> {
RAJA_HOST_DEVICE struct QFunctionArgument<L2<p, 1>, Dimension<1>> {
using type = serac::tuple<double, double>; ///< what will be passed to the q-function
};

/// @overload
template <int p, int dim>
struct QFunctionArgument<L2<p, 1>, Dimension<dim>> {
RAJA_HOST_DEVICE struct QFunctionArgument<L2<p, 1>, Dimension<dim>> {
using type = serac::tuple<double, tensor<double, dim>>; ///< what will be passed to the q-function
};

/// @overload
template <int p, int c>
struct QFunctionArgument<L2<p, c>, Dimension<1>> {
RAJA_HOST_DEVICE struct QFunctionArgument<L2<p, c>, Dimension<1>> {
using type = serac::tuple<tensor<double, c>, tensor<double, c>>; ///< what will be passed to the q-function
};

/// @overload
template <int p, int c, int dim>
struct QFunctionArgument<L2<p, c>, Dimension<dim>> {
RAJA_HOST_DEVICE struct QFunctionArgument<L2<p, c>, Dimension<dim>> {
using type = serac::tuple<tensor<double, c>, tensor<double, c, dim>>; ///< what will be passed to the q-function
};

Expand Down Expand Up @@ -101,14 +106,14 @@ SERAC_HOST_DEVICE auto apply_qf(lambda&& qf, coords_type&& x_q, const serac::tup
}

template <int i, int dim, typename... trials, typename lambda>
auto get_derivative_type(lambda qf)
RAJA_HOST_DEVICE auto get_derivative_type(lambda qf)
{
using qf_arguments = serac::tuple<typename QFunctionArgument<trials, serac::Dimension<dim>>::type...>;
return tuple{get_gradient(apply_qf(qf, tensor<double, dim + 1>{}, make_dual_wrt<i>(qf_arguments{}))), zero{}};
};

template <typename lambda, int n, typename... T>
auto batch_apply_qf(lambda qf, const tensor<double, 2, n>& positions, const tensor<double, 1, 2, n>& jacobians,
RAJA_HOST_DEVICE auto batch_apply_qf(lambda qf, const tensor<double, 2, n>& positions, const tensor<double, 1, 2, n>& jacobians,
const T&... inputs)
{
constexpr int dim = 2;
Expand All @@ -130,7 +135,7 @@ auto batch_apply_qf(lambda qf, const tensor<double, 2, n>& positions, const tens
}

template <typename lambda, int n, typename... T>
auto batch_apply_qf(lambda qf, const tensor<double, 3, n>& positions, const tensor<double, 2, 3, n>& jacobians,
RAJA_HOST_DEVICE auto batch_apply_qf(lambda qf, const tensor<double, 3, n>& positions, const tensor<double, 2, 3, n>& jacobians,
const T&... inputs)
{
constexpr int dim = 3;
Expand All @@ -153,41 +158,63 @@ auto batch_apply_qf(lambda qf, const tensor<double, 3, n>& positions, const tens
return outputs;
}

template <uint32_t differentiation_index, int Q, mfem::Geometry::Type geom, typename test, typename... trials,
typename lambda_type, typename derivative_type, int... indices>
void evaluation_kernel_impl(FunctionSignature<test(trials...)>, const std::vector<const double*>& inputs,
double* outputs, const double* positions, const double* jacobians, lambda_type qf,
[[maybe_unused]] derivative_type* qf_derivatives, uint32_t num_elements,
std::integer_sequence<int, indices...>)
template <mfem::Geometry::Type geom, typename test, typename... trials>
auto get_trial_elements(FunctionSignature<test(trials...)>)
{
using test_element = finite_element<geom, test>;
return tuple<finite_element<geom, trials>...>{};
}

/// @brief the element type for each trial space
static constexpr tuple<finite_element<geom, trials>...> trial_elements{};
template <mfem::Geometry::Type geom, typename test, typename... trials>
auto get_test(FunctionSignature<test(trials...)>)
{
return finite_element<geom, test>{};
}

/// @trial_elements the element type for each trial space
template <uint32_t differentiation_index, int Q, mfem::Geometry::Type geom, typename test_element,
typename trial_element_type, typename lambda_type, typename derivative_type, int... indices>
void evaluation_kernel_impl(trial_element_type trial_elements, test_element, const std::vector<const double*>& inputs,
double* outputs, const double* positions, const double* jacobians, lambda_type qf,
[[maybe_unused]] derivative_type* qf_derivatives, uint32_t num_elements,
camp::int_seq<int, indices...>)
{
// mfem provides this information as opaque arrays of doubles,
// so we reinterpret the pointer with
constexpr int dim = dimension_of(geom) + 1;
constexpr int nqp = num_quadrature_points(geom, Q);
auto J = reinterpret_cast<const tensor<double, dim - 1, dim, nqp>*>(jacobians);
auto x = reinterpret_cast<const tensor<double, dim, nqp>*>(positions);
auto r = reinterpret_cast<typename test_element::dof_type*>(outputs);
static constexpr TensorProductQuadratureRule<Q> rule{};
TensorProductQuadratureRule<Q> rule{};

static constexpr int qpts_per_elem = num_quadrature_points(geom, Q);
int qpts_per_elem = num_quadrature_points(geom, Q);

[[maybe_unused]] tuple u = {
reinterpret_cast<const typename decltype(type<indices>(trial_elements))::dof_type*>(inputs[indices])...};

#if defined(USE_CUDA)
std::cout<<"USING CUDA\n";
using policy = RAJA::cuda_exec<512>;
#else
using policy = RAJA::simd_exec;
#endif
printf("HERE 42\n");
// for each element in the domain
for (uint32_t e = 0; e < num_elements; e++) {
RAJA::forall<policy>(
RAJA::TypedRangeSegment<uint32_t>(0, num_elements),
[J, x, qf, u, rule, r, qpts_per_elem, qf_derivatives] RAJA_HOST_DEVICE (uint32_t e) {
// load the jacobians and positions for each quadrature point in this element
auto J_e = J[e];
auto x_e = x[e];
// Avoid unused warning/error ([[maybe_unused]] is not possible in the capture list)
(void)qf_derivatives;
(void)qpts_per_elem;
printf("HERE 43\n");

static constexpr trial_element_type empty_trial_element{};
// batch-calculate values / derivatives of each trial space, at each quadrature point
[[maybe_unused]] tuple qf_inputs = {promote_each_to_dual_when<indices == differentiation_index>(
get<indices>(trial_elements).interpolate(get<indices>(u)[e], rule))...};
get<indices>(empty_trial_element).interpolate(get<indices>(u)[e], rule))...};

// (batch) evalute the q-function at each quadrature point
auto qf_outputs = batch_apply_qf(qf, x_e, J_e, get<indices>(qf_inputs)...);
Expand All @@ -197,26 +224,26 @@ void evaluation_kernel_impl(FunctionSignature<test(trials...)>, const std::vecto
// won't need to be applied in the action_of_gradient and element_gradient kernels
if constexpr (differentiation_index != serac::NO_DIFFERENTIATION) {
for (int q = 0; q < leading_dimension(qf_outputs); q++) {
qf_derivatives[e * qpts_per_elem + uint32_t(q)] = get_gradient(qf_outputs[q]);
qf_derivatives[e * uint32_t(qpts_per_elem) + uint32_t(q)] = get_gradient(qf_outputs[q]);
}
}

// (batch) integrate the material response against the test-space basis functions
test_element::integrate(get_value(qf_outputs), rule, &r[e]);
}
});
}

//clang-format off
template <typename S, typename T>
auto chain_rule(const S& dfdx, const T& dx)
RAJA_HOST_DEVICE auto chain_rule(const S& dfdx, const T& dx)
{
return serac::chain_rule(serac::get<0>(serac::get<0>(dfdx)), serac::get<0>(dx)) +
serac::chain_rule(serac::get<1>(serac::get<0>(dfdx)), serac::get<1>(dx));
}
//clang-format on

template <typename derivative_type, int n, typename T>
auto batch_apply_chain_rule(derivative_type* qf_derivatives, const tensor<T, n>& inputs)
RAJA_HOST_DEVICE auto batch_apply_chain_rule(derivative_type* qf_derivatives, const tensor<T, n>& inputs)
{
using return_type = decltype(chain_rule(derivative_type{}, T{}));
tensor<tuple<return_type, zero>, n> outputs{};
Expand Down Expand Up @@ -252,20 +279,26 @@ auto batch_apply_chain_rule(derivative_type* qf_derivatives, const tensor<T, n>&
* @param[in] num_elements The number of elements in the mesh
*/
template <int Q, mfem::Geometry::Type geom, typename test, typename trial, typename derivatives_type>
void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* qf_derivatives, std::size_t num_elements)
void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* qf_derivatives, uint32_t num_elements)
{
using test_element = finite_element<geom, test>;
using trial_element = finite_element<geom, trial>;

// mfem provides this information in 1D arrays, so we reshape it
// into strided multidimensional arrays before using
constexpr int nqp = num_quadrature_points(geom, Q);
uint32_t nqp = num_quadrature_points(geom, Q);
auto du = reinterpret_cast<const typename trial_element::dof_type*>(dU);
auto dr = reinterpret_cast<typename test_element::dof_type*>(dR);
static constexpr TensorProductQuadratureRule<Q> rule{};
static TensorProductQuadratureRule<Q> rule{};

#if defined(USE_CUDA)
using policy = RAJA::cuda_exec<512>;
#else
using policy = RAJA::simd_exec;
#endif

// for each element in the domain
for (uint32_t e = 0; e < num_elements; e++) {
RAJA::forall<policy>(RAJA::TypedRangeSegment<uint32_t>(0, num_elements), [=] RAJA_HOST_DEVICE(uint32_t e) {
// (batch) interpolate each quadrature point's value
auto qf_inputs = trial_element::interpolate(du[e], rule);

Expand All @@ -274,7 +307,7 @@ void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* q

// (batch) integrate the material response against the test-space basis functions
test_element::integrate(qf_outputs, rule, &dr[e]);
}
});
}

/**
Expand All @@ -299,8 +332,12 @@ void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* q
* @param[in] num_elements The number of elements in the mesh
*/
template <mfem::Geometry::Type g, typename test, typename trial, int Q, typename derivatives_type>
void element_gradient_kernel(ExecArrayView<double, 3, ExecutionSpace::CPU> dK, derivatives_type* qf_derivatives,
std::size_t num_elements)
#if defined(USE_CUDA)
void element_gradient_kernel(ExecArrayView<double, 3, ExecutionSpace::GPU> dK,
#else
void element_gradient_kernel(ExecArrayView<double, 3, ExecutionSpace::CPU> dK,
#endif
derivatives_type* qf_derivatives, std::size_t num_elements)
{
using test_element = finite_element<g, test>;
using trial_element = finite_element<g, trial>;
Expand Down Expand Up @@ -331,9 +368,11 @@ std::function<void(const std::vector<const double*>&, double*, bool)> evaluation
signature s, lambda_type qf, const double* positions, const double* jacobians,
std::shared_ptr<derivative_type> qf_derivatives, uint32_t num_elements)
{
auto trial_elements = get_trial_elements<geom>(s);
auto test = get_test<geom>(s);
return [=](const std::vector<const double*>& inputs, double* outputs, bool /* update state */) {
evaluation_kernel_impl<wrt, Q, geom>(s, inputs, outputs, positions, jacobians, qf, qf_derivatives.get(),
num_elements, s.index_seq);
evaluation_kernel_impl<wrt, Q, geom>(trial_elements, test, inputs, outputs, positions, jacobians, qf,
qf_derivatives.get(), num_elements, s.index_seq);
};
}

Expand All @@ -349,10 +388,18 @@ std::function<void(const double*, double*)> jacobian_vector_product_kernel(
}

template <int wrt, int Q, mfem::Geometry::Type geom, typename signature, typename derivative_type>
#if defined(USE_CUDA)
std::function<void(ExecArrayView<double, 3, ExecutionSpace::GPU>)> element_gradient_kernel(
#else
std::function<void(ExecArrayView<double, 3, ExecutionSpace::CPU>)> element_gradient_kernel(
#endif
signature, std::shared_ptr<derivative_type> qf_derivatives, uint32_t num_elements)
{
#if defined(USE_CUDA)
return [=](ExecArrayView<double, 3, ExecutionSpace::GPU> K_elem) {
#else
return [=](ExecArrayView<double, 3, ExecutionSpace::CPU> K_elem) {
#endif
using test_space = typename signature::return_type;
using trial_space = typename std::tuple_element<wrt, typename signature::parameter_types>::type;
element_gradient_kernel<geom, test_space, trial_space, Q>(K_elem, qf_derivatives.get(), num_elements);
Expand Down
7 changes: 4 additions & 3 deletions src/serac/numerics/functional/detail/hexahedron_H1.inl
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
}

template <typename in_t, int q>
static auto batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input, const TensorProductQuadratureRule<q>&)
static auto RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input, const TensorProductQuadratureRule<q>&)
{
static constexpr bool apply_weights = false;
static constexpr auto B = calculate_B<apply_weights, q>();
Expand Down Expand Up @@ -167,7 +167,8 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
}

template <int q>
static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
RAJA_HOST_DEVICE
static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
{
// we want to compute the following:
//
Expand Down Expand Up @@ -228,7 +229,7 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
}

template <typename source_type, typename flux_type, int q>
static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
RAJA_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
const TensorProductQuadratureRule<q>&, dof_type* element_residual, int step = 1)
{
if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
Expand Down
7 changes: 4 additions & 3 deletions src/serac/numerics/functional/detail/hexahedron_Hcurl.inl
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ struct finite_element<mfem::Geometry::CUBE, Hcurl<p>> {
}

template <typename in_t, int q>
static auto batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input, const TensorProductQuadratureRule<q>&)
static auto RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input, const TensorProductQuadratureRule<q>&)
{
constexpr bool apply_weights = false;
constexpr tensor<double, q, p> B1 = calculate_B1<apply_weights, q>();
Expand Down Expand Up @@ -334,7 +334,8 @@ struct finite_element<mfem::Geometry::CUBE, Hcurl<p>> {
}

template <int q>
static auto interpolate(const dof_type& element_values, const TensorProductQuadratureRule<q>&)
RAJA_HOST_DEVICE
static auto interpolate(const dof_type& element_values, const TensorProductQuadratureRule<q>&)
{
constexpr bool apply_weights = false;
constexpr tensor<double, q, p> B1 = calculate_B1<apply_weights, q>();
Expand Down Expand Up @@ -395,7 +396,7 @@ struct finite_element<mfem::Geometry::CUBE, Hcurl<p>> {
}

template <typename source_type, typename flux_type, int q>
static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
RAJA_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
const TensorProductQuadratureRule<q>&, dof_type* element_residual,
[[maybe_unused]] int step = 1)
{
Expand Down
7 changes: 4 additions & 3 deletions src/serac/numerics/functional/detail/hexahedron_L2.inl
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ struct finite_element<mfem::Geometry::CUBE, L2<p, c> > {
}

template <typename in_t, int q>
static auto batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input, const TensorProductQuadratureRule<q>&)
static auto RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input, const TensorProductQuadratureRule<q>&)
{
static constexpr bool apply_weights = false;
static constexpr auto B = calculate_B<apply_weights, q>();
Expand Down Expand Up @@ -171,7 +171,8 @@ struct finite_element<mfem::Geometry::CUBE, L2<p, c> > {
}

template <int q>
static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
RAJA_HOST_DEVICE
static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
{
// we want to compute the following:
//
Expand Down Expand Up @@ -232,7 +233,7 @@ struct finite_element<mfem::Geometry::CUBE, L2<p, c> > {
}

template <typename source_type, typename flux_type, int q>
static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
RAJA_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
const TensorProductQuadratureRule<q>&, dof_type* element_residual, int step = 1)
{
if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
Expand Down
Loading