LLNL · johnbowen42 · Sep 5, 2023 · Sep 7, 2023 · Sep 7, 2023 · Oct 18, 2023
diff --git a/cmake/SeracMacros.cmake b/cmake/SeracMacros.cmake
@@ -122,7 +122,8 @@ endmacro(serac_convert_to_native_escaped_file_path)
 ##------------------------------------------------------------------------------
 ## serac_add_tests( SOURCES       [source1 [source2 ...]]
 ##                  DEPENDS_ON    [dep1 [dep2 ...]]
-##                  NUM_MPI_TASKS [num tasks])
+##                  NUM_MPI_TASKS [num tasks]
+##                  )
 ##
 ## Creates an executable per given source and then adds the test to CTest
 ##------------------------------------------------------------------------------

diff --git a/src/serac/numerics/functional/boundary_integral_kernels.hpp b/src/serac/numerics/functional/boundary_integral_kernels.hpp
@@ -10,6 +10,11 @@
 #include "serac/numerics/functional/quadrature_data.hpp"
 #include "serac/numerics/functional/differentiate_wrt.hpp"
 
+#include <RAJA/index/RangeSegment.hpp>
+#include <RAJA/RAJA.hpp>
+#include <array>
+#include <cstdint>
+
 namespace serac {
 
 namespace boundary_integral {
@@ -26,49 +31,49 @@ struct QFunctionArgument;
 
 /// @overload
 template <int p>
-struct QFunctionArgument<H1<p, 1>, Dimension<1>> {
+RAJA_HOST_DEVICE struct QFunctionArgument<H1<p, 1>, Dimension<1>> {
   using type = serac::tuple<double, double>;  ///< what will be passed to the q-function
 };
 
 /// @overload
 template <int p, int dim>
-struct QFunctionArgument<H1<p, 1>, Dimension<dim>> {
+RAJA_HOST_DEVICE struct QFunctionArgument<H1<p, 1>, Dimension<dim>> {
   using type = serac::tuple<double, tensor<double, dim>>;  ///< what will be passed to the q-function
 };
 
 /// @overload
 template <int p, int c>
-struct QFunctionArgument<H1<p, c>, Dimension<1>> {
+RAJA_HOST_DEVICE struct QFunctionArgument<H1<p, c>, Dimension<1>> {
   using type = serac::tuple<tensor<double, c>, tensor<double, c>>;  ///< what will be passed to the q-function
 };
 
 /// @overload
 template <int p, int c, int dim>
-struct QFunctionArgument<H1<p, c>, Dimension<dim>> {
+RAJA_HOST_DEVICE struct QFunctionArgument<H1<p, c>, Dimension<dim>> {
   using type = serac::tuple<tensor<double, c>, tensor<double, c, dim>>;  ///< what will be passed to the q-function
 };
 
 /// @overload
 template <int p>
-struct QFunctionArgument<L2<p, 1>, Dimension<1>> {
+RAJA_HOST_DEVICE struct QFunctionArgument<L2<p, 1>, Dimension<1>> {
   using type = serac::tuple<double, double>;  ///< what will be passed to the q-function
 };
 
 /// @overload
 template <int p, int dim>
-struct QFunctionArgument<L2<p, 1>, Dimension<dim>> {
+RAJA_HOST_DEVICE struct QFunctionArgument<L2<p, 1>, Dimension<dim>> {
   using type = serac::tuple<double, tensor<double, dim>>;  ///< what will be passed to the q-function
 };
 
 /// @overload
 template <int p, int c>
-struct QFunctionArgument<L2<p, c>, Dimension<1>> {
+RAJA_HOST_DEVICE struct QFunctionArgument<L2<p, c>, Dimension<1>> {
   using type = serac::tuple<tensor<double, c>, tensor<double, c>>;  ///< what will be passed to the q-function
 };
 
 /// @overload
 template <int p, int c, int dim>
-struct QFunctionArgument<L2<p, c>, Dimension<dim>> {
+RAJA_HOST_DEVICE struct QFunctionArgument<L2<p, c>, Dimension<dim>> {
   using type = serac::tuple<tensor<double, c>, tensor<double, c, dim>>;  ///< what will be passed to the q-function
 };
 
@@ -101,14 +106,14 @@ SERAC_HOST_DEVICE auto apply_qf(lambda&& qf, coords_type&& x_q, const serac::tup
 }
 
 template <int i, int dim, typename... trials, typename lambda>
-auto get_derivative_type(lambda qf)
+RAJA_HOST_DEVICE auto get_derivative_type(lambda qf)
 {
   using qf_arguments = serac::tuple<typename QFunctionArgument<trials, serac::Dimension<dim>>::type...>;
   return tuple{get_gradient(apply_qf(qf, tensor<double, dim + 1>{}, make_dual_wrt<i>(qf_arguments{}))), zero{}};
 };
 
 template <typename lambda, int n, typename... T>
-auto batch_apply_qf(lambda qf, const tensor<double, 2, n>& positions, const tensor<double, 1, 2, n>& jacobians,
+RAJA_HOST_DEVICE auto batch_apply_qf(lambda qf, const tensor<double, 2, n>& positions, const tensor<double, 1, 2, n>& jacobians,
                     const T&... inputs)
 {
   constexpr int dim = 2;
@@ -130,7 +135,7 @@ auto batch_apply_qf(lambda qf, const tensor<double, 2, n>& positions, const tens
 }
 
 template <typename lambda, int n, typename... T>
-auto batch_apply_qf(lambda qf, const tensor<double, 3, n>& positions, const tensor<double, 2, 3, n>& jacobians,
+RAJA_HOST_DEVICE auto batch_apply_qf(lambda qf, const tensor<double, 3, n>& positions, const tensor<double, 2, 3, n>& jacobians,
                     const T&... inputs)
 {
   constexpr int dim = 3;
@@ -153,41 +158,63 @@ auto batch_apply_qf(lambda qf, const tensor<double, 3, n>& positions, const tens
   return outputs;
 }
 
-template <uint32_t differentiation_index, int Q, mfem::Geometry::Type geom, typename test, typename... trials,
-          typename lambda_type, typename derivative_type, int... indices>
-void evaluation_kernel_impl(FunctionSignature<test(trials...)>, const std::vector<const double*>& inputs,
-                            double* outputs, const double* positions, const double* jacobians, lambda_type qf,
-                            [[maybe_unused]] derivative_type* qf_derivatives, uint32_t num_elements,
-                            std::integer_sequence<int, indices...>)
+template <mfem::Geometry::Type geom, typename test, typename... trials>
+auto get_trial_elements(FunctionSignature<test(trials...)>)
 {
-  using test_element = finite_element<geom, test>;
+  return tuple<finite_element<geom, trials>...>{};
+}
 
-  /// @brief the element type for each trial space
-  static constexpr tuple<finite_element<geom, trials>...> trial_elements{};
+template <mfem::Geometry::Type geom, typename test, typename... trials>
+auto get_test(FunctionSignature<test(trials...)>)
+{
+  return finite_element<geom, test>{};
+}
 
+/// @trial_elements the element type for each trial space
+template <uint32_t differentiation_index, int Q, mfem::Geometry::Type geom, typename test_element,
+          typename trial_element_type, typename lambda_type, typename derivative_type, int... indices>
+void evaluation_kernel_impl(trial_element_type trial_elements, test_element, const std::vector<const double*>& inputs,
+                            double* outputs, const double* positions, const double* jacobians, lambda_type qf,
+                            [[maybe_unused]] derivative_type* qf_derivatives, uint32_t num_elements,
+                            camp::int_seq<int, indices...>)
+{
   // mfem provides this information as opaque arrays of doubles,
   // so we reinterpret the pointer with
   constexpr int dim = dimension_of(geom) + 1;
   constexpr int nqp = num_quadrature_points(geom, Q);
   auto          J   = reinterpret_cast<const tensor<double, dim - 1, dim, nqp>*>(jacobians);
   auto          x   = reinterpret_cast<const tensor<double, dim, nqp>*>(positions);
   auto          r   = reinterpret_cast<typename test_element::dof_type*>(outputs);
-  static constexpr TensorProductQuadratureRule<Q> rule{};
+  TensorProductQuadratureRule<Q> rule{};
 
-  static constexpr int qpts_per_elem = num_quadrature_points(geom, Q);
+  int qpts_per_elem = num_quadrature_points(geom, Q);
 
   [[maybe_unused]] tuple u = {
       reinterpret_cast<const typename decltype(type<indices>(trial_elements))::dof_type*>(inputs[indices])...};
 
+#if defined(USE_CUDA)
+  std::cout<<"USING CUDA\n";
+  using policy = RAJA::cuda_exec<512>;
+#else
+  using policy = RAJA::simd_exec;
+#endif
+printf("HERE 42\n");
   // for each element in the domain
-  for (uint32_t e = 0; e < num_elements; e++) {
+  RAJA::forall<policy>(
+      RAJA::TypedRangeSegment<uint32_t>(0, num_elements), 
+  [J, x, qf, u, rule, r, qpts_per_elem, qf_derivatives] RAJA_HOST_DEVICE (uint32_t e) {
     // load the jacobians and positions for each quadrature point in this element
     auto J_e = J[e];
     auto x_e = x[e];
+    // Avoid unused warning/error ([[maybe_unused]] is not possible in the capture list)
+    (void)qf_derivatives;
+    (void)qpts_per_elem;
+    printf("HERE 43\n");
 
+    static constexpr trial_element_type empty_trial_element{};
     // batch-calculate values / derivatives of each trial space, at each quadrature point
     [[maybe_unused]] tuple qf_inputs = {promote_each_to_dual_when<indices == differentiation_index>(
-        get<indices>(trial_elements).interpolate(get<indices>(u)[e], rule))...};
+        get<indices>(empty_trial_element).interpolate(get<indices>(u)[e], rule))...};
 
     // (batch) evalute the q-function at each quadrature point
     auto qf_outputs = batch_apply_qf(qf, x_e, J_e, get<indices>(qf_inputs)...);
@@ -197,26 +224,26 @@ void evaluation_kernel_impl(FunctionSignature<test(trials...)>, const std::vecto
     // won't need to be applied in the action_of_gradient and element_gradient kernels
     if constexpr (differentiation_index != serac::NO_DIFFERENTIATION) {
       for (int q = 0; q < leading_dimension(qf_outputs); q++) {
-        qf_derivatives[e * qpts_per_elem + uint32_t(q)] = get_gradient(qf_outputs[q]);
+        qf_derivatives[e * uint32_t(qpts_per_elem) + uint32_t(q)] = get_gradient(qf_outputs[q]);
       }
     }
 
     // (batch) integrate the material response against the test-space basis functions
     test_element::integrate(get_value(qf_outputs), rule, &r[e]);
-  }
+  });
 }
 
 //clang-format off
 template <typename S, typename T>
-auto chain_rule(const S& dfdx, const T& dx)
+RAJA_HOST_DEVICE auto chain_rule(const S& dfdx, const T& dx)
 {
   return serac::chain_rule(serac::get<0>(serac::get<0>(dfdx)), serac::get<0>(dx)) +
          serac::chain_rule(serac::get<1>(serac::get<0>(dfdx)), serac::get<1>(dx));
 }
 //clang-format on
 
 template <typename derivative_type, int n, typename T>
-auto batch_apply_chain_rule(derivative_type* qf_derivatives, const tensor<T, n>& inputs)
+RAJA_HOST_DEVICE auto batch_apply_chain_rule(derivative_type* qf_derivatives, const tensor<T, n>& inputs)
 {
   using return_type = decltype(chain_rule(derivative_type{}, T{}));
   tensor<tuple<return_type, zero>, n> outputs{};
@@ -252,20 +279,26 @@ auto batch_apply_chain_rule(derivative_type* qf_derivatives, const tensor<T, n>&
  * @param[in] num_elements The number of elements in the mesh
  */
 template <int Q, mfem::Geometry::Type geom, typename test, typename trial, typename derivatives_type>
-void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* qf_derivatives, std::size_t num_elements)
+void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* qf_derivatives, uint32_t num_elements)
 {
   using test_element  = finite_element<geom, test>;
   using trial_element = finite_element<geom, trial>;
 
   // mfem provides this information in 1D arrays, so we reshape it
   // into strided multidimensional arrays before using
-  constexpr int                                   nqp = num_quadrature_points(geom, Q);
+  uint32_t                                   nqp = num_quadrature_points(geom, Q);
   auto                                            du  = reinterpret_cast<const typename trial_element::dof_type*>(dU);
   auto                                            dr  = reinterpret_cast<typename test_element::dof_type*>(dR);
-  static constexpr TensorProductQuadratureRule<Q> rule{};
+  static TensorProductQuadratureRule<Q> rule{};
+
+#if defined(USE_CUDA)
+  using policy = RAJA::cuda_exec<512>;
+#else
+  using policy = RAJA::simd_exec;
+#endif
 
   // for each element in the domain
-  for (uint32_t e = 0; e < num_elements; e++) {
+  RAJA::forall<policy>(RAJA::TypedRangeSegment<uint32_t>(0, num_elements), [=] RAJA_HOST_DEVICE(uint32_t e) {
     // (batch) interpolate each quadrature point's value
     auto qf_inputs = trial_element::interpolate(du[e], rule);
 
@@ -274,7 +307,7 @@ void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* q
 
     // (batch) integrate the material response against the test-space basis functions
     test_element::integrate(qf_outputs, rule, &dr[e]);
-  }
+  });
 }
 
 /**
@@ -299,8 +332,12 @@ void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* q
  * @param[in] num_elements The number of elements in the mesh
  */
 template <mfem::Geometry::Type g, typename test, typename trial, int Q, typename derivatives_type>
-void element_gradient_kernel(ExecArrayView<double, 3, ExecutionSpace::CPU> dK, derivatives_type* qf_derivatives,
-                             std::size_t num_elements)
+#if defined(USE_CUDA)
+void element_gradient_kernel(ExecArrayView<double, 3, ExecutionSpace::GPU> dK,
+#else
+void element_gradient_kernel(ExecArrayView<double, 3, ExecutionSpace::CPU> dK,
+#endif
+                             derivatives_type* qf_derivatives, std::size_t num_elements)
 {
   using test_element  = finite_element<g, test>;
   using trial_element = finite_element<g, trial>;
@@ -331,9 +368,11 @@ std::function<void(const std::vector<const double*>&, double*, bool)> evaluation
     signature s, lambda_type qf, const double* positions, const double* jacobians,
     std::shared_ptr<derivative_type> qf_derivatives, uint32_t num_elements)
 {
+  auto trial_elements = get_trial_elements<geom>(s);
+  auto test           = get_test<geom>(s);
   return [=](const std::vector<const double*>& inputs, double* outputs, bool /* update state */) {
-    evaluation_kernel_impl<wrt, Q, geom>(s, inputs, outputs, positions, jacobians, qf, qf_derivatives.get(),
-                                         num_elements, s.index_seq);
+    evaluation_kernel_impl<wrt, Q, geom>(trial_elements, test, inputs, outputs, positions, jacobians, qf,
+                                         qf_derivatives.get(), num_elements, s.index_seq);
   };
 }
 
@@ -349,10 +388,18 @@ std::function<void(const double*, double*)> jacobian_vector_product_kernel(
 }
 
 template <int wrt, int Q, mfem::Geometry::Type geom, typename signature, typename derivative_type>
+#if defined(USE_CUDA)
+std::function<void(ExecArrayView<double, 3, ExecutionSpace::GPU>)> element_gradient_kernel(
+#else
 std::function<void(ExecArrayView<double, 3, ExecutionSpace::CPU>)> element_gradient_kernel(
+#endif
     signature, std::shared_ptr<derivative_type> qf_derivatives, uint32_t num_elements)
 {
+  #if defined(USE_CUDA)
+  return [=](ExecArrayView<double, 3, ExecutionSpace::GPU> K_elem) {
+  #else
   return [=](ExecArrayView<double, 3, ExecutionSpace::CPU> K_elem) {
+  #endif
     using test_space  = typename signature::return_type;
     using trial_space = typename std::tuple_element<wrt, typename signature::parameter_types>::type;
     element_gradient_kernel<geom, test_space, trial_space, Q>(K_elem, qf_derivatives.get(), num_elements);

diff --git a/src/serac/numerics/functional/detail/hexahedron_H1.inl b/src/serac/numerics/functional/detail/hexahedron_H1.inl
@@ -130,7 +130,7 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
   }
 
   template <typename in_t, int q>
-  static auto batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input, const TensorProductQuadratureRule<q>&)
+  static auto RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input, const TensorProductQuadratureRule<q>&)
   {
     static constexpr bool apply_weights = false;
     static constexpr auto B             = calculate_B<apply_weights, q>();
@@ -167,7 +167,8 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
   }
 
   template <int q>
-  static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
+  RAJA_HOST_DEVICE
+static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
   {
     // we want to compute the following:
     //
@@ -228,7 +229,7 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
   }
 
   template <typename source_type, typename flux_type, int q>
-  static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
+  RAJA_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
                         const TensorProductQuadratureRule<q>&, dof_type* element_residual, int step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {

diff --git a/src/serac/numerics/functional/detail/hexahedron_Hcurl.inl b/src/serac/numerics/functional/detail/hexahedron_Hcurl.inl
@@ -255,7 +255,7 @@ struct finite_element<mfem::Geometry::CUBE, Hcurl<p>> {
   }
 
   template <typename in_t, int q>
-  static auto batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input, const TensorProductQuadratureRule<q>&)
+  static auto RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input, const TensorProductQuadratureRule<q>&)
   {
     constexpr bool                     apply_weights = false;
     constexpr tensor<double, q, p>     B1            = calculate_B1<apply_weights, q>();
@@ -334,7 +334,8 @@ struct finite_element<mfem::Geometry::CUBE, Hcurl<p>> {
   }
 
   template <int q>
-  static auto interpolate(const dof_type& element_values, const TensorProductQuadratureRule<q>&)
+  RAJA_HOST_DEVICE
+static auto interpolate(const dof_type& element_values, const TensorProductQuadratureRule<q>&)
   {
     constexpr bool                     apply_weights = false;
     constexpr tensor<double, q, p>     B1            = calculate_B1<apply_weights, q>();
@@ -395,7 +396,7 @@ struct finite_element<mfem::Geometry::CUBE, Hcurl<p>> {
   }
 
   template <typename source_type, typename flux_type, int q>
-  static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
+  RAJA_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
                         const TensorProductQuadratureRule<q>&, dof_type* element_residual,
                         [[maybe_unused]] int step = 1)
   {

diff --git a/src/serac/numerics/functional/detail/hexahedron_L2.inl b/src/serac/numerics/functional/detail/hexahedron_L2.inl
@@ -134,7 +134,7 @@ struct finite_element<mfem::Geometry::CUBE, L2<p, c> > {
   }
 
   template <typename in_t, int q>
-  static auto batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input, const TensorProductQuadratureRule<q>&)
+  static auto RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input, const TensorProductQuadratureRule<q>&)
   {
     static constexpr bool apply_weights = false;
     static constexpr auto B             = calculate_B<apply_weights, q>();
@@ -171,7 +171,8 @@ struct finite_element<mfem::Geometry::CUBE, L2<p, c> > {
   }
 
   template <int q>
-  static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
+  RAJA_HOST_DEVICE
+static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
   {
     // we want to compute the following:
     //
@@ -232,7 +233,7 @@ struct finite_element<mfem::Geometry::CUBE, L2<p, c> > {
   }
 
   template <typename source_type, typename flux_type, int q>
-  static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
+  RAJA_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
                         const TensorProductQuadratureRule<q>&, dof_type* element_residual, int step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {