diff --git a/src/serac/numerics/functional/boundary_integral_kernels.hpp b/src/serac/numerics/functional/boundary_integral_kernels.hpp
index 2c53b45b4..3959ac96c 100644
--- a/src/serac/numerics/functional/boundary_integral_kernels.hpp
+++ b/src/serac/numerics/functional/boundary_integral_kernels.hpp
@@ -153,18 +153,14 @@ auto batch_apply_qf(lambda qf, const tensor<double, 3, n>& positions, const tens
   return outputs;
 }
 
-template <uint32_t differentiation_index, int Q, mfem::Geometry::Type geom, typename test, typename... trials,
-          typename lambda_type, typename derivative_type, int... indices>
-void evaluation_kernel_impl(FunctionSignature<test(trials...)>, const std::vector<const double*>& inputs,
+/// @trial_elements the element type for each trial space
+template <uint32_t differentiation_index, int Q, mfem::Geometry::Type geom, typename test_element,
+          typename trial_element_type, typename lambda_type, typename derivative_type, int... indices>
+void evaluation_kernel_impl(trial_element_type trial_elements, test_element, const std::vector<const double*>& inputs,
                             double* outputs, const double* positions, const double* jacobians, lambda_type qf,
                             [[maybe_unused]] derivative_type* qf_derivatives, uint32_t num_elements,
-                            std::integer_sequence<int, indices...>)
+                            camp::int_seq<int, indices...>)
 {
-  using test_element = finite_element<geom, test>;
-
-  /// @brief the element type for each trial space
-  static constexpr tuple<finite_element<geom, trials>...> trial_elements{};
-
   // mfem provides this information as opaque arrays of doubles,
   // so we reinterpret the pointer with
   constexpr int dim = dimension_of(geom) + 1;
@@ -331,9 +327,11 @@ std::function<void(const std::vector<const double*>&, double*, bool)> evaluation
     signature s, lambda_type qf, const double* positions, const double* jacobians,
     std::shared_ptr<derivative_type> qf_derivatives, uint32_t num_elements)
 {
+  auto trial_elements = trial_elements_tuple<geom>(s);
+  auto test           = get_test_element<geom>(s);
   return [=](const std::vector<const double*>& inputs, double* outputs, bool /* update state */) {
-    evaluation_kernel_impl<wrt, Q, geom>(s, inputs, outputs, positions, jacobians, qf, qf_derivatives.get(),
-                                         num_elements, s.index_seq);
+    evaluation_kernel_impl<wrt, Q, geom>(trial_elements, test, inputs, outputs, positions, jacobians, qf,
+                                         qf_derivatives.get(), num_elements, s.index_seq);
   };
 }
 
diff --git a/src/serac/numerics/functional/detail/hexahedron_H1.inl b/src/serac/numerics/functional/detail/hexahedron_H1.inl
index 25d3f5f37..2a5ef7479 100644
--- a/src/serac/numerics/functional/detail/hexahedron_H1.inl
+++ b/src/serac/numerics/functional/detail/hexahedron_H1.inl
@@ -167,7 +167,7 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
   }
 
   template <int q>
-  static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
   {
     // we want to compute the following:
     //
@@ -228,8 +228,9 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
   }
 
   template <typename source_type, typename flux_type, int q>
-  static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
-                        const TensorProductQuadratureRule<q>&, dof_type* element_residual, int step = 1)
+  SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
+                                          const TensorProductQuadratureRule<q>&, dof_type* element_residual,
+                                          int step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
diff --git a/src/serac/numerics/functional/detail/hexahedron_Hcurl.inl b/src/serac/numerics/functional/detail/hexahedron_Hcurl.inl
index 93d0d8e7d..b6277937b 100644
--- a/src/serac/numerics/functional/detail/hexahedron_Hcurl.inl
+++ b/src/serac/numerics/functional/detail/hexahedron_Hcurl.inl
@@ -334,7 +334,7 @@ struct finite_element<mfem::Geometry::CUBE, Hcurl<p>> {
   }
 
   template <int q>
-  static auto interpolate(const dof_type& element_values, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static auto interpolate(const dof_type& element_values, const TensorProductQuadratureRule<q>&)
   {
     constexpr bool                     apply_weights = false;
     constexpr tensor<double, q, p>     B1            = calculate_B1<apply_weights, q>();
@@ -395,9 +395,9 @@ struct finite_element<mfem::Geometry::CUBE, Hcurl<p>> {
   }
 
   template <typename source_type, typename flux_type, int q>
-  static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
-                        const TensorProductQuadratureRule<q>&, dof_type* element_residual,
-                        [[maybe_unused]] int step = 1)
+  SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
+                                          const TensorProductQuadratureRule<q>&, dof_type* element_residual,
+                                          [[maybe_unused]] int step = 1)
   {
     constexpr bool                     apply_weights = true;
     constexpr tensor<double, q, p>     B1            = calculate_B1<apply_weights, q>();
diff --git a/src/serac/numerics/functional/detail/hexahedron_L2.inl b/src/serac/numerics/functional/detail/hexahedron_L2.inl
index ec7c09b1b..51efeaeec 100644
--- a/src/serac/numerics/functional/detail/hexahedron_L2.inl
+++ b/src/serac/numerics/functional/detail/hexahedron_L2.inl
@@ -171,7 +171,7 @@ struct finite_element<mfem::Geometry::CUBE, L2<p, c> > {
   }
 
   template <int q>
-  static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
   {
     // we want to compute the following:
     //
@@ -232,8 +232,9 @@ struct finite_element<mfem::Geometry::CUBE, L2<p, c> > {
   }
 
   template <typename source_type, typename flux_type, int q>
-  static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
-                        const TensorProductQuadratureRule<q>&, dof_type* element_residual, int step = 1)
+  SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
+                                          const TensorProductQuadratureRule<q>&, dof_type* element_residual,
+                                          int step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
diff --git a/src/serac/numerics/functional/detail/qoi.inl b/src/serac/numerics/functional/detail/qoi.inl
index 80f74ce01..189b7f205 100644
--- a/src/serac/numerics/functional/detail/qoi.inl
+++ b/src/serac/numerics/functional/detail/qoi.inl
@@ -24,15 +24,15 @@ struct finite_element<g, QOI> {
   SERAC_HOST_DEVICE static constexpr double shape_functions(double /* xi */) { return 1.0; }
 
   template <int Q, int q>
-  static void integrate(const tensor<zero, Q>&, const TensorProductQuadratureRule<q>&, dof_type*,
-                        [[maybe_unused]] int step = 1)
+  SERAC_HOST_DEVICE static void integrate(const tensor<zero, Q>&, const TensorProductQuadratureRule<q>&, dof_type*,
+                                          [[maybe_unused]] int step = 1)
   {
     return;  // integrating zeros is a no-op
   }
 
   template <int Q, int q>
-  static void integrate(const tensor<double, Q>& qf_output, const TensorProductQuadratureRule<q>&,
-                        dof_type* element_total, [[maybe_unused]] int step = 1)
+  SERAC_HOST_DEVICE static void integrate(const tensor<double, Q>& qf_output, const TensorProductQuadratureRule<q>&,
+                                          dof_type* element_total, [[maybe_unused]] int step = 1)
   {
     if constexpr (geometry == mfem::Geometry::SEGMENT) {
       static_assert(Q == q);
@@ -77,8 +77,9 @@ struct finite_element<g, QOI> {
   // this overload is used for boundary integrals, since they pad the
   // output to be a tuple with a hardcoded `zero` flux term
   template <typename source_type, int Q, int q>
-  static void integrate(const tensor<serac::tuple<source_type, zero>, Q>& qf_output,
-                        const TensorProductQuadratureRule<q>&, dof_type* element_total, [[maybe_unused]] int step = 1)
+  SERAC_HOST_DEVICE static void integrate(const tensor<serac::tuple<source_type, zero>, Q>& qf_output,
+                                          const TensorProductQuadratureRule<q>&, dof_type* element_total,
+                                          [[maybe_unused]] int step = 1)
   {
     if constexpr (is_zero<source_type>{}) {
       return;
diff --git a/src/serac/numerics/functional/detail/quadrilateral_H1.inl b/src/serac/numerics/functional/detail/quadrilateral_H1.inl
index 953bab991..452bd6e17 100644
--- a/src/serac/numerics/functional/detail/quadrilateral_H1.inl
+++ b/src/serac/numerics/functional/detail/quadrilateral_H1.inl
@@ -202,7 +202,7 @@ struct finite_element<mfem::Geometry::SQUARE, H1<p, c> > {
   // A(dy, qx)  := B(qx, dx) * X_e(dy, dx)
   // X_q(qy, qx) := B(qy, dy) * A(dy, qx)
   template <int q>
-  static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
   {
     static constexpr bool apply_weights = false;
     static constexpr auto B             = calculate_B<apply_weights, q>();
@@ -245,8 +245,9 @@ struct finite_element<mfem::Geometry::SQUARE, H1<p, c> > {
   // flux can be one of: {zero, tensor<double,dim>, tensor<double,dim,dim>, tensor<double,dim,dim,dim>,
   // tensor<double,dim,dim,dim>}
   template <typename source_type, typename flux_type, int q>
-  static void integrate(const tensor<tuple<source_type, flux_type>, q * q>& qf_output,
-                        const TensorProductQuadratureRule<q>&, dof_type* element_residual, int step = 1)
+  SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q>& qf_output,
+                                          const TensorProductQuadratureRule<q>&, dof_type* element_residual,
+                                          int step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
diff --git a/src/serac/numerics/functional/detail/quadrilateral_Hcurl.inl b/src/serac/numerics/functional/detail/quadrilateral_Hcurl.inl
index 420567561..6e332f27e 100644
--- a/src/serac/numerics/functional/detail/quadrilateral_Hcurl.inl
+++ b/src/serac/numerics/functional/detail/quadrilateral_Hcurl.inl
@@ -285,7 +285,7 @@ struct finite_element<mfem::Geometry::SQUARE, Hcurl<p> > {
   }
 
   template <int q>
-  static auto interpolate(const dof_type& element_values, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static auto interpolate(const dof_type& element_values, const TensorProductQuadratureRule<q>&)
   {
     constexpr bool                     apply_weights = false;
     constexpr tensor<double, q, p>     B1            = calculate_B1<apply_weights, q>();
@@ -323,9 +323,9 @@ struct finite_element<mfem::Geometry::SQUARE, Hcurl<p> > {
   }
 
   template <typename source_type, typename flux_type, int q>
-  static void integrate(const tensor<tuple<source_type, flux_type>, q * q>& qf_output,
-                        const TensorProductQuadratureRule<q>&, dof_type* element_residual,
-                        [[maybe_unused]] int step = 1)
+  SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q>& qf_output,
+                                          const TensorProductQuadratureRule<q>&, dof_type* element_residual,
+                                          [[maybe_unused]] int step = 1)
   {
     constexpr bool                     apply_weights = true;
     constexpr tensor<double, q, p>     B1            = calculate_B1<apply_weights, q>();
diff --git a/src/serac/numerics/functional/detail/quadrilateral_L2.inl b/src/serac/numerics/functional/detail/quadrilateral_L2.inl
index 2f2008017..09d653b5f 100644
--- a/src/serac/numerics/functional/detail/quadrilateral_L2.inl
+++ b/src/serac/numerics/functional/detail/quadrilateral_L2.inl
@@ -201,7 +201,7 @@ struct finite_element<mfem::Geometry::SQUARE, L2<p, c> > {
   // A(dy, qx)  := B(qx, dx) * X_e(dy, dx)
   // X_q(qy, qx) := B(qy, dy) * A(dy, qx)
   template <int q>
-  static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
   {
     static constexpr bool apply_weights = false;
     static constexpr auto B             = calculate_B<apply_weights, q>();
@@ -244,8 +244,9 @@ struct finite_element<mfem::Geometry::SQUARE, L2<p, c> > {
   // flux can be one of: {zero, tensor<double,dim>, tensor<double,dim,dim>, tensor<double,dim,dim,dim>,
   // tensor<double,dim,dim,dim>}
   template <typename source_type, typename flux_type, int q>
-  static void integrate(const tensor<tuple<source_type, flux_type>, q * q>& qf_output,
-                        const TensorProductQuadratureRule<q>&, dof_type* element_residual, int step = 1)
+  SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q>& qf_output,
+                                          const TensorProductQuadratureRule<q>&, dof_type* element_residual,
+                                          int step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
diff --git a/src/serac/numerics/functional/detail/segment_H1.inl b/src/serac/numerics/functional/detail/segment_H1.inl
index 4dbe8f0e5..5651be12d 100644
--- a/src/serac/numerics/functional/detail/segment_H1.inl
+++ b/src/serac/numerics/functional/detail/segment_H1.inl
@@ -94,7 +94,7 @@ struct finite_element<mfem::Geometry::SEGMENT, H1<p, c> > {
   }
 
   template <typename T, int q>
-  static auto batch_apply_shape_fn(int jx, tensor<T, q> input, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static auto batch_apply_shape_fn(int jx, tensor<T, q> input, const TensorProductQuadratureRule<q>&)
   {
     static constexpr bool apply_weights = false;
     static constexpr auto B             = calculate_B<apply_weights, q>();
@@ -121,7 +121,7 @@ struct finite_element<mfem::Geometry::SEGMENT, H1<p, c> > {
   }
 
   template <int q>
-  static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
   {
     static constexpr bool apply_weights = false;
     static constexpr auto B             = calculate_B<apply_weights, q>();
@@ -155,9 +155,9 @@ struct finite_element<mfem::Geometry::SEGMENT, H1<p, c> > {
   }
 
   template <typename source_type, typename flux_type, int q>
-  static void integrate(const tensor<tuple<source_type, flux_type>, q>& qf_output,
-                        const TensorProductQuadratureRule<q>&, dof_type* element_residual,
-                        [[maybe_unused]] int step = 1)
+  SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q>& qf_output,
+                                          const TensorProductQuadratureRule<q>&, dof_type* element_residual,
+                                          [[maybe_unused]] int step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
diff --git a/src/serac/numerics/functional/detail/segment_L2.inl b/src/serac/numerics/functional/detail/segment_L2.inl
index af85ff6a1..214e757dc 100644
--- a/src/serac/numerics/functional/detail/segment_L2.inl
+++ b/src/serac/numerics/functional/detail/segment_L2.inl
@@ -121,7 +121,7 @@ struct finite_element<mfem::Geometry::SEGMENT, L2<p, c> > {
   }
 
   template <int q>
-  static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
   {
     static constexpr bool apply_weights = false;
     static constexpr auto B             = calculate_B<apply_weights, q>();
@@ -155,9 +155,9 @@ struct finite_element<mfem::Geometry::SEGMENT, L2<p, c> > {
   }
 
   template <typename source_type, typename flux_type, int q>
-  static void integrate(const tensor<tuple<source_type, flux_type>, q>& qf_output,
-                        const TensorProductQuadratureRule<q>&, dof_type* element_residual,
-                        [[maybe_unused]] int step = 1)
+  SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q>& qf_output,
+                                          const TensorProductQuadratureRule<q>&, dof_type* element_residual,
+                                          [[maybe_unused]] int step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
diff --git a/src/serac/numerics/functional/detail/tetrahedron_H1.inl b/src/serac/numerics/functional/detail/tetrahedron_H1.inl
index cf66e4a2f..d3ea86061 100644
--- a/src/serac/numerics/functional/detail/tetrahedron_H1.inl
+++ b/src/serac/numerics/functional/detail/tetrahedron_H1.inl
@@ -361,7 +361,7 @@ struct finite_element<mfem::Geometry::TETRAHEDRON, H1<p, c> > {
   }
 
   template <int q>
-  static auto interpolate(const tensor<double, c, ndof>& X, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static auto interpolate(const tensor<double, c, ndof>& X, const TensorProductQuadratureRule<q>&)
   {
     constexpr auto xi = GaussLegendreNodes<q, mfem::Geometry::TETRAHEDRON>();
 
@@ -384,8 +384,9 @@ struct finite_element<mfem::Geometry::TETRAHEDRON, H1<p, c> > {
   }
 
   template <typename source_type, typename flux_type, int q>
-  static void integrate(const tensor<tuple<source_type, flux_type>, nqpts(q)>& qf_output,
-                        const TensorProductQuadratureRule<q>&, tensor<double, c, ndof>* element_residual, int step = 1)
+  SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, nqpts(q)>& qf_output,
+                                          const TensorProductQuadratureRule<q>&,
+                                          tensor<double, c, ndof>* element_residual, int step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
diff --git a/src/serac/numerics/functional/detail/tetrahedron_L2.inl b/src/serac/numerics/functional/detail/tetrahedron_L2.inl
index 97f6467bb..df30f3409 100644
--- a/src/serac/numerics/functional/detail/tetrahedron_L2.inl
+++ b/src/serac/numerics/functional/detail/tetrahedron_L2.inl
@@ -366,7 +366,7 @@ struct finite_element<mfem::Geometry::TETRAHEDRON, L2<p, c> > {
   }
 
   template <int q>
-  static auto interpolate(const tensor<double, c, ndof>& X, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static auto interpolate(const tensor<double, c, ndof>& X, const TensorProductQuadratureRule<q>&)
   {
     constexpr auto xi = GaussLegendreNodes<q, mfem::Geometry::TETRAHEDRON>();
 
@@ -389,8 +389,9 @@ struct finite_element<mfem::Geometry::TETRAHEDRON, L2<p, c> > {
   }
 
   template <typename source_type, typename flux_type, int q>
-  static void integrate(const tensor<tuple<source_type, flux_type>, nqpts(q)>& qf_output,
-                        const TensorProductQuadratureRule<q>&, tensor<double, c, ndof>* element_residual, int step = 1)
+  SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, nqpts(q)>& qf_output,
+                                          const TensorProductQuadratureRule<q>&,
+                                          tensor<double, c, ndof>* element_residual, int step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
diff --git a/src/serac/numerics/functional/detail/triangle_H1.inl b/src/serac/numerics/functional/detail/triangle_H1.inl
index 22456d767..fe2bc560d 100644
--- a/src/serac/numerics/functional/detail/triangle_H1.inl
+++ b/src/serac/numerics/functional/detail/triangle_H1.inl
@@ -266,7 +266,7 @@ struct finite_element<mfem::Geometry::TRIANGLE, H1<p, c> > {
   }
 
   template <int q>
-  static auto interpolate(const tensor<double, c, ndof>& X, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static auto interpolate(const tensor<double, c, ndof>& X, const TensorProductQuadratureRule<q>&)
   {
     constexpr auto       xi                    = GaussLegendreNodes<q, mfem::Geometry::TRIANGLE>();
     static constexpr int num_quadrature_points = q * (q + 1) / 2;
@@ -290,8 +290,9 @@ struct finite_element<mfem::Geometry::TRIANGLE, H1<p, c> > {
   }
 
   template <typename source_type, typename flux_type, int q>
-  static void integrate(const tensor<tuple<source_type, flux_type>, q*(q + 1) / 2>& qf_output,
-                        const TensorProductQuadratureRule<q>&, tensor<double, c, ndof>* element_residual, int step = 1)
+  SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q*(q + 1) / 2>& qf_output,
+                                          const TensorProductQuadratureRule<q>&,
+                                          tensor<double, c, ndof>* element_residual, int step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
diff --git a/src/serac/numerics/functional/detail/triangle_L2.inl b/src/serac/numerics/functional/detail/triangle_L2.inl
index aa92a8eaf..51f64af5e 100644
--- a/src/serac/numerics/functional/detail/triangle_L2.inl
+++ b/src/serac/numerics/functional/detail/triangle_L2.inl
@@ -275,7 +275,7 @@ struct finite_element<mfem::Geometry::TRIANGLE, L2<p, c> > {
   }
 
   template <int q>
-  static auto interpolate(const tensor<double, c, ndof>& X, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static auto interpolate(const tensor<double, c, ndof>& X, const TensorProductQuadratureRule<q>&)
   {
     constexpr auto       xi                    = GaussLegendreNodes<q, mfem::Geometry::TRIANGLE>();
     static constexpr int num_quadrature_points = q * (q + 1) / 2;
@@ -299,8 +299,9 @@ struct finite_element<mfem::Geometry::TRIANGLE, L2<p, c> > {
   }
 
   template <typename source_type, typename flux_type, int q>
-  static void integrate(const tensor<tuple<source_type, flux_type>, q*(q + 1) / 2>& qf_output,
-                        const TensorProductQuadratureRule<q>&, tensor<double, c, ndof>* element_residual, int step = 1)
+  SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q*(q + 1) / 2>& qf_output,
+                                          const TensorProductQuadratureRule<q>&,
+                                          tensor<double, c, ndof>* element_residual, int step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
diff --git a/src/serac/numerics/functional/domain_integral_kernels.hpp b/src/serac/numerics/functional/domain_integral_kernels.hpp
index ed7415fc0..00e6db9ba 100644
--- a/src/serac/numerics/functional/domain_integral_kernels.hpp
+++ b/src/serac/numerics/functional/domain_integral_kernels.hpp
@@ -10,7 +10,10 @@
 #include "serac/numerics/functional/function_signature.hpp"
 #include "serac/numerics/functional/differentiate_wrt.hpp"
 
+#include <RAJA/index/RangeSegment.hpp>
+#include <RAJA/RAJA.hpp>
 #include <array>
+#include <cstdint>
 
 namespace serac {
 
@@ -24,40 +27,40 @@ namespace domain_integral {
  * trial space
  */
 template <typename space, typename dimension>
-struct QFunctionArgument;
+SERAC_HOST_DEVICE struct QFunctionArgument;
 
 /// @overload
 template <int p, int dim>
-struct QFunctionArgument<H1<p, 1>, Dimension<dim> > {
+SERAC_HOST_DEVICE struct QFunctionArgument<H1<p, 1>, Dimension<dim> > {
   using type = tuple<double, tensor<double, dim> >;  ///< what will be passed to the q-function
 };
 
 /// @overload
 template <int p, int c, int dim>
-struct QFunctionArgument<H1<p, c>, Dimension<dim> > {
+SERAC_HOST_DEVICE struct QFunctionArgument<H1<p, c>, Dimension<dim> > {
   using type = tuple<tensor<double, c>, tensor<double, c, dim> >;  ///< what will be passed to the q-function
 };
 
 /// @overload
 template <int p, int dim>
-struct QFunctionArgument<L2<p, 1>, Dimension<dim> > {
+SERAC_HOST_DEVICE struct QFunctionArgument<L2<p, 1>, Dimension<dim> > {
   using type = tuple<double, tensor<double, dim> >;  ///< what will be passed to the q-function
 };
 /// @overload
 template <int p, int c, int dim>
-struct QFunctionArgument<L2<p, c>, Dimension<dim> > {
+SERAC_HOST_DEVICE struct QFunctionArgument<L2<p, c>, Dimension<dim> > {
   using type = tuple<tensor<double, c>, tensor<double, c, dim> >;  ///< what will be passed to the q-function
 };
 
 /// @overload
 template <int p>
-struct QFunctionArgument<Hcurl<p>, Dimension<2> > {
+SERAC_HOST_DEVICE struct QFunctionArgument<Hcurl<p>, Dimension<2> > {
   using type = tuple<tensor<double, 2>, double>;  ///< what will be passed to the q-function
 };
 
 /// @overload
 template <int p>
-struct QFunctionArgument<Hcurl<p>, Dimension<3> > {
+SERAC_HOST_DEVICE struct QFunctionArgument<Hcurl<p>, Dimension<3> > {
   using type = tuple<tensor<double, 3>, tensor<double, 3> >;  ///< what will be passed to the q-function
 };
 
@@ -99,7 +102,7 @@ auto get_derivative_type(lambda qf, qpt_data_type&& qpt_data)
 };
 
 template <typename lambda, int dim, int n, typename... T>
-auto batch_apply_qf_no_qdata(lambda qf, const tensor<double, dim, n> x, const T&... inputs)
+SERAC_HOST_DEVICE auto batch_apply_qf_no_qdata(lambda qf, const tensor<double, dim, n> x, const T&... inputs)
 {
   using return_type = decltype(qf(tensor<double, dim>{}, T{}[0]...));
   tensor<return_type, n> outputs{};
@@ -114,8 +117,8 @@ auto batch_apply_qf_no_qdata(lambda qf, const tensor<double, dim, n> x, const T&
 }
 
 template <typename lambda, int dim, int n, typename qpt_data_type, typename... T>
-auto batch_apply_qf(lambda qf, const tensor<double, dim, n> x, qpt_data_type* qpt_data, bool update_state,
-                    const T&... inputs)
+SERAC_HOST_DEVICE auto batch_apply_qf(lambda qf, const tensor<double, dim, n> x, qpt_data_type* qpt_data,
+                                      bool update_state, const T&... inputs)
 {
   using return_type = decltype(qf(tensor<double, dim>{}, qpt_data[0], T{}[0]...));
   tensor<return_type, n> outputs{};
@@ -134,44 +137,42 @@ auto batch_apply_qf(lambda qf, const tensor<double, dim, n> x, qpt_data_type* qp
   return outputs;
 }
 
-template <uint32_t differentiation_index, int Q, mfem::Geometry::Type geom, typename test, typename... trials,
-          typename lambda_type, typename state_type, typename derivative_type, int... indices>
-void evaluation_kernel_impl(FunctionSignature<test(trials...)>, const std::vector<const double*>& inputs,
-                            double* outputs, const double* positions, const double* jacobians, lambda_type qf,
+template <uint32_t differentiation_index, int Q, mfem::Geometry::Type geom, typename test_element_type,
+          typename trial_element_tuple_type, typename lambda_type, typename state_type, typename derivative_type,
+          int... indices>
+void evaluation_kernel_impl(trial_element_tuple_type          trial_elements, test_element_type,
+                            const std::vector<const double*>& inputs, double* outputs, const double* positions,
+                            const double* jacobians, lambda_type qf,
                             [[maybe_unused]] axom::ArrayView<state_type, 2> qf_state,
                             [[maybe_unused]] derivative_type* qf_derivatives, uint32_t num_elements, bool update_state,
-                            std::integer_sequence<int, indices...>)
+                            camp::int_seq<int, indices...>)
 {
-  using test_element = finite_element<geom, test>;
-
-  /// @brief the element type for each trial space
-  static constexpr tuple<finite_element<geom, trials>...> trial_elements{};
-
   // mfem provides this information as opaque arrays of doubles,
   // so we reinterpret the pointer with
-  auto r = reinterpret_cast<typename test_element::dof_type*>(outputs);
-  auto x = reinterpret_cast<const typename batched_position<geom, Q>::type*>(positions);
-  auto J = reinterpret_cast<const typename batched_jacobian<geom, Q>::type*>(jacobians);
-  static constexpr TensorProductQuadratureRule<Q> rule{};
+  auto                           r = reinterpret_cast<typename test_element_type::dof_type*>(outputs);
+  auto                           x = reinterpret_cast<const typename batched_position<geom, Q>::type*>(positions);
+  auto                           J = reinterpret_cast<const typename batched_jacobian<geom, Q>::type*>(jacobians);
+  TensorProductQuadratureRule<Q> rule{};
 
-  static constexpr int qpts_per_elem = num_quadrature_points(geom, Q);
+  [[maybe_unused]] auto qpts_per_elem = num_quadrature_points(geom, Q);
 
   [[maybe_unused]] tuple u = {
       reinterpret_cast<const typename decltype(type<indices>(trial_elements))::dof_type*>(inputs[indices])...};
 
   // for each element in the domain
-  for (uint32_t e = 0; e < num_elements; e++) {
+  for (uint32_t e = 0; e < num_elements; ++e) {
     // load the jacobians and positions for each quadrature point in this element
     auto J_e = J[e];
     auto x_e = x[e];
 
+    [[maybe_unused]] static constexpr trial_element_tuple_type trial_element_tuple{};
     // batch-calculate values / derivatives of each trial space, at each quadrature point
     [[maybe_unused]] tuple qf_inputs = {promote_each_to_dual_when<indices == differentiation_index>(
-        get<indices>(trial_elements).interpolate(get<indices>(u)[e], rule))...};
+        get<indices>(trial_element_tuple).interpolate(get<indices>(u)[e], rule))...};
 
     // use J_e to transform values / derivatives on the parent element
     // to the to the corresponding values / derivatives on the physical element
-    (parent_to_physical<get<indices>(trial_elements).family>(get<indices>(qf_inputs), J_e), ...);
+    (parent_to_physical<get<indices>(trial_element_tuple).family>(get<indices>(qf_inputs), J_e), ...);
 
     // (batch) evalute the q-function at each quadrature point
     //
@@ -188,25 +189,27 @@ void evaluation_kernel_impl(FunctionSignature<test(trials...)>, const std::vecto
 
     // use J to transform sources / fluxes on the physical element
     // back to the corresponding sources / fluxes on the parent element
-    physical_to_parent<test_element::family>(qf_outputs, J_e);
+    physical_to_parent<test_element_type::family>(qf_outputs, J_e);
 
     // write out the q-function derivatives after applying the
     // physical_to_parent transformation, so that those transformations
     // won't need to be applied in the action_of_gradient and element_gradient kernels
     if constexpr (differentiation_index != serac::NO_DIFFERENTIATION) {
       for (int q = 0; q < leading_dimension(qf_outputs); q++) {
-        qf_derivatives[e * qpts_per_elem + uint32_t(q)] = get_gradient(qf_outputs[q]);
+        qf_derivatives[e * uint32_t(qpts_per_elem) + uint32_t(q)] = get_gradient(qf_outputs[q]);
       }
     }
 
     // (batch) integrate the material response against the test-space basis functions
-    test_element::integrate(get_value(qf_outputs), rule, &r[e]);
+    test_element_type::integrate(get_value(qf_outputs), rule, &r[e]);
   }
+
+  return;
 }
 
 //clang-format off
 template <bool is_QOI, typename S, typename T>
-auto chain_rule(const S& dfdx, const T& dx)
+SERAC_HOST_DEVICE auto chain_rule(const S& dfdx, const T& dx)
 {
   if constexpr (is_QOI) {
     return serac::chain_rule(serac::get<0>(dfdx), serac::get<0>(dx)) +
@@ -223,7 +226,7 @@ auto chain_rule(const S& dfdx, const T& dx)
 //clang-format on
 
 template <bool is_QOI, typename derivative_type, int n, typename T>
-auto batch_apply_chain_rule(derivative_type* qf_derivatives, const tensor<T, n>& inputs)
+SERAC_HOST_DEVICE auto batch_apply_chain_rule(derivative_type* qf_derivatives, const tensor<T, n>& inputs)
 {
   using return_type = decltype(chain_rule<is_QOI>(derivative_type{}, T{}));
   tensor<return_type, n> outputs{};
@@ -265,14 +268,14 @@ void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* q
   using test_element  = finite_element<g, test>;
   using trial_element = finite_element<g, trial>;
 
-  static constexpr bool is_QOI   = (test::family == Family::QOI);
-  static constexpr int  num_qpts = num_quadrature_points(g, Q);
+  constexpr bool is_QOI   = (test::family == Family::QOI);
+  constexpr int  num_qpts = num_quadrature_points(g, Q);
 
   // mfem provides this information in 1D arrays, so we reshape it
   // into strided multidimensional arrays before using
-  auto                                            du = reinterpret_cast<const typename trial_element::dof_type*>(dU);
-  auto                                            dr = reinterpret_cast<typename test_element::dof_type*>(dR);
-  static constexpr TensorProductQuadratureRule<Q> rule{};
+  auto                                     du = reinterpret_cast<const typename trial_element::dof_type*>(dU);
+  auto                                     dr = reinterpret_cast<typename test_element::dof_type*>(dR);
+  constexpr TensorProductQuadratureRule<Q> rule{};
 
   // for each element in the domain
   for (uint32_t e = 0; e < num_elements; e++) {
@@ -351,9 +354,11 @@ std::function<void(const std::vector<const double*>&, double*, bool)> evaluation
     std::shared_ptr<QuadratureData<state_type> > qf_state, std::shared_ptr<derivative_type> qf_derivatives,
     uint32_t num_elements)
 {
+  auto trial_elements = trial_elements_tuple<geom>(s);
+  auto test           = get_test_element<geom>(s);
   return [=](const std::vector<const double*>& inputs, double* outputs, bool update_state) {
-    domain_integral::evaluation_kernel_impl<wrt, Q, geom>(s, inputs, outputs, positions, jacobians, qf,
-                                                          (*qf_state)[geom], qf_derivatives.get(), num_elements,
+    domain_integral::evaluation_kernel_impl<wrt, Q, geom>(trial_elements, test, inputs, outputs, positions, jacobians,
+                                                          qf, (*qf_state)[geom], qf_derivatives.get(), num_elements,
                                                           update_state, s.index_seq);
   };
 }
diff --git a/src/serac/numerics/functional/finite_element.hpp b/src/serac/numerics/functional/finite_element.hpp
index eecc4b07a..28174f703 100644
--- a/src/serac/numerics/functional/finite_element.hpp
+++ b/src/serac/numerics/functional/finite_element.hpp
@@ -244,7 +244,7 @@ struct QOI {
  * @param jacobians the jacobians of the isoparametric map from parent to physical space of each quadrature point
  */
 template <Family f, typename T, int q, int dim>
-void parent_to_physical(tensor<T, q>& qf_input, const tensor<double, dim, dim, q>& jacobians)
+SERAC_HOST_DEVICE void parent_to_physical(tensor<T, q>& qf_input, const tensor<double, dim, dim, q>& jacobians)
 {
   [[maybe_unused]] constexpr int VALUE      = 0;
   [[maybe_unused]] constexpr int DERIVATIVE = 1;
@@ -285,7 +285,7 @@ void parent_to_physical(tensor<T, q>& qf_input, const tensor<double, dim, dim, q
  * @param jacobians the jacobians of the isoparametric map from parent to physical space of each quadrature point
  */
 template <Family f, typename T, int q, int dim>
-void physical_to_parent(tensor<T, q>& qf_output, const tensor<double, dim, dim, q>& jacobians)
+SERAC_HOST_DEVICE void physical_to_parent(tensor<T, q>& qf_output, const tensor<double, dim, dim, q>& jacobians)
 {
   [[maybe_unused]] constexpr int SOURCE = 0;
   [[maybe_unused]] constexpr int FLUX   = 1;
diff --git a/src/serac/numerics/functional/function_signature.hpp b/src/serac/numerics/functional/function_signature.hpp
index 07c2df520..e5ce5d9bb 100644
--- a/src/serac/numerics/functional/function_signature.hpp
+++ b/src/serac/numerics/functional/function_signature.hpp
@@ -1,4 +1,6 @@
 #pragma once
+#include <camp/camp.hpp>
+#include "serac/numerics/functional/finite_element.hpp"
 
 template <typename T>
 struct FunctionSignature;
@@ -17,5 +19,34 @@ struct FunctionSignature<output_type(input_types...)> {
   static constexpr int num_args = sizeof...(input_types);
 
   /// integer sequence used to make iterating over arguments easier
-  static constexpr auto index_seq = std::make_integer_sequence<int, num_args>{};
+  static constexpr auto index_seq = camp::make_int_seq<int, num_args>{};
 };
+
+/**
+ * @brief This helper function template expands the variadic parameter trials and uses
+ * the pack to construct a tuple of finite elements.
+ *
+ * Expansion of the template parameter trials must occur in a separate template
+ * function from evaluation_kernel_impl.  The __host__ __device__ marker on the
+ * lambda declared in evaluation_kernel_impl prevents use of multiple variadic
+ * template parameters in evaluation_kernel_impl.  See section 14.7.3 of the
+ * CUDA programming guide, item 9.
+ */
+template <mfem::Geometry::Type geom, typename test, typename... trials>
+auto trial_elements_tuple(FunctionSignature<test(trials...)>)
+{
+  return serac::tuple<serac::finite_element<geom, trials>...>{};
+}
+
+/**
+ * @brief This helper function template returns a finite element based on the output
+ * type of FunctionSignature.
+ *
+ * See the comments for trial_elements_tuple to understand why this is needed in
+ * domain_integral_kernels::evaluation_kernel and boundary_integral_kernels::evaluation_kernel.
+ */
+template <mfem::Geometry::Type geom, typename test, typename... trials>
+auto get_test_element(FunctionSignature<test(trials...)>)
+{
+  return serac::finite_element<geom, test>{};
+}
diff --git a/src/serac/numerics/functional/tensor.hpp b/src/serac/numerics/functional/tensor.hpp
index 373bdb305..bb89f83d9 100644
--- a/src/serac/numerics/functional/tensor.hpp
+++ b/src/serac/numerics/functional/tensor.hpp
@@ -1308,7 +1308,7 @@ auto matrix_sqrt(const tensor<T, dim, dim>& A)
  * @param B the right operand
  */
 template <int i1, int i2, typename S, int m, int... n, typename T, int p, int q>
-auto contract(const tensor<S, m, n...>& A, const tensor<T, p, q>& B)
+SERAC_HOST_DEVICE auto contract(const tensor<S, m, n...>& A, const tensor<T, p, q>& B)
 {
   constexpr int Adims[] = {m, n...};
   constexpr int Bdims[] = {p, q};
@@ -1366,7 +1366,7 @@ auto contract(const tensor<S, m, n...>& A, const tensor<T, p, q>& B)
 
 /// @overload
 template <int i1, int i2, typename T>
-auto contract(const zero&, const T&)
+SERAC_HOST_DEVICE auto contract(const zero&, const T&)
 {
   return zero{};
 }
diff --git a/src/serac/physics/heat_transfer.hpp b/src/serac/physics/heat_transfer.hpp
index 14a3377a6..d8563ed72 100644
--- a/src/serac/physics/heat_transfer.hpp
+++ b/src/serac/physics/heat_transfer.hpp
@@ -67,7 +67,7 @@ const TimesteppingOptions default_static_options = {TimestepMethod::QuasiStatic}
 /**
  * @brief An object containing the solver for a thermal conduction PDE
  *
- * This is a generic linear thermal diffusion oeprator of the form
+ * This is a generic linear thermal diffusion operator of the form
  *
  * \f[
  * \mathbf{M} \frac{\partial \mathbf{u}}{\partial t} = -\kappa \mathbf{K} \mathbf{u} + \mathbf{f}
@@ -285,6 +285,64 @@ class HeatTransfer<order, dim, Parameters<parameter_space...>, std::integer_sequ
     cycle_ += 1;
   }
 
+  /**
+   * @brief Functor representing the integrand of a thermal material.  Material type must be
+   * a functor as well.
+   */
+  template <typename MaterialType>
+  struct ThermalMaterialIntegrand {
+    /**
+     * @brief Construct a ThermalMaterialIntegrand functor with material model of type `MaterialType`.
+     * @param[in] material A functor representing the material model.  Should be a functor, or a class/struct with
+     * public operator() method.  Must NOT be a generic lambda, or serac will not compile due to static asserts below.
+     */
+    ThermalMaterialIntegrand(MaterialType material) : material_(material) {}
+
+    // Due to nvcc's lack of support for extended generic lambdas (i.e. functions of the form
+    // auto lambda = [] __host__ __device__ (auto) {}; ), MaterialType cannot be an extended
+    // generic lambda.  The static asserts below check this.
+  private:
+    class DummyArgumentType {
+    };
+    static_assert(!std::is_invocable<MaterialType, DummyArgumentType&>::value);
+    static_assert(!std::is_invocable<MaterialType, DummyArgumentType*>::value);
+    static_assert(!std::is_invocable<MaterialType, DummyArgumentType>::value);
+
+  public:
+    /**
+     * @brief Evaluate integrand
+     */
+    template <typename X, typename T, typename dT_dt, typename Shape, typename... Params>
+    auto operator()(X x, T temperature, dT_dt dtemp_dt, Shape shape, Params... params)
+    {
+      // Get the value and the gradient from the input tuple
+      auto [u, du_dX] = temperature;
+      auto [p, dp_dX] = shape;
+      auto du_dt      = get<VALUE>(dtemp_dt);
+
+      auto I_plus_dp_dX     = I + dp_dX;
+      auto inv_I_plus_dp_dX = inv(I_plus_dp_dX);
+      auto det_I_plus_dp_dX = det(I_plus_dp_dX);
+
+      // Note that the current configuration x = X + p, where X is the original reference
+      // configuration and p is the shape displacement. We need the gradient with
+      // respect to the perturbed reference configuration x = X + p for the material model. Therefore, we calculate
+      // du/dx = du/dX * dX/dx = du/dX * (dx/dX)^-1 = du/dX * (I + dp/dX)^-1
+
+      auto du_dx = dot(du_dX, inv_I_plus_dp_dX);
+
+      auto [heat_capacity, heat_flux] = material_(x + p, u, du_dx, params...);
+
+      // Note that the return is integrated in the perturbed reference
+      // configuration, hence the det(I + dp_dx) = det(dx/dX)
+      return serac::tuple{heat_capacity * du_dt * det_I_plus_dp_dX,
+                          -1.0 * dot(inv_I_plus_dp_dX, heat_flux) * det_I_plus_dp_dX};
+    }
+
+  private:
+    MaterialType material_;
+  };
+
   /**
    * @brief Set the thermal material model for the physics solver
    *
@@ -311,33 +369,9 @@ class HeatTransfer<order, dim, Parameters<parameter_space...>, std::integer_sequ
   template <int... active_parameters, typename MaterialType>
   void setMaterial(DependsOn<active_parameters...>, MaterialType material)
   {
-    residual_->AddDomainIntegral(
-        Dimension<dim>{}, DependsOn<0, 1, 2, active_parameters + NUM_STATE_VARS...>{},
-        [material](auto x, auto temperature, auto dtemp_dt, auto shape, auto... params) {
-          // Get the value and the gradient from the input tuple
-          auto [u, du_dX] = temperature;
-          auto [p, dp_dX] = shape;
-          auto du_dt      = get<VALUE>(dtemp_dt);
-
-          auto I_plus_dp_dX     = I + dp_dX;
-          auto inv_I_plus_dp_dX = inv(I_plus_dp_dX);
-          auto det_I_plus_dp_dX = det(I_plus_dp_dX);
-
-          // Note that the current configuration x = X + p, where X is the original reference
-          // configuration and p is the shape displacement. We need the gradient with
-          // respect to the perturbed reference configuration x = X + p for the material model. Therefore, we calculate
-          // du/dx = du/dX * dX/dx = du/dX * (dx/dX)^-1 = du/dX * (I + dp/dX)^-1
-
-          auto du_dx = dot(du_dX, inv_I_plus_dp_dX);
-
-          auto [heat_capacity, heat_flux] = material(x + p, u, du_dx, params...);
-
-          // Note that the return is integrated in the perturbed reference
-          // configuration, hence the det(I + dp_dx) = det(dx/dX)
-          return serac::tuple{heat_capacity * du_dt * det_I_plus_dp_dX,
-                              -1.0 * dot(inv_I_plus_dp_dX, heat_flux) * det_I_plus_dp_dX};
-        },
-        mesh_);
+    ThermalMaterialIntegrand<MaterialType> integrand(material);
+    residual_->AddDomainIntegral(Dimension<dim>{}, DependsOn<0, 1, 2, active_parameters + NUM_STATE_VARS...>{},
+                                 integrand, mesh_);
   }
 
   /// @overload