diff --git a/src/serac/numerics/functional/domain_integral_kernels.cuh b/src/serac/numerics/functional/domain_integral_kernels.cuh
index 1188dd2d3e..2daf872717 100644
--- a/src/serac/numerics/functional/domain_integral_kernels.cuh
+++ b/src/serac/numerics/functional/domain_integral_kernels.cuh
@@ -10,6 +10,9 @@
 #include "serac/infrastructure/accelerator.hpp"
 #include "serac/numerics/functional/quadrature_data.hpp"
 
+#include "RAJA/RAJA.hpp"
+#include <RAJA/index/RangeSegment.hpp>
+
 namespace serac {
 
 namespace detail {
@@ -193,7 +196,8 @@ template <mfem::Geometry::Type g, typename test, typename trial, int Q, typename
           typename qpt_data_type = void>
 __global__ void eval_cuda_quadrature(const solution_type u, residual_type r,
                                      GPUArrayView<derivatives_type, 2> qf_derivatives, jacobian_type J, position_type X,
-                                     int num_elements, lambda qf, QuadratureData<qpt_data_type> data)
+                                     int num_elements, lambda qf, QuadratureData<qpt_data_type> data,
+                                     int blocks_quadrature_element, int block_dim, int thread_idx)
 {
   using test_element          = finite_element<g, test>;
   using trial_element         = finite_element<g, trial>;
@@ -201,9 +205,9 @@ __global__ void eval_cuda_quadrature(const solution_type u, residual_type r,
   static constexpr auto rule  = GaussQuadratureRule<g, Q>();
   static constexpr int  dim   = dimension_of(g);
 
-  const int grid_stride = blockDim.x * gridDim.x;
+  const int grid_stride = block_dim * blocks_quadrature_element;
   // launch a thread for each quadrature x element point
-  for (int qe = blockIdx.x * blockDim.x + threadIdx.x; qe < num_elements * rule.size(); qe += grid_stride) {
+  for (int qe = thread_idx; qe < num_elements * rule.size(); qe += grid_stride) {
     // warps won't fetch that many elements ... not great.. but not horrible
     int e = qe / rule.size();
     int q = qe % rule.size();
@@ -426,7 +430,8 @@ template <mfem::Geometry::Type g, typename test, typename trial, int Q, typename
           typename dsolution_type, typename dresidual_type>
 __global__ void gradient_cuda_quadrature(const dsolution_type du, dresidual_type dr,
                                          GPUArrayView<derivatives_type, 2>         qf_derivatives,
-                                         const mfem::DeviceTensor<4, const double> J, int num_elements)
+                                         const mfem::DeviceTensor<4, const double> J, int num_elements,
+                                         int grid_dim, int block_dim, int thread_id)
 {
   using test_element          = finite_element<g, test>;
   using trial_element         = finite_element<g, trial>;
@@ -434,8 +439,7 @@ __global__ void gradient_cuda_quadrature(const dsolution_type du, dresidual_type
   static constexpr auto rule  = GaussQuadratureRule<g, Q>();
   static constexpr int  dim   = dimension_of(g);
 
-  const int grid_stride           = blockDim.x * gridDim.x;
-  auto      thread_id             = blockIdx.x * blockDim.x + threadIdx.x;
+  const int grid_stride           = block_dim * grid_dim;
   auto      num_quadrature_points = num_elements * rule.size();
 #pragma unroll
   for (int qe = thread_id; qe < num_quadrature_points; qe += grid_stride) {
@@ -518,19 +522,38 @@ void action_of_gradient_kernel(serac::detail::GPULaunchConfiguration config, con
   auto du = detail::Reshape<trial>(dU.Read(), trial_ndof, num_elements);
   auto dr = detail::Reshape<test>(dR.ReadWrite(), test_ndof, num_elements);
 
+  using global_thread_x = RAJA::LoopPolicy<loop_policy
+  #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+                                          , gpu_global_thread_x_policy
+  #endif
+  RAJA::RangeSegment range (0, num_elements);
+
   cudaDeviceSynchronize();
   serac::accelerator::displayLastCUDAMessage();
 
   // call gradient_cuda
   if constexpr (policy == serac::detail::ThreadParallelizationStrategy::THREAD_PER_QUADRATURE_POINT) {
     int blocks_quadrature_element = (num_elements * rule.size() + config.blocksize - 1) / config.blocksize;
-    gradient_cuda_quadrature<g, test, trial, Q, derivatives_type>
-        <<<blocks_quadrature_element, config.blocksize>>>(du, dr, qf_derivatives, J, num_elements);
-
+    RAJA::launch<launch_policy>(RAJA::ExecPlace::DEVICE,
+      RAJA::LaunchParams(RAJA::Teams(blocks_quadrature_element),
+                          RAJA::Threads(config.blocksize)),
+                          [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+                            RAJA::loop<global_thread_x>(ctx, range, [&](int t_idx) {
+                              gradient_cuda_quadrature<g, test, trial, Q, derivatives_type>
+                                (du, dr, qf_derivatives, J, num_elements, blocks_quadrature_element, config.blocksize, t_idx);
+                            });
+                          });
   } else if constexpr (policy == serac::detail::ThreadParallelizationStrategy::THREAD_PER_ELEMENT) {
     int blocks_element = (num_elements + config.blocksize - 1) / config.blocksize;
-    gradient_cuda_element<g, test, trial, Q, derivatives_type>
-        <<<blocks_element, config.blocksize>>>(du, dr, qf_derivatives, J, num_elements);
+    RAJA::launch<launch_policy>(RAJA::ExecPlace::DEVICE,
+      RAJA::LaunchParams(RAJA::Teams(blocks_element),
+                          RAJA::Threads(config.blocksize)),
+                          [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+                            RAJA::loop<global_thread_x>(ctx, range, [&](int t_idx) {
+                              gradient_cuda_element<g, test, trial, Q, derivatives_type>
+                                (du, dr, qf_derivatives, J, num_elements, blocks_element, config.blocksize, t_idx);
+                            });
+                          });
   }
 
   cudaDeviceSynchronize();
diff --git a/src/serac/numerics/functional/domain_integral_kernels.hpp b/src/serac/numerics/functional/domain_integral_kernels.hpp
index ed7415fc04..5a59dc559b 100644
--- a/src/serac/numerics/functional/domain_integral_kernels.hpp
+++ b/src/serac/numerics/functional/domain_integral_kernels.hpp
@@ -5,12 +5,17 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 #pragma once
 
+#include "RAJA/RAJA.hpp"
+
 #include "serac/infrastructure/accelerator.hpp"
 #include "serac/numerics/functional/quadrature_data.hpp"
 #include "serac/numerics/functional/function_signature.hpp"
 #include "serac/numerics/functional/differentiate_wrt.hpp"
 
+#include <RAJA/index/RangeSegment.hpp>
+#include <RAJA/pattern/forall.hpp>
 #include <array>
+#include <cstdint>
 
 namespace serac {
 
@@ -159,8 +164,16 @@ void evaluation_kernel_impl(FunctionSignature<test(trials...)>, const std::vecto
   [[maybe_unused]] tuple u = {
       reinterpret_cast<const typename decltype(type<indices>(trial_elements))::dof_type*>(inputs[indices])...};
 
+  #if defined(RAJA_ENABLE_CUDA)
+    using policy = RAJA::cuda_exec<512>;
+  #elif defined(RAJA_ENABLE_OPENMP)
+    using policy = RAJA::omp_parallel_for_exec;
+  #else 
+    using policy = RAJA::simd_exec;
+  #endif
+
   // for each element in the domain
-  for (uint32_t e = 0; e < num_elements; e++) {
+  RAJA::forall<policy>(RAJA::TypedRangeSegment<uint32_t>(0, num_elements), [&] (uint32_t e) {
     // load the jacobians and positions for each quadrature point in this element
     auto J_e = J[e];
     auto x_e = x[e];
@@ -201,7 +214,7 @@ void evaluation_kernel_impl(FunctionSignature<test(trials...)>, const std::vecto
 
     // (batch) integrate the material response against the test-space basis functions
     test_element::integrate(get_value(qf_outputs), rule, &r[e]);
-  }
+  });
 }
 
 //clang-format off
diff --git a/src/serac/physics/heat_transfer.hpp b/src/serac/physics/heat_transfer.hpp
index 17163937b3..f049abb032 100644
--- a/src/serac/physics/heat_transfer.hpp
+++ b/src/serac/physics/heat_transfer.hpp
@@ -63,7 +63,7 @@ const TimesteppingOptions default_static_options = {TimestepMethod::QuasiStatic}
 /**
  * @brief An object containing the solver for a thermal conduction PDE
  *
- * This is a generic linear thermal diffusion oeprator of the form
+ * This is a generic linear thermal diffusion operator of the form
  *
  * \f[
  * \mathbf{M} \frac{\partial \mathbf{u}}{\partial t} = -\kappa \mathbf{K} \mathbf{u} + \mathbf{f}
@@ -276,6 +276,39 @@ class HeatTransfer<order, dim, Parameters<parameter_space...>, std::integer_sequ
     cycle_ += 1;
   }
 
+  template<typename MaterialType>
+  struct ThermalMaterialIntegrand {
+    ThermalMaterialIntegrand(MaterialType material) : material_(material) {}
+
+    template<typename X, typename T, typename dT_dt, typename Shape, typename... Params>
+    auto operator()(X x, T temperature, dT_dt dtemp_dt, Shape shape, Params... params) {
+      // Get the value and the gradient from the input tuple
+      auto [u, du_dX] = temperature;
+      auto [p, dp_dX] = shape;
+      auto du_dt      = get<VALUE>(dtemp_dt);
+
+      auto I_plus_dp_dX     = I + dp_dX;
+      auto inv_I_plus_dp_dX = inv(I_plus_dp_dX);
+      auto det_I_plus_dp_dX = det(I_plus_dp_dX);
+
+      // Note that the current configuration x = X + p, where X is the original reference
+      // configuration and p is the shape displacement. We need the gradient with
+      // respect to the perturbed reference configuration x = X + p for the material model. Therefore, we calculate
+      // du/dx = du/dX * dX/dx = du/dX * (dx/dX)^-1 = du/dX * (I + dp/dX)^-1
+
+      auto du_dx = dot(du_dX, inv_I_plus_dp_dX);
+
+      auto [heat_capacity, heat_flux] = material_(x + p, u, du_dx, params...);
+
+      // Note that the return is integrated in the perturbed reference
+      // configuration, hence the det(I + dp_dx) = det(dx/dX)
+      return serac::tuple{heat_capacity * du_dt * det_I_plus_dp_dX,
+                          -1.0 * dot(inv_I_plus_dp_dX, heat_flux) * det_I_plus_dp_dX};
+    }
+    private:
+    MaterialType material_;
+  };
+
   /**
    * @brief Set the thermal material model for the physics solver
    *
@@ -300,33 +333,10 @@ class HeatTransfer<order, dim, Parameters<parameter_space...>, std::integer_sequ
   template <int... active_parameters, typename MaterialType>
   void setMaterial(DependsOn<active_parameters...>, MaterialType material)
   {
+    ThermalMaterialIntegrand integrand (material);
     residual_->AddDomainIntegral(
         Dimension<dim>{}, DependsOn<0, 1, 2, active_parameters + NUM_STATE_VARS...>{},
-        [material](auto x, auto temperature, auto dtemp_dt, auto shape, auto... params) {
-          // Get the value and the gradient from the input tuple
-          auto [u, du_dX] = temperature;
-          auto [p, dp_dX] = shape;
-          auto du_dt      = get<VALUE>(dtemp_dt);
-
-          auto I_plus_dp_dX     = I + dp_dX;
-          auto inv_I_plus_dp_dX = inv(I_plus_dp_dX);
-          auto det_I_plus_dp_dX = det(I_plus_dp_dX);
-
-          // Note that the current configuration x = X + p, where X is the original reference
-          // configuration and p is the shape displacement. We need the gradient with
-          // respect to the perturbed reference configuration x = X + p for the material model. Therefore, we calculate
-          // du/dx = du/dX * dX/dx = du/dX * (dx/dX)^-1 = du/dX * (I + dp/dX)^-1
-
-          auto du_dx = dot(du_dX, inv_I_plus_dp_dX);
-
-          auto [heat_capacity, heat_flux] = material(x + p, u, du_dx, params...);
-
-          // Note that the return is integrated in the perturbed reference
-          // configuration, hence the det(I + dp_dx) = det(dx/dX)
-          return serac::tuple{heat_capacity * du_dt * det_I_plus_dp_dX,
-                              -1.0 * dot(inv_I_plus_dp_dX, heat_flux) * det_I_plus_dp_dX};
-        },
-        mesh_);
+        integrand, mesh_);
   }
 
   /// @overload