diff --git a/Docs/sphinx_doc/Inputs.rst b/Docs/sphinx_doc/Inputs.rst
index 9768977c1..cc190297e 100644
--- a/Docs/sphinx_doc/Inputs.rst
+++ b/Docs/sphinx_doc/Inputs.rst
@@ -1165,41 +1165,41 @@ In addition, there is a run-time option to project the initial velocity field to
 List of Parameters
 ------------------
 
-+----------------------------------+-------------------+--------------------+------------------+
-| Parameter                        | Definition        | Acceptable         | Default          |
-|                                  |                   | Values             |                  |
-+==================================+===================+====================+==================+
-| **erf.init_type**                | Initialization    | “custom”,          | “*custom*”       |
-|                                  | type              | “ideal”,           |                  |
-|                                  |                   | "real",            |                  |
-|                                  |                   |"input_sounding"    |                  |
-+----------------------------------+-------------------+--------------------+------------------+
-| **erf.input_sounding_file**      | Path to WRF-style |  String            | "input_sounding" |
-|                                  | input sounding    |                    |                  |
-|                                  | file              |                    |                  |
-+----------------------------------+-------------------+--------------------+------------------+
-| **erf.init_sounding_ideal**      | Perform           |  true or false     | false            |
-|                                  | initialization    |                    |                  |
-|                                  | like WRF's        |                    |                  |
-|                                  | ideal.exe         |                    |                  |
-+----------------------------------+-------------------+--------------------+------------------+
-| **erf.use_real_bcs**             | If init_type is   | true or false      | true if          |
-|                                  | real or metgrid,  |                    | if init_type     |
-|                                  | do we want to use |                    | is real or       |
-|                                  | these bcs?        |                    | metgrid;         |
-|                                  |                   |                    | else false       |
-+----------------------------------+-------------------+--------------------+------------------+
-| **erf.nc_init_file**             | NetCDF file with  |  String            | NONE             |
-|                                  | initial mesoscale |                    |                  |
-|                                  | data              |                    |                  |
-+----------------------------------+-------------------+--------------------+------------------+
-| **erf.nc_bdy_file**              | NetCDF file with  |  String            | NONE             |
-|                                  | mesoscale data at |                    |                  |
-|                                  | lateral boundaries|                    |                  |
-+----------------------------------+-------------------+--------------------+------------------+
-| **erf.project_initial_velocity** | project initial   |  Integer           | 1                |
-|                                  | velocity?         |                    |                  |
-+----------------------------------+-------------------+--------------------+------------------+
++----------------------------------+-------------------+--------------------+-----------------------+
+| Parameter                        | Definition        | Acceptable         | Default               |
+|                                  |                   | Values             |                       |
++==================================+===================+====================+=======================+
+| **erf.init_type**                | Initialization    | “custom”,          | “*custom*”            |
+|                                  | type              | “ideal”,           |                       |
+|                                  |                   | "real",            |                       |
+|                                  |                   |"input_sounding"    |                       |
++----------------------------------+-------------------+--------------------+-----------------------+
+| **erf.input_sounding_file**      | Path to WRF-style |  String            | "input_sounding"      |
+|                                  | input sounding    |                    |                       |
+|                                  | file              |                    |                       |
++----------------------------------+-------------------+--------------------+-----------------------+
+| **erf.init_sounding_ideal**      | Perform           |  true or false     | false                 |
+|                                  | initialization    |                    |                       |
+|                                  | like WRF's        |                    |                       |
+|                                  | ideal.exe         |                    |                       |
++----------------------------------+-------------------+--------------------+-----------------------+
+| **erf.use_real_bcs**             | If init_type is   | true or false      | true if               |
+|                                  | real or metgrid,  |                    | if init_type          |
+|                                  | do we want to use |                    | is real or            |
+|                                  | these bcs?        |                    | metgrid;              |
+|                                  |                   |                    | else false            |
++----------------------------------+-------------------+--------------------+-----------------------+
+| **erf.nc_init_file**             | NetCDF file with  |  String            | NONE                  |
+|                                  | initial mesoscale |                    |                       |
+|                                  | data              |                    |                       |
++----------------------------------+-------------------+--------------------+-----------------------+
+| **erf.nc_bdy_file**              | NetCDF file with  |  String            | NONE                  |
+|                                  | mesoscale data at |                    |                       |
+|                                  | lateral boundaries|                    |                       |
++----------------------------------+-------------------+--------------------+-----------------------+
+| **erf.project_initial_velocity** | project initial   |  true or false     | true if anelastic;    |
+|                                  | velocity?         |                    | false if compressible |
++----------------------------------+-------------------+--------------------+-----------------------+
 
 Notes
 -----------------
@@ -1227,6 +1227,10 @@ integrating the hydrostatic equation from the surface.
 
 If **erf.init_type = custom** or **erf.init_type = input_sounding**, ``erf.nc_init_file`` and ``erf.nc_bdy_file`` do not need to be set.
 
+Note that the **erf.project_initial_velocity** option is available for all **init_type** options.  If using the anelastic
+formulation this will be true regardless of the input; if using the compressible formulation the default is false but
+that can be over-written.
+
 Map Scale Factors
 =================
 
diff --git a/Source/DataStructs/ERF_DataStruct.H b/Source/DataStructs/ERF_DataStruct.H
index c1cbec3d8..354d2d749 100644
--- a/Source/DataStructs/ERF_DataStruct.H
+++ b/Source/DataStructs/ERF_DataStruct.H
@@ -193,11 +193,6 @@ struct SolverChoice {
         }
         // *******************************************************************************
 
-        // Should we project the initial velocity field to make it divergence-free?
-        pp.query("project_initial_velocity", project_initial_velocity);
-
-        // *******************************************************************************
-
         bool any_anelastic = false;
         for (int i = 0; i <= max_level; ++i) {
             if (anelastic[i] == 1) any_anelastic = true;
@@ -209,10 +204,14 @@ struct SolverChoice {
             project_initial_velocity = true;
             buoyancy_type = 2; // uses Tprime
         } else {
+            pp.query("project_initial_velocity", project_initial_velocity);
+
             constant_density = false;  // We default to false but allow the user to set it
             pp.query("constant_density", constant_density);
         }
 
+        // *******************************************************************************
+
         pp.query("project_every_stage", project_every_stage);
         pp.query("ncorr", ncorr);
         pp.query("poisson_abstol", poisson_abstol);
diff --git a/Source/ERF.H b/Source/ERF.H
index a914b926f..781fae659 100644
--- a/Source/ERF.H
+++ b/Source/ERF.H
@@ -164,10 +164,12 @@ public:
 #endif
 
     // Project the velocities to be divergence-free -- this is only relevant if anelastic == 1
-    void project_velocities (int lev, amrex::Real dt, amrex::Vector<amrex::MultiFab >& vars, amrex::MultiFab& p);
+    void project_velocities (int lev, amrex::Real dt, amrex::Vector<amrex::MultiFab >& vars,
+                             amrex::MultiFab& Omega, amrex::MultiFab& p);
 
     // Project the velocities to be divergence-free with a thin body
-    void project_velocities_tb (int lev, amrex::Real dt, amrex::Vector<amrex::MultiFab >& vars, amrex::MultiFab& p);
+    void project_velocities_tb (int lev, amrex::Real dt, amrex::Vector<amrex::MultiFab >& vars,
+                                amrex::MultiFab& Omega, amrex::MultiFab& p);
 
     // Define the projection bc's based on the domain bc types
     amrex::Array<amrex::LinOpBCType,AMREX_SPACEDIM>
@@ -731,6 +733,9 @@ private:
     amrex::Vector<amrex::MultiFab> rW_old;
     amrex::Vector<amrex::MultiFab> rW_new;
 
+    // Normal momentum on z-face
+    amrex::Vector<amrex::MultiFab> Omega;
+
     // amrex::Vector<amrex::MultiFab> xmom_crse_rhs;
     // amrex::Vector<amrex::MultiFab> ymom_crse_rhs;
     amrex::Vector<amrex::MultiFab> zmom_crse_rhs;
diff --git a/Source/ERF.cpp b/Source/ERF.cpp
index 08f123b8a..3fa57b44d 100644
--- a/Source/ERF.cpp
+++ b/Source/ERF.cpp
@@ -197,6 +197,8 @@ ERF::ERF_shared ()
     rV_old.resize(nlevs_max);
     rW_old.resize(nlevs_max);
 
+    Omega.resize(nlevs_max);
+
     // xmom_crse_rhs.resize(nlevs_max);
     // ymom_crse_rhs.resize(nlevs_max);
     zmom_crse_rhs.resize(nlevs_max);
@@ -856,6 +858,11 @@ ERF::InitData_post ()
         }
     }
 
+    //
+    // If we are starting from scratch, we have the option to project the initial velocity field
+    //    regardless of how we initialized.
+    // pp_inc is used as scratch space here; we zero it out after the projection
+    //
     if (restart_chkfile == "")
     {
         // Note -- this projection is only defined for no terrain
@@ -864,7 +871,7 @@ ERF::InitData_post ()
             Real dummy_dt = 1.0;
             for (int lev = 0; lev <= finest_level; ++lev)
             {
-                project_velocities(lev, dummy_dt, vars_new[lev], pp_inc[lev]);
+                project_velocities(lev, dummy_dt, vars_new[lev], Omega[lev], pp_inc[lev]);
                 pp_inc[lev].setVal(0.);
             }
         }
diff --git a/Source/ERF_make_new_arrays.cpp b/Source/ERF_make_new_arrays.cpp
index d1a856c19..b9d018ea0 100644
--- a/Source/ERF_make_new_arrays.cpp
+++ b/Source/ERF_make_new_arrays.cpp
@@ -137,6 +137,11 @@ ERF::init_stuff (int lev, const BoxArray& ba, const DistributionMapping& dm,
         pp_inc[lev].setVal(0.0);
     }
 
+    // We keep Omega as a persistent variable now in order to use the projected Omega
+    // directly rather than calculating it again from (u,v,w)
+    Omega[lev].define(convert(ba, IntVect(0,0,1)), dm, 1, 1);
+    Omega[lev].setVal(5.6e23);
+
     // ********************************************************************************************
     // These are just used for scratch in the time integrator but we might as well define them here
     // ********************************************************************************************
diff --git a/Source/ERF_make_new_level.cpp b/Source/ERF_make_new_level.cpp
index dbe285ec8..07840783a 100644
--- a/Source/ERF_make_new_level.cpp
+++ b/Source/ERF_make_new_level.cpp
@@ -448,6 +448,8 @@ ERF::ClearLevel (int lev)
     rW_new[lev].clear();
     rW_old[lev].clear();
 
+    Omega[lev].clear();
+
     if (lev > 0) {
         zmom_crse_rhs[lev].clear();
     }
diff --git a/Source/TimeIntegration/ERF_TI_no_substep_fun.H b/Source/TimeIntegration/ERF_TI_no_substep_fun.H
index 27a07bab3..c9ba95680 100644
--- a/Source/TimeIntegration/ERF_TI_no_substep_fun.H
+++ b/Source/TimeIntegration/ERF_TI_no_substep_fun.H
@@ -138,9 +138,9 @@
             bool have_tb = (thin_xforce[0] || thin_yforce[0] || thin_zforce[0]);
             if (solverChoice.project_every_stage || (nrk==1)) {
                 if (have_tb) {
-                    project_velocities_tb(level, slow_dt, S_sum, pp_inc[level]);
+                    project_velocities_tb(level, slow_dt, S_sum, Omega[level], pp_inc[level]);
                 } else {
-                    project_velocities(level, slow_dt, S_sum, pp_inc[level]);
+                    project_velocities(level, slow_dt, S_sum, Omega[level], pp_inc[level]);
                 }
             }
         }
diff --git a/Source/TimeIntegration/ERF_TI_slow_rhs_fun.H b/Source/TimeIntegration/ERF_TI_slow_rhs_fun.H
index 1e29bea7f..5dacd6b97 100644
--- a/Source/TimeIntegration/ERF_TI_slow_rhs_fun.H
+++ b/Source/TimeIntegration/ERF_TI_slow_rhs_fun.H
@@ -133,7 +133,7 @@
 
             erf_slow_rhs_pre(level, finest_level, nrk, slow_dt, S_rhs, S_old, S_data, S_prim, S_scratch,
                              xvel_new, yvel_new, zvel_new,
-                             z_t_rk[level], Omega, cc_src, xmom_src, ymom_src, zmom_src,
+                             z_t_rk[level], Omega[level], cc_src, xmom_src, ymom_src, zmom_src,
                              (level > 0) ? &zmom_crse_rhs[level] : nullptr,
                              Tau11_lev[level].get(), Tau22_lev[level].get(), Tau33_lev[level].get(), Tau12_lev[level].get(),
                              Tau13_lev[level].get(), Tau21_lev[level].get(), Tau23_lev[level].get(), Tau31_lev[level].get(),
@@ -242,7 +242,7 @@
 
             erf_slow_rhs_pre(level, finest_level, nrk, slow_dt, S_rhs, S_old, S_data, S_prim, S_scratch,
                              xvel_new, yvel_new, zvel_new,
-                             z_t_rk[level], Omega, cc_src, xmom_src, ymom_src, zmom_src,
+                             z_t_rk[level], Omega[level], cc_src, xmom_src, ymom_src, zmom_src,
                              (level > 0) ? &zmom_crse_rhs[level] : nullptr,
                              Tau11_lev[level].get(), Tau22_lev[level].get(), Tau33_lev[level].get(), Tau12_lev[level].get(),
                              Tau13_lev[level].get(), Tau21_lev[level].get(), Tau23_lev[level].get(), Tau31_lev[level].get(),
@@ -448,7 +448,7 @@
         erf_slow_rhs_pre(level, finest_level, nrk, slow_dt,
                          S_rhs, S_old, S_data, S_prim, S_scratch,
                          xvel_new, yvel_new, zvel_new,
-                         z_t_rk[level], Omega, cc_src, xmom_src, ymom_src, zmom_src,
+                         z_t_rk[level], Omega[level], cc_src, xmom_src, ymom_src, zmom_src,
                          (level > 0) ? &zmom_crse_rhs[level] : nullptr,
                          Tau11_lev[level].get(), Tau22_lev[level].get(), Tau33_lev[level].get(), Tau12_lev[level].get(),
                          Tau13_lev[level].get(), Tau21_lev[level].get(), Tau23_lev[level].get(), Tau31_lev[level].get(),
diff --git a/Source/TimeIntegration/ERF_TI_substep_fun.H b/Source/TimeIntegration/ERF_TI_substep_fun.H
index bddb14930..a93c6dfd5 100644
--- a/Source/TimeIntegration/ERF_TI_substep_fun.H
+++ b/Source/TimeIntegration/ERF_TI_substep_fun.H
@@ -102,7 +102,7 @@ auto fast_rhs_fun = [&](int fast_step, int /*n_sub*/, int nrk,
                                 S_slow_rhs, S_old, S_stage, S_prim, pi_stage, fast_coeffs,
                                 S_data, S_scratch, fine_geom,
                                 solverChoice.gravity, solverChoice.use_lagged_delta_rt,
-                                Omega, z_t_rk[level], z_t_pert.get(),
+                                Omega[level], z_t_rk[level], z_t_pert.get(),
                                 z_phys_nd[level], z_phys_nd_new[level], z_phys_nd_src[level],
                                   detJ_cc[level],   detJ_cc_new[level],   detJ_cc_src[level],
                                 dtau, beta_s, inv_fac,
@@ -114,7 +114,7 @@ auto fast_rhs_fun = [&](int fast_step, int /*n_sub*/, int nrk,
                                 S_slow_rhs, S_data, S_stage, S_prim, pi_stage, fast_coeffs,
                                 S_data, S_scratch, fine_geom,
                                 solverChoice.gravity, solverChoice.use_lagged_delta_rt,
-                                Omega, z_t_rk[level], z_t_pert.get(),
+                                Omega[level], z_t_rk[level], z_t_pert.get(),
                                 z_phys_nd[level], z_phys_nd_new[level], z_phys_nd_src[level],
                                   detJ_cc[level],   detJ_cc_new[level],   detJ_cc_src[level],
                                 dtau, beta_s, inv_fac,
@@ -133,7 +133,7 @@ auto fast_rhs_fun = [&](int fast_step, int /*n_sub*/, int nrk,
                 //    and S_data is the new-time solution to be defined here
                 erf_fast_rhs_T(fast_step, nrk, level, finest_level,
                                S_slow_rhs, S_old, S_stage, S_prim, pi_stage, fast_coeffs,
-                               S_data, S_scratch, fine_geom, solverChoice.gravity, Omega,
+                               S_data, S_scratch, fine_geom, solverChoice.gravity, Omega[level],
                                z_phys_nd[level], detJ_cc[level], dtau, beta_s, inv_fac,
                                mapfac_m[level], mapfac_u[level], mapfac_v[level],
                                fr_as_crse, fr_as_fine, l_use_moisture, l_reflux, l_implicit_substepping);
@@ -142,7 +142,7 @@ auto fast_rhs_fun = [&](int fast_step, int /*n_sub*/, int nrk,
                 //    and as the new-time solution to be defined here
                 erf_fast_rhs_T(fast_step, nrk, level, finest_level,
                                S_slow_rhs, S_data, S_stage, S_prim, pi_stage, fast_coeffs,
-                               S_data, S_scratch, fine_geom, solverChoice.gravity, Omega,
+                               S_data, S_scratch, fine_geom, solverChoice.gravity, Omega[level],
                                z_phys_nd[level], detJ_cc[level], dtau, beta_s, inv_fac,
                                mapfac_m[level], mapfac_u[level], mapfac_v[level],
                                fr_as_crse, fr_as_fine, l_use_moisture, l_reflux, l_implicit_substepping);
diff --git a/Source/TimeIntegration/ERF_advance_dycore.cpp b/Source/TimeIntegration/ERF_advance_dycore.cpp
index e211ff464..a04c46a62 100644
--- a/Source/TimeIntegration/ERF_advance_dycore.cpp
+++ b/Source/TimeIntegration/ERF_advance_dycore.cpp
@@ -183,8 +183,6 @@ void ERF::advance_dycore(int level,
     } // l_use_diff
     } // profile
 
-    MultiFab Omega (state_old[IntVars::zmom].boxArray(),dm,1,1);
-
 #include "ERF_TI_utils.H"
 
     // Additional SFS quantities, calculated once per timestep
diff --git a/Source/TimeIntegration/ERF_slow_rhs_pre.cpp b/Source/TimeIntegration/ERF_slow_rhs_pre.cpp
index 78e0b6519..973fe18f9 100644
--- a/Source/TimeIntegration/ERF_slow_rhs_pre.cpp
+++ b/Source/TimeIntegration/ERF_slow_rhs_pre.cpp
@@ -297,10 +297,11 @@ void erf_slow_rhs_pre (int level, int finest_level,
         const Array4<      Real>& omega_arr = Omega.array(mfi);
 
         Array4<const Real> z_t;
-        if (z_t_mf)
+        if (z_t_mf) {
             z_t = z_t_mf->array(mfi);
-        else
+        } else {
             z_t = Array4<const Real>{};
+        }
 
         const Array4<Real>& rho_u_rhs = S_rhs[IntVars::xmom].array(mfi);
         const Array4<Real>& rho_v_rhs = S_rhs[IntVars::ymom].array(mfi);
@@ -366,8 +367,16 @@ void erf_slow_rhs_pre (int level, int finest_level,
         {
         BL_PROFILE("slow_rhs_making_omega");
             Box gbxo = surroundingNodes(bx,2); gbxo.grow(IntVect(1,1,1));
+            //
             // Now create Omega with momentum (not velocity) with z_t subtracted if moving terrain
-            if (l_use_terrain) {
+            // ONLY if not doing anelastic + terrain -- in that case Omega will be defined coming
+            // out of the projection
+            //
+            if (!l_use_terrain) {
+                ParallelFor(gbxo, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept {
+                    omega_arr(i,j,k) = rho_w(i,j,k);
+                });
+            } else if (!l_anelastic) {
 
                 Box gbxo_lo = gbxo; gbxo_lo.setBig(2,domain.smallEnd(2));
                 int lo_z_face = domain.smallEnd(2);
@@ -384,7 +393,7 @@ void erf_slow_rhs_pre (int level, int finest_level,
                     });
                 }
 
-                if (z_t) {
+                if (z_t) { // Note we never do anelastic with moving terrain
                     Box gbxo_mid = gbxo; gbxo_mid.setSmall(2,1); gbxo_mid.setBig(2,gbxo.bigEnd(2)-1);
                     ParallelFor(gbxo_mid, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept {
                         // We define rho on the z-face the same way as in MomentumToVelocity/VelocityToMomentum
@@ -404,10 +413,6 @@ void erf_slow_rhs_pre (int level, int finest_level,
                         omega_arr(i,j,k) = OmegaFromW(i,j,k,rho_w(i,j,k),rho_u,rho_v,z_nd,dxInv);
                     });
                 }
-            } else {
-                ParallelFor(gbxo, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept {
-                    omega_arr(i,j,k) = rho_w(i,j,k);
-                });
             }
         } // end profile
 
diff --git a/Source/TimeIntegration/MASK/ERF_TI_fast_headers.H b/Source/TimeIntegration/MASK/ERF_TI_fast_headers.H
new file mode 100644
index 000000000..e53aa50a6
--- /dev/null
+++ b/Source/TimeIntegration/MASK/ERF_TI_fast_headers.H
@@ -0,0 +1,131 @@
+#ifndef ERF_FAST_INTEGRATION_H_
+#define ERF_FAST_INTEGRATION_H_
+
+#include <AMReX_MultiFab.H>
+#include <AMReX_BCRec.H>
+#include <AMReX_YAFluxRegister.H>
+#include "ERF_DataStruct.H"
+#include "ERF_IndexDefines.H"
+#include <ERF_TerrainMetrics.H>
+#include <ERF_FillPatcher.H>
+
+#include <ERF_TileNoZ.H>
+#include <ERF_prob_common.H>
+
+#ifdef ERF_USE_EB
+#include <AMReX_EBMultiFabUtil.H>
+#endif
+
+/**
+ * Function for computing the fast RHS with no terrain
+ *
+ */
+void erf_fast_rhs_N (int step, int nrk, int level, int finest_level,
+                     amrex::Vector<amrex::MultiFab >& S_slow_rhs,
+                     const amrex::Vector<amrex::MultiFab >& S_prev,
+                     amrex::Vector<amrex::MultiFab >& S_stage_data,
+                     const amrex::MultiFab& S_stage_prim,
+                     const amrex::MultiFab& pi_stage,
+                     const amrex::MultiFab& fast_coeffs,
+                     amrex::Vector<amrex::MultiFab >& S_data,
+                     amrex::Vector<amrex::MultiFab >& S_scratch,
+                     const amrex::Geometry geom,
+                     const amrex::Real gravity,
+                     const amrex::Real dtau, const amrex::Real beta_s,
+                     const amrex::Real facinv,
+                     std::unique_ptr<amrex::MultiFab>& mapfac_m,
+                     std::unique_ptr<amrex::MultiFab>& mapfac_u,
+                     std::unique_ptr<amrex::MultiFab>& mapfac_v,
+                     ERFFillPatcher* FPr_u,
+                     ERFFillPatcher* FPr_v,
+                     ERFFillPatcher* FPr_w,
+                     amrex::YAFluxRegister* fr_as_crse,
+                     amrex::YAFluxRegister* fr_as_fine,
+                     bool l_use_moisture, bool l_reflux,
+                     bool l_implicit_substepping);
+
+/**
+ * Function for computing the fast RHS with fixed terrain
+ *
+ */
+void erf_fast_rhs_T (int step, int nrk, int level, int finest_level,
+                     amrex::Vector<amrex::MultiFab >& S_slow_rhs,
+                     const amrex::Vector<amrex::MultiFab >& S_prev,
+                     amrex::Vector<amrex::MultiFab >& S_stage_data,
+                     const amrex::MultiFab& S_stage_prim,
+                     const amrex::MultiFab& pi_stage,
+                     const amrex::MultiFab& fast_coeffs,
+                     amrex::Vector<amrex::MultiFab >& S_data,
+                     amrex::Vector<amrex::MultiFab >& S_scratch,
+                     const amrex::Geometry geom,
+                     const amrex::Real gravity,
+                           amrex::MultiFab& Omega,
+                     std::unique_ptr<amrex::MultiFab>& z_phys_nd,
+                     std::unique_ptr<amrex::MultiFab>& detJ_cc,
+                     const amrex::Real dtau, const amrex::Real beta_s,
+                     const amrex::Real facinv,
+                     std::unique_ptr<amrex::MultiFab>& mapfac_m,
+                     std::unique_ptr<amrex::MultiFab>& mapfac_u,
+                     std::unique_ptr<amrex::MultiFab>& mapfac_v,
+                     amrex::YAFluxRegister* fr_as_crse,
+                     amrex::YAFluxRegister* fr_as_fine,
+                     bool l_use_moisture, bool l_reflux,
+                     bool l_implicit_substepping);
+
+/**
+ * Function for computing the fast RHS with moving terrain
+ *
+ */
+void erf_fast_rhs_MT (int step, int nrk, int level, int finest_level,
+                      amrex::Vector<amrex::MultiFab >& S_slow_rhs,
+                      const amrex::Vector<amrex::MultiFab >& S_prev,
+                      amrex::Vector<amrex::MultiFab >& S_stg_data,
+                      const amrex::MultiFab& S_stg_prim,
+                      const amrex::MultiFab& pi_stage,
+                      const amrex::MultiFab& fast_coeffs,
+                      amrex::Vector<amrex::MultiFab >& S_data,
+                      amrex::Vector<amrex::MultiFab >& S_scratch,
+                      const amrex::Geometry geom,
+                      const amrex::Real gravity,
+                      const bool use_lagged_delta_rt,
+                            amrex::MultiFab& Omega,
+                      std::unique_ptr<amrex::MultiFab>& z_t_rk,
+                      const amrex::MultiFab* z_t_pert,
+                      std::unique_ptr<amrex::MultiFab>& z_phys_nd_old,
+                      std::unique_ptr<amrex::MultiFab>& z_phys_nd_new,
+                      std::unique_ptr<amrex::MultiFab>& z_phys_nd_stg,
+                      std::unique_ptr<amrex::MultiFab>& detJ_cc_old,
+                      std::unique_ptr<amrex::MultiFab>& detJ_cc_new,
+                      std::unique_ptr<amrex::MultiFab>& detJ_cc_stg,
+                      const amrex::Real dtau, const amrex::Real beta_s,
+                      const amrex::Real facinv,
+                      std::unique_ptr<amrex::MultiFab>& mapfac_m,
+                      std::unique_ptr<amrex::MultiFab>& mapfac_u,
+                      std::unique_ptr<amrex::MultiFab>& mapfac_v,
+                      amrex::YAFluxRegister* fr_as_crse,
+                      amrex::YAFluxRegister* fr_as_fine,
+                      bool l_use_moisture, bool l_reflux,
+                      bool l_implicit_substepping);
+
+/**
+ * Function for computing the coefficients for the tridiagonal solver used in the fast
+ * integrator (the acoustic substepping).
+ */
+void make_fast_coeffs (int level,
+                       amrex::MultiFab& fast_coeffs,
+                       amrex::Vector<amrex::MultiFab >& S_stage_data,
+                       const amrex::MultiFab& S_stage_prim,
+                       const amrex::MultiFab& pi_stage,
+                       const amrex::Geometry geom,
+                       const bool use_moisture,
+                       const bool use_terrain,
+                       const amrex::Real gravity,
+                       const amrex::Real c_p,
+                       std::unique_ptr<amrex::MultiFab>& detJ_cc,
+                       const amrex::MultiFab* r0,
+                       const amrex::MultiFab* pi0,
+                       const amrex::Real dtau,
+                       const amrex::Real beta_s,
+                       amrex::GpuArray<ERF_BC, AMREX_SPACEDIM*2> &phys_bc_type);
+
+#endif
diff --git a/Source/TimeIntegration/MASK/ERF_TI_fast_rhs_fun.H b/Source/TimeIntegration/MASK/ERF_TI_fast_rhs_fun.H
new file mode 100644
index 000000000..71c44dbbf
--- /dev/null
+++ b/Source/TimeIntegration/MASK/ERF_TI_fast_rhs_fun.H
@@ -0,0 +1,196 @@
+/**
+ *  Wrapper for calling the routine that creates the fast RHS
+ */
+auto fast_rhs_fun = [&](int fast_step, int /*n_sub*/, int nrk,
+                        Vector<MultiFab>& S_slow_rhs,
+                        const Vector<MultiFab>& S_old,
+                        Vector<MultiFab>& S_stage,
+                        Vector<MultiFab>& S_data,
+                        Vector<MultiFab>& S_scratch,
+                        const Real dtau,
+                        const Real inv_fac,
+                        const Real old_substep_time,
+                        const Real new_substep_time)
+    {
+        BL_PROFILE("fast_rhs_fun");
+        if (verbose) amrex::Print() << "Calling fast rhs at level " << level << " with dt = " << dtau << std::endl;
+
+        // Define beta_s here so that it is consistent between where we make the fast coefficients
+        //    and where we use them
+        // Per p2902 of Klemp-Skamarock-Dudhia-2007
+        // beta_s = -1.0 : fully explicit
+        // beta_s =  1.0 : fully implicit
+        Real beta_s;
+        if (solverChoice.substepping_type[level] == SubsteppingType::Implicit)
+        {
+            beta_s = 0.1;
+        } else { // Fully explicit
+            beta_s = -1.0;
+        }
+
+        // *************************************************************************
+        // Set up flux registers if using two_way coupling
+        // *************************************************************************
+        const bool l_reflux = (solverChoice.coupling_type == CouplingType::TwoWay);
+
+        YAFluxRegister* fr_as_crse = nullptr;
+        YAFluxRegister* fr_as_fine = nullptr;
+        if (l_reflux) {
+            if (level < finest_level) {
+                fr_as_crse = getAdvFluxReg(level+1);
+            }
+            if (level > 0) {
+                fr_as_fine = getAdvFluxReg(level);
+            }
+        }
+
+        // Moving terrain
+        std::unique_ptr<MultiFab> z_t_pert;
+        if ( solverChoice.use_terrain &&  (solverChoice.terrain_type == TerrainType::Moving) )
+        {
+            // Make "old" fast geom -- store in z_phys_nd for convenience
+            if (verbose) Print() << "Making geometry at start of substep time: " << old_substep_time << std::endl;
+            prob->init_custom_terrain(fine_geom,*z_phys_nd[level],old_substep_time);
+            init_terrain_grid  (level,fine_geom,*z_phys_nd[level], zlevels_stag[level], phys_bc_type);
+            make_J             (fine_geom,*z_phys_nd[level], *detJ_cc[level]);
+
+            // Make "new" fast geom
+            if (verbose) Print() << "Making geometry for end of substep time :" << new_substep_time << std::endl;
+            prob->init_custom_terrain(fine_geom,*z_phys_nd_new[level],new_substep_time);
+            init_terrain_grid  (level,fine_geom,*z_phys_nd_new[level], zlevels_stag[level], phys_bc_type);
+            make_J             (fine_geom,*z_phys_nd_new[level], *detJ_cc_new[level]);
+            make_areas         (fine_geom,*z_phys_nd_new[level], *ax_new[level], *ay_new[level], *az_new[level]);
+
+            Real inv_dt   = 1./dtau;
+
+            z_t_pert = std::make_unique<MultiFab>(S_data[IntVars::zmom].boxArray(), S_data[IntVars::zmom].DistributionMap(), 1, 1);
+
+            for (MFIter mfi(*z_t_rk[level],TilingIfNotGPU()); mfi.isValid(); ++mfi)
+            {
+                Box gbx = mfi.growntilebox(IntVect(1,1,0));
+
+                const Array4<Real      >& z_t_arr      =  z_t_rk[level]->array(mfi);
+                const Array4<Real      >& zp_t_arr     =  z_t_pert->array(mfi);
+
+                const Array4<Real const>& z_nd_new_arr =  z_phys_nd_new[level]->const_array(mfi);
+                const Array4<Real const>& z_nd_old_arr =  z_phys_nd[level]->const_array(mfi);
+
+                // Loop over horizontal plane
+                amrex::ParallelFor(gbx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+                {
+                    // Evaluate between RK stages assuming the geometry is linear between old and new time
+                    zp_t_arr(i,j,k) = 0.25 * inv_dt * (z_nd_new_arr(i+1,j+1,k) - z_nd_old_arr(i+1,j+1,k)
+                                                      +z_nd_new_arr(i  ,j+1,k) - z_nd_old_arr(  i,j+1,k)
+                                                      +z_nd_new_arr(i+1,j  ,k) - z_nd_old_arr(i+1,j  ,k)
+                                                      +z_nd_new_arr(i  ,j  ,k) - z_nd_old_arr(i  ,j  ,k));
+                    // Convert to perturbation: z"_t(t) = z_t(t) - z_t^{RK}
+                    zp_t_arr(i,j,k) -= z_t_arr(i,j,k);
+                });
+            } // mfi
+
+            // We have to call this each step since it depends on the substep time now
+            // Note we pass in the *old* detJ here
+            make_fast_coeffs(level, fast_coeffs, S_stage, S_prim, pi_stage, fine_geom,
+                             l_use_moisture, solverChoice.use_terrain, solverChoice.gravity, solverChoice.c_p,
+                             detJ_cc[level], r0, pi0, dtau, beta_s, phys_bc_type);
+
+            if (fast_step == 0) {
+                // If this is the first substep we pass in S_old as the previous step's solution
+                erf_fast_rhs_MT(fast_step, nrk, level, finest_level,
+                                S_slow_rhs, S_old, S_stage, S_prim, pi_stage, fast_coeffs,
+                                S_data, S_scratch, fine_geom,
+                                solverChoice.gravity, solverChoice.use_lagged_delta_rt,
+                                Omega, z_t_rk[level], z_t_pert.get(),
+                                z_phys_nd[level], z_phys_nd_new[level], z_phys_nd_src[level],
+                                  detJ_cc[level],   detJ_cc_new[level],   detJ_cc_src[level],
+                                dtau, beta_s, inv_fac,
+                                mapfac_m[level], mapfac_u[level], mapfac_v[level],
+                                fr_as_crse, fr_as_fine, l_use_moisture, l_reflux, l_implicit_substepping);
+            } else {
+                // If this is not the first substep we pass in S_data as the previous step's solution
+                erf_fast_rhs_MT(fast_step, nrk, level, finest_level,
+                                S_slow_rhs, S_data, S_stage, S_prim, pi_stage, fast_coeffs,
+                                S_data, S_scratch, fine_geom,
+                                solverChoice.gravity, solverChoice.use_lagged_delta_rt,
+                                Omega, z_t_rk[level], z_t_pert.get(),
+                                z_phys_nd[level], z_phys_nd_new[level], z_phys_nd_src[level],
+                                  detJ_cc[level],   detJ_cc_new[level],   detJ_cc_src[level],
+                                dtau, beta_s, inv_fac,
+                                mapfac_m[level], mapfac_u[level], mapfac_v[level],
+                                fr_as_crse, fr_as_fine, l_use_moisture, l_reflux, l_implicit_substepping);
+            }
+        } else if (solverChoice.use_terrain && solverChoice.terrain_type == TerrainType::Static) {
+            if (fast_step == 0) {
+
+                // If this is the first substep we make the coefficients since they are based only on stage data
+                make_fast_coeffs(level, fast_coeffs, S_stage, S_prim, pi_stage, fine_geom,
+                                 l_use_moisture, solverChoice.use_terrain, solverChoice.gravity, solverChoice.c_p,
+                                 detJ_cc[level], r0, pi0, dtau, beta_s, phys_bc_type);
+
+                // If this is the first substep we pass in S_old as the previous step's solution
+                //    and S_data is the new-time solution to be defined here
+                erf_fast_rhs_T(fast_step, nrk, level, finest_level,
+                               S_slow_rhs, S_old, S_stage, S_prim, pi_stage, fast_coeffs,
+                               S_data, S_scratch, fine_geom, solverChoice.gravity, Omega,
+                               z_phys_nd[level], detJ_cc[level], dtau, beta_s, inv_fac,
+                               mapfac_m[level], mapfac_u[level], mapfac_v[level],
+                               fr_as_crse, fr_as_fine, l_use_moisture, l_reflux, l_implicit_substepping);
+            } else {
+                // If this is not the first substep we pass in S_data as both the previous step's solution
+                //    and as the new-time solution to be defined here
+                erf_fast_rhs_T(fast_step, nrk, level, finest_level,
+                               S_slow_rhs, S_data, S_stage, S_prim, pi_stage, fast_coeffs,
+                               S_data, S_scratch, fine_geom, solverChoice.gravity, Omega,
+                               z_phys_nd[level], detJ_cc[level], dtau, beta_s, inv_fac,
+                               mapfac_m[level], mapfac_u[level], mapfac_v[level],
+                               fr_as_crse, fr_as_fine, l_use_moisture, l_reflux, l_implicit_substepping);
+            }
+        } else {
+            if (fast_step == 0) {
+
+                // If this is the first substep we make the coefficients since they are based only on stage data
+                make_fast_coeffs(level, fast_coeffs, S_stage, S_prim, pi_stage, fine_geom,
+                                 l_use_moisture, solverChoice.use_terrain, solverChoice.gravity, solverChoice.c_p,
+                                 detJ_cc[level], r0, pi0, dtau, beta_s, phys_bc_type);
+
+                // If this is the first substep we pass in S_old as the previous step's solution
+                //    and S_data is the new-time solution to be defined here
+                erf_fast_rhs_N(fast_step, nrk, level, finest_level,
+                               S_slow_rhs, S_old, S_stage, S_prim, pi_stage, fast_coeffs,
+                               S_data, S_scratch, fine_geom, solverChoice.gravity,
+                               dtau, beta_s, inv_fac,
+                               mapfac_m[level], mapfac_u[level], mapfac_v[level],
+                               (level > 0) ? &FPr_u[level-1] : nullptr,
+                               (level > 0) ? &FPr_v[level-1] : nullptr,
+                               (level > 0) ? &FPr_w[level-1] : nullptr,
+                               fr_as_crse, fr_as_fine, l_use_moisture, l_reflux, l_implicit_substepping);
+            } else {
+                // If this is not the first substep we pass in S_data as both the previous step's solution
+                //    and as the new-time solution to be defined here
+                erf_fast_rhs_N(fast_step, nrk, level, finest_level,
+                               S_slow_rhs, S_data, S_stage, S_prim, pi_stage, fast_coeffs,
+                               S_data, S_scratch, fine_geom, solverChoice.gravity,
+                               dtau, beta_s, inv_fac,
+                               mapfac_m[level], mapfac_u[level], mapfac_v[level],
+                               (level > 0) ? &FPr_u[level-1] : nullptr,
+                               (level > 0) ? &FPr_v[level-1] : nullptr,
+                               (level > 0) ? &FPr_w[level-1] : nullptr,
+                               fr_as_crse, fr_as_fine, l_use_moisture, l_reflux, l_implicit_substepping);
+            }
+        }
+
+        // Even if we update all the conserved variables we don't need
+        // to fillpatch the slow ones every acoustic substep
+
+        // NOTE: Numerical diffusion is tested on in FillPatchIntermediate and dictates the size of the
+        //       box over which VelocityToMomentum is computed. V2M requires one more ghost cell be
+        //       filled for rho than velocity. This logical condition ensures we fill enough ghost cells
+        //       when use_NumDiff is true.
+        int ng_cons = S_data[IntVars::cons].nGrowVect().max() - 1;
+        int ng_vel  = S_data[IntVars::xmom].nGrowVect().max();
+        if (!solverChoice.use_NumDiff) {
+            ng_cons = 1;
+            ng_vel  = 1;
+        }
+        apply_bcs(S_data, new_substep_time, ng_cons, ng_vel, fast_only=true, vel_and_mom_synced=false);
+    };
diff --git a/Source/TimeIntegration/MASK/ERF_fast_rhs_N.cpp b/Source/TimeIntegration/MASK/ERF_fast_rhs_N.cpp
new file mode 100644
index 000000000..a8079f36c
--- /dev/null
+++ b/Source/TimeIntegration/MASK/ERF_fast_rhs_N.cpp
@@ -0,0 +1,638 @@
+
+#include <ERF_TI_fast_headers.H>
+
+using namespace amrex;
+
+/**
+ * Function for computing the fast RHS with no terrain
+ *
+ * @param[in   ]  step  which fast time step within each Runge-Kutta step
+ * @param[in   ]  nrk   which Runge-Kutta step
+ * @param[in   ]  level level of resolution
+ * @param[in   ]  finest_level finest level of resolution
+ * @param[in   ]  S_slow_rhs slow RHS computed in erf_slow_rhs_pre
+ * @param[in   ]  S_prev                           if step == 0, this is S_old, else the previous fast solution
+ * @param[in   ]  S_stage_data solution            at previous RK stage
+ * @param[in   ]  S_stage_prim primitive variables at previous RK stage
+ * @param[in   ]  pi_stage   Exner function      at previous RK stage
+ * @param[in   ]  fast_coeffs coefficients for the tridiagonal solve used in the fast integrator
+ * @param[  out]  S_data    current solution
+ * @param[in   ]  S_scratch scratch space
+ * @param[in   ]  geom container for geometric information
+ * @param[in   ]  gravity magnitude of gravity
+ * @param[in   ]  dtau fast time step
+ * @param[in   ]  beta_s  Coefficient which determines how implicit vs explicit the solve is
+ * @param[in   ]  facinv inverse factor for time-averaging the momenta
+ * @param[in   ]  mapfac_m map factor at cell centers
+ * @param[in   ]  mapfac_u map factor at x-faces
+ * @param[in   ]  mapfac_v map factor at y-faces
+ * @param[inout]  fr_as_crse YAFluxRegister at level l at level l   / l+1 interface
+ * @param[inout]  fr_as_fine YAFluxRegister at level l at level l-1 / l   interface
+ * @param[in   ]  l_use_moisture
+ * @param[in   ]  l_reflux should we add fluxes to the FluxRegisters?
+ * @param[in   ]  l_implicit_substepping
+ */
+
+void erf_fast_rhs_N (int step, int nrk,
+                     int level, int finest_level,
+                     Vector<MultiFab>& S_slow_rhs,                   // the slow RHS already computed
+                     const Vector<MultiFab>& S_prev,                 // if step == 0, this is S_old, else the previous solution
+                     Vector<MultiFab>& S_stage_data,                 // S_stage = S^n, S^* or S^**
+                     const  MultiFab & S_stage_prim,                 // Primitive version of S_stage_data[IntVars::cons]
+                     const  MultiFab & pi_stage,                     // Exner function evaluated at last stage
+                     const  MultiFab &fast_coeffs,                   // Coeffs for tridiagonal solve
+                     Vector<MultiFab>& S_data,                       // S_sum = most recent full solution
+                     Vector<MultiFab>& S_scratch,                    // S_sum_old at most recent fast timestep for (rho theta)
+                     const Geometry geom,
+                     const Real gravity,
+                     const Real dtau, const Real beta_s,
+                     const Real facinv,
+                     std::unique_ptr<MultiFab>& mapfac_m,
+                     std::unique_ptr<MultiFab>& mapfac_u,
+                     std::unique_ptr<MultiFab>& mapfac_v,
+                     ERFFillPatcher* FPr_u,
+                     ERFFillPatcher* FPr_v,
+                     ERFFillPatcher* FPr_w,
+                     YAFluxRegister* fr_as_crse,
+                     YAFluxRegister* fr_as_fine,
+                     bool l_use_moisture,
+                     bool l_reflux,
+                     bool l_implicit_substepping)
+{
+    //
+    // NOTE: for step > 0, S_data and S_prev point to the same MultiFab data!!
+    //
+
+    BL_PROFILE_REGION("erf_fast_rhs_N()");
+
+    Real beta_1 = 0.5 * (1.0 - beta_s);  // multiplies explicit terms
+    Real beta_2 = 0.5 * (1.0 + beta_s);  // multiplies implicit terms
+
+    // How much do we project forward the (rho theta) that is used in the horizontal momentum equations
+    Real beta_d = 0.1;
+
+    const Real* dx = geom.CellSize();
+    const GpuArray<Real, AMREX_SPACEDIM> dxInv = geom.InvCellSizeArray();
+
+    const auto dom_lo = lbound(geom.Domain());
+    const auto dom_hi = ubound(geom.Domain());
+
+    Real dxi = dxInv[0];
+    Real dyi = dxInv[1];
+    Real dzi = dxInv[2];
+
+    const auto& ba = S_stage_data[IntVars::cons].boxArray();
+    const auto& dm = S_stage_data[IntVars::cons].DistributionMap();
+
+    MultiFab Delta_rho_theta(        ba                , dm, 1, 1);
+    MultiFab Delta_rho_w    (convert(ba,IntVect(0,0,1)), dm, 1, IntVect(1,1,0));
+
+    MultiFab     coeff_A_mf(fast_coeffs, make_alias, 0, 1);
+    MultiFab inv_coeff_B_mf(fast_coeffs, make_alias, 1, 1);
+    MultiFab     coeff_C_mf(fast_coeffs, make_alias, 2, 1);
+    MultiFab     coeff_P_mf(fast_coeffs, make_alias, 3, 1);
+    MultiFab     coeff_Q_mf(fast_coeffs, make_alias, 4, 1);
+
+    // *************************************************************************
+    // Set gravity as a vector
+    const    Array<Real,AMREX_SPACEDIM> grav{0.0, 0.0, -gravity};
+    const GpuArray<Real,AMREX_SPACEDIM> grav_gpu{grav[0], grav[1], grav[2]};
+
+    // This will hold theta extrapolated forward in time
+    MultiFab extrap(S_data[IntVars::cons].boxArray(),S_data[IntVars::cons].DistributionMap(),1,1);
+
+    // This will hold the update for (rho) and (rho theta)
+    MultiFab temp_rhs(S_stage_data[IntVars::zmom].boxArray(),S_stage_data[IntVars::zmom].DistributionMap(),2,0);
+
+    // This will hold the new x- and y-momenta temporarily (so that we don't overwrite values we need when tiling)
+    MultiFab temp_cur_xmom(S_stage_data[IntVars::xmom].boxArray(),S_stage_data[IntVars::xmom].DistributionMap(),1,0);
+    MultiFab temp_cur_ymom(S_stage_data[IntVars::ymom].boxArray(),S_stage_data[IntVars::ymom].DistributionMap(),1,0);
+
+    // We assume that in the first step (nrk == 0) we are only doing one substep.
+    AMREX_ALWAYS_ASSERT(nrk > 0 || step == 0);
+
+    iMultiFab* mask_u; if (level > 0) mask_u = FPr_u->GetMask();
+    iMultiFab* mask_v; if (level > 0) mask_v = FPr_v->GetMask();
+    iMultiFab* mask_w; if (level > 0) mask_w = FPr_w->GetMask();
+
+    // *************************************************************************
+    // First set up some arrays we'll need
+    // *************************************************************************
+
+#ifdef _OPENMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for ( MFIter mfi(S_stage_data[IntVars::cons],TilingIfNotGPU()); mfi.isValid(); ++mfi)
+    {
+        const Array4<const Real>& prev_cons  = S_prev[IntVars::cons].const_array(mfi);
+        const Array4<const Real>& prev_zmom =  S_prev[IntVars::zmom].const_array(mfi);
+
+        const Array4<const Real>& stage_cons = S_stage_data[IntVars::cons].const_array(mfi);
+        const Array4<const Real>& stage_zmom = S_stage_data[IntVars::zmom].const_array(mfi);
+
+        const Array4<Real>& prev_drho_w     = Delta_rho_w.array(mfi);
+        const Array4<Real>& prev_drho_theta = Delta_rho_theta.array(mfi);
+        const Array4<Real>& lagged_delta_rt = S_scratch[IntVars::cons].array(mfi);
+        const Array4<Real>& theta_extrap = extrap.array(mfi);
+
+        Box gbx = mfi.growntilebox(1);
+        ParallelFor(gbx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            prev_drho_theta(i,j,k) = prev_cons(i,j,k,RhoTheta_comp) - stage_cons(i,j,k,RhoTheta_comp);
+
+            if (step == 0) {
+                theta_extrap(i,j,k) = prev_drho_theta(i,j,k);
+            } else {
+                theta_extrap(i,j,k) = prev_drho_theta(i,j,k) + beta_d *
+                  ( prev_drho_theta(i,j,k) - lagged_delta_rt(i,j,k,RhoTheta_comp) );
+            }
+
+            // We define lagged_delta_rt for our next step as the current delta_rt
+            // (after using it above to extrapolate theta for this step)
+            lagged_delta_rt(i,j,k,RhoTheta_comp) = prev_drho_theta(i,j,k);
+        });
+
+        // NOTE: We must do this here because for step > 0, prev_zmom and cur_zmom both point to the same data,
+        //       so by the time we would use prev_zmom to define zflux, it would have already been over-written.
+        Box gtbz = mfi.nodaltilebox(2);
+        gtbz.grow(IntVect(1,1,0));
+        ParallelFor(gtbz, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept {
+            prev_drho_w(i,j,k) = prev_zmom(i,j,k) - stage_zmom(i,j,k);
+        });
+    } // mfi
+
+    // *************************************************************************
+    // Define updates in the current RK stage
+    // *************************************************************************
+
+#ifdef _OPENMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for ( MFIter mfi(S_stage_data[IntVars::cons],TilingIfNotGPU()); mfi.isValid(); ++mfi)
+    {
+        Box tbx = mfi.nodaltilebox(0);
+        Box tby = mfi.nodaltilebox(1);
+
+        const Array4<const Real> & stage_xmom = S_stage_data[IntVars::xmom].const_array(mfi);
+        const Array4<const Real> & stage_ymom = S_stage_data[IntVars::ymom].const_array(mfi);
+        const Array4<const Real> & prim       = S_stage_prim.const_array(mfi);
+
+        const Array4<const Real>& slow_rhs_rho_u = S_slow_rhs[IntVars::xmom].const_array(mfi);
+        const Array4<const Real>& slow_rhs_rho_v = S_slow_rhs[IntVars::ymom].const_array(mfi);
+
+        const Array4<Real>& temp_cur_xmom_arr  = temp_cur_xmom.array(mfi);
+        const Array4<Real>& temp_cur_ymom_arr  = temp_cur_ymom.array(mfi);
+
+        const Array4<const Real>& prev_xmom = S_prev[IntVars::xmom].const_array(mfi);
+        const Array4<const Real>& prev_ymom = S_prev[IntVars::ymom].const_array(mfi);
+
+        // These store the advection momenta which we will use to update the slow variables
+        const Array4<      Real>& avg_xmom = S_scratch[IntVars::xmom].array(mfi);
+        const Array4<      Real>& avg_ymom = S_scratch[IntVars::ymom].array(mfi);
+
+        const Array4<const Real>& pi_stage_ca = pi_stage.const_array(mfi);
+
+        const Array4<Real>& theta_extrap = extrap.array(mfi);
+
+        // Map factors
+        const Array4<const Real>& mf_u = mapfac_u->const_array(mfi);
+        const Array4<const Real>& mf_v = mapfac_v->const_array(mfi);
+
+        // Coarse/fine masking
+        const Array4<const int> mask_u_arr = (level > 0) ? mask_u->const_array(mfi) : Array4<const int>{};
+        const Array4<const int> mask_v_arr = (level > 0) ? mask_v->const_array(mfi) : Array4<const int>{};
+
+        int mask_u_set = (level > 0) ? FPr_u->GetSetMaskVal() : 0;
+        int mask_v_set = (level > 0) ? FPr_v->GetSetMaskVal() : 0;
+
+        // *********************************************************************
+        // Define updates in the RHS of {x, y, z}-momentum equations
+        // *********************************************************************
+        if (nrk == 0 and step == 0) {
+            ParallelFor(tbx, tby,
+            [=] AMREX_GPU_DEVICE (int i, int j, int k)
+            {
+                temp_cur_xmom_arr(i,j,k) = prev_xmom(i,j,k) + dtau * slow_rhs_rho_u(i,j,k);
+                avg_xmom(i,j,k)         += facinv * (temp_cur_xmom_arr(i,j,k) - stage_xmom(i,j,k));
+            },
+            [=] AMREX_GPU_DEVICE (int i, int j, int k)
+            {
+                temp_cur_ymom_arr(i,j,k) = prev_ymom(i,j,k) + dtau * slow_rhs_rho_v(i,j,k);
+                avg_ymom(i,j,k)         += facinv * (temp_cur_ymom_arr(i,j,k) - stage_ymom(i,j,k));
+            });
+        } else {
+            ParallelFor(tbx, tby,
+            [=] AMREX_GPU_DEVICE (int i, int j, int k)
+            {
+                // Add (negative) gradient of (rho theta) multiplied by lagged "pi"
+                Real gpx = (theta_extrap(i,j,k) - theta_extrap(i-1,j,k))*dxi;
+                gpx *= mf_u(i,j,0);
+
+                if (l_use_moisture) {
+                    Real q = 0.5 * ( prim(i,j,k,PrimQ1_comp) + prim(i-1,j,k,PrimQ1_comp)
+                                    +prim(i,j,k,PrimQ2_comp) + prim(i-1,j,k,PrimQ2_comp) );
+                    gpx /= (1.0 + q);
+                }
+
+                Real pi_c =  0.5 * (pi_stage_ca(i-1,j,k,0) + pi_stage_ca(i,j,k,0));
+
+                Real fast_rhs_rho_u;
+                if (level > 0) {
+                    fast_rhs_rho_u = (i != dom_lo.x && i != dom_hi.x+1 &&  mask_u_arr(i,j,k) == mask_u_set) ?
+                         -Gamma * R_d * pi_c * (theta_extrap(i-1,j,k) - theta_extrap(i-2,j,k))*dxi  :
+                         -Gamma * R_d * pi_c * gpx;
+
+                    // fast_rhs_rho_u = -Gamma * R_d * pi_c * gpx;
+                } else {
+                    fast_rhs_rho_u = -Gamma * R_d * pi_c * gpx;
+                }
+
+                temp_cur_xmom_arr(i,j,k) = prev_xmom(i,j,k) + dtau * (fast_rhs_rho_u + slow_rhs_rho_u(i,j,k));
+
+                avg_xmom(i,j,k) += facinv * (temp_cur_xmom_arr(i,j,k) - stage_xmom(i,j,k));
+            },
+            [=] AMREX_GPU_DEVICE (int i, int j, int k)
+            {
+                // Add (negative) gradient of (rho theta) multiplied by lagged "pi"
+                Real gpy = (theta_extrap(i,j,k) - theta_extrap(i,j-1,k))*dyi;
+                gpy *= mf_v(i,j,0);
+
+                if (l_use_moisture) {
+                    Real q = 0.5 * ( prim(i,j,k,PrimQ1_comp) + prim(i,j-1,k,PrimQ1_comp)
+                                    +prim(i,j,k,PrimQ2_comp) + prim(i,j-1,k,PrimQ2_comp) );
+                    gpy /= (1.0 + q);
+                }
+
+                Real pi_c =  0.5 * (pi_stage_ca(i,j-1,k,0) + pi_stage_ca(i,j,k,0));
+
+                Real fast_rhs_rho_v;
+                if (level > 0) {
+                    fast_rhs_rho_v = (j != dom_lo.y && j != dom_hi.y+1 &&  mask_v_arr(i,j,k) == mask_v_set) ? 0.0 : -Gamma * R_d * pi_c * gpy;
+                    // fast_rhs_rho_v = -Gamma * R_d * pi_c * gpy;
+                } else {
+                    fast_rhs_rho_v = -Gamma * R_d * pi_c * gpy;
+                }
+
+                temp_cur_ymom_arr(i,j,k) = prev_ymom(i,j,k) + dtau * (fast_rhs_rho_v + slow_rhs_rho_v(i,j,k));
+
+                avg_ymom(i,j,k) += facinv * (temp_cur_ymom_arr(i,j,k) - stage_ymom(i,j,k));
+
+            });
+        } // nrk > 0 and/or step > 0
+    } //mfi
+
+#ifdef _OPENMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    {
+    std::array<FArrayBox,AMREX_SPACEDIM> flux;
+    for ( MFIter mfi(S_stage_data[IntVars::cons],TileNoZ()); mfi.isValid(); ++mfi)
+    {
+        Box bx  = mfi.tilebox();
+        Box tbz = surroundingNodes(bx,2);
+
+        Box vbx = mfi.validbox();
+        const auto& vbx_hi = ubound(vbx);
+
+        const Array4<const Real>& stage_xmom = S_stage_data[IntVars::xmom].const_array(mfi);
+        const Array4<const Real>& stage_ymom = S_stage_data[IntVars::ymom].const_array(mfi);
+        const Array4<const Real>& stage_zmom = S_stage_data[IntVars::zmom].const_array(mfi);
+        const Array4<const Real> & prim       = S_stage_prim.const_array(mfi);
+
+        const Array4<const Real>& prev_drho_theta = Delta_rho_theta.array(mfi);
+
+        const Array4<const Real>& prev_cons  = S_prev[IntVars::cons].const_array(mfi);
+        const Array4<const Real>& stage_cons = S_stage_data[IntVars::cons].const_array(mfi);
+
+        const Array4<const Real>& slow_rhs_cons  = S_slow_rhs[IntVars::cons].const_array(mfi);
+        const Array4<const Real>& slow_rhs_rho_w = S_slow_rhs[IntVars::zmom].const_array(mfi);
+
+        const Array4<const Real>& prev_zmom = S_prev[IntVars::zmom].const_array(mfi);
+        const Array4<      Real>&  cur_zmom = S_data[IntVars::zmom].array(mfi);
+
+        const Array4<Real>& temp_cur_xmom_arr  = temp_cur_xmom.array(mfi);
+        const Array4<Real>& temp_cur_ymom_arr  = temp_cur_ymom.array(mfi);
+
+        // These store the advection momenta which we will use to update the slow variables
+        const Array4<      Real>& avg_zmom = S_scratch[IntVars::zmom].array(mfi);
+
+        // Map factors
+        const Array4<const Real>& mf_m = mapfac_m->const_array(mfi);
+        const Array4<const Real>& mf_u = mapfac_u->const_array(mfi);
+        const Array4<const Real>& mf_v = mapfac_v->const_array(mfi);
+
+        FArrayBox RHS_fab;
+        RHS_fab.resize(tbz,1, The_Async_Arena());
+
+        FArrayBox soln_fab;
+        soln_fab.resize(tbz,1, The_Async_Arena());
+
+        auto const& RHS_a  = RHS_fab.array();
+        auto const& soln_a = soln_fab.array();
+
+        auto const& temp_rhs_arr = temp_rhs.array(mfi);
+
+        auto const&     coeffA_a =     coeff_A_mf.array(mfi);
+        auto const& inv_coeffB_a = inv_coeff_B_mf.array(mfi);
+        auto const&     coeffC_a =     coeff_C_mf.array(mfi);
+        auto const&     coeffP_a =     coeff_P_mf.array(mfi);
+        auto const&     coeffQ_a =     coeff_Q_mf.array(mfi);
+
+        // *************************************************************************
+        // Define flux arrays for use in advection
+        // *************************************************************************
+        for (int dir = 0; dir < AMREX_SPACEDIM; ++dir) {
+            flux[dir].resize(surroundingNodes(bx,dir),2);
+            flux[dir].setVal<RunOn::Device>(0.);
+        }
+        const GpuArray<const Array4<Real>, AMREX_SPACEDIM>
+            flx_arr{{AMREX_D_DECL(flux[0].array(), flux[1].array(), flux[2].array())}};
+
+        // *********************************************************************
+        ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept {
+            Real xflux_lo = (temp_cur_xmom_arr(i  ,j,k) - stage_xmom(i  ,j,k)) / mf_u(i  ,j,0);
+            Real xflux_hi = (temp_cur_xmom_arr(i+1,j,k) - stage_xmom(i+1,j,k)) / mf_u(i+1,j,0);
+            Real yflux_lo = (temp_cur_ymom_arr(i,j  ,k) - stage_ymom(i,j  ,k)) / mf_v(i,j  ,0);
+            Real yflux_hi = (temp_cur_ymom_arr(i,j+1,k) - stage_ymom(i,j+1,k)) / mf_v(i,j+1,0);
+
+            Real mfsq = mf_m(i,j,0) * mf_m(i,j,0);
+
+            temp_rhs_arr(i,j,k,Rho_comp     ) =  ( xflux_hi - xflux_lo ) * dxi * mfsq
+                                               + ( yflux_hi - yflux_lo ) * dyi * mfsq;
+            temp_rhs_arr(i,j,k,RhoTheta_comp) = (( xflux_hi * (prim(i,j,k,0) + prim(i+1,j,k,0)) -
+                                                   xflux_lo * (prim(i,j,k,0) + prim(i-1,j,k,0)) ) * dxi * mfsq +
+                                                 ( yflux_hi * (prim(i,j,k,0) + prim(i,j+1,k,0)) -
+                                                   yflux_lo * (prim(i,j,k,0) + prim(i,j-1,k,0)) ) * dyi * mfsq) * 0.5;
+
+            (flx_arr[0])(i,j,k,0) = xflux_lo;
+            (flx_arr[0])(i,j,k,1) = (flx_arr[0])(i  ,j,k,0) * 0.5 * (prim(i,j,k,0) + prim(i-1,j,k,0));
+
+            (flx_arr[1])(i,j,k,0) = yflux_lo;
+            (flx_arr[1])(i,j,k,1) = (flx_arr[0])(i,j  ,k,0) * 0.5 * (prim(i,j,k,0) + prim(i,j-1,k,0));
+
+            if (i == vbx_hi.x) {
+                (flx_arr[0])(i+1,j,k,0) = xflux_hi;
+                (flx_arr[0])(i+1,j,k,1) = (flx_arr[0])(i+1,j,k,0) * 0.5 * (prim(i,j,k,0) + prim(i+1,j,k,0));
+            }
+            if (j == vbx_hi.y) {
+                (flx_arr[1])(i,j+1,k,0) = yflux_hi;
+                (flx_arr[1])(i,j+1,k,1) = (flx_arr[1])(i,j+1,k,0) * 0.5 * (prim(i,j,k,0) + prim(i,j+1,k,0));
+            }
+        });
+
+        Box bx_shrunk_in_k = bx;
+        int klo = tbz.smallEnd(2);
+        int khi = tbz.bigEnd(2);
+        bx_shrunk_in_k.setSmall(2,klo+1);
+        bx_shrunk_in_k.setBig(2,khi-1);
+
+        // Note that the notes use "g" to mean the magnitude of gravity, so it is positive
+        // We set grav_gpu[2] to be the vector component which is negative
+        // We define halfg to match the notes (which is why we take the absolute value)
+        Real halfg = std::abs(0.5 * grav_gpu[2]);
+
+        // *********************************************************************
+        // fast_loop_on_shrunk
+        // *********************************************************************
+        //Note we don't act on the bottom or top boundaries of the domain
+        ParallelFor(bx_shrunk_in_k, [=] AMREX_GPU_DEVICE (int i, int j, int k)
+        {
+             Real coeff_P = coeffP_a(i,j,k);
+             Real coeff_Q = coeffQ_a(i,j,k);
+
+            if (l_use_moisture) {
+                Real q = 0.5 * ( prim(i,j,k,PrimQ1_comp) + prim(i,j,k-1,PrimQ1_comp)
+                                +prim(i,j,k,PrimQ2_comp) + prim(i,j,k-1,PrimQ2_comp) );
+                coeff_P /= (1.0 + q);
+                coeff_Q /= (1.0 + q);
+            }
+
+            Real theta_t_lo  = 0.5 * ( prim(i,j,k-2,PrimTheta_comp) + prim(i,j,k-1,PrimTheta_comp) );
+            Real theta_t_mid = 0.5 * ( prim(i,j,k-1,PrimTheta_comp) + prim(i,j,k  ,PrimTheta_comp) );
+            Real theta_t_hi  = 0.5 * ( prim(i,j,k  ,PrimTheta_comp) + prim(i,j,k+1,PrimTheta_comp) );
+
+            Real Omega_kp1 = prev_zmom(i,j,k+1) - stage_zmom(i,j,k+1);
+            Real Omega_k   = prev_zmom(i,j,k  ) - stage_zmom(i,j,k  );
+            Real Omega_km1 = prev_zmom(i,j,k-1) - stage_zmom(i,j,k-1);
+
+            // line 2 last two terms (order dtau)
+            Real old_drho_k   = prev_cons(i,j,k  ,Rho_comp) - stage_cons(i,j,k  ,Rho_comp);
+            Real old_drho_km1 = prev_cons(i,j,k-1,Rho_comp) - stage_cons(i,j,k-1,Rho_comp);
+            Real R0_tmp = coeff_P * prev_drho_theta(i,j,k) + coeff_Q * prev_drho_theta(i,j,k-1)
+                         - halfg * ( old_drho_k + old_drho_km1 );
+
+            // lines 3-5 residuals (order dtau^2) 1.0 <-> beta_2
+            Real R1_tmp =  halfg * (-slow_rhs_cons(i,j,k  ,Rho_comp)
+                                    -slow_rhs_cons(i,j,k-1,Rho_comp)
+                                    +temp_rhs_arr(i,j,k,0) + temp_rhs_arr(i,j,k-1) )
+                + ( coeff_P * (slow_rhs_cons(i,j,k  ,RhoTheta_comp) - temp_rhs_arr(i,j,k  ,RhoTheta_comp)) +
+                    coeff_Q * (slow_rhs_cons(i,j,k-1,RhoTheta_comp) - temp_rhs_arr(i,j,k-1,RhoTheta_comp)) );
+
+            // lines 6&7 consolidated (reuse Omega & metrics) (order dtau^2)
+            R1_tmp +=  beta_1 * dzi * ( (Omega_kp1 - Omega_km1)                         * halfg
+                                       -(Omega_kp1*theta_t_hi  - Omega_k  *theta_t_mid) * coeff_P
+                                       -(Omega_k  *theta_t_mid - Omega_km1*theta_t_lo ) * coeff_Q );
+
+            // line 1
+            RHS_a(i,j,k) = Omega_k + dtau * (slow_rhs_rho_w(i,j,k) + R0_tmp + dtau * beta_2 * R1_tmp);
+
+        }); // bx_shrunk_in_k
+
+        Box b2d = tbz; // Copy constructor
+        b2d.setRange(2,0);
+
+        auto const lo = lbound(bx);
+        auto const hi = ubound(bx);
+
+        ParallelFor(b2d, [=] AMREX_GPU_DEVICE (int i, int j, int)
+        {
+          // w at bottom boundary of grid is 0 if at domain boundary, otherwise w = w_old + dtau * slow_rhs
+          RHS_a (i,j,lo.z  ) = prev_zmom(i,j,lo.z  ) - stage_zmom(i,j,lo.z)
+                             + dtau * slow_rhs_rho_w(i,j,lo.z);
+
+          // w at top boundary of grid is 0 if at domain boundary, otherwise w = w_old + dtau * slow_rhs
+          RHS_a (i,j,hi.z+1) = prev_zmom(i,j,hi.z+1) - stage_zmom(i,j,hi.z+1)
+                             + dtau * slow_rhs_rho_w(i,j,hi.z+1);
+        }); // b2d
+
+#ifdef AMREX_USE_GPU
+        if (l_implicit_substepping) {
+
+            ParallelFor(b2d, [=] AMREX_GPU_DEVICE (int i, int j, int)
+            {
+              // w = specified Dirichlet value at k = lo.z
+                soln_a(i,j,lo.z) = RHS_a(i,j,lo.z) * inv_coeffB_a(i,j,lo.z);
+              cur_zmom(i,j,lo.z) = stage_zmom(i,j,lo.z) + soln_a(i,j,lo.z);
+
+              for (int k = lo.z+1; k <= hi.z+1; k++) {
+                  soln_a(i,j,k) = (RHS_a(i,j,k)-coeffA_a(i,j,k)*soln_a(i,j,k-1)) * inv_coeffB_a(i,j,k);
+              }
+
+              cur_zmom(i,j,hi.z+1) = stage_zmom(i,j,hi.z+1) + soln_a(i,j,hi.z+1);
+
+              for (int k = hi.z; k >= lo.z; k--) {
+                  soln_a(i,j,k) -= ( coeffC_a(i,j,k) * inv_coeffB_a(i,j,k) ) *soln_a(i,j,k+1);
+                  cur_zmom(i,j,k) = stage_zmom(i,j,k) + soln_a(i,j,k);
+              }
+            }); // b2d
+
+        } else { // explicit substepping (beta_1 = 1; beta_2 = 0)
+
+            ParallelFor(b2d, [=] AMREX_GPU_DEVICE (int i, int j, int)
+            {
+              for (int k = lo.z; k <= hi.z+1; k++) {
+                    soln_a(i,j,k) = RHS_a(i,j,k);
+                  cur_zmom(i,j,k) = stage_zmom(i,j,k) + soln_a(i,j,k);
+              }
+            }); // b2d
+        } // end of explicit substepping
+#else
+        if (l_implicit_substepping) {
+
+            for (int j = lo.y; j <= hi.y; ++j) {
+                AMREX_PRAGMA_SIMD
+                for (int i = lo.x; i <= hi.x; ++i) {
+                    soln_a(i,j,lo.z) = RHS_a(i,j,lo.z) * inv_coeffB_a(i,j,lo.z);
+                }
+            }
+            for (int k = lo.z+1; k <= hi.z+1; ++k) {
+                for (int j = lo.y; j <= hi.y; ++j) {
+                    AMREX_PRAGMA_SIMD
+                    for (int i = lo.x; i <= hi.x; ++i) {
+                        soln_a(i,j,k) = (RHS_a(i,j,k)-coeffA_a(i,j,k)*soln_a(i,j,k-1)) * inv_coeffB_a(i,j,k);
+                    }
+                }
+            }
+            for (int j = lo.y; j <= hi.y; ++j) {
+                AMREX_PRAGMA_SIMD
+                for (int i = lo.x; i <= hi.x; ++i) {
+                    cur_zmom(i,j,hi.z+1) = stage_zmom(i,j,hi.z+1) + soln_a(i,j,hi.z+1);
+                }
+            }
+            for (int k = hi.z; k >= lo.z; --k) {
+                for (int j = lo.y; j <= hi.y; ++j) {
+                    AMREX_PRAGMA_SIMD
+                    for (int i = lo.x; i <= hi.x; ++i) {
+                        soln_a(i,j,k) -= ( coeffC_a(i,j,k) * inv_coeffB_a(i,j,k) ) * soln_a(i,j,k+1);
+                        cur_zmom(i,j,k) = stage_zmom(i,j,k) + soln_a(i,j,k);
+                    }
+                }
+            }
+        } else { // explicit substepping (beta_1 = 1; beta_2 = 0)
+
+            for (int k = lo.z; k <= hi.z+1; ++k) {
+                for (int j = lo.y; j <= hi.y; ++j) {
+                    AMREX_PRAGMA_SIMD
+                    for (int i = lo.x; i <= hi.x; ++i) {
+
+                        soln_a(i,j,k) = RHS_a(i,j,k);
+
+                        cur_zmom(i,j,k) = stage_zmom(i,j,k) + soln_a(i,j,k);
+                    }
+                }
+            }
+
+        } // end of explicit substepping
+#endif
+
+        // **************************************************************************
+        // Define updates in the RHS of rho and (rho theta)
+        // **************************************************************************
+        const Array4<Real>&  prev_drho_w = Delta_rho_w.array(mfi);
+        ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            Real zflux_lo = beta_2 * soln_a(i,j,k  ) + beta_1 * prev_drho_w(i,j,k  );
+            Real zflux_hi = beta_2 * soln_a(i,j,k+1) + beta_1 * prev_drho_w(i,j,k+1);
+
+            avg_zmom(i,j,k)      += facinv*zflux_lo / (mf_m(i,j,0) * mf_m(i,j,0));
+            (flx_arr[2])(i,j,k,0) =        zflux_lo / (mf_m(i,j,0) * mf_m(i,j,0));
+            (flx_arr[2])(i,j,k,1) = (flx_arr[2])(i,j,k,0) * 0.5 * (prim(i,j,k) + prim(i,j,k-1));
+
+            if (k == vbx_hi.z) {
+                avg_zmom(i,j,k+1)      += facinv * zflux_hi / (mf_m(i,j,0) * mf_m(i,j,0));
+                (flx_arr[2])(i,j,k+1,0) =          zflux_hi / (mf_m(i,j,0) * mf_m(i,j,0));
+                (flx_arr[2])(i,j,k+1,1) = (flx_arr[2])(i,j,k+1,0) * 0.5 * (prim(i,j,k) + prim(i,j,k+1));
+            }
+
+            temp_rhs_arr(i,j,k,Rho_comp     ) += dzi * ( zflux_hi - zflux_lo );
+            temp_rhs_arr(i,j,k,RhoTheta_comp) += 0.5 * dzi * ( zflux_hi * (prim(i,j,k) + prim(i,j,k+1))
+                                                             - zflux_lo * (prim(i,j,k) + prim(i,j,k-1)) );
+        });
+
+        // We only add to the flux registers in the final RK step
+        if (l_reflux && nrk == 2) {
+            int strt_comp_reflux = 0;
+            // For now we don't reflux (rho theta) because it seems to create issues at c/f boundaries
+            int  num_comp_reflux = 1;
+            if (level < finest_level) {
+                fr_as_crse->CrseAdd(mfi,
+                    {{AMREX_D_DECL(&(flux[0]), &(flux[1]), &(flux[2]))}},
+                    dx, dtau, strt_comp_reflux, strt_comp_reflux, num_comp_reflux, RunOn::Device);
+            }
+            if (level > 0) {
+                fr_as_fine->FineAdd(mfi,
+                    {{AMREX_D_DECL(&(flux[0]), &(flux[1]), &(flux[2]))}},
+                    dx, dtau, strt_comp_reflux, strt_comp_reflux, num_comp_reflux, RunOn::Device);
+            }
+
+            // This is necessary here so we don't go on to the next FArrayBox without
+            // having finished copying the fluxes into the FluxRegisters (since the fluxes
+            // are stored in temporary FArrayBox's)
+            Gpu::streamSynchronize();
+
+        } // two-way coupling
+    } // mfi
+    } // OMP
+
+#ifdef _OPENMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for ( MFIter mfi(S_stage_data[IntVars::cons],TilingIfNotGPU()); mfi.isValid(); ++mfi)
+    {
+        const Box& bx = mfi.tilebox();
+
+        const Array4<      Real>&  cur_cons = S_data[IntVars::cons].array(mfi);
+        const Array4<const Real>& prev_cons = S_prev[IntVars::cons].const_array(mfi);
+        auto const& temp_rhs_arr     = temp_rhs.const_array(mfi);
+        auto const& slow_rhs_cons    = S_slow_rhs[IntVars::cons].const_array(mfi);
+
+        if (step == 0) {
+            ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+            {
+                cur_cons(i,j,k,Rho_comp)      = prev_cons(i,j,k,Rho_comp) +
+                                                dtau * (slow_rhs_cons(i,j,k,Rho_comp) - temp_rhs_arr(i,j,k,Rho_comp));
+                cur_cons(i,j,k,RhoTheta_comp) = prev_cons(i,j,k,RhoTheta_comp) +
+                                                dtau * (slow_rhs_cons(i,j,k,RhoTheta_comp) - temp_rhs_arr(i,j,k,RhoTheta_comp));
+            });
+        } else {
+            ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+            {
+                //
+                // We didn't need to set cur_cons = prev_cons above because they point to the same data for step > 0
+                //
+                cur_cons(i,j,k,Rho_comp)      += dtau * (slow_rhs_cons(i,j,k,Rho_comp) - temp_rhs_arr(i,j,k,Rho_comp));
+                cur_cons(i,j,k,RhoTheta_comp) += dtau * (slow_rhs_cons(i,j,k,RhoTheta_comp) - temp_rhs_arr(i,j,k,RhoTheta_comp));
+            });
+        } // step = 0
+
+        const Array4<Real>& cur_xmom = S_data[IntVars::xmom].array(mfi);
+        const Array4<Real>& cur_ymom = S_data[IntVars::ymom].array(mfi);
+
+        const Array4<Real const>& temp_cur_xmom_arr = temp_cur_xmom.const_array(mfi);
+        const Array4<Real const>& temp_cur_ymom_arr = temp_cur_ymom.const_array(mfi);
+
+        Box tbx = surroundingNodes(bx,0);
+        Box tby = surroundingNodes(bx,1);
+
+        ParallelFor(tbx, tby,
+        [=] AMREX_GPU_DEVICE (int i, int j, int k)
+        {
+            cur_xmom(i,j,k) = temp_cur_xmom_arr(i,j,k);
+        },
+        [=] AMREX_GPU_DEVICE (int i, int j, int k)
+        {
+            cur_ymom(i,j,k) = temp_cur_ymom_arr(i,j,k);
+        });
+
+    } // mfi
+}
diff --git a/Source/Utils/ERF_PoissonSolve.cpp b/Source/Utils/ERF_PoissonSolve.cpp
index 0f731fe38..6ded377e7 100644
--- a/Source/Utils/ERF_PoissonSolve.cpp
+++ b/Source/Utils/ERF_PoissonSolve.cpp
@@ -45,7 +45,7 @@ bool ERF::projection_has_dirichlet (Array<LinOpBCType,AMREX_SPACEDIM> bcs) const
  * Project the single-level velocity field to enforce incompressibility
  * Note that the level may or may not be level 0.
  */
-void ERF::project_velocities (int lev, Real l_dt, Vector<MultiFab>& mom_mf, MultiFab& pmf)
+void ERF::project_velocities (int lev, Real l_dt, Vector<MultiFab>& mom_mf, MultiFab& /*Omega*/, MultiFab& pmf)
 {
     BL_PROFILE("ERF::project_velocities()");
 
diff --git a/Source/Utils/ERF_PoissonSolve_tb.cpp b/Source/Utils/ERF_PoissonSolve_tb.cpp
index 74139c6e4..165dfcc69 100644
--- a/Source/Utils/ERF_PoissonSolve_tb.cpp
+++ b/Source/Utils/ERF_PoissonSolve_tb.cpp
@@ -9,7 +9,7 @@ using namespace amrex;
  * Project the single-level velocity field to enforce incompressibility with a
  * thin body
  */
-void ERF::project_velocities_tb (int lev, Real l_dt, Vector<MultiFab>& vmf, MultiFab& pmf)
+void ERF::project_velocities_tb (int lev, Real l_dt, Vector<MultiFab>& vmf, MultiFab& /*Omega*/, MultiFab& pmf)
 {
     BL_PROFILE("ERF::project_velocities_tb()");
     AMREX_ALWAYS_ASSERT(!solverChoice.use_terrain);