From cf9593a186cffbb3ccfac4742b5bf798079ee1b0 Mon Sep 17 00:00:00 2001
From: Ann Almgren <asalmgren@lbl.gov>
Date: Sat, 23 Nov 2024 18:42:10 -0800
Subject: [PATCH] GMRES is now working with FFT preconditioner

---
 Docs/sphinx_doc/LinearSolvers.rst             |  16 +-
 Source/LinearSolvers/ERF_FFT_TerrainPrecond.H | 262 ++++++++++++++++++
 Source/LinearSolvers/ERF_PoissonSolve.cpp     |  75 ++++-
 Source/LinearSolvers/ERF_TerrainPoisson.H     |   5 +
 Source/LinearSolvers/ERF_TerrainPoisson.cpp   |  14 +-
 .../LinearSolvers/ERF_compute_divergence.cpp  |  16 +-
 Source/LinearSolvers/ERF_solve_with_fft.cpp   |   2 -
 Source/LinearSolvers/ERF_solve_with_gmres.cpp |   3 +-
 Source/LinearSolvers/Make.package             |   1 +
 9 files changed, 367 insertions(+), 27 deletions(-)
 create mode 100644 Source/LinearSolvers/ERF_FFT_TerrainPrecond.H

diff --git a/Docs/sphinx_doc/LinearSolvers.rst b/Docs/sphinx_doc/LinearSolvers.rst
index 9ac62c56c..6b6a01f2a 100644
--- a/Docs/sphinx_doc/LinearSolvers.rst
+++ b/Docs/sphinx_doc/LinearSolvers.rst
@@ -7,6 +7,18 @@
 Linear Solvers
 ==============
 
-Evolving the anelastic equation set requires solution of a Poisson equation in which we solve for the update to the perturbational pressure at cell centers.   AMReX provides several solver options: geometric multigrid, Fast Fourier Transforms (FFTs) and preconditioned GMRES.  For simulations with no terrain or grid stretching, one of the FFT options is generally the fastest solver, followed by multigrid.  We note that the multigrid solver has the option to ``ignore'' a coordinate direction if the domain is only one cell wide in that direction; this allows for efficient solution of effectively 2D problems.  Multigrid can also be used when the union of grids at a level is not in itself rectangular; the FFT solvers do not work in that general case.
+Evolving the anelastic equation set requires solution of a Poisson equation in which we solve for the update to the perturbational pressure at cell centers.
+ERF uses several solver options available through AMReX: geometric multigrid, Fast Fourier Transforms (FFTs) and preconditioned GMRES.
+For simulations with no terrain or grid stretching, one of the FFT options is generally the fastest solver,
+followed by multigrid.  We note that the multigrid solver has the option to ``ignore'' a coordinate direction
+if the domain is only one cell wide in that direction; this allows for efficient solution of effectively 2D problems.
+Multigrid can also be used when the union of grids at a level is not in itself rectangular; the FFT solvers do not work in that general case.
 
-For simulations using grid stretching in the vertical but flat terrain, we must use the hybrid FFT solver in which we perform 2D transforms only in the lateral directions and couple the solution in the vertical direction with a tridiagonal solve. In both these cases we use a 7-point stencil. To solve the Poisson equation on terrain-fitted coordinates with general terrain, we rely on the preconditioned GMRES solver since the stencil effectively has variable coefficients and requires 19 points.
+For simulations using grid stretching in the vertical but flat terrain, we must use the hybrid FFT solver in which
+we perform 2D transforms only in the lateral directions and couple the solution in the vertical direction with a tridiagonal solve.
+In both these cases we use a 7-point stencil.
+To solve the Poisson equation on terrain-fitted coordinates with general terrain,
+we rely on the FFT-preconditioned GMRES solver since the stencil effectively has variable coefficients and requires 19 points.
+
+   .. note::
+      **Currently only doubly periodic lateral boundary conditions are supported by the hybrid FFT, and therefore by the GMRES solver.  More general boundary conditions are a work in progress.**
diff --git a/Source/LinearSolvers/ERF_FFT_TerrainPrecond.H b/Source/LinearSolvers/ERF_FFT_TerrainPrecond.H
new file mode 100644
index 000000000..941fe3f29
--- /dev/null
+++ b/Source/LinearSolvers/ERF_FFT_TerrainPrecond.H
@@ -0,0 +1,262 @@
+#ifndef ERF_FFT_TERRAIN_PRECOND_H_
+#define ERF_FFT_TERRAIN_PRECOND_H_
+
+#include <AMReX_FFT.H>
+#include <AMReX_FFT_Poisson.H>
+#include <AMReX_Geometry.H>
+
+namespace amrex::FFT
+{
+
+/**
+ * \brief 3D Preconditioner for terrain problems with periodic boundaries in the first two
+ * dimensions and Neumann in the last dimension.
+ */
+template <typename MF = MultiFab>
+class PoissonTerrainPrecond
+{
+public:
+    using T = typename MF::value_type;
+
+    template <typename FA=MF, std::enable_if_t<IsFabArray_v<FA>,int> = 0>
+    explicit PoissonTerrainPrecond (Geometry const& geom)
+        : m_geom(geom), m_r2c(geom.Domain(), Info().setBatchMode(true))
+    {
+        AMREX_ALWAYS_ASSERT(geom.isPeriodic(0) && geom.isPeriodic(1));
+    }
+
+    void solve (MF& soln, MF const& rhs, MF const& height);
+
+    template <typename DZ>
+    void solve_doit (MF& soln, MF const& rhs, MF const& height, DZ const& dz); // has to be public for cuda
+
+private:
+    Geometry m_geom;
+    R2C<typename MF::value_type, Direction::both> m_r2c;
+};
+
+template <typename MF>
+void PoissonTerrainPrecond<MF>::solve (MF& soln, MF const& rhs, MF const& height)
+{
+    auto delz = T(m_geom.CellSize(AMREX_SPACEDIM-1));
+    solve_doit(soln, rhs, height, fft_poisson_detail::DZ<T>{delz});
+}
+
+template <typename MF>
+template <typename DZ>
+void PoissonTerrainPrecond<MF>::solve_doit (MF& soln, MF const& rhs, MF const& height, DZ const& dz)
+{
+    BL_PROFILE("FFT::PoissonTerrainPrecond::solve");
+
+#if (AMREX_SPACEDIM < 3)
+    amrex::ignore_unused(soln, rhs, dz);
+#else
+    auto facx = T(2)*Math::pi<T>()/T(m_geom.ProbLength(0));
+    auto facy = T(2)*Math::pi<T>()/T(m_geom.ProbLength(1));
+
+    auto dx =T(m_geom.CellSize(0));
+    auto dy = T(m_geom.CellSize(1));
+
+    auto dxinv = T(m_geom.InvCellSize(0));
+    auto dyinv = T(m_geom.InvCellSize(1));
+
+    auto scale = T(1.0)/(T(m_geom.Domain().length(0)) *
+                         T(m_geom.Domain().length(1)));
+    auto ny = m_geom.Domain().length(1);
+    auto nz = m_geom.Domain().length(2);
+
+    Box cdomain = m_geom.Domain();
+    cdomain.setBig(0,cdomain.length(0)/2);
+    auto cba = amrex::decompose(cdomain, ParallelContext::NProcsSub(),
+                                {AMREX_D_DECL(true,true,false)});
+    DistributionMapping dm = detail::make_iota_distromap(cba.size());
+    FabArray<BaseFab<GpuComplex<T> > > spmf(cba, dm, 1, 0);
+
+    m_r2c.forward(rhs, spmf);
+
+    for (MFIter mfi(spmf); mfi.isValid(); ++mfi)
+    {
+        auto const& spectral = spmf.array(mfi);
+        auto const& box = mfi.validbox();
+        auto const& xybox = amrex::makeSlab(box, 2, 0);
+
+        auto const zp = height.const_array(mfi);
+
+#ifdef AMREX_USE_GPU
+        // xxxxx TODO: We need to explore how to optimize this
+        // function. Maybe we can use cusparse. Maybe we should make
+        // z-direction to be the unit stride direction.
+
+        FArrayBox tridiag_workspace(box,4);
+        auto const& ald = tridiag_workspace.array(0);
+        auto const& bd = tridiag_workspace.array(1);
+        auto const& cud = tridiag_workspace.array(2);
+        auto const& scratch = tridiag_workspace.array(3);
+
+        amrex::ParallelFor(xybox, [=] AMREX_GPU_DEVICE (int i, int j, int)
+        {
+            T a = facx*i;
+            T b = (j < ny/2) ? facy*j : facy*(ny-j);
+
+            T k2 = T(2)*(std::cos(a*dx)-T(1))/(dx*dx)
+                +  T(2)*(std::cos(b*dy)-T(1))/(dy*dy);
+
+            // Tridiagonal solve with homogeneous Neumann
+            for(int k=0; k < nz; k++) {
+                Real hzeta_inv_on_cc = 4.0 / ( (zp(i,j,k+1) + zp(i+1,j,k+1) + zp(i,j+1,k+1) + zp(i+1,j+1,k+1))
+                                              -(zp(i,j,k  ) + zp(i+1,j,k  ) + zp(i,j+1,k  ) + zp(i+1,j+1,k  )) );
+                if(k==0) {
+
+                    Real hzeta_inv_on_zhi = 8.0 / ( (zp(i,j,k+2) + zp(i+1,j,k+2) + zp(i,j+1,k+2) + zp(i+1,j+1,k+2))
+                                                   -(zp(i,j,k  ) + zp(i+1,j,k  ) + zp(i,j+1,k  ) + zp(i+1,j+1,k  )) );
+                    Real h_xi_on_zhi  = 0.5 * (zp(i+1,j+1,k+1) + zp(i+1,j,k+1) - zp(i,j+1,k+1) - zp(i,j,k+1)) * dxinv;
+                    Real h_eta_on_zhi = 0.5 * (zp(i+1,j+1,k+1) + zp(i,j+1,k+1) - zp(i+1,j,k+1) - zp(i,j,k+1)) * dyinv;
+
+                    ald(i,j,k) = 0.;
+                    cud(i,j,k) = hzeta_inv_on_cc * (1.0 + h_xi_on_zhi*h_xi_on_zhi + h_eta_on_zhi*h_eta_on_zhi) * hzeta_inv_on_zhi;
+                    bd(i,j,k) = k2 - ald(i,j,k) - cud(i,j,k);
+
+                } else if (k == nz-1) {
+
+                    Real hzeta_inv_on_zlo = 8.0 / ( (zp(i,j,k+1) + zp(i+1,j,k+1) + zp(i,j+1,k+1) + zp(i+1,j+1,k+1))
+                                                   -(zp(i,j,k-1) + zp(i+1,j,k-1) + zp(i,j+1,k-1) + zp(i+1,j+1,k-1)) );
+                    Real h_xi_on_zlo  = 0.5 * (zp(i+1,j+1,k  ) + zp(i+1,j,k  ) - zp(i,j+1,k  ) - zp(i,j,k  )) * dxinv;
+                    Real h_eta_on_zlo = 0.5 * (zp(i+1,j+1,k  ) + zp(i,j+1,k  ) - zp(i+1,j,k  ) - zp(i,j,k  )) * dyinv;
+                    ald(i,j,k) = hzeta_inv_on_cc * (1.0 + h_xi_on_zlo*h_xi_on_zlo + h_eta_on_zlo*h_eta_on_zlo) * hzeta_inv_on_zlo;
+                    cud(i,j,k) = 0.;
+                    bd(i,j,k) = k2 - ald(i,j,k) - cud(i,j,k);
+                    if (i == 0 && j == 0) {
+                        bd(i,j,k) *= 2.0;
+                    }
+                } else {
+                    Real hzeta_inv_on_zlo = 8.0 / ( (zp(i,j,k+1) + zp(i+1,j,k+1) + zp(i,j+1,k+1) + zp(i+1,j+1,k+1))
+                                                   -(zp(i,j,k-1) + zp(i+1,j,k-1) + zp(i,j+1,k-1) + zp(i+1,j+1,k-1)) );
+                    Real h_xi_on_zlo  = 0.5 * (zp(i+1,j+1,k  ) + zp(i+1,j,k  ) - zp(i,j+1,k  ) - zp(i,j,k  )) * dxinv;
+                    Real h_eta_on_zlo = 0.5 * (zp(i+1,j+1,k  ) + zp(i,j+1,k  ) - zp(i+1,j,k  ) - zp(i,j,k  )) * dyinv;
+
+                    Real hzeta_inv_on_zhi = 8.0 / ( (zp(i,j,k+2) + zp(i+1,j,k+2) + zp(i,j+1,k+2) + zp(i+1,j+1,k+2))
+                                                   -(zp(i,j,k  ) + zp(i+1,j,k  ) + zp(i,j+1,k  ) + zp(i+1,j+1,k  )) );
+                    Real h_xi_on_zhi  = 0.5 * (zp(i+1,j+1,k+1) + zp(i+1,j,k+1) - zp(i,j+1,k+1) - zp(i,j,k+1)) * dxinv;
+                    Real h_eta_on_zhi = 0.5 * (zp(i+1,j+1,k+1) + zp(i,j+1,k+1) - zp(i+1,j,k+1) - zp(i,j,k+1)) * dyinv;
+
+                    ald(i,j,k) = hzeta_inv_on_cc * (1.0 + h_xi_on_zlo*h_xi_on_zlo + h_eta_on_zlo*h_eta_on_zlo) * hzeta_inv_on_zlo;
+                    cud(i,j,k) = hzeta_inv_on_cc * (1.0 + h_xi_on_zhi*h_xi_on_zhi + h_eta_on_zhi*h_eta_on_zhi) * hzeta_inv_on_zhi;
+                    bd(i,j,k) = k2 - ald(i,j,k) - cud(i,j,k);
+
+                }
+            }
+
+            scratch(i,j,0) = cud(i,j,0)/bd(i,j,0);
+            spectral(i,j,0) = spectral(i,j,0)/bd(i,j,0);
+
+            for (int k = 1; k < nz; k++) {
+                if (k < nz-1) {
+                    scratch(i,j,k) = cud(i,j,k) / (bd(i,j,k) - ald(i,j,k) * scratch(i,j,k-1));
+                }
+                spectral(i,j,k) = (spectral(i,j,k) - ald(i,j,k) * spectral(i,j,k - 1))
+                    / (bd(i,j,k) - ald(i,j,k) * scratch(i,j,k-1));
+            }
+
+            for (int k = nz - 2; k >= 0; k--) {
+                spectral(i,j,k) -= scratch(i,j,k) * spectral(i,j,k + 1);
+            }
+
+            for (int k = 0; k < nz; ++k) {
+                spectral(i,j,k) *= scale;
+            }
+        });
+        Gpu::streamSynchronize();
+
+#else
+
+        Gpu::DeviceVector<GpuComplex<Real>> ald(nz);
+        Gpu::DeviceVector<GpuComplex<Real>> bd(nz);
+        Gpu::DeviceVector<GpuComplex<Real>> cud(nz);
+        Gpu::DeviceVector<GpuComplex<Real>> scratch(nz);
+
+        amrex::LoopOnCpu(xybox, [&] (int i, int j, int)
+        {
+            T a = facx*i;
+            T b = (j < ny/2) ? facy*j : facy*(ny-j);
+
+            T k2 = T(2)*(std::cos(a*dx)-T(1))/(dx*dx)
+                +  T(2)*(std::cos(b*dy)-T(1))/(dy*dy);
+
+            // Tridiagonal solve with homogeneous Neumann
+            for(int k=0; k < nz; k++) {
+
+                Real hzeta_inv_on_cc = 4.0 / ( (zp(i,j,k+1) + zp(i+1,j,k+1) + zp(i,j+1,k+1) + zp(i+1,j+1,k+1))
+                                              -(zp(i,j,k  ) + zp(i+1,j,k  ) + zp(i,j+1,k  ) + zp(i+1,j+1,k  )) );
+
+                if(k==0) {
+
+                    Real hzeta_inv_on_zhi = 8.0 / ( (zp(i,j,k+2) + zp(i+1,j,k+2) + zp(i,j+1,k+2) + zp(i+1,j+1,k+2))
+                                                   -(zp(i,j,k  ) + zp(i+1,j,k  ) + zp(i,j+1,k  ) + zp(i+1,j+1,k  )) );
+                    Real h_xi_on_zhi  = 0.5 * (zp(i+1,j+1,k+1) + zp(i+1,j,k+1) - zp(i,j+1,k+1) - zp(i,j,k+1)) * dxinv;
+                    Real h_eta_on_zhi = 0.5 * (zp(i+1,j+1,k+1) + zp(i,j+1,k+1) - zp(i+1,j,k+1) - zp(i,j,k+1)) * dyinv;
+
+                    ald[k] = 0.;
+                    cud[k] = hzeta_inv_on_cc * (1.0 + h_xi_on_zhi*h_xi_on_zhi + h_eta_on_zhi*h_eta_on_zhi) * hzeta_inv_on_zhi;
+                    bd[k]  = k2 -ald[k]-cud[k];
+
+                } else if (k == nz-1) {
+
+                    Real hzeta_inv_on_zlo = 8.0 / ( (zp(i,j,k+1) + zp(i+1,j,k+1) + zp(i,j+1,k+1) + zp(i+1,j+1,k+1))
+                                                   -(zp(i,j,k-1) + zp(i+1,j,k-1) + zp(i,j+1,k-1) + zp(i+1,j+1,k-1)) );
+                    Real h_xi_on_zlo  = 0.5 * (zp(i+1,j+1,k  ) + zp(i+1,j,k  ) - zp(i,j+1,k  ) - zp(i,j,k  )) * dxinv;
+                    Real h_eta_on_zlo = 0.5 * (zp(i+1,j+1,k  ) + zp(i,j+1,k  ) - zp(i+1,j,k  ) - zp(i,j,k  )) * dyinv;
+
+                    ald[k] = hzeta_inv_on_cc * (1.0 + h_xi_on_zlo*h_xi_on_zlo + h_eta_on_zlo*h_eta_on_zlo) * hzeta_inv_on_zlo;
+                    cud[k] = 0.;
+                    bd[k]  = k2 -ald[k]-cud[k];
+
+                    if (i == 0 && j == 0) {
+                        bd[k] *= 2.0;
+                    }
+                } else {
+
+                    Real hzeta_inv_on_zlo = 8.0 / ( (zp(i,j,k+1) + zp(i+1,j,k+1) + zp(i,j+1,k+1) + zp(i+1,j+1,k+1))
+                                                   -(zp(i,j,k-1) + zp(i+1,j,k-1) + zp(i,j+1,k-1) + zp(i+1,j+1,k-1)) );
+                    Real h_xi_on_zlo  = 0.5 * (zp(i+1,j+1,k  ) + zp(i+1,j,k  ) - zp(i,j+1,k  ) - zp(i,j,k  )) * dxinv;
+                    Real h_eta_on_zlo = 0.5 * (zp(i+1,j+1,k  ) + zp(i,j+1,k  ) - zp(i+1,j,k  ) - zp(i,j,k  )) * dyinv;
+
+                    Real hzeta_inv_on_zhi = 8.0 / ( (zp(i,j,k+2) + zp(i+1,j,k+2) + zp(i,j+1,k+2) + zp(i+1,j+1,k+2))
+                                                   -(zp(i,j,k  ) + zp(i+1,j,k  ) + zp(i,j+1,k  ) + zp(i+1,j+1,k  )) );
+                    Real h_xi_on_zhi  = 0.5 * (zp(i+1,j+1,k+1) + zp(i+1,j,k+1) - zp(i,j+1,k+1) - zp(i,j,k+1)) * dxinv;
+                    Real h_eta_on_zhi = 0.5 * (zp(i+1,j+1,k+1) + zp(i,j+1,k+1) - zp(i+1,j,k+1) - zp(i,j,k+1)) * dyinv;
+
+                    ald[k] = hzeta_inv_on_cc * (1.0 + h_xi_on_zlo*h_xi_on_zlo + h_eta_on_zlo*h_eta_on_zlo) * hzeta_inv_on_zlo;
+                    cud[k] = hzeta_inv_on_cc * (1.0 + h_xi_on_zhi*h_xi_on_zhi + h_eta_on_zhi*h_eta_on_zhi) * hzeta_inv_on_zhi;
+                    bd[k]  = k2 - ald[k] - cud[k];
+                }
+            }
+
+            scratch[0] = cud[0]/bd[0];
+            spectral(i,j,0) = spectral(i,j,0)/bd[0];
+
+            for (int k = 1; k < nz; k++) {
+                if (k < nz-1) {
+                    scratch[k] = cud[k] / (bd[k] - ald[k] * scratch[k-1]);
+                }
+                spectral(i,j,k) = (spectral(i,j,k) - ald[k] * spectral(i,j,k - 1))
+                    / (bd[k] - ald[k] * scratch[k-1]);
+            }
+
+            for (int k = nz - 2; k >= 0; k--) {
+                spectral(i,j,k) -= scratch[k] * spectral(i,j,k + 1);
+            }
+
+            for (int k = 0; k < nz; ++k) {
+                spectral(i,j,k) *= scale;
+            }
+        });
+#endif
+    }
+
+    m_r2c.backward(spmf, soln);
+#endif
+}
+
+}
+
+#endif
diff --git a/Source/LinearSolvers/ERF_PoissonSolve.cpp b/Source/LinearSolvers/ERF_PoissonSolve.cpp
index f2f2c198f..63cb26ed0 100644
--- a/Source/LinearSolvers/ERF_PoissonSolve.cpp
+++ b/Source/LinearSolvers/ERF_PoissonSolve.cpp
@@ -17,7 +17,6 @@ void ERF::project_velocities (int lev, Real l_dt, Vector<MultiFab>& mom_mf, Mult
 #endif
 
     bool l_use_terrain = SolverChoice::terrain_type != TerrainType::None;
-    bool use_gmres     = (l_use_terrain && !SolverChoice::terrain_is_flat);
 
     // Make sure the solver only sees the levels over which we are solving
     Vector<BoxArray>            ba_tmp;   ba_tmp.push_back(mom_mf[Vars::cons].boxArray());
@@ -111,27 +110,77 @@ void ERF::project_velocities (int lev, Real l_dt, Vector<MultiFab>& mom_mf, Mult
     // ****************************************************************************
     // Choose the solver and solve
     // ****************************************************************************
+
+    // ****************************************************************************
+    // EB
+    // ****************************************************************************
 #ifdef ERF_USE_EB
     solve_with_EB_mlmg(lev, rhs, phi, fluxes);
 #else
 
-#ifdef ERF_USE_FFT
     bool boxes_make_rectangle = (geom_tmp[0].Domain().numPts() == ba_tmp[0].numPts());
-    if (use_fft && boxes_make_rectangle) {
 
-        solve_with_fft(lev, rhs[0], phi[0], fluxes[0]);
+    // ****************************************************************************
+    // No terrain or grid stretching
+    // ****************************************************************************
+    if (!l_use_terrain) {
+#ifdef ERF_USE_FFT
+        if (use_fft) {
+            if (boxes_make_rectangle) {
+                solve_with_fft(lev, rhs[0], phi[0], fluxes[0]);
+            } else {
+                amrex::Warning("FFT won't work unless the boxArray covers the domain: defaulting to MLMG");
+                solve_with_mlmg(lev, rhs, phi, fluxes);
+            }
+        } else {
+            solve_with_mlmg(lev, rhs, phi, fluxes);
+        }
+#else
+        if (use_fft) {
+            amrex::Warning("use_fft can't be used unless you build with USE_FFT = TRUE; defaulting to MLMG");
+        }
+        solve_with_mlmg(lev, rhs, phi, fluxes);
+#endif
+    } // No terrain or grid stretching
 
-    } else
+    // ****************************************************************************
+    // Grid stretching (flat terrain)
+    // ****************************************************************************
+    else if (l_use_terrain && SolverChoice::terrain_is_flat) {
+#ifndef ERF_USE_FFT
+        amrex::Abort("Rebuild with USE_FFT = TRUE so you can use the FFT solver");
+#else
+        if (!boxes_make_rectangle) {
+            amrex::Abort("FFT won't work unless the boxArray covers the domain");
+        } else {
+            if (!use_fft) {
+                amrex::Warning("Using FFT even though you didnt set use_fft = 0; it's the best choice");
+            }
+            solve_with_fft(lev, rhs[0], phi[0], fluxes[0]);
+        }
 #endif
-    if (use_gmres)
-    {
-        solve_with_gmres(lev, rhs, phi, fluxes[0]);
-    }
-    else
-    {
-        solve_with_mlmg(lev, rhs, phi, fluxes);
-    }
+    } // grid stretching
+
+    // ****************************************************************************
+    // General terrain
+    // ****************************************************************************
+    else if (l_use_terrain && !SolverChoice::terrain_is_flat) {
+#ifdef ERF_USE_FFT
+        if (use_fft)
+        {
+            amrex::Warning("FFT solver does not work for general terrain: switching to GMRES");
+        }
+        if (!boxes_make_rectangle) {
+            amrex::Abort("FFT preconditioner for GMRES won't work unless the boxArray covers the domain");
+        } else {
+            solve_with_gmres(lev, rhs, phi, fluxes[0]);
+        }
+#else
+        amrex::Abort("Rebuild with USE_FFT = TRUE so you can use the FFT preconditioner for GMRES");
 #endif
+    } // general terrain
+
+#endif // not EB
 
     // ****************************************************************************
     // Print time in solve
diff --git a/Source/LinearSolvers/ERF_TerrainPoisson.H b/Source/LinearSolvers/ERF_TerrainPoisson.H
index ed571718c..281293ef6 100644
--- a/Source/LinearSolvers/ERF_TerrainPoisson.H
+++ b/Source/LinearSolvers/ERF_TerrainPoisson.H
@@ -2,6 +2,7 @@
 #define ERF_TERRAIN_POISSON_H_
 
 #include "ERF_TerrainPoisson_3D_K.H"
+#include "ERF_FFT_TerrainPrecond.H"
 
 #include <AMReX_Geometry.H>
 #include <AMReX_MultiFab.H>
@@ -18,6 +19,8 @@ public:
 
     void apply(amrex::MultiFab& lhs, amrex::MultiFab const& rhs);
 
+    void usePrecond(bool precond_flag);
+
     void getFluxes(amrex::MultiFab& phi, amrex::Array<amrex::MultiFab,AMREX_SPACEDIM>& fluxes);
 
     void assign(amrex::MultiFab& lhs, amrex::MultiFab const& rhs);
@@ -42,10 +45,12 @@ public:
     void setToZero(amrex::MultiFab& v);
 
 private:
+    bool m_use_precond = false;
     amrex::Geometry m_geom;
     amrex::BoxArray m_grids;
     amrex::DistributionMapping m_dmap;
     const amrex::MultiFab* m_zphys;
+    std::unique_ptr<amrex::FFT::PoissonTerrainPrecond<amrex::MultiFab>> m_2D_fft_precond;
 };
 
 #endif
diff --git a/Source/LinearSolvers/ERF_TerrainPoisson.cpp b/Source/LinearSolvers/ERF_TerrainPoisson.cpp
index 07128f309..c6f2918ea 100644
--- a/Source/LinearSolvers/ERF_TerrainPoisson.cpp
+++ b/Source/LinearSolvers/ERF_TerrainPoisson.cpp
@@ -10,6 +10,14 @@ TerrainPoisson::TerrainPoisson (Geometry const& geom, BoxArray const& ba,
       m_dmap(dm),
       m_zphys(z_phys_nd)
 {
+    if (!m_2D_fft_precond) {
+        m_2D_fft_precond = std::make_unique<FFT::PoissonTerrainPrecond<MultiFab>>(geom);
+    }
+}
+
+void TerrainPoisson::usePrecond(bool use_precond_in)
+{
+    m_use_precond = use_precond_in;
 }
 
 void TerrainPoisson::apply(MultiFab& lhs, MultiFab const& rhs)
@@ -178,7 +186,11 @@ Real TerrainPoisson::norm2(MultiFab const& v)
 
 void TerrainPoisson::precond(MultiFab& lhs, MultiFab const& rhs)
 {
-    MultiFab::Copy(lhs, rhs, 0, 0, 1, 0);
+    if (m_use_precond) {
+        m_2D_fft_precond->solve(lhs, rhs, *m_zphys);
+    } else {
+        MultiFab::Copy(lhs, rhs, 0, 0, 1, 0);
+    }
 }
 
 void TerrainPoisson::setToZero(MultiFab& v)
diff --git a/Source/LinearSolvers/ERF_compute_divergence.cpp b/Source/LinearSolvers/ERF_compute_divergence.cpp
index b48fe23e1..6a96dd8c6 100644
--- a/Source/LinearSolvers/ERF_compute_divergence.cpp
+++ b/Source/LinearSolvers/ERF_compute_divergence.cpp
@@ -28,9 +28,8 @@ void ERF::compute_divergence (int lev, MultiFab& rhs, Vector<MultiFab>& mom_mf,
     bool already_on_centroids = true;
     EB_computeDivergence(rhs, rho0_u_const, geom_at_lev, already_on_centroids);
 #else
-    // if (l_use_terrain && SolverChoice::terrain_is_flat) {
-    if (0) {
-
+    if (l_use_terrain && SolverChoice::terrain_is_flat)
+    {
         for ( MFIter mfi(rhs,TilingIfNotGPU()); mfi.isValid(); ++mfi)
         {
             Box bx = mfi.tilebox();
@@ -47,9 +46,9 @@ void ERF::compute_divergence (int lev, MultiFab& rhs, Vector<MultiFab>& mom_mf,
                                  +(rho0w_arr(i,j,k+1) - rho0w_arr(i,j,k)) / dz;
             });
         } // mfi
-
-    } else if (l_use_terrain) { // terrain is not flat
-
+    }
+    else if (l_use_terrain) // terrain is not flat
+    {
         //
         // Note we compute the divergence using "rho0w" == Omega
         //
@@ -74,9 +73,10 @@ void ERF::compute_divergence (int lev, MultiFab& rhs, Vector<MultiFab>& mom_mf,
             });
         } // mfi
 
-    } else { // no terrain
+    }
+    else // no terrain
+    {
         computeDivergence(rhs, rho0_u_const, geom_at_lev);
-
     }
 #endif
 }
diff --git a/Source/LinearSolvers/ERF_solve_with_fft.cpp b/Source/LinearSolvers/ERF_solve_with_fft.cpp
index da837be7f..339741b18 100644
--- a/Source/LinearSolvers/ERF_solve_with_fft.cpp
+++ b/Source/LinearSolvers/ERF_solve_with_fft.cpp
@@ -50,8 +50,6 @@ void ERF::solve_with_fft (int lev, MultiFab& rhs, MultiFab& phi, Array<MultiFab,
 {
     BL_PROFILE("ERF::solve_with_fft()");
 
-    AMREX_ALWAYS_ASSERT(use_fft);
-
     // For now the logic only works correctly for lev == 0
     // TODO: enable FFT for nested domains at lev > 0
     AMREX_ALWAYS_ASSERT(lev == 0);
diff --git a/Source/LinearSolvers/ERF_solve_with_gmres.cpp b/Source/LinearSolvers/ERF_solve_with_gmres.cpp
index 9d53e9f43..189dce0dc 100644
--- a/Source/LinearSolvers/ERF_solve_with_gmres.cpp
+++ b/Source/LinearSolvers/ERF_solve_with_gmres.cpp
@@ -1,6 +1,7 @@
 #include "ERF.H"
 #include "ERF_Utils.H"
 #include "ERF_TerrainPoisson.H"
+#include "ERF_FFT_TerrainPrecond.H"
 
 #include <AMReX_GMRES.H>
 
@@ -24,7 +25,7 @@ void ERF::solve_with_gmres (int lev, Vector<MultiFab>& rhs, Vector<MultiFab>& ph
 
     gmsolver.setVerbose(mg_verbose);
 
-//  tp.usePrecond(false);
+    tp.usePrecond(true);
 
     gmsolver.solve(phi[0], rhs[0], reltol, abstol);
 
diff --git a/Source/LinearSolvers/Make.package b/Source/LinearSolvers/Make.package
index 068a92f8b..3ec95b6cc 100644
--- a/Source/LinearSolvers/Make.package
+++ b/Source/LinearSolvers/Make.package
@@ -1,5 +1,6 @@
 CEXE_headers += ERF_TerrainPoisson_3D_K.H
 CEXE_headers += ERF_TerrainPoisson.H
+CEXE_headers += ERF_FFT_TerrainPrecond.H
 CEXE_sources += ERF_TerrainPoisson.cpp
 
 CEXE_sources += ERF_compute_divergence.cpp