diff --git a/examples/09_Disorder_KPM.jl b/examples/09_Disorder_KPM.jl
index c7a286c46..7c6ca5901 100644
--- a/examples/09_Disorder_KPM.jl
+++ b/examples/09_Disorder_KPM.jl
@@ -47,12 +47,13 @@ plot_intensities(res)
 
 sys_inhom = to_inhomogeneous(repeat_periodically(sys, (10, 10, 1)))
 
-# Use [`symmetry_equivalent_bonds`](@ref) to iterate over all nearest neighbor
-# bonds of the inhomogeneous system. Modify each AFM exchange with a noise term
-# that has variance of 1/3. The newly minimized energy configuration allows for
-# long wavelength modulations on top of the original 120° order.
+# Iterate over all [`symmetry_equivalent_bonds`](@ref) for nearest neighbor
+# sites in the inhomogeneous system. Set the AFM exchange to include a noise
+# term that has a standard deviation of 1/3. The newly minimized energy
+# configuration allows for long wavelength modulations on top of the original
+# 120° order.
 
-for (site1, site2, offset) in symmetry_equivalent_bonds(sys_inhom, Bond(1,1,[1,0,0]))
+for (site1, site2, offset) in symmetry_equivalent_bonds(sys_inhom, Bond(1, 1, [1, 0, 0]))
     noise = randn()/3
     set_exchange_at!(sys_inhom, 1.0 + noise, site1, site2; offset)
 end
@@ -60,38 +61,32 @@ end
 minimize_energy!(sys_inhom, maxiters=5_000)
 plot_spins(sys_inhom; color=[S[3] for S in sys_inhom.dipoles], ndims=2)
 
-# Traditional spin wave theory calculations become impractical for large system
-# sizes. Significant acceleration is possible with the [kernel polynomial
-# method](https://arxiv.org/abs/2312.08349). Enable it by selecting
-# [`SpinWaveTheoryKPM`](@ref) in place of the traditional
-# [`SpinWaveTheory`](@ref). Using KPM, the cost of an [`intensities`](@ref)
-# calculation becomes linear in system size and scales inversely with the width
-# of the line broadening `kernel`. Error tolerance is controlled through the
-# dimensionless `tol` parameter. A relatively small value, `tol = 0.01`, helps
-# to resolve the large intensities near the ordering wavevector. The alternative
-# choice `tol = 0.1` would be twice faster, but would introduce significant
-# numerical artifacts.
+# Spin wave theory with exact diagonalization becomes impractical for large
+# system sizes. Significant acceleration is possible with an iterative Krylov
+# space solver. With [`SpinWaveTheoryKPM`](@ref), the cost of an
+# [`intensities`](@ref) calculation scales linearly in the system size and
+# inversely with the width of the line broadening `kernel`. Good choices for the
+# dimensionless error tolerance are `tol=0.05` (more speed) or `tol=0.01` (more
+# accuracy).
 #
 # Observe from the KPM calculation that disorder in the nearest-neighbor
 # exchange serves to broaden the discrete excitation bands into a continuum.
 
-swt = SpinWaveTheoryKPM(sys_inhom; measure=ssf_perp(sys_inhom), tol=0.01)
+swt = SpinWaveTheoryKPM(sys_inhom; measure=ssf_perp(sys_inhom), tol=0.05)
 res = intensities(swt, path; energies, kernel)
 plot_intensities(res)
 
 # Now apply a magnetic field of magnitude 7.5 (energy units) along the global
-# ``ẑ`` axis. This field fully polarizes the spins. Because gap opens, a larger
-# tolerance of `tol = 0.1` can be used to accelerate the KPM calculation without
-# sacrificing much accuracy. The resulting spin wave spectrum shows a sharp mode
-# at the Γ-point (zone center) that broadens into a continuum along the K and M
-# points (zone boundary).
+# ``ẑ`` axis. This field fully polarizes the spins and opens a gap. The spin
+# wave spectrum shows a sharp mode at the Γ-point (zone center) that broadens
+# into a continuum along the K and M points (zone boundary).
 
 set_field!(sys_inhom, [0, 0, 7.5])
 randomize_spins!(sys_inhom)
 minimize_energy!(sys_inhom)
 
 energies = range(0.0, 9.0, 150)
-swt = SpinWaveTheoryKPM(sys_inhom; measure=ssf_perp(sys_inhom), tol=0.1)
+swt = SpinWaveTheoryKPM(sys_inhom; measure=ssf_perp(sys_inhom), tol=0.05)
 res = intensities(swt, path; energies, kernel)
 plot_intensities(res)
 
@@ -105,8 +100,10 @@ for site in eachsite(sys_inhom)
     noise = randn()/6
     sys_inhom.gs[site] = [1 0 0; 0 1 0; 0 0 1+noise]
 end
+randomize_spins!(sys_inhom)
+minimize_energy!(sys_inhom)
 
-swt = SpinWaveTheoryKPM(sys_inhom; measure=ssf_perp(sys_inhom), tol=0.1)
+swt = SpinWaveTheoryKPM(sys_inhom; measure=ssf_perp(sys_inhom), tol=0.05)
 res = intensities(swt, path; energies, kernel)
 plot_intensities(res)
 
diff --git a/src/KPM/Lanczos.jl b/src/KPM/Lanczos.jl
index bb5ea2b67..a362e87dc 100644
--- a/src/KPM/Lanczos.jl
+++ b/src/KPM/Lanczos.jl
@@ -1,13 +1,40 @@
-# Reference implementation, without consideration of speed.
+# Reference implementation of the generalized Lanczos method, without
+# consideration of speed. The input A can be any Hermitian matrix. The optional
+# argument S is an inner-product that must be both Hermitian and positive
+# definite. Intuitively, Lanczos finds a low-rank approximant A S ≈ Q T Q† S,
+# where T is tridiagonal and Q is orthogonal in the sense that Q† S Q = I. The
+# first column of Q is the initial vector v, which must be normalized. The
+# Lanczos method is useful in two ways. First, eigenvalues of T are
+# representative of extremal eigenvalues of A (in an appropriate S-measure
+# sense). Second, one obtains a very good approximation to the matrix-vector
+# product, f(A S) v ≈ Q f(T) e₁, valid for any function f [1].
+# 
+# The generalization of Lanczos to an arbitrary measure S is implemented as
+# follows: Each matrix-vector product A v is replaced by A S v, and each vector
+# inner product w† v is replaced by w† S v. Similar generalizations of Lanczos
+# have been considered in [2] and [3].
+#
+# 1. N. Amsel, T. Chen, A. Greenbaum, C. Musco, C. Musco, Near-optimal
+#    approximation of matrix functions by the Lanczos method, (2013)
+#    https://arxiv.org/abs/2303.03358v2.
+# 2. H. A. van der Vorst, Math. Comp. 39, 559-561 (1982),
+#    https://doi.org/10.1090/s0025-5718-1982-0669648-0
+# 3. M. Grüning, A. Marini, X. Gonze, Comput. Mater. Sci. 50, 2148-2156 (2011),
+#    https://doi.org/10.1016/j.commatsci.2011.02.021.
 function lanczos_ref(A, S, v; niters)
     αs = Float64[]
     βs = Float64[]
+    vs = typeof(v)[]
+
+    c = sqrt(real(dot(v, S, v)))
+    @assert c ≈ 1 "Initial vector not normalized"
+    v /= c
 
-    v /= sqrt(real(dot(v, S, v)))
     w = A * S * v
     α = real(dot(w, S, v))
     w = w - α * v
     push!(αs, α)
+    push!(vs, v)
 
     for _ in 2:niters
         β = sqrt(real(dot(w, S, w)))
@@ -19,35 +46,54 @@ function lanczos_ref(A, S, v; niters)
         v = vp
         push!(αs, α)
         push!(βs, β)
+        push!(vs, v)
     end
 
-    return SymTridiagonal(αs, βs)
+    Q = reduce(hcat, vs)
+    T = SymTridiagonal(αs, βs)
+    return (; Q, T)
 end
 
 
-
-# Use Lanczos iterations to find a truncated tridiagonal representation of A S,
-# up to similarity transformation. Here, A is any Hermitian matrix, while S is
-# both Hermitian and positive definite. Traditional Lanczos uses the identity
-# matrix, S = I. The extension to non-identity matrices S is as follows: Each
-# matrix-vector product A v becomes A S v, and each vector inner product w† v
-# becomes w† S v. The implementation below follows the notation of Wikipedia,
-# and is the most stable of the four variants considered by Paige [1]. Each
-# Lanczos iteration requires only one matrix-vector multiplication for A and S,
-# respectively.
-
-# Similar generalizations of Lanczos have been considered in [2] and [3].
+# An optimized implementation of the generalized Lanczos method. See
+# `lanczos_ref` for explanation, and a reference implementation. This optimized
+# implementation follows "the most stable" of the four variants considered by
+# Paige [1], as listed on Wikipedia. Here, however, we allow for an arbitary
+# measure S. In practice, this means that each matrix-vector product A v is
+# replaced by A S v, and each inner product w† v is replaced by w† S v.
+#
+# In this implementation, each Lanczos iteration requires only one matrix-vector
+# multiplication involving A and S, respectively. 
+#
+# Details:
+#
+# * The return value is a pair, (T, lhs† Q). The symbol `lhs` denotes "left-hand
+#   side" column vectors, packed horizontally into a matrix. If the `lhs`
+#   argument is ommitted, its default value will be a matrix of width 0.
+# * If the initial vector `v` is not normalized, then this normalization will be
+#   performed automatically, and the scale `√v S v` will be absorbed into the
+#   return value `lhs† Q`.
+# * The number of iterations will never exceed length(v). If this limit is hit
+#   then, mathematically, the Krylov subspace should be complete, and the matrix
+#   T should be exactly similar to the matrix A S. In practice, however,
+#   numerical error at finite precision may be severe. Further testing is
+#   required to determine whether the Lanczos method is appropriate in this
+#   case, where the matrices have modest dimension. (Direct diagonalization may
+#   be preferable.)
+# * After `min_iters` iterations, this procedure will estimate the spectral
+#   bandwidth Δϵ between extreme eigenvalues of T. Then `Δϵ / resolution` will
+#   be an upper bound on the total number of iterations (which includes the
+#   initial `min_iters` iterations as well as subsequent ones).
 #
 # 1. C. C. Paige, IMA J. Appl. Math., 373-381 (1972),
-# https://doi.org/10.1093%2Fimamat%2F10.3.373.
-# 2. H. A. van der Vorst, Math. Comp. 39, 559-561 (1982),
-# https://doi.org/10.1090/s0025-5718-1982-0669648-0
-# 3. M. Grüning, A. Marini, X. Gonze, Comput. Mater. Sci. 50, 2148-2156 (2011),
-# https://doi.org/10.1016/j.commatsci.2011.02.021.
-function lanczos(mulA!, mulS!, v; niters)
+#    https://doi.org/10.1093%2Fimamat%2F10.3.373.
+
+function lanczos(mulA!, mulS!, v; min_iters, resolution=Inf, lhs=zeros(length(v), 0), verbose=false)
     αs = Float64[]
     βs = Float64[]
+    lhs_adj_Q = Vector{ComplexF64}[]
 
+    v = copy(v)
     vp  = zero(v)
     Sv  = zero(v)
     Svp = zero(v)
@@ -56,6 +102,7 @@ function lanczos(mulA!, mulS!, v; niters)
 
     mulS!(Sv, v)
     c = sqrt(real(dot(v, Sv)))
+    @assert c ≈ 1 "Initial vector not normalized"
     @. v /= c
     @. Sv /= c
 
@@ -64,10 +111,36 @@ function lanczos(mulA!, mulS!, v; niters)
     @. w = w - α * v
     mulS!(Sw, w)
     push!(αs, α)
-
-    for _ in 2:niters
-        β = sqrt(real(dot(w, Sw)))
-        iszero(β) && break
+    push!(lhs_adj_Q, lhs' * v)
+
+    # The maximum number of iterations is length(v)-1, because that is the
+    # dimension of the full vector space. Break out early if niters has been
+    # reached, which will be set according to the spectral bounds Δϵ after
+    # iteration i == min_iters.
+    niters = typemax(Int)
+    for i in 1:length(v)-1
+        if i == min_iters
+            T = SymTridiagonal(αs, βs)
+            Δϵ = eigmax(T) - eigmin(T)
+            niters = max(min_iters, fld(Δϵ, resolution))
+            niters += mod(niters, 2) # Round up to nearest even integer
+            if verbose
+                println("Δϵ=$Δϵ, niters=$niters")
+            end
+        end
+
+        i >= niters && break
+
+        # Let β = w† S w. If β < 0 then S is not a valid positive definite
+        # measure. If β = 0, this would formally indicate that v is a linear
+        # combination of only a subset of the eigenvectors of A S. In this case,
+        # it is valid to break early for purposes of approximating the
+        # matrix-vector product f(A S) v. 
+        β² = real(dot(w, Sw))
+        iszero(β²) && break
+        β² < 0 && error("S is not a positive definite measure")
+
+        β = sqrt(β²)
         @. vp = w / β
         @. Svp = Sw / β
         mulA!(w, Svp)
@@ -77,9 +150,10 @@ function lanczos(mulA!, mulS!, v; niters)
         @. v = vp
         push!(αs, α)
         push!(βs, β)
+        push!(lhs_adj_Q, lhs' * v)
     end
 
-    return SymTridiagonal(αs, βs)
+    return SymTridiagonal(αs, βs), reduce(hcat, lhs_adj_Q)
 end
 
 
@@ -107,6 +181,9 @@ function eigbounds(swt, q_reshaped, niters)
     end
 
     v = randn(ComplexF64, 2L)
-    tridiag = lanczos(mulA!, mulS!, v; niters)
-    return eigmin(tridiag), eigmax(tridiag)
+    Sv = zero(v)
+    mulS!(Sv, v)
+    v /= sqrt(v' * Sv)
+    T, _ = lanczos(mulA!, mulS!, v; min_iters=niters)
+    return eigmin(T), eigmax(T)
 end
diff --git a/src/KPM/SpinWaveTheoryKPM.jl b/src/KPM/SpinWaveTheoryKPM.jl
index e2e989ef7..31c51cb83 100644
--- a/src/KPM/SpinWaveTheoryKPM.jl
+++ b/src/KPM/SpinWaveTheoryKPM.jl
@@ -2,24 +2,16 @@
     SpinWaveTheoryKPM(sys::System; measure, regularization=1e-8, tol)
 
 A variant of [`SpinWaveTheory`](@ref) that uses the kernel polynomial method
-(KPM) to perform [`intensities`](@ref) calculations [1]. Instead of explicitly
-diagonalizing the dynamical matrix, KPM approximates intensities using
-polynomial expansion truncated at order ``M``. The reduces the computational
-cost from ``𝒪(N^3)`` to ``𝒪(N M)``, which is favorable for large system sizes
-``N``.
+(KPM) to calculate [`intensities`](@ref) [1]. This method avoids direct matrix
+diagonalization, which scales cubically in the system size ``N``. Instead, using
+``M`` iterative matrix-vector multiplications, the cost becomes ``𝒪(N M +
+M^2)``. This provides a very significant acceleration when ``N`` is large.
 
-The polynomial order ``M`` will be determined from the line broadening kernel
-and the specified error tolerance `tol`. Specifically, for each wavevector,
-``M`` scales like the spectral bandwidth of excitations, divided by the energy
-resolution of the broadening kernel, times the negative logarithm of `tol`.
-
-The error tolerance `tol` should be tuned empirically for each calculation.
-Reasonable starting points are `1e-1` (more speed) or `1e-2` (more accuracy).
-
-!!! warning "Missing intensity at small quasi-particle energy"  
-    The KPM-calculated intensities are unreliable at small energies ``ω``. In
-    particular, KPM may mask intensities that arise near the Goldstone modes of
-    an ordered state with continuous symmetry.
+The number of iterations is typically not too large: `M ≈ -2 log10(tol) Δϵ /
+fwhm` where `Δϵ` is the estimated spectral bandwidth of excitations, `fwhm` is
+the full width at half maximum of the broadening kernel, and `tol` is a
+dimensionless tolerance parameter. Good choices are `0.05` (more speed) or
+`0.01` (more accuracy).
 
 ## References
 
@@ -185,5 +177,137 @@ end
 function intensities(swt_kpm::SpinWaveTheoryKPM, qpts; energies, kernel::AbstractBroadening, kT=0.0, verbose=false)
     qpts = convert(AbstractQPoints, qpts)
     data = zeros(eltype(swt_kpm.swt.measure), length(energies), length(qpts.qs))
-    return intensities!(data, swt_kpm, qpts; energies, kernel, kT, verbose)
+    # return intensities!(data, swt_kpm, qpts; energies, kernel, kT, verbose)
+    return intensities2!(data, swt_kpm, qpts; energies, kernel, kT, verbose)
+end
+
+
+function intensities2!(data, swt_kpm::SpinWaveTheoryKPM, qpts; energies, kernel::AbstractBroadening, kT=0.0, verbose=false)
+    qpts = convert(AbstractQPoints, qpts)
+
+    (; swt, tol, lanczos_iters) = swt_kpm
+    (; sys, measure) = swt
+    cryst = orig_crystal(sys)
+
+    isnothing(kernel.fwhm) && error("Cannot determine the kernel fwhm")
+
+    @assert eltype(data) == eltype(measure)
+    @assert size(data) == (length(energies), length(qpts.qs))
+    fill!(data, zero(eltype(data)))
+
+    Na = nsites(sys)
+    Ncells = Na / natoms(cryst)
+    Nf = nflavors(swt)
+    L = Nf*Na
+    Avec_pref = zeros(ComplexF64, Na)
+
+    Nobs = size(measure.observables, 1)
+    Ncorr = length(measure.corr_pairs)
+    corrbuf = zeros(ComplexF64, Ncorr)
+
+    u = zeros(ComplexF64, 2L, Nobs)
+    v = zeros(ComplexF64, 2L)
+    Sv = zeros(ComplexF64, 2L)
+
+    for (iq, q) in enumerate(qpts.qs)
+        q_reshaped = to_reshaped_rlu(sys, q)
+        q_global = cryst.recipvecs * q
+
+        # Represent each local observable A(q) as a complex vector u(q) that
+        # denotes a linear combination of HP bosons.
+
+        for i in 1:Na
+            r = sys.crystal.positions[i]
+            ff = get_swt_formfactor(measure, 1, i)
+            Avec_pref[i] = exp(2π*im * dot(q_reshaped, r))
+            Avec_pref[i] *= compute_form_factor(ff, norm2(q_global))
+        end
+
+        if sys.mode == :SUN
+            (; observables_localized) = swt.data::SWTDataSUN
+            N = sys.Ns[1]
+            for μ in 1:Nobs, i in 1:Na
+                O = observables_localized[μ, i]
+                for f in 1:Nf
+                    u[f + (i-1)*Nf, μ]     = Avec_pref[i] * O[f, N]
+                    u[f + (i-1)*Nf + L, μ] = Avec_pref[i] * O[N, f]
+                end
+            end
+        else
+            @assert sys.mode in (:dipole, :dipole_uncorrected)
+            (; sqrtS, observables_localized) = swt.data::SWTDataDipole
+            for μ in 1:Nobs, i in 1:Na
+                O = observables_localized[μ, i]
+                u[i, μ]   = Avec_pref[i] * (sqrtS[i] / √2) * (O[1] + im*O[2])
+                u[i+L, μ] = Avec_pref[i] * (sqrtS[i] / √2) * (O[1] - im*O[2])
+            end
+        end
+
+        # Perform Lanczos calculation
+ 
+        # w = Ĩ v
+        function mulA!(w, v)
+            @views w[1:L]    = +v[1:L]
+            @views w[L+1:2L] = -v[L+1:2L]
+            return w
+        end
+
+        # w = D v
+        function mulS!(w, v)
+            mul_dynamical_matrix!(swt, reshape(w, 1, :), reshape(v, 1, :), [q_reshaped])
+            return w
+        end
+
+        resolution = (kernel.fwhm/2) / max(-log10(tol), 0)
+
+        for ξ in 1:Nobs
+            # Don't accumulate observables that are zero
+            iszero(view(u, :, ξ)) && continue
+
+            mulA!(v, view(u, :, ξ))
+            mulS!(Sv, v)
+            c = sqrt(real(Sv' * v))
+            v ./= c
+            tridiag, lhs_adj_Q = try
+                lanczos(mulA!, mulS!, v; lhs=u, min_iters=lanczos_iters, resolution, verbose)
+            catch e
+                if e.msg == "S is not a positive definite measure"
+                    rethrow(ErrorException("Not an energy-minimum; wavevector q = $q unstable."))
+                else
+                    rethrow()
+                end
+            end
+
+            (; values, vectors) = eigen(tridiag)
+
+            for (iω, ω) in enumerate(energies)
+                f(x) = kernel(x, ω) * thermal_prefactor(x; kT)
+
+                corr_ξ = c * lhs_adj_Q * vectors * Diagonal(f.(values)) * (vectors'[:, 1])
+
+                # This step assumes that each local observable in the
+                # correlation is Hermitian. In this case, bare correlations
+                # should be symmetric, C[μ, ν] = C[ν, μ]*. The Lanczos
+                # approximation C̃ breaks this symmetry. Restore it by looping
+                # over ξ in 1:Nobs and accumulate Lanczos data C̃[:, ξ] in a
+                # symmetric way. Accumulate C̃[μ, ξ] into C[μ, ν] if ξ = ν. Also
+                # accumulate C̃[ν, ξ]* into C[μ, ν] if ξ = μ. A factor of 1/2
+                # avoids double counting. In the special case that μ = ν, this
+                # assigns real(C̃[μ, μ]) to C[μ, μ] only once.
+                corrbuf .= 0
+                for (i, (μ, ν)) in enumerate(measure.corr_pairs)
+                    ξ == ν && (corrbuf[i] += (1/2) *     (corr_ξ[μ] / Ncells))
+                    ξ == μ && (corrbuf[i] += (1/2) * conj(corr_ξ[ν] / Ncells))
+                end
+
+                # This step assumes that combiner is linear, so that it is valid
+                # to move the ξ loop outside the data accumulation. One could
+                # relax this assumption by preallocating an array of size (Nω,
+                # Ncorr) to accumulate into corrbuf prior to calling combiner.
+                data[iω, iq] += measure.combiner(q_global, corrbuf)
+            end
+        end
+    end
+
+    return Intensities(cryst, qpts, collect(energies), data)
 end
diff --git a/src/SpinWaveTheory/DispersionAndIntensities.jl b/src/SpinWaveTheory/DispersionAndIntensities.jl
index 643a750c8..b43dd9c35 100644
--- a/src/SpinWaveTheory/DispersionAndIntensities.jl
+++ b/src/SpinWaveTheory/DispersionAndIntensities.jl
@@ -97,7 +97,7 @@ function excitations!(T, tmp, swt::SpinWaveTheory, q)
     try
         return bogoliubov!(T, tmp)
     catch _
-        error("Instability at wavevector q = $q")
+        rethrow(ErrorException("Not an energy-minimum; wavevector q = $q unstable."))
     end
 end
 
diff --git a/src/SpinWaveTheory/HamiltonianDipole.jl b/src/SpinWaveTheory/HamiltonianDipole.jl
index aaf5581c5..f3e7c55b1 100644
--- a/src/SpinWaveTheory/HamiltonianDipole.jl
+++ b/src/SpinWaveTheory/HamiltonianDipole.jl
@@ -4,7 +4,7 @@ function swt_hamiltonian_dipole!(H::Matrix{ComplexF64}, swt::SpinWaveTheory, q_r
     (; local_rotations, stevens_coefs, sqrtS) = data
     (; extfield, gs) = sys
 
-    L = nbands(swt) 
+    L = nbands(swt)
     @assert size(H) == (2L, 2L)
 
     # Initialize Hamiltonian buffer 
diff --git a/test/test_kpm.jl b/test/test_kpm.jl
index 9ba1dd259..617c916f4 100644
--- a/test/test_kpm.jl
+++ b/test/test_kpm.jl
@@ -1,27 +1,35 @@
 @testitem "Lanczos" begin
     using LinearAlgebra
 
-    N = 200
+    N = 40
     A = hermitianpart(randn(N, N))
-    A = diagm(vcat(ones(N÷2), -ones(N÷2)))
 
     S = randn(N, N)
     S = S' * S
     S = S / eigmax(S)
-    S = S + 0.0I
+    S = S + 1e-2I
 
+    # Check that fast and slow Lanczos implementations match
+
+    lhs = randn(N, 2)
     v = randn(N)
-    ev1 = eigvals(Sunny.lanczos_ref(A*S*A, S, v; niters=10))
+    v /= sqrt(dot(v, S, v))
+    (; Q, T) = Sunny.lanczos_ref(A, S, v; niters=10)
+    ev1 = eigvals(T)
 
-    mulA!(w, v) = (w .= A * S * A * v)
+    mulA!(w, v) = mul!(w, A, v)
     mulS!(w, v) = mul!(w, S, v)
-    ev2 = eigvals(Sunny.lanczos(mulA!, mulS!, copy(v); niters=10))
+    T, lhs_adj_Q = Sunny.lanczos(mulA!, mulS!, copy(v); lhs, min_iters=10)
+    ev2 = eigvals(T)
 
     @test ev1 ≈ ev2
+    @test lhs' * Q ≈ lhs_adj_Q
+
+    # Check that extremal eigenvalues match to reasonable accuracy
 
-    ev3 = extrema(eigvals(Sunny.lanczos_ref(A*S*A, S, v; niters=100)))
-    ev4 = extrema(eigvals((A*S)^2))
-    @test isapprox(collect(ev3), collect(ev4); atol=1e-3)
+    T, _ = Sunny.lanczos(mulA!, mulS!, copy(v); min_iters=20)
+    @test isapprox(eigmin(T), eigmin(A * S); atol=1e-3)
+    @test isapprox(eigmax(T), eigmax(A * S); atol=1e-3)
 end