diff --git a/examples/p4est_2d_dgsem/elixir_shallowwater_wellbalanced.jl b/examples/p4est_2d_dgsem/elixir_shallowwater_wellbalanced.jl
new file mode 100644
index 0000000..5be32db
--- /dev/null
+++ b/examples/p4est_2d_dgsem/elixir_shallowwater_wellbalanced.jl
@@ -0,0 +1,186 @@
+
+using Downloads: download
+using OrdinaryDiffEq
+using Trixi
+using TrixiShallowWater
+
+###############################################################################
+# semidiscretization of the shallow water equations with a continuous
+# bottom topography function
+
+equations = ShallowWaterEquationsWetDry2D(gravity_constant = 9.812, H0 = 3.0)
+
+function initial_condition_wb_testing(x, t, equations::ShallowWaterEquationsWetDry2D)
+    # Set up polar coordinates
+    inicenter = SVector(0.15, 0.15)
+    x_norm = x[1] - inicenter[1]
+    y_norm = x[2] - inicenter[2]
+    r = sqrt(x_norm^2 + y_norm^2)
+
+    # Calculate primitive variables
+    v1 = 0.0
+    v2 = 0.0
+    # v1 = r < 0.6 ? 1.75 : 0.0
+    # v2 = r < 0.6 ? -1.75 : 0.0
+    # bottom topography taken from Pond.control in [HOHQMesh](https://github.com/trixi-framework/HOHQMesh)
+    x1, x2 = x
+    b = (1.5 / exp(0.5 * ((x1 - 1.0)^2 + (x2 - 1.0)^2))
+         +
+         0.75 / exp(0.5 * ((x1 + 1.0)^2 + (x2 + 1.0)^2)))
+    # H = equations.H0
+    H = max(equations.H0, b + equations.threshold_limiter)
+
+    return prim2cons(SVector(H, v1, v2, b), equations)
+end
+
+initial_condition = initial_condition_wb_testing
+
+boundary_condition = Dict(:OuterCircle => boundary_condition_slip_wall)
+
+###############################################################################
+# Get the DG approximation space
+
+# # ersing flux in both
+# volume_flux = (flux_wintermeyer_etal, flux_nonconservative_ersing_etal)
+# surface_fluxes = (flux_wintermeyer_etal, flux_nonconservative_ersing_etal)
+
+# Wintermeyer flux in the volume
+volume_flux = (flux_wintermeyer_etal, flux_nonconservative_wintermeyer_etal)
+
+# # Wintermeyer flux in the surface for testing
+# surface_fluxes = (flux_wintermeyer_etal, flux_nonconservative_wintermeyer_etal)
+
+surface_fluxes = (FluxHydrostaticReconstruction(flux_hll_chen_noelle, hydrostatic_reconstruction_chen_noelle),
+                  flux_nonconservative_chen_noelle)
+
+
+# # Fjordholms flux for testing
+# surface_fluxes = (flux_fjordholm_etal, flux_nonconservative_fjordholm_etal)
+
+# Audusse HR with Rusanov flux
+# surface_fluxes = (FluxHydrostaticReconstruction(flux_lax_friedrichs,
+#                                                 hydrostatic_reconstruction_audusse_etal),
+#                   flux_nonconservative_audusse_etal)
+
+# Audusse HR with HLL flux
+# surface_fluxes = (FluxHydrostaticReconstruction(flux_hll, hydrostatic_reconstruction_audusse_etal),
+#                   flux_nonconservative_audusse_etal)
+
+# Create the solver
+solver = DGSEM(polydeg = 4, surface_flux = surface_fluxes,
+               volume_integral = VolumeIntegralFluxDifferencing(volume_flux))
+
+###############################################################################
+# This setup is for the curved, split form well-balancedness testing
+
+# Get the unstructured quad mesh from a file (downloads the file if not available locally)
+
+default_mesh_file = joinpath(@__DIR__, "abaqus_outer_circle.inp")
+isfile(default_mesh_file) ||
+    download("https://gist.githubusercontent.com/andrewwinters5000/df92dd4986909927e96af23c37f6db5f/raw/8620823342f98c505a36351b210aab7f3b368041/abaqus_outer_circle.inp",
+             default_mesh_file)
+mesh_file = default_mesh_file
+
+mesh_file = joinpath(@__DIR__, "pond.inp")
+
+mesh = P4estMesh{2}(mesh_file)
+
+# Refine bottom left quadrant of each tree to level 2
+function refine_fn(p4est, which_tree, quadrant)
+    quadrant_obj = unsafe_load(quadrant)
+    if quadrant_obj.x == 0 && quadrant_obj.y == 0 && quadrant_obj.level < 2
+          # return true (refine)
+        return Cint(1)
+    else
+        # return false (don't refine)
+        return Cint(0)
+    end
+end
+
+# Refine recursively until each bottom left quadrant of a tree has level 2
+# The mesh will be rebalanced before the simulation starts
+refine_fn_c = @cfunction(refine_fn, Cint,
+                         (Ptr{Trixi.p4est_t}, Ptr{Trixi.p4est_topidx_t},
+                          Ptr{Trixi.p4est_quadrant_t}))
+Trixi.refine_p4est!(mesh.p4est, true, refine_fn_c, C_NULL)
+
+# Create the semi discretization object
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
+                                    boundary_conditions = boundary_condition)
+
+###############################################################################
+# ODE solvers, callbacks, etc.
+
+tspan = (0.0, 3.0)
+ode = semidiscretize(semi, tspan)
+
+###############################################################################
+# # Workaround to set a discontinuous bottom topography for debugging and testing.
+
+# # alternative version of the initial conditinon used to setup a truly discontinuous
+# # bottom topography function for this academic testcase.
+# # The errors from the analysis callback are not important but the error for this lake at rest test case
+# # `∑|H0-(h+b)|` should be around machine roundoff
+# # In contrast to the usual signature of initial conditions, this one get passed the
+# # `element_id` explicitly. In particular, this initial conditions works as intended
+# # only for the specific mesh loaded above!
+# function initial_condition_discontinuous_well_balancedness(x, t, element_id, equations::ShallowWaterEquations2D)
+#   # Set the background values
+#   H = equations.H0
+#   v1 = 0.0
+#   v2 = 0.0
+
+#   x1, x2 = x
+#   b = (  1.5 / exp( 0.5 * ((x1 - 1.0)^2 + (x2 - 1.0)^2) )
+#        + 0.75 / exp( 0.5 * ((x1 + 1.0)^2 + (x2 + 1.0)^2) ) )
+
+#   # Setup a discontinuous bottom topography using the element id number
+# #  if element_id > 50 && element_id < 80 # for the original mesh file
+# #  if element_id > 200 && element_id < 300 # for no hanging node grid with < 1 in function above
+#   if element_id > 300 && element_id < 400 # for the forced hanging node grid with < 2 in function above
+# #  if element_id > 400 && element_id < 650 # for the forced hanging node grid with < 3 in function above
+#     b = 0.0
+#   end
+
+#   return prim2cons(SVector(H, v1, v2, b), equations)
+# end
+
+# # point to the data we want to augment
+# u = Trixi.wrap_array(ode.u0, semi)
+# # reset the initial condition
+# for element in eachelement(semi.solver, semi.cache)
+#   for j in eachnode(semi.solver), i in eachnode(semi.solver)
+#     x_node = Trixi.get_node_coords(semi.cache.elements.node_coordinates, equations, semi.solver, i, j, element)
+#     u_node = initial_condition_discontinuous_well_balancedness(x_node, first(tspan), element, equations)
+#     Trixi.set_node_vars!(u, u_node, equations, semi.solver, i, j, element)
+#   end
+# end
+###############################################################################
+
+summary_callback = SummaryCallback()
+
+analysis_interval = 100
+analysis_callback = AnalysisCallback(semi, interval = analysis_interval,
+                                     extra_analysis_integrals = (lake_at_rest_error,))
+
+alive_callback = AliveCallback(analysis_interval = analysis_interval)
+
+save_solution = SaveSolutionCallback(dt = 1.0,
+                                     save_initial_solution = true,
+                                     save_final_solution = true)
+
+stepsize_callback = StepsizeCallback(cfl = 2.0)
+
+callbacks = CallbackSet(summary_callback,
+                        analysis_callback,
+                        alive_callback,
+                        save_solution,
+                        stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false),
+            dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
+            save_everystep = false, callback = callbacks);
+summary_callback() # print the timer summary
diff --git a/src/TrixiShallowWater.jl b/src/TrixiShallowWater.jl
index 5522fc5..0b73535 100644
--- a/src/TrixiShallowWater.jl
+++ b/src/TrixiShallowWater.jl
@@ -1,8 +1,8 @@
 module TrixiShallowWater
 
-# While `using Trixi` makes all exported symbols available, in order to extend a method from the 
+# While `using Trixi` makes all exported symbols available, in order to extend a method from the
 # `Trixi.jl` module, symbols need to be explicitly qualified with `Trixi.function_name`.
-# For more information, see 
+# For more information, see
 # https://github.com/trixi-framework/TrixiShallowWater.jl/pull/10#discussion_r1433720559
 using Trixi
 # Import additional symbols that are not exported by Trixi.jl
@@ -16,6 +16,7 @@ include("equations/equations.jl")
 include("equations/numerical_fluxes.jl")
 include("callbacks_stage/callbacks_stage.jl")
 include("solvers/indicators.jl")
+include("solvers/scratch_p4est.jl")
 
 # Export types/functions that define the public API of TrixiShallowWater.jl
 export ShallowWaterEquationsWetDry1D, ShallowWaterEquationsWetDry2D,
diff --git a/src/solvers/scratch_p4est.jl b/src/solvers/scratch_p4est.jl
new file mode 100644
index 0000000..e4e6b09
--- /dev/null
+++ b/src/solvers/scratch_p4est.jl
@@ -0,0 +1,302 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+function Trixi.prolong2mortars!(cache, u,
+                          mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                          equations::ShallowWaterEquationsWetDry2D,
+                          mortar_l2::Trixi.LobattoLegendreMortarL2,
+                          surface_integral, dg::DGSEM)
+    @unpack neighbor_ids, node_indices = cache.mortars
+    index_range = eachnode(dg)
+
+    Trixi.@threaded for mortar in Trixi.eachmortar(dg, cache)
+        # Copy solution data from the small elements using "delayed indexing" with
+        # a start value and a step size to get the correct face and orientation.
+        small_indices = node_indices[1, mortar]
+
+        i_small_start, i_small_step = Trixi.index_to_start_step_2d(small_indices[1],
+                                                             index_range)
+        j_small_start, j_small_step = Trixi.index_to_start_step_2d(small_indices[2],
+                                                             index_range)
+
+        for position in 1:2
+            i_small = i_small_start
+            j_small = j_small_start
+            element = neighbor_ids[position, mortar]
+            for i in eachnode(dg)
+                # TODO: need to form the sigma variable from Benov et al. (essentially H = h+b)
+                # and then save sigma, momenta, and bottom topography into the buffer for projection
+                temp = zeros(eltype(cache.mortars.u), 4)
+                if u[1, i_small, j_small, element] > equations.threshold_wet
+                    temp[1] = u[1, i_small, j_small, element] + u[4, i_small, j_small, element]
+                else
+                    temp[1] = equations.H0
+                end
+                temp[2:4] = u[2:4, i_small, j_small, element]
+
+                # println(temp)
+
+                for v in eachvariable(equations)
+                    cache.mortars.u[1, v, position, i, mortar] = temp[v]
+                    # cache.mortars.u[1, v, position, i, mortar] = u[v, i_small, j_small,
+                    #                                                element]
+                end
+                i_small += i_small_step
+                j_small += j_small_step
+            end
+        end
+
+        # Buffer to copy solution values of the large element in the correct orientation
+        # before interpolating
+        u_buffer = cache.u_threaded[Threads.threadid()]
+
+        # Copy solution of large element face to buffer in the
+        # correct orientation
+        large_indices = node_indices[2, mortar]
+
+        i_large_start, i_large_step = Trixi.index_to_start_step_2d(large_indices[1],
+                                                             index_range)
+        j_large_start, j_large_step = Trixi.index_to_start_step_2d(large_indices[2],
+                                                             index_range)
+
+        i_large = i_large_start
+        j_large = j_large_start
+        element = neighbor_ids[3, mortar]
+        for i in eachnode(dg)
+            # TODO: need to form the sigma variable from Benov et al. (essentially H = h+b)
+            # and then save sigma, momenta, and bottom topography into the buffer for projection
+            temp = zeros(eltype(cache.mortars.u), 4)
+            if u[1, i_large, j_large, element] > equations.threshold_wet
+                temp[1] = u[1, i_large, j_large, element] + u[4, i_large, j_large, element]
+            else
+                temp[1] = equations.H0
+            end
+            temp[2:4] = u[2:4, i_large, j_large, element]
+
+            for v in eachvariable(equations)
+                u_buffer[v, i] = temp[v]
+                # u_buffer[v, i] = u[v, i_large, j_large, element]
+            end
+            i_large += i_large_step
+            j_large += j_large_step
+        end
+
+        # Interpolate large element face data from buffer to small face locations
+        Trixi.multiply_dimensionwise!(view(cache.mortars.u, 2, :, 1, :, mortar),
+                                mortar_l2.forward_lower,
+                                u_buffer)
+        Trixi.multiply_dimensionwise!(view(cache.mortars.u, 2, :, 2, :, mortar),
+                                mortar_l2.forward_upper,
+                                u_buffer)
+        # # TODO: Force the bottom topography to not be projected?
+        # cache.mortars.u[2, 4, 1, :, mortar] .= cache.mortars.u[1, 4, 1, :, mortar]
+        # cache.mortars.u[2, 4, 2, :, mortar] .= cache.mortars.u[1, 4, 2, :, mortar]
+    end
+
+    return nothing
+end
+
+function Trixi.calc_mortar_flux!(surface_flux_values,
+                           mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                           nonconservative_terms,
+                           equations::ShallowWaterEquationsWetDry2D,
+                           mortar_l2::Trixi.LobattoLegendreMortarL2,
+                           surface_integral, dg::DG, cache)
+    @unpack neighbor_ids, node_indices = cache.mortars
+    @unpack contravariant_vectors = cache.elements
+    @unpack fstar_upper_threaded, fstar_lower_threaded = cache
+    index_range = eachnode(dg)
+
+    # println("in mortar flux computation")
+
+    Trixi.@threaded for mortar in Trixi.eachmortar(dg, cache)
+        # Choose thread-specific pre-allocated container
+        fstar = (fstar_lower_threaded[Threads.threadid()],
+                 fstar_upper_threaded[Threads.threadid()])
+
+        # Get index information on the small elements
+        small_indices = node_indices[1, mortar]
+        small_direction = Trixi.indices2direction(small_indices)
+
+        i_small_start, i_small_step = Trixi.index_to_start_step_2d(small_indices[1],
+                                                             index_range)
+        j_small_start, j_small_step = Trixi.index_to_start_step_2d(small_indices[2],
+                                                             index_range)
+
+        for position in 1:2
+            i_small = i_small_start
+            j_small = j_small_start
+            element = neighbor_ids[position, mortar]
+            for node in eachnode(dg)
+                # Get the normal direction on the small element.
+                # Note, contravariant vectors at interfaces in negative coordinate direction
+                # are pointing inwards. This is handled by `get_normal_direction`.
+                normal_direction = Trixi.get_normal_direction(small_direction,
+                                                        contravariant_vectors,
+                                                        i_small, j_small, element)
+
+                Trixi.calc_mortar_flux!(fstar, mesh, nonconservative_terms, equations,
+                                  surface_integral, dg, cache,
+                                  mortar, position, normal_direction,
+                                  node)
+
+                i_small += i_small_step
+                j_small += j_small_step
+            end
+        end
+
+        # Buffer to interpolate flux values of the large element to before
+        # copying in the correct orientation
+        u_buffer = cache.u_threaded[Threads.threadid()]
+
+        # in calc_interface_flux!, the interface flux is computed once over each
+        # interface using the normal from the "primary" element. The result is then
+        # passed back to the "secondary" element, flipping the sign to account for the
+        # change in the normal direction. For mortars, this sign flip occurs in
+        # "mortar_fluxes_to_elements!" instead.
+        Trixi.mortar_fluxes_to_elements!(surface_flux_values,
+                                   mesh, equations, mortar_l2, dg, cache,
+                                   mortar, fstar, u_buffer)
+    end
+
+    # wololo()
+
+    return nothing
+end
+
+# Inlined version of the mortar flux computation on small elements for equations with conservative and
+# nonconservative terms
+@inline function Trixi.calc_mortar_flux!(fstar,
+                                   mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                                   nonconservative_terms::True,
+                                   equations::ShallowWaterEquationsWetDry2D,
+                                   surface_integral, dg::DG, cache,
+                                   mortar_index, position_index, normal_direction,
+                                   node_index)
+    @unpack u = cache.mortars
+    surface_flux, nonconservative_flux = surface_integral.surface_flux
+
+    u_ll, u_rr = Trixi.get_surface_node_vars(u, equations, dg, position_index, node_index,
+                                       mortar_index)
+    # TODO: These surface values are still in terms of sigma, momenta, and bottom topography
+    # (1) unpack the sigma to create the water height on the mortar from Eq. 41 in Benov et al.
+    # (2) perform hydrostatic reconstruction on the mortar solution
+    # (3) use these HR quantities to compute the flux and nonconservative terms
+    # These last two steps, as far as I recall, occur within the surface flux from the
+    # provided solution states
+    u_ll_temp = zeros(eltype(u_ll), 4)
+    u_rr_temp = zeros(eltype(u_rr), 4)
+
+    # go back to the conservative variables on either side
+    u_ll_temp[1] = max(u_ll[1] - u_ll[4], equations.threshold_wet) # zero(eltype(u_ll)))
+    u_ll_temp[2:4] = u_ll[2:4]
+
+    u_rr_temp[1] = max(u_rr[1] - u_rr[4], equations.threshold_wet) # zero(eltype(u_rr)))
+    u_rr_temp[2:4] = u_rr[2:4]
+
+    # println(u_rr_temp[4] - u_ll_temp[4])
+
+    # println(u_ll_temp)
+    # println(u_rr_temp)
+    # println()
+
+    # Compute conservative flux
+    flux = surface_flux(u_ll_temp, u_rr_temp, normal_direction, equations)
+    # flux = surface_flux(u_ll, u_rr, normal_direction, equations)
+
+    # Compute nonconservative flux and add it to the conservative flux.
+    # The nonconservative flux is scaled by a factor of 0.5 based on
+    # the interpretation of global SBP operators coupled discontinuously via
+    # central fluxes/SATs
+    noncons = nonconservative_flux(u_ll_temp, u_rr_temp,
+                                   normal_direction, normal_direction,
+                                   equations)
+    # noncons = nonconservative_flux(u_rr_temp, u_ll_temp,
+    #                                normal_direction, normal_direction,
+    #                                equations)
+    # noncons = nonconservative_flux(u_ll, u_rr, normal_direction, normal_direction,
+    #                                equations)
+
+    flux_plus_noncons = flux + 0.5 * noncons
+    # println(flux)
+    # println(surface_flux(u_ll_temp, u_ll_temp, normal_direction, equations))
+    # println(nonconservative_flux(u_ll_temp, u_rr_temp,
+    #                              normal_direction, normal_direction,
+    #                              equations))
+    # println(noncons)
+    # println()
+
+    # Copy to buffer
+    set_node_vars!(fstar[position_index], flux_plus_noncons, equations, dg, node_index)
+end
+
+@inline function Trixi.mortar_fluxes_to_elements!(surface_flux_values,
+                                            mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                                            equations::ShallowWaterEquationsWetDry2D,
+                                            mortar_l2::Trixi.LobattoLegendreMortarL2,
+                                            dg::DGSEM, cache, mortar, fstar, u_buffer)
+    @unpack neighbor_ids, node_indices = cache.mortars
+
+    # Copy solution small to small
+    small_indices = node_indices[1, mortar]
+    small_direction = Trixi.indices2direction(small_indices)
+
+    for position in 1:2
+        element = neighbor_ids[position, mortar]
+        for i in eachnode(dg)
+            for v in eachvariable(equations)
+                surface_flux_values[v, i, small_direction, element] = fstar[position][v,
+                                                                                      i]
+            end
+        end
+    end
+
+    # TODO: Does this back projection need modified, e.g., as a strong form penalty?
+
+    # Project small fluxes to large element.
+    Trixi.multiply_dimensionwise!(u_buffer,
+                            mortar_l2.reverse_upper, fstar[2],
+                            mortar_l2.reverse_lower, fstar[1])
+
+    # println(u_buffer[2,:])
+
+    # The flux is calculated in the outward direction of the small elements,
+    # so the sign must be switched to get the flux in outward direction
+    # of the large element.
+    # The contravariant vectors of the large element (and therefore the normal
+    # vectors of the large element as well) are twice as large as the
+    # contravariant vectors of the small elements. Therefore, the flux needs
+    # to be scaled by a factor of 2 to obtain the flux of the large element.
+    u_buffer .*= -2
+
+    # Copy interpolated flux values from buffer to large element face in the
+    # correct orientation.
+    # Note that the index of the small sides will always run forward but
+    # the index of the large side might need to run backwards for flipped sides.
+    large_element = neighbor_ids[3, mortar]
+    large_indices = node_indices[2, mortar]
+    large_direction = Trixi.indices2direction(large_indices)
+
+    if :i_backward in large_indices
+        for i in eachnode(dg)
+            for v in eachvariable(equations)
+                surface_flux_values[v, end + 1 - i, large_direction, large_element] = u_buffer[v,
+                                                                                               i]
+            end
+        end
+    else
+        for i in eachnode(dg)
+            for v in eachvariable(equations)
+                surface_flux_values[v, i, large_direction, large_element] = u_buffer[v,
+                                                                                     i]
+            end
+        end
+    end
+
+    return nothing
+end
+end # @muladd
diff --git a/src/solvers/scratch_tree.jl b/src/solvers/scratch_tree.jl
new file mode 100644
index 0000000..ecddb59
--- /dev/null
+++ b/src/solvers/scratch_tree.jl
@@ -0,0 +1,269 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+function prolong2mortars!(cache, u,
+                          mesh::TreeMesh{2}, equations,
+                          mortar_l2::LobattoLegendreMortarL2, surface_integral,
+                          dg::DGSEM)
+    @threaded for mortar in eachmortar(dg, cache)
+        large_element = cache.mortars.neighbor_ids[3, mortar]
+        upper_element = cache.mortars.neighbor_ids[2, mortar]
+        lower_element = cache.mortars.neighbor_ids[1, mortar]
+
+        # Copy solution small to small
+        if cache.mortars.large_sides[mortar] == 1 # -> small elements on right side
+            if cache.mortars.orientations[mortar] == 1
+                # L2 mortars in x-direction
+                for l in eachnode(dg)
+                    for v in eachvariable(equations)
+                        cache.mortars.u_upper[2, v, l, mortar] = u[v, 1, l,
+                                                                   upper_element]
+                        cache.mortars.u_lower[2, v, l, mortar] = u[v, 1, l,
+                                                                   lower_element]
+                    end
+                end
+            else
+                # L2 mortars in y-direction
+                for l in eachnode(dg)
+                    for v in eachvariable(equations)
+                        cache.mortars.u_upper[2, v, l, mortar] = u[v, l, 1,
+                                                                   upper_element]
+                        cache.mortars.u_lower[2, v, l, mortar] = u[v, l, 1,
+                                                                   lower_element]
+                    end
+                end
+            end
+        else # large_sides[mortar] == 2 -> small elements on left side
+            if cache.mortars.orientations[mortar] == 1
+                # L2 mortars in x-direction
+                for l in eachnode(dg)
+                    for v in eachvariable(equations)
+                        cache.mortars.u_upper[1, v, l, mortar] = u[v, nnodes(dg), l,
+                                                                   upper_element]
+                        cache.mortars.u_lower[1, v, l, mortar] = u[v, nnodes(dg), l,
+                                                                   lower_element]
+                    end
+                end
+            else
+                # L2 mortars in y-direction
+                for l in eachnode(dg)
+                    for v in eachvariable(equations)
+                        cache.mortars.u_upper[1, v, l, mortar] = u[v, l, nnodes(dg),
+                                                                   upper_element]
+                        cache.mortars.u_lower[1, v, l, mortar] = u[v, l, nnodes(dg),
+                                                                   lower_element]
+                    end
+                end
+            end
+        end
+
+        # Interpolate large element face data to small interface locations
+        if cache.mortars.large_sides[mortar] == 1 # -> large element on left side
+            leftright = 1
+            if cache.mortars.orientations[mortar] == 1
+                # L2 mortars in x-direction
+                u_large = view(u, :, nnodes(dg), :, large_element)
+                element_solutions_to_mortars!(cache.mortars, mortar_l2, leftright,
+                                              mortar, u_large)
+            else
+                # L2 mortars in y-direction
+                u_large = view(u, :, :, nnodes(dg), large_element)
+                element_solutions_to_mortars!(cache.mortars, mortar_l2, leftright,
+                                              mortar, u_large)
+            end
+        else # large_sides[mortar] == 2 -> large element on right side
+            leftright = 2
+            if cache.mortars.orientations[mortar] == 1
+                # L2 mortars in x-direction
+                u_large = view(u, :, 1, :, large_element)
+                element_solutions_to_mortars!(cache.mortars, mortar_l2, leftright,
+                                              mortar, u_large)
+            else
+                # L2 mortars in y-direction
+                u_large = view(u, :, :, 1, large_element)
+                element_solutions_to_mortars!(cache.mortars, mortar_l2, leftright,
+                                              mortar, u_large)
+            end
+        end
+    end
+
+    return nothing
+end
+
+@inline function element_solutions_to_mortars!(mortars,
+                                               mortar_l2::LobattoLegendreMortarL2,
+                                               leftright, mortar,
+                                               u_large::AbstractArray{<:Any, 2})
+    multiply_dimensionwise!(view(mortars.u_upper, leftright, :, :, mortar),
+                            mortar_l2.forward_upper, u_large)
+    multiply_dimensionwise!(view(mortars.u_lower, leftright, :, :, mortar),
+                            mortar_l2.forward_lower, u_large)
+    return nothing
+end
+
+function calc_mortar_flux!(surface_flux_values,
+                           mesh::TreeMesh{2},
+                           nonconservative_terms::True, equations,
+                           mortar_l2::LobattoLegendreMortarL2,
+                           surface_integral, dg::DG, cache)
+    surface_flux, nonconservative_flux = surface_integral.surface_flux
+    @unpack u_lower, u_upper, orientations, large_sides = cache.mortars
+    @unpack fstar_upper_threaded, fstar_lower_threaded = cache
+
+    @threaded for mortar in eachmortar(dg, cache)
+        # Choose thread-specific pre-allocated container
+        fstar_upper = fstar_upper_threaded[Threads.threadid()]
+        fstar_lower = fstar_lower_threaded[Threads.threadid()]
+
+        # Calculate fluxes
+        orientation = orientations[mortar]
+        calc_fstar!(fstar_upper, equations, surface_flux, dg, u_upper, mortar,
+                    orientation)
+        calc_fstar!(fstar_lower, equations, surface_flux, dg, u_lower, mortar,
+                    orientation)
+
+        # Add nonconservative fluxes.
+        # These need to be adapted on the geometry (left/right) since the order of
+        # the arguments matters, based on the global SBP operator interpretation.
+        # The same interpretation (global SBP operators coupled discontinuously via
+        # central fluxes/SATs) explains why we need the factor 0.5.
+        # Alternatively, you can also follow the argumentation of Bohm et al. 2018
+        # ("nonconservative diamond flux")
+        if large_sides[mortar] == 1 # -> small elements on right side
+            for i in eachnode(dg)
+                # Pull the left and right solutions
+                u_upper_ll, u_upper_rr = get_surface_node_vars(u_upper, equations, dg,
+                                                               i, mortar)
+                u_lower_ll, u_lower_rr = get_surface_node_vars(u_lower, equations, dg,
+                                                               i, mortar)
+                # Call pointwise nonconservative term
+                noncons_upper = nonconservative_flux(u_upper_ll, u_upper_rr,
+                                                     orientation, equations)
+                noncons_lower = nonconservative_flux(u_lower_ll, u_lower_rr,
+                                                     orientation, equations)
+                # Add to primary and secondary temporary storage
+                multiply_add_to_node_vars!(fstar_upper, 0.5, noncons_upper, equations,
+                                           dg, i)
+                multiply_add_to_node_vars!(fstar_lower, 0.5, noncons_lower, equations,
+                                           dg, i)
+            end
+        else # large_sides[mortar] == 2 -> small elements on the left
+            for i in eachnode(dg)
+                # Pull the left and right solutions
+                u_upper_ll, u_upper_rr = get_surface_node_vars(u_upper, equations, dg,
+                                                               i, mortar)
+                u_lower_ll, u_lower_rr = get_surface_node_vars(u_lower, equations, dg,
+                                                               i, mortar)
+                # Call pointwise nonconservative term
+                noncons_upper = nonconservative_flux(u_upper_rr, u_upper_ll,
+                                                     orientation, equations)
+                noncons_lower = nonconservative_flux(u_lower_rr, u_lower_ll,
+                                                     orientation, equations)
+                # Add to primary and secondary temporary storage
+                multiply_add_to_node_vars!(fstar_upper, 0.5, noncons_upper, equations,
+                                           dg, i)
+                multiply_add_to_node_vars!(fstar_lower, 0.5, noncons_lower, equations,
+                                           dg, i)
+            end
+        end
+
+        mortar_fluxes_to_elements!(surface_flux_values,
+                                   mesh, equations, mortar_l2, dg, cache,
+                                   mortar, fstar_upper, fstar_lower)
+    end
+
+    return nothing
+end
+
+@inline function calc_fstar!(destination::AbstractArray{<:Any, 2}, equations,
+                             surface_flux, dg::DGSEM,
+                             u_interfaces, interface, orientation)
+    for i in eachnode(dg)
+        # Call pointwise two-point numerical flux function
+        u_ll, u_rr = get_surface_node_vars(u_interfaces, equations, dg, i, interface)
+        flux = surface_flux(u_ll, u_rr, orientation, equations)
+
+        # Copy flux to left and right element storage
+        set_node_vars!(destination, flux, equations, dg, i)
+    end
+
+    return nothing
+end
+
+@inline function mortar_fluxes_to_elements!(surface_flux_values,
+                                            mesh::TreeMesh{2}, equations,
+                                            mortar_l2::LobattoLegendreMortarL2,
+                                            dg::DGSEM, cache,
+                                            mortar, fstar_upper, fstar_lower)
+    large_element = cache.mortars.neighbor_ids[3, mortar]
+    upper_element = cache.mortars.neighbor_ids[2, mortar]
+    lower_element = cache.mortars.neighbor_ids[1, mortar]
+
+    # Copy flux small to small
+    if cache.mortars.large_sides[mortar] == 1 # -> small elements on right side
+        if cache.mortars.orientations[mortar] == 1
+            # L2 mortars in x-direction
+            direction = 1
+        else
+            # L2 mortars in y-direction
+            direction = 3
+        end
+    else # large_sides[mortar] == 2 -> small elements on left side
+        if cache.mortars.orientations[mortar] == 1
+            # L2 mortars in x-direction
+            direction = 2
+        else
+            # L2 mortars in y-direction
+            direction = 4
+        end
+    end
+    surface_flux_values[:, :, direction, upper_element] .= fstar_upper
+    surface_flux_values[:, :, direction, lower_element] .= fstar_lower
+
+    # Project small fluxes to large element
+    if cache.mortars.large_sides[mortar] == 1 # -> large element on left side
+        if cache.mortars.orientations[mortar] == 1
+            # L2 mortars in x-direction
+            direction = 2
+        else
+            # L2 mortars in y-direction
+            direction = 4
+        end
+    else # large_sides[mortar] == 2 -> large element on right side
+        if cache.mortars.orientations[mortar] == 1
+            # L2 mortars in x-direction
+            direction = 1
+        else
+            # L2 mortars in y-direction
+            direction = 3
+        end
+    end
+
+    # TODO: Taal performance
+    # for v in eachvariable(equations)
+    #   # The code below is semantically equivalent to
+    #   # surface_flux_values[v, :, direction, large_element] .=
+    #   #   (mortar_l2.reverse_upper * fstar_upper[v, :] + mortar_l2.reverse_lower * fstar_lower[v, :])
+    #   # but faster and does not allocate.
+    #   # Note that `true * some_float == some_float` in Julia, i.e. `true` acts as
+    #   # a universal `one`. Hence, the second `mul!` means "add the matrix-vector
+    #   # product to the current value of the destination".
+    #   @views mul!(surface_flux_values[v, :, direction, large_element],
+    #               mortar_l2.reverse_upper, fstar_upper[v, :])
+    #   @views mul!(surface_flux_values[v, :, direction, large_element],
+    #               mortar_l2.reverse_lower,  fstar_lower[v, :], true, true)
+    # end
+    # The code above could be replaced by the following code. However, the relative efficiency
+    # depends on the types of fstar_upper/fstar_lower and dg.l2mortar_reverse_upper.
+    # Using StaticArrays for both makes the code above faster for common test cases.
+    multiply_dimensionwise!(view(surface_flux_values, :, :, direction, large_element),
+                            mortar_l2.reverse_upper, fstar_upper,
+                            mortar_l2.reverse_lower, fstar_lower)
+
+    return nothing
+end
+end # @muladd