diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..175ace3
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,81 @@
+cmake_minimum_required (VERSION 3.5)
+
+project (compose CXX)
+set (CMAKE_CXX_STANDARD 11)
+
+function (prc var)
+  message ("${var}: ${${var}}")
+endfunction ()
+
+find_package (MPI REQUIRED)
+
+if (Kokkos_DIR)
+  include (${Kokkos_DIR}/kokkos.cmake)
+  set (Kokkos_INCLUDE ${Kokkos_DIR}/include)
+else ()
+  message (FATAL_ERROR "COMPOSE requires Kokkos_DIR")
+endif ()
+
+set (SOURCES
+  cedr/cedr_caas.cpp
+  cedr/cedr_local.cpp
+  cedr/cedr_mpi.cpp
+  cedr/cedr_qlt.cpp
+  cedr/cedr_test.cpp
+  cedr/cedr_test_1d_transport.cpp
+  cedr/cedr_test_randomized.cpp
+  cedr/cedr_util.cpp)
+
+set (HEADERS
+  cedr/cedr.hpp
+  cedr/cedr_caas.hpp
+  cedr/cedr_caas_inl.hpp
+  cedr/cedr_cdr.hpp
+  cedr/cedr_kokkos.hpp
+  cedr/cedr_local.hpp
+  cedr/cedr_local_inl.hpp
+  cedr/cedr_mpi.hpp
+  cedr/cedr_mpi_inl.hpp
+  cedr/cedr_qlt.hpp
+  cedr/cedr_qlt_inl.hpp
+  cedr/cedr_test.hpp
+  cedr/cedr_test_randomized.hpp
+  cedr/cedr_util.hpp
+  siqk/siqk.hpp
+  siqk/siqk_defs.hpp
+  siqk/siqk_geometry.hpp
+  siqk/siqk_intersect.hpp
+  siqk/siqk_quadrature.hpp
+  siqk/siqk_search.hpp
+  siqk/siqk_sqr.hpp)
+
+if (NOT COMPOSE_TEST_MPIRUN)
+  set (COMPOSE_TEST_MPIRUN mpirun)
+endif ()
+if (NOT COMPOSE_TEST_NRANK)
+  set (COMPOSE_TEST_NRANK 8)
+endif ()
+
+set (COMPOSE_COMPILE_FLAGS "${MPI_COMPILE_FLAGS} ${KOKKOS_CXXFLAGS} ${CMAKE_CXX_FLAGS}")
+set (COMPOSE_LINK_FLAGS "${MPI_LINK_FLAGS} ${KOKKOS_LDFLAGS}")
+set (COMPOSE_INCLUDES "${Kokkos_INCLUDE}")
+set (COMPOSE_LIBRARIES ${MPI_LIBRARIES} ${KOKKOS_LIBS})
+
+prc(MPI_COMPILE_FLAGS)
+prc(MPI_LINK_FLAGS)
+prc(MPI_LIBRARIES)
+add_library (${PROJECT_NAME} ${SOURCES})
+set_target_properties (${PROJECT_NAME} PROPERTIES
+  COMPILE_FLAGS ${COMPOSE_COMPILE_FLAGS}
+  LINK_FLAGS ${COMPOSE_LINK_FLAGS})
+target_include_directories (${PROJECT_NAME} PUBLIC cedr siqk)
+target_include_directories (${PROJECT_NAME} PRIVATE siqk cedr)
+target_include_directories (${PROJECT_NAME} PUBLIC ${COMPOSE_INCLUDES})
+target_link_libraries (${PROJECT_NAME} ${COMPOSE_LIBRARIES})
+
+install (TARGETS ${PROJECT_NAME} ARCHIVE DESTINATION lib)
+install (FILES ${HEADERS} DESTINATION include/compose)
+
+enable_testing ()
+add_subdirectory(siqk)
+add_subdirectory(cedr)
diff --git a/README.md b/README.md
index 8dec62e..165fdd3 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,22 @@
 # COMPOSE
-Compact Multi-moment Performance-Portable Semi-Lagrangian methods for non-hydrostatic dynamics
+Compact Multi-moment Performance-Portable Semi-Lagrangian methods
+
+COMPOSE provides libraries for semi-Lagrangian transport and, together or
+separately, property preservation.
+
+CEDR: Communication-Efficient Constrained Density Reconstructors.
+SIQK: Sphereical Polygon Intersection and Quadrature.
+
+First, install Kokkos:
+    https://github.com/kokkos/kokkos
+For example, in a typical environment using OpenMP, a simple build line is:
+    ./kokkos/generate_makefile.bash --with-serial --with-openmp --prefix=/path/to/my/libs --compiler=g++
+    make -j8 install
+
+Second, configure, build, and test COMPOSE:
+    cmake \
+        -D Kokkos_DIR=/path/to/my/kokkos/install \
+        -D CMAKE_INSTALL_PREFIX=/path/to/my/compose/install \
+        /path/to/compose/repo
+    make -j8
+    ctest
diff --git a/cedr/CMakeLists.txt b/cedr/CMakeLists.txt
new file mode 100644
index 0000000..f0f5c88
--- /dev/null
+++ b/cedr/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_executable (cedr_test cedr_test.cpp)
+set_target_properties (cedr_test PROPERTIES
+  COMPILE_FLAGS ${COMPOSE_COMPILE_FLAGS}
+  LINK_FLAGS ${COMPOSE_LINK_FLAGS})
+
+target_include_directories (cedr_test PRIVATE ${COMPOSE_INCLUDES})
+target_link_libraries (cedr_test ${PROJECT_NAME} ${COMPOSE_LIBRARIES})
+
+add_test (NAME cedr-test-unit
+  COMMAND $<TARGET_FILE:cedr_test> -t)
+add_test (NAME cedr-test-unit-mpi
+  COMMAND ${COMPOSE_TEST_MPIRUN} -np ${COMPOSE_TEST_NRANK}
+  $<TARGET_FILE:cedr_test> -t --proc-random -nc 111 -nt 11)
+add_test (NAME cedr-test-t1d
+  COMMAND $<TARGET_FILE:cedr_test> -t -t1d -nc 111)
diff --git a/cedr/cedr.hpp b/cedr/cedr.hpp
new file mode 100644
index 0000000..dea74e7
--- /dev/null
+++ b/cedr/cedr.hpp
@@ -0,0 +1,31 @@
+#ifndef INCLUDE_CEDR_HPP
+#define INCLUDE_CEDR_HPP
+
+#include "cedr_kokkos.hpp"
+
+// Communication-Efficient Constrained Density Reconstructors
+namespace cedr {
+typedef int Int;
+typedef long int Long;
+typedef std::size_t Size;
+typedef double Real;
+
+// CDRs in general implement
+// * tracer mass, Qm, conservation;
+// * mixing ratio, q, shape preservation, either local bound preservation or
+//   dynamic range preservation; and
+// * tracer consistency, which follows from dynamic range preservation or
+//   stronger (including local bound preservation) with rhom coming from the
+//   dynamics.
+//
+// One can solve a subset of these.
+//   If !conserve, then the CDR does not alter the tracer mass, but it does not
+// correct for any failure in mass conservation in the field given to it.
+//   If consistent but !shapepreserve, the the CDR solves the dynamic range
+// preservation problem rather than the local bound preservation problem.
+struct ProblemType {
+  enum : Int { conserve = 1, shapepreserve = 1 << 1, consistent = 1 << 2 };
+};
+}
+
+#endif
diff --git a/cedr/cedr_caas.cpp b/cedr/cedr_caas.cpp
new file mode 100644
index 0000000..2694024
--- /dev/null
+++ b/cedr/cedr_caas.cpp
@@ -0,0 +1,214 @@
+#include "cedr_caas.hpp"
+#include "cedr_util.hpp"
+#include "cedr_test_randomized.hpp"
+
+namespace cedr {
+namespace caas {
+
+template <typename ES>
+CAAS<ES>::CAAS (const mpi::Parallel::Ptr& p, const Int nlclcells)
+  : p_(p), nlclcells_(nlclcells), nrhomidxs_(0), need_conserve_(false)
+{
+  cedr_throw_if(nlclcells == 0, "CAAS does not support 0 cells on a rank.");
+  tracer_decls_ = std::make_shared<std::vector<Decl> >();  
+}
+
+template <typename ES>
+void CAAS<ES>::declare_tracer(int problem_type, const Int& rhomidx) {
+  cedr_throw_if( ! (problem_type & ProblemType::shapepreserve),
+                "CAAS is a WIP; ! shapepreserve is not supported yet.");
+  cedr_throw_if(rhomidx > 0, "rhomidx > 0 is not supported yet.");
+  tracer_decls_->push_back(Decl(problem_type, rhomidx));
+  if (problem_type & ProblemType::conserve)
+    need_conserve_ = true;
+  nrhomidxs_ = std::max(nrhomidxs_, rhomidx+1);
+}
+
+template <typename ES>
+void CAAS<ES>::end_tracer_declarations () {
+  cedr_throw_if(tracer_decls_->size() == 0, "#tracers is 0.");
+  cedr_throw_if(nrhomidxs_ == 0, "#rhomidxs is 0.");
+  probs_ = IntList("CAAS probs", static_cast<Int>(tracer_decls_->size()));
+  t2r_ = IntList("CAAS t2r", static_cast<Int>(tracer_decls_->size()));
+  for (Int i = 0; i < probs_.extent_int(0); ++i) {
+    probs_(i) = (*tracer_decls_)[i].probtype;
+    t2r_(i) = (*tracer_decls_)[i].rhomidx;
+  }
+  tracer_decls_ = nullptr;
+  // (rho, Qm, Qm_min, Qm_max, [Qm_prev])
+  const Int e = need_conserve_ ? 1 : 0;
+  d_ = RealList("CAAS data", nlclcells_ * ((3+e)*probs_.size() + 1));
+  const auto nslots = 4*probs_.size();
+  // (e'Qm_clip, e'Qm, e'Qm_min, e'Qm_max, [e'Qm_prev])
+  send_ = RealList("CAAS send", nslots);
+  recv_ = RealList("CAAS recv", nslots);
+}
+
+template <typename ES>
+int CAAS<ES>::get_problem_type (const Int& tracer_idx) const {
+  cedr_assert(tracer_idx >= 0 && tracer_idx < probs_.extent_int(0));
+  return probs_[tracer_idx];
+}
+
+template <typename ES>
+Int CAAS<ES>::get_num_tracers () const {
+  return probs_.extent_int(0);
+}
+
+template <typename ES>
+void CAAS<ES>::reduce_locally () {
+  const Int nt = probs_.size();
+  Int k = 0;
+  Int os = nlclcells_;
+  // Qm_clip
+  for ( ; k < nt; ++k) {
+    Real Qm_sum = 0, Qm_clip_sum = 0;
+    for (Int i = 0; i < nlclcells_; ++i) {
+      const Real Qm = d_(os+i);
+      Qm_sum += (probs_(k) & ProblemType::conserve ?
+                 d_(os + nlclcells_*3*nt + i) /* Qm_prev */ :
+                 Qm);
+      const Real Qm_min = d_(os + nlclcells_*  nt + i);
+      const Real Qm_max = d_(os + nlclcells_*2*nt + i);
+      const Real Qm_clip = cedr::impl::min(Qm_max, cedr::impl::max(Qm_min, Qm));
+      Qm_clip_sum += Qm_clip;
+      d_(os+i) = Qm_clip;
+    }
+    send_(     k) = Qm_clip_sum;
+    send_(nt + k) = Qm_sum;
+    os += nlclcells_;
+  }
+  k += nt;
+  // Qm_min, Qm_max
+  for ( ; k < 4*nt; ++k) {
+    Real accum = 0;
+    for (Int i = 0; i < nlclcells_; ++i)
+      accum += d_(os+i);
+    send_(k) = accum;
+    os += nlclcells_;
+  }
+}
+
+template <typename ES>
+void CAAS<ES>::reduce_globally () {
+  int err = mpi::all_reduce(*p_, send_.data(), recv_.data(), send_.size(), MPI_SUM);
+  cedr_throw_if(err != MPI_SUCCESS,
+                "CAAS::reduce_globally MPI_Allreduce returned " << err);
+}
+
+template <typename ES>
+void CAAS<ES>::finish_locally () {
+  const Int nt = probs_.size();
+  Int os = nlclcells_;
+  for (Int k = 0; k < nt; ++k) {
+    const Real Qm_clip_sum = recv_(     k);
+    const Real Qm_sum      = recv_(nt + k);
+    const Real m = Qm_sum - Qm_clip_sum;
+    if (m < 0) {
+      const Real Qm_min_sum = recv_(2*nt + k);
+      Real fac = Qm_clip_sum - Qm_min_sum;
+      if (fac > 0) {
+        fac = m/fac;
+        for (Int i = 0; i < nlclcells_; ++i) {
+          const Real Qm_min = d_(os + nlclcells_*  nt + i);
+          Real& Qm = d_(os+i);
+          Qm += fac*(Qm - Qm_min);
+        }
+      }
+    } else if (m > 0) {
+      const Real Qm_max_sum = recv_(3*nt + k);
+      Real fac = Qm_max_sum - Qm_clip_sum;
+      if (fac > 0) {
+        fac = m/fac;
+        for (Int i = 0; i < nlclcells_; ++i) {
+          const Real Qm_max = d_(os + nlclcells_*2*nt + i);
+          Real& Qm = d_(os+i);
+          Qm += fac*(Qm_max - Qm);
+        }
+      }
+    }
+    os += nlclcells_;
+  }
+}
+
+template <typename ES>
+void CAAS<ES>::run () {
+  reduce_locally();
+  reduce_globally();
+  finish_locally();
+}
+
+namespace test {
+struct TestCAAS : public cedr::test::TestRandomized {
+  typedef CAAS<Kokkos::DefaultExecutionSpace> CAAST;
+
+  TestCAAS (const mpi::Parallel::Ptr& p, const Int& ncells, const bool verbose)
+    : TestRandomized("CAAS", p, ncells, verbose),
+      p_(p)
+  {
+    const auto np = p->size(), rank = p->rank();
+    nlclcells_ = ncells / np;
+    const Int todo = ncells - nlclcells_ * np;
+    if (rank < todo) ++nlclcells_;
+    caas_ = std::make_shared<CAAST>(p, nlclcells_);
+    init();
+  }
+
+  CDR& get_cdr () override { return *caas_; }
+
+  void init_numbering () override {
+    const auto np = p_->size(), rank = p_->rank();
+    Int start = 0;
+    for (Int lrank = 0; lrank < rank; ++lrank)
+      start += get_nllclcells(ncells_, np, lrank);
+    gcis_.resize(nlclcells_);
+    for (Int i = 0; i < nlclcells_; ++i)
+      gcis_[i] = start + i;
+  }
+
+  void init_tracers () override {
+    // CAAS doesn't yet support everything, so remove a bunch of the tracers.
+    std::vector<TestRandomized::Tracer> tracers;
+    Int idx = 0;
+    for (auto& t : tracers_) {
+      if ( ! (t.problem_type & ProblemType::shapepreserve) ||
+           ! t.local_should_hold)
+        continue;
+      t.idx = idx++;
+      tracers.push_back(t);
+      caas_->declare_tracer(t.problem_type, 0);
+    }
+    tracers_ = tracers;
+    caas_->end_tracer_declarations();
+  }
+
+  void run_impl (const Int trial) override {
+    caas_->run();
+  }
+
+private:
+  mpi::Parallel::Ptr p_;
+  Int nlclcells_;
+  CAAST::Ptr caas_;
+
+  static Int get_nllclcells (const Int& ncells, const Int& np, const Int& rank) {
+    Int nlclcells = ncells / np;
+    const Int todo = ncells - nlclcells * np;
+    if (rank < todo) ++nlclcells;
+    return nlclcells;
+  }
+};
+
+Int unittest (const mpi::Parallel::Ptr& p) {
+  const auto np = p->size();
+  Int nerr = 0;
+  for (Int nlclcells : {1, 2, 4, 11}) {
+    Long ncells = np*nlclcells;
+    if (ncells > np) ncells -= np/2;
+    nerr += TestCAAS(p, ncells, false).run(1, false);
+  }
+  return nerr;
+}
+} // namespace test
+} // namespace caas
+} // namespace cedr
diff --git a/cedr/cedr_caas.hpp b/cedr/cedr_caas.hpp
new file mode 100644
index 0000000..849636f
--- /dev/null
+++ b/cedr/cedr_caas.hpp
@@ -0,0 +1,75 @@
+#ifndef INCLUDE_CEDR_CAAS_HPP
+#define INCLUDE_CEDR_CAAS_HPP
+
+#include "cedr_cdr.hpp"
+
+namespace cedr {
+// ClipAndAssuredSum.
+namespace caas {
+
+template <typename ExeSpace = Kokkos::DefaultExecutionSpace>
+class CAAS : public CDR {
+public:
+  typedef typename cedr::impl::DeviceType<ExeSpace>::type Device;
+  typedef CAAS<ExeSpace> Me;
+  typedef std::shared_ptr<Me> Ptr;
+
+public:
+  CAAS(const mpi::Parallel::Ptr& p, const Int nlclcells);
+
+  void declare_tracer(int problem_type, const Int& rhomidx) override;
+
+  void end_tracer_declarations() override;
+
+  int get_problem_type(const Int& tracer_idx) const override;
+
+  Int get_num_tracers() const override;
+
+  // lclcellidx is trivial; it is the user's index for the cell.
+  KOKKOS_INLINE_FUNCTION
+  void set_rhom(const Int& lclcellidx, const Int& rhomidx, const Real& rhom) override;
+
+  KOKKOS_INLINE_FUNCTION
+  void set_Qm(const Int& lclcellidx, const Int& tracer_idx,
+              const Real& Qm, const Real& Qm_min, const Real& Qm_max,
+              const Real Qm_prev = -1) override;
+
+  void run() override;
+
+  KOKKOS_INLINE_FUNCTION
+  Real get_Qm(const Int& lclcellidx, const Int& tracer_idx) override;
+
+private:
+  typedef Kokkos::View<Real*, Kokkos::LayoutLeft, Device> RealList;
+  typedef cedr::impl::Unmanaged<RealList> UnmanagedRealList;
+  typedef Kokkos::View<Int*, Kokkos::LayoutLeft, Device> IntList;
+
+  struct Decl {
+    int probtype;
+    Int rhomidx;
+    Decl (const int probtype_, const Int rhomidx_)
+      : probtype(probtype_), rhomidx(rhomidx_) {}
+  };
+
+  mpi::Parallel::Ptr p_;
+  
+  Int nlclcells_, nrhomidxs_;
+  std::shared_ptr<std::vector<Decl> > tracer_decls_;
+  bool need_conserve_;
+  IntList probs_, t2r_;
+  RealList d_, send_, recv_;
+
+  void reduce_locally();
+  void reduce_globally();
+  void finish_locally();
+};
+
+namespace test {
+Int unittest(const mpi::Parallel::Ptr& p);
+} // namespace test
+} // namespace caas
+} // namespace cedr
+
+#include "cedr_caas_inl.hpp"
+
+#endif
diff --git a/cedr/cedr_caas_inl.hpp b/cedr/cedr_caas_inl.hpp
new file mode 100644
index 0000000..f1a64fd
--- /dev/null
+++ b/cedr/cedr_caas_inl.hpp
@@ -0,0 +1,42 @@
+#ifndef INCLUDE_CEDR_CAAS_INL_HPP
+#define INCLUDE_CEDR_CAAS_INL_HPP
+
+#include "cedr_util.hpp"
+
+namespace cedr {
+// ClipAndAssuredSum.
+namespace caas {
+
+template <typename ES> KOKKOS_INLINE_FUNCTION
+void CAAS<ES>::set_rhom (const Int& lclcellidx, const Int& rhomidx, const Real& rhom) {
+  cedr_kernel_assert(lclcellidx >= 0 && lclcellidx < nlclcells_);
+  cedr_kernel_assert(rhomidx >= 0 && rhomidx < nrhomidxs_);
+  d_(lclcellidx) = rhom;
+}
+
+template <typename ES> KOKKOS_INLINE_FUNCTION
+void CAAS<ES>
+::set_Qm (const Int& lclcellidx, const Int& tracer_idx,
+          const Real& Qm, const Real& Qm_min, const Real& Qm_max,
+          const Real Qm_prev) {
+  cedr_kernel_assert(lclcellidx >= 0 && lclcellidx < nlclcells_);
+  cedr_kernel_assert(tracer_idx >= 0 && tracer_idx < probs_.extent_int(0));
+  const Int nt = probs_.size();
+  d_((1 +               tracer_idx)*nlclcells_ + lclcellidx) = Qm;
+  d_((1 +   nt + tracer_idx)*nlclcells_ + lclcellidx) = Qm_min;
+  d_((1 + 2*nt + tracer_idx)*nlclcells_ + lclcellidx) = Qm_max;
+  if (need_conserve_)
+    d_((1 + 3*nt + tracer_idx)*nlclcells_ + lclcellidx) = Qm_prev;
+}
+
+template <typename ES> KOKKOS_INLINE_FUNCTION
+Real CAAS<ES>::get_Qm (const Int& lclcellidx, const Int& tracer_idx) {
+  cedr_kernel_assert(lclcellidx >= 0 && lclcellidx < nlclcells_);
+  cedr_kernel_assert(tracer_idx >= 0 && tracer_idx < probs_.extent_int(0));
+  return d_((1 + tracer_idx)*nlclcells_ + lclcellidx);
+}
+
+} // namespace caas
+} // namespace cedr
+
+#endif
diff --git a/cedr/cedr_cdr.hpp b/cedr/cedr_cdr.hpp
new file mode 100644
index 0000000..6dad452
--- /dev/null
+++ b/cedr/cedr_cdr.hpp
@@ -0,0 +1,65 @@
+#ifndef INCLUDE_CEDR_CDR_HPP
+#define INCLUDE_CEDR_CDR_HPP
+
+#include "cedr_mpi.hpp"
+
+namespace cedr {
+// Constrained Density Reconstructor interface.
+struct CDR {
+  typedef std::shared_ptr<CDR> Ptr;
+
+  virtual void print(std::ostream& os) const {}
+
+  // Set up QLT tracer metadata. Call declare_tracer in order of the tracer
+  // index in the caller's numbering. Once end_tracer_declarations is called, it
+  // is an error to call declare_tracer again.
+  //   Associate the tracer with a rhom index. In many problems, there will be
+  // only one rhom, so rhomidx is always 0.
+  //   It is an error to call this function from a parallel region.
+  virtual void declare_tracer(int problem_type, const Int& rhomidx) = 0;
+
+  // It is an error to call this function from a parallel region.
+  virtual void end_tracer_declarations() = 0;
+
+  virtual int get_problem_type(const Int& tracer_idx) const = 0;
+
+  virtual Int get_num_tracers() const = 0;
+
+  // set_{rhom,Qm}: Set cell values prior to running the QLT algorithm.
+  //
+  //   Notation:
+  //     rho: Total density.
+  //       Q: Tracer density.
+  //       q: Tracer mixing ratio = Q/rho.
+  //      *m: Mass corresponding to the density; results from an integral over a
+  //          region, such as a cell.
+  //   Some CDRs have a nontrivial local <-> global cell index map. For these
+  // CDRs, lclcellidx may be nontrivial. For others, the caller should provide
+  // the index into the local cell.
+  //
+  //   set_rhom must be called before set_Qm.
+  virtual void set_rhom(
+    const Int& lclcellidx, const Int& rhomidx,
+    // Current total mass in this cell.
+    const Real& rhom) = 0;
+
+  virtual void set_Qm(
+    const Int& lclcellidx, const Int& tracer_idx,
+    // Current tracer mass in this cell.
+    const Real& Qm,
+    // Minimum and maximum permitted tracer mass in this cell.
+    const Real& Qm_min, const Real& Qm_max,
+    // If mass conservation is requested, provide the previous Qm, which will be
+    // summed to give the desired global mass.
+    const Real Qm_prev = -1) = 0;
+
+  // Run the QLT algorithm with the values set by set_{rho,Q}. It is an error to
+  // call this function from a parallel region.
+  virtual void run() = 0;
+
+  // Get a cell's tracer mass Qm after the QLT algorithm has run.
+  virtual Real get_Qm(const Int& lclcellidx, const Int& tracer_idx) = 0;
+};
+} // namespace cedr
+
+#endif
diff --git a/cedr/cedr_kokkos.hpp b/cedr/cedr_kokkos.hpp
new file mode 100644
index 0000000..ec25b02
--- /dev/null
+++ b/cedr/cedr_kokkos.hpp
@@ -0,0 +1,53 @@
+#ifndef INCLUDE_CEDR_KOKKOS_HPP
+#define INCLUDE_CEDR_KOKKOS_HPP
+
+#include <Kokkos_Core.hpp>
+
+namespace cedr {
+namespace impl {
+template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
+using MemoryTraits = Kokkos::MemoryTraits<
+  MemoryTraitsType::Unmanaged | MemoryTraitsType::RandomAccess |
+  MemoryTraitsType::Atomic | flag>;
+
+template <typename View>
+using Unmanaged = Kokkos::View<
+  typename View::data_type, typename View::array_layout,
+  typename View::device_type, MemoryTraits<typename View::memory_traits,
+                                           Kokkos::Unmanaged> >;
+template <typename View>
+using Const = Kokkos::View<
+  typename View::const_data_type, typename View::array_layout,
+  typename View::device_type, typename View::memory_traits>;
+template <typename View>
+using ConstUnmanaged = Const<Unmanaged<View> >;
+
+template <typename ExeSpace>
+struct DeviceType {
+  typedef Kokkos::Device<typename ExeSpace::execution_space,
+                         typename ExeSpace::memory_space> type;
+};
+
+#ifdef KOKKOS_HAVE_CUDA
+typedef Kokkos::Device<Kokkos::CudaSpace::execution_space,
+                       Kokkos::CudaSpace::memory_space> DefaultDeviceType;
+
+template <> struct DeviceType<Kokkos::Cuda> {
+  typedef DefaultDeviceType type;
+};
+#else
+typedef Kokkos::Device<Kokkos::DefaultExecutionSpace::execution_space,
+                       Kokkos::DefaultExecutionSpace::memory_space> DefaultDeviceType;
+#endif
+
+// GPU-friendly replacements for std::*.
+template <typename T> KOKKOS_INLINE_FUNCTION
+const T& min (const T& a, const T& b) { return a < b ? a : b; }
+template <typename T> KOKKOS_INLINE_FUNCTION
+const T& max (const T& a, const T& b) { return a > b ? a : b; }
+template <typename T> KOKKOS_INLINE_FUNCTION
+void swap (T& a, T& b) { const T tmp = a; a = b; b = tmp; }
+}
+}
+
+#endif
diff --git a/cedr/cedr_local.cpp b/cedr/cedr_local.cpp
new file mode 100644
index 0000000..15167e5
--- /dev/null
+++ b/cedr/cedr_local.cpp
@@ -0,0 +1,223 @@
+#include "cedr_local.hpp"
+#include "cedr_local_inl.hpp"
+
+namespace cedr {
+namespace local {
+namespace test {
+// Check the first-order optimality conditions. Return true if OK, false
+// otherwise. If quiet, don't print anything.
+bool check_1eq_bc_qp_foc (
+  const char* label, const Int n, const Real* w, const Real* a, const Real b,
+  const Real* xlo, const Real* xhi, const Real* y, const Real* x, const bool verbose)
+{
+  auto& os = std::cout;
+  bool ok = true;
+  Real xtmp;
+  // Check the bound constraints.
+  for (Int i = 0; i < n; ++i)
+    if (x[i] < (xtmp = xlo[i])) {
+      if (verbose)
+        os << "x[" << i << "] = " << x[i]
+           << " but x[i] - xlo[i] = " << (x[i] - xtmp) << "\n";
+      ok = false;
+    }
+  for (Int i = 0; i < n; ++i)
+    if (x[i] > (xtmp = xhi[i])) {
+      if (verbose)
+        os << "x[" << i << "] = " << x[i]
+           << " but xhi[i] - x[i] = " << (xtmp - x[i]) << "\n";
+      ok = false;
+    }
+  // Check the equality constraint.
+  Real r = 0;
+  for (Int i = 0; i < n; ++i)
+    r += a[i]*x[i];
+  r -= b;
+  if (std::abs(r) > impl::calc_r_tol(b, a, y, n)) {
+    if (verbose)
+      os << "r = " << r << "\n";
+    ok = false;
+  }
+  // Check the gradient is 0 when projected into the constraints. Compute
+  //     g = W (x - y)
+  //     g_reduced = g - C ((C'C) \ (C'g))
+  // where
+  //     IA = I(:,A)
+  //     C = [IA a],
+  // and A is the active set.
+  const Real padtol = 1e5*std::numeric_limits<Real>::epsilon();
+  Real lambda = 0, den = 0;
+  for (Int i = 0; i < n; ++i) {
+    const Real pad = padtol*(xhi[i] - xlo[i]);
+    if (xlo[i] + pad <= x[i] && x[i] <= xhi[i] - pad) {
+      const Real gi = w[i]*(x[i] - y[i]);
+      lambda += a[i]*gi;
+      den += a[i]*a[i];
+    }
+  }
+  lambda /= den;
+  Real normg = 0, normy = 0;
+  for (Int i = 0; i < n; ++i) {
+    normy += cedr::util::square(y[i]);
+    const Real pad = padtol*(xhi[i] - xlo[i]);
+    if (xlo[i] + pad <= x[i] && x[i] <= xhi[i] - pad)
+      normg += cedr::util::square(w[i]*(x[i] - y[i]) - a[i]*lambda);
+  }
+  normy = std::sqrt(normy);
+  normg = std::sqrt(normg);
+  const Real gtol = 1e4*std::numeric_limits<Real>::epsilon()*normy;
+  if (normg > gtol) {
+    if (verbose)
+      os << "norm(g) = " << normg << " gtol = " << gtol << "\n";
+    ok = false;
+  }
+  // Check the gradient at the active boundaries.
+  for (Int i = 0; i < n; ++i) {
+    const bool onlo = x[i] == xlo[i];
+    const bool onhi = onlo ? false : x[i] == xhi[i];
+    if (onlo || onhi) {
+      const Real rg = w[i]*(x[i] - y[i]) - a[i]*lambda;
+      if (onlo && rg < -gtol) {
+        if (verbose)
+          os << "onlo but rg = " << rg << "\n";
+        ok = false;
+      } else if (onhi && rg > gtol) {
+        if (verbose)
+          os << "onhi but rg = " << rg << "\n";
+        ok = false;
+      }
+    }
+  }
+  if ( ! ok && verbose)
+    os << "label: " << label << "\n";
+  return ok;
+}
+} // namespace test
+
+Int unittest () {
+  bool verbose = true;
+  Int nerr = 0;
+
+  Int n;
+  static const Int N = 16;
+  Real w[N], a[N], b, xlo[N], xhi[N], y[N], x[N], al, au;
+
+  auto run = [&] () {
+    const Int info = solve_1eq_bc_qp(n, w, a, b, xlo, xhi, y, x);
+    const bool ok = test::check_1eq_bc_qp_foc(
+      "unittest", n, w, a, b, xlo, xhi, y, x, verbose);
+    if ( ! ok) ++nerr;
+
+    if (n == 2) {
+      // This version never returns 0.
+      Real x2[2];
+      const Int info2 = solve_1eq_bc_qp_2d(w, a, b, xlo, xhi, y, x2);
+      if (info2 != 1 && (info == 0 || info == 1)) {
+        if (verbose) pr(puf(info) pu(info2));
+        ++nerr;
+      }
+      const Real rd = cedr::util::reldif(x, x2, 2);
+      if (rd > 1e4*std::numeric_limits<Real>::epsilon()) {
+        if (verbose)
+          printf("%1.1e | y %1.15e %1.15e | x %1.15e %1.15e | "
+                 "x2 %1.15e %1.15e | l %1.15e %1.15e | u %1.15e %1.15e\n",
+                 rd, y[0], y[1], x[0], x[1], x2[0], x2[1],
+                 xlo[0], xlo[1], xhi[0], xhi[1]);
+        ++nerr;
+      }
+    }
+
+    caas(n, a, b, xlo, xhi, y, x);
+    Real m = 0, den = 0;
+    for (Int i = 0; i < n; ++i) {
+      m += a[i]*x[i];
+      den += std::abs(a[i]*x[i]);
+      if (x[i] < xlo[i]) ++nerr;
+      else if (x[i] > xhi[i]) ++nerr;
+    }
+    const Real rd = std::abs(b - m)/den;
+    if (rd > 1e3*std::numeric_limits<Real>::epsilon()) {
+      if (verbose) pr(puf(rd) pu(n) pu(b) pu(m));
+      ++nerr;
+    }
+  };
+
+  auto gena = [&] () {
+    for (Int i = 0; i < n; ++i)
+      a[i] = 0.1 + cedr::util::urand();
+  };
+  auto genw = [&] () {
+    for (Int i = 0; i < n; ++i)
+      w[i] = 0.1 + cedr::util::urand();
+  };
+  auto genbnds = [&] () {
+    al = au = 0;
+    for (Int i = 0; i < n; ++i) {
+      xlo[i] = cedr::util::urand() - 0.5;
+      al += a[i]*xlo[i];
+      xhi[i] = xlo[i] + cedr::util::urand();
+      au += a[i]*xhi[i];
+    }
+  };
+  auto genb = [&] (const bool in) {
+    if (in) {
+      const Real alpha = cedr::util::urand();
+      b = alpha*al + (1 - alpha)*au;
+    } else {
+      if (cedr::util::urand() > 0.5)
+        b = au + 0.01 + cedr::util::urand();
+      else
+        b = al - 0.01 - cedr::util::urand();
+    }
+  };
+  auto geny = [&] (const bool in) {
+    if (in) {
+      for (Int i = 0; i < n; ++i) {
+        const Real alpha = cedr::util::urand();
+        y[i] = alpha*xlo[i] + (1 - alpha)*xhi[i];
+      }
+    } else if (cedr::util::urand() > 0.2) {
+      for (Int i = 1; i < n; i += 2) {
+        const Real alpha = cedr::util::urand();
+        y[i] = alpha*xlo[i] + (1 - alpha)*xhi[i];
+        cedr_assert(y[i] >= xlo[i] && y[i] <= xhi[i]);
+      }      
+      for (Int i = 0; i < n; i += 4)
+        y[i] = xlo[i] - cedr::util::urand();
+      for (Int i = 2; i < n; i += 4)
+        y[i] = xhi[i] + cedr::util::urand();
+    } else {
+      for (Int i = 0; i < n; i += 2)
+        y[i] = xlo[i] - cedr::util::urand();
+      for (Int i = 1; i < n; i += 2)
+        y[i] = xhi[i] + cedr::util::urand();
+    }
+  };
+  auto b4y = [&] () {
+    b = 0;
+    for (Int i = 0; i < n; ++i)
+      b += a[i]*y[i];
+  };
+
+  for (n = 2; n <= 16; ++n) {
+    const Int count = n == 2 ? 100 : 10;
+    for (Int i = 0; i < count; ++i) {
+      gena();
+      genw();
+      genbnds();
+      genb(true);
+      geny(true);
+      run();
+      b4y();
+      run();
+      genb(true);
+      geny(false);
+      run();
+    }
+  }
+
+  return  nerr;
+}
+
+}
+}
diff --git a/cedr/cedr_local.hpp b/cedr/cedr_local.hpp
new file mode 100644
index 0000000..e6a6ba8
--- /dev/null
+++ b/cedr/cedr_local.hpp
@@ -0,0 +1,41 @@
+#ifndef INCLUDE_CEDR_LOCAL_HPP
+#define INCLUDE_CEDR_LOCAL_HPP
+
+#include "cedr.hpp"
+#include "cedr_kokkos.hpp"
+
+namespace cedr {
+namespace local {
+
+// Solve
+//     min_x sum_i w(i) (x(i) - y(i))^2
+//      st   a' x = b
+//           xlo <= x <= xhi,
+// a(i), w(i) > 0. Return 0 on success and x == y, 1 on success and x != y, -1
+// if infeasible, -2 if max_its hit with no solution. See Section 3 of Bochev,
+// Ridzal, Shashkov, Fast optimization-based conservative remap of scalar fields
+// through aggregate mass transfer. lambda is used in check_1eq_bc_qp_foc.
+KOKKOS_INLINE_FUNCTION
+Int solve_1eq_bc_qp(const Int n, const Real* w, const Real* a, const Real b,
+                    const Real* xlo, const Real* xhi,
+                    const Real* y, Real* x, const Int max_its = 100);
+
+KOKKOS_INLINE_FUNCTION
+Int solve_1eq_bc_qp_2d(const Real* w, const Real* a, const Real b,
+                       const Real* xlo, const Real* xhi,
+                       const Real* y, Real* x);
+
+// ClipAndAssuredSum. Does not check for feasibility.
+KOKKOS_INLINE_FUNCTION
+void caas(const Int n, const Real* a, const Real b,
+          const Real* xlo, const Real* xhi,
+          const Real* y, Real* x);
+
+Int unittest();
+
+}
+}
+
+#include "cedr_local_inl.hpp"
+
+#endif
diff --git a/cedr/cedr_local_inl.hpp b/cedr/cedr_local_inl.hpp
new file mode 100644
index 0000000..5c3c867
--- /dev/null
+++ b/cedr/cedr_local_inl.hpp
@@ -0,0 +1,303 @@
+#ifndef INCLUDE_CEDR_LOCAL_INL_HPP
+#define INCLUDE_CEDR_LOCAL_INL_HPP
+
+#include "cedr_util.hpp"
+
+namespace cedr {
+namespace local {
+
+namespace impl {
+KOKKOS_INLINE_FUNCTION
+Real calc_r_tol (const Real b, const Real* a, const Real* y, const Int n) {
+  Real ab = std::abs(b);
+  for (Int i = 0; i < n; ++i) ab = std::max(ab, std::abs(a[i]*y[i]));
+  return 1e1*std::numeric_limits<Real>::epsilon()*std::abs(ab);
+}
+
+// Eval r at end points to check for feasibility, and also possibly a quick exit
+// on a common case. Return -1 if infeasible, 1 if a corner is a solution, 0 if
+// feasible and a corner is not.
+KOKKOS_INLINE_FUNCTION
+Int check_lu (const Int n, const Real* a, const Real& b,
+              const Real* xlo, const Real* xhi, const Real* y, const Real& r_tol,
+              Real* x) {
+  Real r = -b;
+  for (Int i = 0; i < n; ++i) {
+    x[i] = xlo[i];
+    r += a[i]*x[i];
+  }
+  if (std::abs(r) <= r_tol) return 1;
+  if (r > 0) return -1;
+  r = -b;
+  for (Int i = 0; i < n; ++i) {
+    x[i] = xhi[i];
+    r += a[i]*x[i];
+  }
+  if (std::abs(r) <= r_tol) return 1;
+  if (r < 0) return -1;
+  return 0;
+}
+
+KOKKOS_INLINE_FUNCTION
+void calc_r (const Int n, const Real* w, const Real* a, const Real b,
+             const Real* xlo, const Real* xhi,  const Real* y, const Real& lambda,
+             Real* x, Real& r, Real& r_lambda) {
+  r = 0;
+  r_lambda = 0;
+  for (Int i = 0; i < n; ++i) {
+    const Real q = a[i]/w[i];
+    const Real x_trial = y[i] + lambda*q;
+    Real xtmp;
+    if (x_trial < (xtmp = xlo[i]))
+      x[i] = xtmp;
+    else if (x_trial > (xtmp = xhi[i]))
+      x[i] = xtmp;
+    else {
+      x[i] = x_trial;
+      r_lambda += a[i]*q;
+    }
+    r += a[i]*x[i];
+  }
+  r -= b;
+}
+} // namespace impl
+
+// 2D special case for efficiency.
+KOKKOS_INLINE_FUNCTION
+Int solve_1eq_bc_qp_2d (const Real* w, const Real* a, const Real b,
+                        const Real* xlo, const Real* xhi, 
+                        const Real* y, Real* x) {
+  const Real r_tol = impl::calc_r_tol(b, a, y, 2);
+  Int info = impl::check_lu(2, a, b, xlo, xhi, y, r_tol, x);
+  if (info != 0) return info;
+
+  { // Check if the optimal point ignoring bound constraints is in bounds.
+    Real qmass = 0, dm = b;
+    for (int i = 0; i < 2; ++i) {
+      const Real qi = a[i]/w[i];
+      qmass += a[i]*qi;
+      dm -= a[i]*y[i];
+    }
+    const Real lambda = dm/qmass;
+    bool ok = true;
+    for (int i = 0; i < 2; ++i) {
+      x[i] = y[i] + lambda*(a[i]/w[i]);
+      if (x[i] < xlo[i] || x[i] > xhi[i]) {
+        ok = false;
+        break;
+      }
+    }
+    if (ok) return 1;
+  }
+
+  // Solve for intersection of a'x = b, given by the parameterized line
+  //     p(alpa) = x_base + alpha x_dir,
+  // with a bounding line.
+
+  // Get parameterized line.
+  Real x_base[2];
+  for (int i = 0; i < 2; ++i)
+    x_base[i] = 0.5*b/a[i];
+  Real x_dir[] = {-a[1], a[0]};
+
+  // Get the 4 alpha values.
+  Real alphas[4];
+  alphas[0] = (xlo[1] - x_base[1])/x_dir[1]; // bottom
+  alphas[1] = (xhi[0] - x_base[0])/x_dir[0]; // right
+  alphas[2] = (xhi[1] - x_base[1])/x_dir[1]; // top
+  alphas[3] = (xlo[0] - x_base[0])/x_dir[0]; // left
+
+  // Find the middle two in the sorted alphas.
+  Real min = alphas[0], max = min;
+  Int imin = 0, imax = 0;
+  for (Int i = 1; i < 4; ++i) {
+    const Real alpha = alphas[i];
+    if (alpha < min) { min = alpha; imin = i; }
+    if (alpha > max) { max = alpha; imax = i; }
+  }
+  Int ais[2];
+  Int cnt = 0;
+  for (Int i = 0; i < 4; ++i)
+    if (i != imin && i != imax) {
+      ais[cnt++] = i;
+      if (cnt == 2) break;
+    }
+
+  Real objs[2];
+  Real alpha_mid = 0;
+  for (Int j = 0; j < 2; ++j) {
+    const Real alpha = alphas[ais[j]];
+    alpha_mid += alpha;
+    Real obj = 0;
+    for (Int i = 0; i < 2; ++i) {
+      x[i] = x_base[i] + alpha*x_dir[i];
+      obj += w[i]*cedr::util::square(y[i] - x[i]);
+    }
+    objs[j] = obj;
+  }
+
+  const Int ai = ais[objs[0] <= objs[1] ? 0 : 1];
+
+  info = 1;
+  Int clipidx = 0;
+  const Real alpha = alphas[ai];
+  switch (ai) {
+  case 0: case 2:
+    x[0] = x_base[0] + alpha*x_dir[0];
+    x[1] = ai == 0 ? xlo[1] : xhi[1];
+    clipidx = 0;
+    break;
+  case 1: case 3:
+    x[0] = ai == 1 ? xhi[0] : xlo[0];
+    x[1] = x_base[1] + alpha*x_dir[1];
+    clipidx = 1;
+    break;
+  default: cedr_assert(0); info = -2;
+  }
+  x[clipidx] = cedr::impl::min(xhi[clipidx], cedr::impl::max(xlo[clipidx], x[clipidx]));
+  return info;
+}
+
+KOKKOS_INLINE_FUNCTION
+Int solve_1eq_bc_qp (const Int n, const Real* w, const Real* a, const Real b,
+                     const Real* xlo, const Real* xhi, const Real* y, Real* x,
+                     const Int max_its) {
+  const Real r_tol = impl::calc_r_tol(b, a, y, n);
+  Int info = impl::check_lu(n, a, b, xlo, xhi, y, r_tol, x);
+  if (info != 0) return info;
+
+  for (int i = 0; i < n; ++i)
+    if (x[i] != y[i]) {
+      info = 1;
+      x[i] = y[i];
+    }
+
+  // In our use case, the caller has already checked (more cheaply) for a quick
+  // exit.
+#if 0
+  { // Check for a quick exit.
+    bool all_in = true;
+    Real r = 0;
+    for (Int i = 0; i < n; ++i) {
+      if (x[i] < xlo[i] || x[i] > xhi[i]) {
+        all_in = false;
+        break;
+      }
+      r += a[i]*x[i];
+    }
+    if (all_in) {
+      r -= b;
+      if (std::abs(r) <= r_tol)
+        return info;
+    }
+  }
+#endif
+
+  const Real wall_dist = 1e-3;
+
+  // Get lambda endpoints.
+  Real lamlo = 0, lamhi = 0;
+  for (Int i = 0; i < n; ++i) {
+    const Real rq = w[i]/a[i];
+    const Real lamlo_i = rq*(xlo[i] - y[i]);
+    const Real lamhi_i = rq*(xhi[i] - y[i]);
+    if (i == 0) {
+      lamlo = lamlo_i;
+      lamhi = lamhi_i;
+    } else {
+      lamlo = cedr::impl::min(lamlo, lamlo_i);
+      lamhi = cedr::impl::max(lamhi, lamhi_i);
+    }
+  }
+  const Real lamlo_feas = lamlo, lamhi_feas = lamhi;
+  Real lambda = lamlo <= 0 && lamhi >= 0 ? 0 : lamlo;
+
+  // Bisection-safeguarded Newton iteration for r(lambda) = 0.
+  bool prev_step_bisect = false;
+  Int nbisect = 0;
+  info = -2;
+  for (Int iteration = 0; iteration < max_its; ++iteration) {
+    // Compute x, r, r_lambda.
+    Real r, r_lambda;
+    impl::calc_r(n, w, a, b, xlo, xhi, y, lambda, x, r, r_lambda);
+    // Is r(lambda) - b sufficiently == 0?
+    if (std::abs(r) <= r_tol) {
+      info = 1;
+      break;
+    }
+    // Check if the lambda bounds are too close.
+    if (nbisect > 64) {
+      if (lamhi == lamhi_feas || lamlo == lamlo_feas) {
+        // r isn't small enough and one lambda bound is on the feasibility
+        // limit. The QP must not be feasible.
+        info = -1;
+        break;
+      }
+      info = 1;
+      break;
+    }
+    // Adjust lambda bounds.
+    if (r > 0)
+      lamhi = lambda;
+    else
+      lamlo = lambda;
+    if (r_lambda != 0) {
+      // Newton step.
+      lambda -= r/r_lambda;
+    } else {
+      // Force bisection.
+      lambda = lamlo;
+    }
+    // Safeguard. The wall distance check assures progress, but use it only
+    // every other potential bisection.
+    const Real D = prev_step_bisect ? 0 : wall_dist*(lamhi - lamlo);
+    if (lambda - lamlo < D || lamhi - lambda < D) {
+      lambda = 0.5*(lamlo + lamhi);
+      ++nbisect;
+      prev_step_bisect = true;
+    } else {
+      prev_step_bisect = false;
+    }
+  }
+
+  return info;
+}
+
+KOKKOS_INLINE_FUNCTION
+void caas (const Int n, const Real* a, const Real b,
+           const Real* xlo, const Real* xhi,
+           const Real* y, Real* x) {
+  Real dm = b;
+  for (Int i = 0; i < n; ++i) {
+    x[i] = cedr::impl::max(xlo[i], cedr::impl::min(xhi[i], y[i]));
+    dm -= a[i]*x[i];
+  }
+  if (dm == 0) return;
+  if (dm > 0) {
+    Real fac = 0;
+    for (Int i = 0; i < n; ++i)
+      fac += a[i]*(xhi[i] - x[i]);
+    if (fac > 0) {
+      fac = dm/fac;
+      for (Int i = 0; i < n; ++i)
+        x[i] += fac*(xhi[i] - x[i]);
+    }
+  } else if (dm < 0) {
+    Real fac = 0;
+    for (Int i = 0; i < n; ++i)
+      fac += a[i]*(x[i] - xlo[i]);
+    if (fac > 0) {
+      fac = dm/fac;
+      for (Int i = 0; i < n; ++i)
+        x[i] += fac*(x[i] - xlo[i]);
+    }
+  }
+  // Clip again for numerics.
+  for (Int i = 0; i < n; ++i)
+    x[i] = cedr::impl::max(xlo[i], cedr::impl::min(xhi[i], x[i]));
+}
+
+} // namespace local
+} // namespace cedr
+
+#endif
diff --git a/cedr/cedr_mpi.cpp b/cedr/cedr_mpi.cpp
new file mode 100644
index 0000000..1569a66
--- /dev/null
+++ b/cedr/cedr_mpi.cpp
@@ -0,0 +1,41 @@
+#include "cedr_mpi.hpp"
+
+namespace cedr {
+namespace mpi {
+
+Parallel::Ptr make_parallel (MPI_Comm comm) {
+  return std::make_shared<Parallel>(comm);
+}
+
+Int Parallel::size () const {
+  int sz = 0;
+  MPI_Comm_size(comm_, &sz);
+  return sz;
+}
+
+Int Parallel::rank () const {
+  int pid = 0;
+  MPI_Comm_rank(comm_, &pid);
+  return pid;
+}
+
+template <> MPI_Datatype get_type<int>() { return MPI_INT; }
+template <> MPI_Datatype get_type<double>() { return MPI_DOUBLE; }
+template <> MPI_Datatype get_type<long>() { return MPI_LONG_INT; }
+
+int waitany (int count, MPI_Request* reqs, int* index, MPI_Status* stats) {
+  return MPI_Waitany(count, reqs, index, stats ? stats : MPI_STATUS_IGNORE);
+}
+
+int waitall (int count, MPI_Request* reqs, MPI_Status* stats) {
+  return MPI_Waitall(count, reqs, stats ? stats : MPI_STATUS_IGNORE);
+}
+
+bool all_ok (const Parallel& p, bool im_ok) {
+  int ok = im_ok, msg;
+  all_reduce<int>(p, &ok, &msg, 1, MPI_LAND);
+  return static_cast<bool>(msg);
+}
+
+}
+}
diff --git a/cedr/cedr_mpi.hpp b/cedr/cedr_mpi.hpp
new file mode 100644
index 0000000..1f28594
--- /dev/null
+++ b/cedr/cedr_mpi.hpp
@@ -0,0 +1,77 @@
+#ifndef INCLUDE_CEDR_MPI_HPP
+#define INCLUDE_CEDR_MPI_HPP
+
+#include <memory>
+
+#include <mpi.h>
+
+#include "cedr.hpp"
+
+namespace cedr {
+namespace mpi {
+
+class Parallel {
+  MPI_Comm comm_;
+public:
+  typedef std::shared_ptr<Parallel> Ptr;
+  Parallel(MPI_Comm comm) : comm_(comm) {}
+  MPI_Comm comm () const { return comm_; }
+  Int size() const;
+  Int rank() const;
+  Int root () const { return 0; }
+  bool amroot () const { return rank() == root(); }
+};
+
+Parallel::Ptr make_parallel(MPI_Comm comm);
+
+template <typename T> MPI_Datatype get_type();
+
+template <typename T>
+int reduce(const Parallel& p, const T* sendbuf, T* rcvbuf, int count, MPI_Op op,
+           int root);
+
+template <typename T>
+int all_reduce(const Parallel& p, const T* sendbuf, T* rcvbuf, int count, MPI_Op op);
+
+template <typename T>
+int isend(const Parallel& p, const T* buf, int count, int dest, int tag,
+          MPI_Request* ireq);
+
+template <typename T>
+int irecv(const Parallel& p, T* buf, int count, int src, int tag, MPI_Request* ireq);
+
+int waitany(int count, MPI_Request* reqs, int* index, MPI_Status* stats = nullptr);
+
+int waitall(int count, MPI_Request* reqs, MPI_Status* stats = nullptr);
+
+template<typename T>
+int gather(const Parallel& p, const T* sendbuf, int sendcount,
+           T* recvbuf, int recvcount, int root);
+
+template <typename T>
+int gatherv(const Parallel& p, const T* sendbuf, int sendcount,
+            T* recvbuf, const int* recvcounts, const int* displs, int root);
+
+bool all_ok(const Parallel& p, bool im_ok);
+
+struct Op {
+  typedef std::shared_ptr<Op> Ptr;
+
+  Op (MPI_User_function* function, bool commute) {
+    MPI_Op_create(function, static_cast<int>(commute), &op_);
+  }
+
+  ~Op () { MPI_Op_free(&op_); }
+
+  const MPI_Op& get () const { return op_; }
+
+private:
+  MPI_Op op_;
+};
+
+} // namespace mpi
+} // namespace cedr
+
+#include "cedr_mpi_inl.hpp"
+
+#endif
diff --git a/cedr/cedr_mpi_inl.hpp b/cedr/cedr_mpi_inl.hpp
new file mode 100644
index 0000000..e4f28db
--- /dev/null
+++ b/cedr/cedr_mpi_inl.hpp
@@ -0,0 +1,59 @@
+#ifndef INCLUDE_CEDR_MPI_INL_HPP
+#define INCLUDE_CEDR_MPI_INL_HPP
+
+namespace cedr {
+namespace mpi {
+
+template <typename T>
+int reduce (const Parallel& p, const T* sendbuf, T* rcvbuf, int count, MPI_Op op,
+            int root) {
+  MPI_Datatype dt = get_type<T>();
+  return MPI_Reduce(const_cast<T*>(sendbuf), rcvbuf, count, dt, op, root, p.comm());
+}
+
+template <typename T>
+int all_reduce (const Parallel& p, const T* sendbuf, T* rcvbuf, int count, MPI_Op op) {
+  MPI_Datatype dt = get_type<T>();
+  return MPI_Allreduce(const_cast<T*>(sendbuf), rcvbuf, count, dt, op, p.comm());
+}
+
+template <typename T>
+int isend (const Parallel& p, const T* buf, int count, int dest, int tag,
+           MPI_Request* ireq) {
+  MPI_Datatype dt = get_type<T>();
+  MPI_Request ureq;
+  MPI_Request* req = ireq ? ireq : &ureq;
+  int ret = MPI_Isend(const_cast<T*>(buf), count, dt, dest, tag, p.comm(), req);
+  if ( ! ireq) MPI_Request_free(req);
+  return ret;
+}
+
+template <typename T>
+int irecv (const Parallel& p, T* buf, int count, int src, int tag, MPI_Request* ireq) {
+  MPI_Datatype dt = get_type<T>();
+  MPI_Request ureq;
+  MPI_Request* req = ireq ? ireq : &ureq;
+  int ret = MPI_Irecv(buf, count, dt, src, tag, p.comm(), req);
+  if ( ! ireq) MPI_Request_free(req);
+  return ret;
+}
+
+template<typename T>
+int gather (const Parallel& p, const T* sendbuf, int sendcount,
+            T* recvbuf, int recvcount, int root) {
+  MPI_Datatype dt = get_type<T>();
+  return MPI_Gather(sendbuf, sendcount, dt, recvbuf, recvcount, dt, root, p.comm());
+}
+
+template <typename T>
+int gatherv (const Parallel& p, const T* sendbuf, int sendcount,
+             T* recvbuf, const int* recvcounts, const int* displs, int root) {
+  MPI_Datatype dt = get_type<T>();
+  return MPI_Gatherv(sendbuf, sendcount, dt, recvbuf, recvcounts, displs, dt, root,
+                     p.comm());
+}
+
+} // namespace mpi
+} // namespace cedr
+
+#endif
diff --git a/cedr/cedr_qlt.cpp b/cedr/cedr_qlt.cpp
new file mode 100644
index 0000000..6a04f04
--- /dev/null
+++ b/cedr/cedr_qlt.cpp
@@ -0,0 +1,1159 @@
+#include "cedr_qlt.hpp"
+#include "cedr_test_randomized.hpp"
+
+#include <sys/time.h>
+
+#include <cassert>
+#include <cmath>
+
+#include <set>
+#include <list>
+#include <limits>
+#include <algorithm>
+
+namespace cedr {
+namespace qlt {
+
+class Timer {
+public:
+  enum Op { tree, analyze, qltrun, qltrunl2r, qltrunr2l, snp, waitall,
+            total, NTIMERS };
+  static inline void init () {
+#ifdef QLT_TIME
+    for (int i = 0; i < NTIMERS; ++i) {
+      et_[i] = 0;
+      cnt_[i] = 0;
+    }
+#endif
+  }
+  static inline void reset (const Op op) {
+#ifdef QLT_TIME
+    et_[op] = 0;
+    cnt_[op] = 0;
+#endif
+  }
+  static inline void start (const Op op) {
+#ifdef QLT_TIME
+    gettimeofday(&t_start_[op], 0);
+    ++cnt_[op];
+#endif
+  }
+  static inline void stop (const Op op) {
+#ifdef QLT_TIME
+    timeval t2;
+    gettimeofday(&t2, 0);
+    const timeval& t1 = t_start_[op];
+    static const double us = 1.0e6;
+    et_[op] += (t2.tv_sec*us + t2.tv_usec - t1.tv_sec*us - t1.tv_usec)/us;
+#endif
+  }
+# define tpr(op) do {                                                   \
+    printf("%-20s %10.3e %10.1f (%4d %10.3e)\n",                        \
+           #op, et_[op], 100*et_[op]/tot, cnt_[op], et_[op]/cnt_[op]);  \
+  } while (0)
+  static void print () {
+#ifdef QLT_TIME
+    const double tot = et_[total];
+    tpr(tree); tpr(analyze);
+    tpr(qltrun); tpr(qltrunl2r); tpr(qltrunr2l); tpr(snp); tpr(waitall);
+    printf("%-20s %10.3e %10.1f\n", "total", tot, 100.0);
+#endif
+  }
+#undef tpr
+private:
+#ifdef QLT_TIME
+  static timeval t_start_[NTIMERS];
+  static double et_[NTIMERS];
+  static int cnt_[NTIMERS];
+#endif
+};
+#ifdef QLT_TIME
+timeval Timer::t_start_[Timer::NTIMERS];
+double Timer::et_[Timer::NTIMERS];
+int Timer::cnt_[Timer::NTIMERS];
+#endif
+
+namespace impl {
+struct NodeSets {
+  typedef std::shared_ptr<const NodeSets> ConstPtr;
+  
+  enum : int { mpitag = 42 };
+
+  // A node in the tree that is relevant to this rank.
+  struct Node {
+    // Rank of the node. If the node is in a level, then its rank is my rank. If
+    // it's not in a level, then it is a comm partner of a node on this rank.
+    Int rank;
+    // Globally unique identifier; cellidx if leaf node, ie, if nkids == 0.
+    Int id;
+    // This node's parent, a comm partner, if such a partner is required.
+    const Node* parent;
+    // This node's kids, comm partners, if such partners are required. Parent
+    // and kid nodes are pruned relative to the full tree over the mesh to
+    // contain just the nodes that matter to this rank.
+    Int nkids;
+    const Node* kids[2];
+    // Offset factor into bulk data. An offset is a unit; actual buffer sizes
+    // are multiples of this unit.
+    Int offset;
+
+    Node () : rank(-1), id(-1), parent(nullptr), nkids(0), offset(-1) {}
+  };
+
+  // A level in the level schedule that is constructed to orchestrate
+  // communication. A node in a level depends only on nodes in lower-numbered
+  // levels (l2r) or higher-numbered (r2l).
+  //
+  // The communication patterns are as follows:
+  //   > l2r
+  //   MPI rcv into kids
+  //   sum into node
+  //   MPI send from node
+  //   > r2l
+  //   MPI rcv into node
+  //   solve QP for kids
+  //   MPI send from kids
+  struct Level {
+    struct MPIMetaData {
+      Int rank;   // Rank of comm partner.
+      Int offset; // Offset to start of buffer for this comm.
+      Int size;   // Size of this buffer in units of offsets.
+    };
+    
+    // The nodes in the level.
+    std::vector<Node*> nodes;
+    // MPI information for this level.
+    std::vector<MPIMetaData> me, kids;
+    // Have to keep requests separate so we can call waitall if we want to.
+    mutable std::vector<MPI_Request> me_req, kids_req;
+  };
+  
+  // Levels. nodes[0] is level 0, the leaf level.
+  std::vector<Level> levels;
+  // Number of data slots this rank needs. Each node owned by this rank, plus
+  // kids on other ranks, have an associated slot.
+  Int nslots;
+  
+  // Allocate a node. The list node_mem_ is the mechanism for memory ownership;
+  // node_mem_ isn't used for anything other than owning nodes.
+  Node* alloc () {
+    node_mem_.push_front(Node());
+    return &node_mem_.front();
+  }
+
+  void print(std::ostream& os) const;
+  
+private:
+  std::list<Node> node_mem_;
+};
+
+void NodeSets::print (std::ostream& os) const {
+  std::stringstream ss;
+  if (levels.empty()) return;
+  const Int myrank = levels[0].nodes[0]->rank;
+  ss << "pid " << myrank << ":";
+  ss << " #levels " << levels.size();
+  for (size_t i = 0; i < levels.size(); ++i) {
+    const auto& lvl = levels[i];
+    ss << "\n  " << i << ": " << lvl.nodes.size();
+    std::set<Int> ps, ks;
+    for (size_t j = 0; j < lvl.nodes.size(); ++j) {
+      const auto n = lvl.nodes[j];
+      for (Int k = 0; k < n->nkids; ++k)
+        if (n->kids[k]->rank != myrank)
+          ks.insert(n->kids[k]->rank);
+      if (n->parent && n->parent->rank != myrank)
+        ps.insert(n->parent->rank);
+    }
+    ss << " |";
+    for (const auto& e : ks) ss << " " << e;
+    if ( ! lvl.kids.empty()) ss << " (" << lvl.kids.size() << ") |";
+    for (const auto& e : ps) ss << " " << e;
+    if ( ! lvl.me.empty()) ss << " (" << lvl.me.size() << ")";
+  }
+  ss << "\n";
+  os << ss.str();
+}
+
+// Find tree depth, assign ranks to non-leaf nodes, and init 'reserved'.
+Int init_tree (const tree::Node::Ptr& node, Int& id) {
+  node->reserved = nullptr;
+  Int depth = 0;
+  for (Int i = 0; i < node->nkids; ++i) {
+    cedr_assert(node.get() == node->kids[i]->parent);
+    depth = std::max(depth, init_tree(node->kids[i], id));
+  }
+  if (node->nkids) {
+    node->rank = node->kids[0]->rank;
+    node->cellidx = id++;
+  } else {
+    cedr_throw_if(node->cellidx < 0 || node->cellidx >= id,
+                  "cellidx is " << node->cellidx << " but should be between " <<
+                  0 << " and " << id);
+  }
+  return depth + 1;
+}
+
+void level_schedule_and_collect (
+  NodeSets& ns, const Int& my_rank, const tree::Node::Ptr& node, Int& level,
+  bool& need_parent_ns_node)
+{
+  cedr_assert(node->rank != -1);
+  level = -1;
+  bool make_ns_node = false;
+  for (Int i = 0; i < node->nkids; ++i) {
+    Int kid_level;
+    bool kid_needs_ns_node;
+    level_schedule_and_collect(ns, my_rank, node->kids[i], kid_level,
+                               kid_needs_ns_node);
+    level = std::max(level, kid_level);
+    if (kid_needs_ns_node) make_ns_node = true;
+  }
+  ++level;
+  // Is parent node needed for isend?
+  const bool node_is_owned = node->rank == my_rank;
+  need_parent_ns_node = node_is_owned;
+  if (node_is_owned || make_ns_node) {
+    cedr_assert( ! node->reserved);
+    NodeSets::Node* ns_node = ns.alloc();
+    // Levels hold only owned nodes.
+    if (node_is_owned) ns.levels[level].nodes.push_back(ns_node);
+    node->reserved = ns_node;
+    ns_node->rank = node->rank;
+    ns_node->id = node->cellidx;
+    ns_node->parent = nullptr;
+    if (node_is_owned) {
+      // If this node is owned, it needs to have information about all kids.
+      ns_node->nkids = node->nkids;
+      for (Int i = 0; i < node->nkids; ++i) {
+        const auto& kid = node->kids[i];
+        if ( ! kid->reserved) {
+          // This kid isn't owned by this rank. But need it for irecv.
+          NodeSets::Node* ns_kid;
+          kid->reserved = ns_kid = ns.alloc();
+          ns_node->kids[i] = ns_kid;
+          cedr_assert(kid->rank != my_rank);
+          ns_kid->rank = kid->rank;
+          ns_kid->id = kid->cellidx;
+          ns_kid->parent = nullptr; // Not needed.
+          // The kid may have kids in the original tree, but in the tree pruned
+          // according to rank, it does not.
+          ns_kid->nkids = 0;
+        } else {
+          // This kid is owned by this rank, so fill in its parent pointer.
+          NodeSets::Node* ns_kid = static_cast<NodeSets::Node*>(kid->reserved);
+          ns_node->kids[i] = ns_kid;
+          ns_kid->parent = ns_node;
+        }
+      }
+    } else {
+      // This node is not owned. Update the owned kids with its parent.
+      ns_node->nkids = 0;
+      for (Int i = 0; i < node->nkids; ++i) {
+        const auto& kid = node->kids[i];
+        if (kid->reserved && kid->rank == my_rank) {
+          NodeSets::Node* ns_kid = static_cast<NodeSets::Node*>(kid->reserved);
+          ns_node->kids[ns_node->nkids++] = ns_kid;
+          ns_kid->parent = ns_node;
+        }
+      }
+    }
+  }
+}
+
+void level_schedule_and_collect (NodeSets& ns, const Int& my_rank,
+                                 const tree::Node::Ptr& tree) {
+  Int iunused;
+  bool bunused;
+  level_schedule_and_collect(ns, my_rank, tree, iunused, bunused);
+}
+
+void consolidate (NodeSets& ns) {
+  auto levels = ns.levels;
+  ns.levels.clear();
+  for (const auto& level : levels)
+    if ( ! level.nodes.empty())
+      ns.levels.push_back(level);
+}
+
+typedef std::pair<Int, NodeSets::Node*> RankNode;
+
+void init_offsets (const Int my_rank, std::vector<RankNode>& rns,
+                   std::vector<NodeSets::Level::MPIMetaData>& mmds, Int& offset) {
+  // Set nodes on my rank to have rank -1 so that they sort first.
+  for (auto& rn : rns)
+    if (rn.first == my_rank)
+      rn.first = -1;
+
+  // Sort so that all comms with a given rank are contiguous. Stable sort so
+  // that rns retains its order, in particular in the leaf node level.
+  std::stable_sort(rns.begin(), rns.end());
+
+  // Collect nodes into groups by rank and set up comm metadata for each group.
+  Int prev_rank = -1;
+  for (auto& rn : rns) {
+    const Int rank = rn.first;
+    if (rank == -1) {
+      if (rn.second->offset == -1)
+        rn.second->offset = offset++;
+      continue;
+    }
+    if (rank != prev_rank) {
+      cedr_assert(rank > prev_rank);
+      prev_rank = rank;
+      mmds.push_back(NodeSets::Level::MPIMetaData());
+      auto& mmd = mmds.back();
+      mmd.rank = rank;
+      mmd.offset = offset;
+      mmd.size = 0;
+    }
+    ++mmds.back().size;
+    rn.second->offset = offset++;
+  }
+}
+
+// Set up comm data. Consolidate so that there is only one message between me
+// and another rank per level. Determine an offset for each node, to be
+// multiplied by data-size factors later, for use in data buffers.
+void init_comm (const Int my_rank, NodeSets& ns) {
+  ns.nslots = 0;
+  for (auto& lvl : ns.levels) {
+    Int nkids = 0;
+    for (const auto& n : lvl.nodes)
+      nkids += n->nkids;
+
+    std::vector<RankNode> me(lvl.nodes.size()), kids(nkids);
+    for (size_t i = 0, mi = 0, ki = 0; i < lvl.nodes.size(); ++i) {
+      const auto& n = lvl.nodes[i];
+      me[mi].first = n->parent ? n->parent->rank : my_rank;
+      me[mi].second = const_cast<NodeSets::Node*>(n);
+      ++mi;
+      for (Int k = 0; k < n->nkids; ++k) {
+        kids[ki].first = n->kids[k]->rank;
+        kids[ki].second = const_cast<NodeSets::Node*>(n->kids[k]);
+        ++ki;
+      }
+    }
+
+    init_offsets(my_rank, me, lvl.me, ns.nslots);
+    lvl.me_req.resize(lvl.me.size());
+    init_offsets(my_rank, kids, lvl.kids, ns.nslots);
+    lvl.kids_req.resize(lvl.kids.size());
+  }
+}
+
+// Analyze the tree to extract levels. Levels are run from 0 to #level - 1. Each
+// level has nodes whose corresponding operations depend on only nodes in
+// lower-indexed levels. This mechanism prevents deadlock in the general case of
+// multiple cells per rank, with multiple ranks appearing in a subtree other
+// than the root.
+//   In addition, the set of nodes collected into levels are just those owned by
+// this rank, and those with which owned nodes must communicate.
+//   Once this function is done, the tree can be deleted.
+NodeSets::ConstPtr analyze (const Parallel::Ptr& p, const Int& ncells,
+                            const tree::Node::Ptr& tree) {
+  const auto nodesets = std::make_shared<NodeSets>();
+  cedr_assert( ! tree->parent);
+  Int id = ncells;
+  const Int depth = init_tree(tree, id);
+  nodesets->levels.resize(depth);
+  level_schedule_and_collect(*nodesets, p->rank(), tree);
+  consolidate(*nodesets);
+  init_comm(p->rank(), *nodesets);
+  return nodesets;
+}
+
+// Check that the offsets are self consistent.
+Int check_comm (const NodeSets::ConstPtr& ns) {
+  Int nerr = 0;
+  std::vector<Int> offsets(ns->nslots, 0);
+  for (const auto& lvl : ns->levels)
+    for (const auto& n : lvl.nodes) {
+      cedr_assert(n->offset < ns->nslots);
+      ++offsets[n->offset];
+      for (Int i = 0; i < n->nkids; ++i)
+        if (n->kids[i]->rank != n->rank)
+          ++offsets[n->kids[i]->offset];
+    }
+  for (const auto& e : offsets)
+    if (e != 1) ++nerr;
+  return nerr;
+}
+
+// Check that there are the correct number of leaf nodes, and that their offsets
+// all come first and are ordered the same as ns->levels[0]->nodes.
+Int check_leaf_nodes (const Parallel::Ptr& p, const NodeSets::ConstPtr& ns,
+                      const Int ncells) {
+  Int nerr = 0;
+  cedr_assert( ! ns->levels.empty());
+  cedr_assert( ! ns->levels[0].nodes.empty());
+  Int my_nleaves = 0;
+  for (const auto& n : ns->levels[0].nodes) {
+    cedr_assert( ! n->nkids);
+    ++my_nleaves;
+  }
+  for (const auto& n : ns->levels[0].nodes) {
+    cedr_assert(n->offset < my_nleaves);
+    cedr_assert(n->id < ncells);
+  }
+  Int glbl_nleaves = 0;
+  mpi::all_reduce(*p, &my_nleaves, &glbl_nleaves, 1, MPI_SUM);
+  if (glbl_nleaves != ncells)
+    ++nerr;
+  return nerr;
+}
+
+// Sum cellidx using the QLT comm pattern.
+Int test_comm_pattern (const Parallel::Ptr& p, const NodeSets::ConstPtr& ns,
+                       const Int ncells) {
+  Int nerr = 0;
+  // Rank-wide data buffer.
+  std::vector<Int> data(ns->nslots);
+  // Sum this rank's cellidxs.
+  for (auto& n : ns->levels[0].nodes)
+    data[n->offset] = n->id;
+  // Leaves to root.
+  for (size_t il = 0; il < ns->levels.size(); ++il) {
+    auto& lvl = ns->levels[il];
+    // Set up receives.
+    for (size_t i = 0; i < lvl.kids.size(); ++i) {
+      const auto& mmd = lvl.kids[i];
+      mpi::irecv(*p, &data[mmd.offset], mmd.size, mmd.rank, NodeSets::mpitag,
+                 &lvl.kids_req[i]);
+    }
+    //todo Replace with simultaneous waitany and isend.
+    mpi::waitall(lvl.kids_req.size(), lvl.kids_req.data());
+    // Combine kids' data.
+    for (auto& n : lvl.nodes) {
+      if ( ! n->nkids) continue;
+      data[n->offset] = 0;
+      for (Int i = 0; i < n->nkids; ++i)
+        data[n->offset] += data[n->kids[i]->offset];
+    }
+    // Send to parents.
+    for (size_t i = 0; i < lvl.me.size(); ++i) {
+      const auto& mmd = lvl.me[i];
+      mpi::isend(*p, &data[mmd.offset], mmd.size, mmd.rank, NodeSets::mpitag,
+                 &lvl.me_req[i]);
+    }
+    if (il+1 == ns->levels.size())
+      mpi::waitall(lvl.me_req.size(), lvl.me_req.data());
+  }
+  // Root to leaves.
+  for (size_t il = ns->levels.size(); il > 0; --il) {
+    auto& lvl = ns->levels[il-1];
+    // Get the global sum from parent.
+    for (size_t i = 0; i < lvl.me.size(); ++i) {
+      const auto& mmd = lvl.me[i];
+      mpi::irecv(*p, &data[mmd.offset], mmd.size, mmd.rank, NodeSets::mpitag,
+                 &lvl.me_req[i]);
+    }    
+    //todo Replace with simultaneous waitany and isend.
+    mpi::waitall(lvl.me_req.size(), lvl.me_req.data());
+    // Pass to kids.
+    for (auto& n : lvl.nodes) {
+      if ( ! n->nkids) continue;
+      for (Int i = 0; i < n->nkids; ++i)
+        data[n->kids[i]->offset] = data[n->offset];
+    }
+    // Send.
+    for (size_t i = 0; i < lvl.kids.size(); ++i) {
+      const auto& mmd = lvl.kids[i];
+      mpi::isend(*p, &data[mmd.offset], mmd.size, mmd.rank, NodeSets::mpitag,
+                 &lvl.kids_req[i]);
+    }
+  }
+  // Wait on sends to clean up.
+  for (size_t il = 0; il < ns->levels.size(); ++il) {
+    auto& lvl = ns->levels[il];
+    if (il+1 < ns->levels.size())
+      mpi::waitall(lvl.me_req.size(), lvl.me_req.data());
+    mpi::waitall(lvl.kids_req.size(), lvl.kids_req.data());
+  }
+  { // Check that all leaf nodes have the right number.
+    const Int desired_sum = (ncells*(ncells - 1)) / 2;
+    for (const auto& n : ns->levels[0].nodes)
+      if (data[n->offset] != desired_sum) ++nerr;
+    if (p->amroot()) {
+      std::cout << " " << data[ns->levels[0].nodes[0]->offset];
+      std::cout.flush();
+    }
+  }
+  return nerr;
+}
+
+// Unit tests for NodeSets.
+Int unittest (const Parallel::Ptr& p, const NodeSets::ConstPtr& ns,
+              const Int ncells) {
+  Int nerr = 0;
+  nerr += check_comm(ns);
+  if (nerr) return nerr;
+  nerr += check_leaf_nodes(p, ns, ncells);
+  if (nerr) return nerr;
+  nerr += test_comm_pattern(p, ns, ncells);
+  if (nerr) return nerr;
+  return nerr;
+}
+} // namespace impl
+
+template <typename ES>
+void QLT<ES>::init (const std::string& name, IntList& d,
+                    typename IntList::HostMirror& h, size_t n) {
+  d = IntList("QLT " + name, n);
+  h = Kokkos::create_mirror_view(d);
+}
+
+template <typename ES>
+int QLT<ES>::MetaData::get_problem_type (const int& idx) {
+  return problem_type_[idx];
+}
+    
+// icpc doesn't let us use problem_type_ here, even though it's constexpr.
+template <typename ES>
+int QLT<ES>::MetaData::get_problem_type_idx (const int& mask) {
+  switch (mask) {
+  case CPT::s:  case CPT::st:  return 0;
+  case CPT::cs: case CPT::cst: return 1;
+  case CPT::t:  return 2;
+  case CPT::ct: return 3;
+  default: cedr_kernel_throw_if(true, "Invalid problem type."); return -1;
+  }
+}
+
+template <typename ES>
+int QLT<ES>::MetaData::get_problem_type_l2r_bulk_size (const int& mask) {
+  if (mask & ProblemType::conserve) return 4;
+  return 3;
+}
+
+template <typename ES>
+int QLT<ES>::MetaData::get_problem_type_r2l_bulk_size (const int& mask) {
+  if (mask & ProblemType::shapepreserve) return 1;
+  return 3;
+}
+
+template <typename ES>
+void QLT<ES>::MetaData::init (const MetaDataBuilder& mdb) {
+  const Int ntracers = mdb.trcr2prob.size();
+
+  Me::init("trcr2prob", a_d_.trcr2prob, a_h_.trcr2prob, ntracers);
+  std::copy(mdb.trcr2prob.begin(), mdb.trcr2prob.end(), a_h_.trcr2prob.data());
+  Kokkos::deep_copy(a_d_.trcr2prob, a_h_.trcr2prob);
+
+  Me::init("bidx2trcr", a_d_.bidx2trcr, a_h_.bidx2trcr, ntracers);
+  Me::init("trcr2bl2r", a_d_.trcr2bl2r, a_h_.trcr2bl2r, ntracers);
+  Me::init("trcr2br2l", a_d_.trcr2br2l, a_h_.trcr2br2l, ntracers);
+  a_h_.prob2trcrptr[0] = 0;
+  a_h_.prob2bl2r[0] = 1; // rho is at 0.
+  a_h_.prob2br2l[0] = 0;
+  for (Int pi = 0; pi < nprobtypes; ++pi) {
+    a_h_.prob2trcrptr[pi+1] = a_h_.prob2trcrptr[pi];
+    const Int l2rbulksz = get_problem_type_l2r_bulk_size(get_problem_type(pi));
+    const Int r2lbulksz = get_problem_type_r2l_bulk_size(get_problem_type(pi));
+    for (Int ti = 0; ti < ntracers; ++ti) {
+      const auto problem_type = a_h_.trcr2prob[ti];
+      if (problem_type != problem_type_[pi]) continue;
+      const auto tcnt = a_h_.prob2trcrptr[pi+1] - a_h_.prob2trcrptr[pi];
+      a_h_.trcr2bl2r[ti] = a_h_.prob2bl2r[pi] + tcnt*l2rbulksz;
+      a_h_.trcr2br2l[ti] = a_h_.prob2br2l[pi] + tcnt*r2lbulksz;
+      a_h_.bidx2trcr[a_h_.prob2trcrptr[pi+1]++] = ti;
+    }
+    Int ni = a_h_.prob2trcrptr[pi+1] - a_h_.prob2trcrptr[pi];
+    a_h_.prob2bl2r[pi+1] = a_h_.prob2bl2r[pi] + ni*l2rbulksz;
+    a_h_.prob2br2l[pi+1] = a_h_.prob2br2l[pi] + ni*r2lbulksz;
+  }
+  Kokkos::deep_copy(a_d_.bidx2trcr, a_h_.bidx2trcr);
+  Kokkos::deep_copy(a_d_.trcr2bl2r, a_h_.trcr2bl2r);
+  Kokkos::deep_copy(a_d_.trcr2br2l, a_h_.trcr2br2l);
+
+  Me::init("trcr2bidx", a_d_.trcr2bidx, a_h_.trcr2bidx, ntracers);
+  for (Int ti = 0; ti < ntracers; ++ti)
+    a_h_.trcr2bidx(a_h_.bidx2trcr(ti)) = ti;
+  Kokkos::deep_copy(a_d_.trcr2bidx, a_h_.trcr2bidx);
+            
+  a_h = a_h_;
+
+  // Won't default construct Unmanaged, so have to do pointer stuff and raw
+  // array copy explicitly.
+  a_d.trcr2prob = a_d_.trcr2prob;
+  a_d.bidx2trcr = a_d_.bidx2trcr;
+  a_d.trcr2bidx = a_d_.trcr2bidx;
+  a_d.trcr2bl2r = a_d_.trcr2bl2r;
+  a_d.trcr2br2l = a_d_.trcr2br2l;
+  std::copy(a_h_.prob2trcrptr, a_h_.prob2trcrptr + nprobtypes + 1,
+            a_d.prob2trcrptr);
+  std::copy(a_h_.prob2bl2r, a_h_.prob2bl2r + nprobtypes + 1, a_d.prob2bl2r);
+  std::copy(a_h_.prob2br2l, a_h_.prob2br2l + nprobtypes + 1, a_d.prob2br2l);
+  cedr_assert(a_d.prob2trcrptr[nprobtypes] == ntracers);
+}
+
+template <typename ES>
+void QLT<ES>::BulkData::init (const MetaData& md, const Int& nslots) {
+  l2r_data_ = RealList("QLT l2r_data", md.a_h.prob2bl2r[md.nprobtypes]*nslots);
+  r2l_data_ = RealList("QLT r2l_data", md.a_h.prob2br2l[md.nprobtypes]*nslots);
+  l2r_data = l2r_data_;
+  r2l_data = r2l_data_;
+}
+
+template <typename ES>
+void QLT<ES>::init (const Parallel::Ptr& p, const Int& ncells,
+                    const tree::Node::Ptr& tree) {
+  p_ = p;
+  Timer::start(Timer::analyze);
+  ns_ = impl::analyze(p, ncells, tree);
+  init_ordinals();
+  Timer::stop(Timer::analyze);
+  mdb_ = std::make_shared<MetaDataBuilder>();
+}
+
+template <typename ES>
+void QLT<ES>::init_ordinals () {
+  for (const auto& n : ns_->levels[0].nodes)
+    gci2lci_[n->id] = n->offset;
+}
+
+template <typename ES>
+QLT<ES>::QLT (const Parallel::Ptr& p, const Int& ncells, const tree::Node::Ptr& tree) {
+  init(p, ncells, tree);
+  cedr_throw_if(nlclcells() == 0, "QLT does not support 0 cells on a rank.");
+}
+
+template <typename ES>
+void QLT<ES>::print (std::ostream& os) const {
+  ns_->print(os);
+}
+
+// Number of cells owned by this rank.
+template <typename ES>
+Int QLT<ES>::nlclcells () const { return ns_->levels[0].nodes.size(); }
+
+// Cells owned by this rank, in order of local numbering. Thus,
+// gci2lci(gcis[i]) == i. Ideally, the caller never actually calls gci2lci(),
+// and instead uses the information from get_owned_glblcells to determine
+// local cell indices.
+template <typename ES>
+void QLT<ES>::get_owned_glblcells (std::vector<Long>& gcis) const {
+  gcis.resize(ns_->levels[0].nodes.size());
+  for (const auto& n : ns_->levels[0].nodes)
+    gcis[n->offset] = n->id;
+}
+
+// For global cell index cellidx, i.e., the globally unique ordinal associated
+// with a cell in the caller's tree, return this rank's local index for
+// it. This is not an efficient operation.
+template <typename ES>
+Int QLT<ES>::gci2lci (const Int& gci) const {
+  const auto it = gci2lci_.find(gci);
+  if (it == gci2lci_.end()) {
+    pr(puf(gci));
+    std::vector<Long> gcis;
+    get_owned_glblcells(gcis);
+    mprarr(gcis);
+  }
+  cedr_throw_if(it == gci2lci_.end(), "gci " << gci << " not in gci2lci map.");
+  return it->second;
+}
+
+template <typename ES>
+void QLT<ES>::declare_tracer (int problem_type, const Int& rhomidx) {
+  cedr_throw_if( ! mdb_, "end_tracer_declarations was already called; "
+                "it is an error to call declare_tracer now.");
+  cedr_throw_if(rhomidx > 0, "rhomidx > 0 is not supported yet.");
+  // For its exception side effect, and to get canonical problem type, since
+  // some possible problem types map to the same canonical one:
+  problem_type = md_.get_problem_type(md_.get_problem_type_idx(problem_type));
+  mdb_->trcr2prob.push_back(problem_type);
+}
+
+template <typename ES>
+void QLT<ES>::end_tracer_declarations () {
+  md_.init(*mdb_);
+  mdb_ = nullptr;
+  bd_.init(md_, ns_->nslots);
+}
+
+template <typename ES>
+int QLT<ES>::get_problem_type (const Int& tracer_idx) const {
+  cedr_throw_if(tracer_idx < 0 || tracer_idx > md_.a_h.trcr2prob.extent_int(0),
+                "tracer_idx is out of bounds: " << tracer_idx);
+  return md_.a_h.trcr2prob[tracer_idx];
+}
+
+template <typename ES>
+Int QLT<ES>::get_num_tracers () const {
+  return md_.a_h.trcr2prob.size();
+}
+
+template <typename ES>
+void QLT<ES>::run () {
+  Timer::start(Timer::qltrunl2r);
+  using namespace impl;
+  // Number of data per slot.
+  const Int l2rndps = md_.a_d.prob2bl2r[md_.nprobtypes];
+  const Int r2lndps = md_.a_d.prob2br2l[md_.nprobtypes];
+  // Leaves to root.
+  for (size_t il = 0; il < ns_->levels.size(); ++il) {
+    auto& lvl = ns_->levels[il];
+    // Set up receives.
+    if (lvl.kids.size()) {
+      for (size_t i = 0; i < lvl.kids.size(); ++i) {
+        const auto& mmd = lvl.kids[i];
+        mpi::irecv(*p_, &bd_.l2r_data(mmd.offset*l2rndps), mmd.size*l2rndps, mmd.rank,
+                   NodeSets::mpitag, &lvl.kids_req[i]);
+      }
+      //todo Replace with simultaneous waitany and isend.
+      Timer::start(Timer::waitall);
+      mpi::waitall(lvl.kids_req.size(), lvl.kids_req.data());
+      Timer::stop(Timer::waitall);
+    }
+    // Combine kids' data.
+    //todo Kernelize, interacting with waitany todo above.
+    for (const auto& n : lvl.nodes) {
+      if ( ! n->nkids) continue;
+      cedr_kernel_assert(n->nkids == 2);
+      // Total density.
+      bd_.l2r_data(n->offset*l2rndps) = (bd_.l2r_data(n->kids[0]->offset*l2rndps) +
+                                         bd_.l2r_data(n->kids[1]->offset*l2rndps));
+      // Tracers.
+      for (Int pti = 0; pti < md_.nprobtypes; ++pti) {
+        const Int problem_type = md_.get_problem_type(pti);
+        const bool sum_only = problem_type & ProblemType::shapepreserve;
+        const Int bsz = md_.get_problem_type_l2r_bulk_size(problem_type);
+        const Int bis = md_.a_d.prob2trcrptr[pti], bie = md_.a_d.prob2trcrptr[pti+1];
+        for (Int bi = bis; bi < bie; ++bi) {
+          const Int bdi = md_.a_d.trcr2bl2r(md_.a_d.bidx2trcr(bi));
+          Real* const me = &bd_.l2r_data(n->offset*l2rndps + bdi);
+          const Real* const k0 = &bd_.l2r_data(n->kids[0]->offset*l2rndps + bdi);
+          const Real* const k1 = &bd_.l2r_data(n->kids[1]->offset*l2rndps + bdi);
+          me[0] = sum_only ? k0[0] + k1[0] : cedr::impl::min(k0[0], k1[0]);
+          me[1] =            k0[1] + k1[1] ;
+          me[2] = sum_only ? k0[2] + k1[2] : cedr::impl::max(k0[2], k1[2]);
+          if (bsz == 4)
+            me[3] =          k0[3] + k1[3] ;
+        }
+      }
+    }
+    // Send to parents.
+    if (lvl.me.size()) {
+      for (size_t i = 0; i < lvl.me.size(); ++i) {
+        const auto& mmd = lvl.me[i];
+        mpi::isend(*p_, &bd_.l2r_data(mmd.offset*l2rndps), mmd.size*l2rndps, mmd.rank,
+                   NodeSets::mpitag, &lvl.me_req[i]);
+      }
+      if (il+1 == ns_->levels.size()) {
+        Timer::start(Timer::waitall);
+        mpi::waitall(lvl.me_req.size(), lvl.me_req.data());
+        Timer::stop(Timer::waitall);
+      }
+    }
+  }
+  Timer::stop(Timer::qltrunl2r); Timer::start(Timer::qltrunr2l);
+  // Root.
+  if ( ! ns_->levels.empty() && ns_->levels.back().nodes.size() == 1 &&
+       ! ns_->levels.back().nodes[0]->parent) {
+    const auto& n = ns_->levels.back().nodes[0];
+    for (Int pti = 0; pti < md_.nprobtypes; ++pti) {
+      const Int problem_type = md_.get_problem_type(pti);
+      const Int bis = md_.a_d.prob2trcrptr[pti], bie = md_.a_d.prob2trcrptr[pti+1];
+      for (Int bi = bis; bi < bie; ++bi) {
+        const Int l2rbdi = md_.a_d.trcr2bl2r(md_.a_d.bidx2trcr(bi));
+        const Int r2lbdi = md_.a_d.trcr2br2l(md_.a_d.bidx2trcr(bi));
+        // If QLT is enforcing global mass conservation, set the root's r2l Qm
+        // value to the l2r Qm_prev's sum; otherwise, copy the l2r Qm value to
+        // the r2l one.
+        const Int os = problem_type & ProblemType::conserve ? 3 : 1;
+        bd_.r2l_data(n->offset*r2lndps + r2lbdi) =
+          bd_.l2r_data(n->offset*l2rndps + l2rbdi + os);
+        if ( ! (problem_type & ProblemType::shapepreserve)) {
+          // We now know the global q_{min,max}. Start propagating it
+          // leafward.
+          bd_.r2l_data(n->offset*r2lndps + r2lbdi + 1) =
+            bd_.l2r_data(n->offset*l2rndps + l2rbdi + 0);
+          bd_.r2l_data(n->offset*r2lndps + r2lbdi + 2) =
+            bd_.l2r_data(n->offset*l2rndps + l2rbdi + 2);
+        }
+      }
+    }
+  }
+  // Root to leaves.
+  for (size_t il = ns_->levels.size(); il > 0; --il) {
+    auto& lvl = ns_->levels[il-1];
+    if (lvl.me.size()) {
+      for (size_t i = 0; i < lvl.me.size(); ++i) {
+        const auto& mmd = lvl.me[i];
+        mpi::irecv(*p_, &bd_.r2l_data(mmd.offset*r2lndps), mmd.size*r2lndps, mmd.rank,
+                   NodeSets::mpitag, &lvl.me_req[i]);
+      }
+      //todo Replace with simultaneous waitany and isend.
+      Timer::start(Timer::waitall);
+      mpi::waitall(lvl.me_req.size(), lvl.me_req.data());
+      Timer::stop(Timer::waitall);
+    }
+    // Solve QP for kids' values.
+    //todo Kernelize, interacting with waitany todo above.
+    Timer::start(Timer::snp);
+    for (const auto& n : lvl.nodes) {
+      if ( ! n->nkids) continue;
+      for (Int pti = 0; pti < md_.nprobtypes; ++pti) {
+        const Int problem_type = md_.get_problem_type(pti);
+        const Int bis = md_.a_d.prob2trcrptr[pti], bie = md_.a_d.prob2trcrptr[pti+1];
+        for (Int bi = bis; bi < bie; ++bi) {
+          const Int l2rbdi = md_.a_d.trcr2bl2r(md_.a_d.bidx2trcr(bi));
+          const Int r2lbdi = md_.a_d.trcr2br2l(md_.a_d.bidx2trcr(bi));
+          cedr_assert(n->nkids == 2);
+          if ( ! (problem_type & ProblemType::shapepreserve)) {
+            // Pass q_{min,max} info along. l2r data are updated for use in
+            // solve_node_problem. r2l data are updated for use in isend.
+            const Real q_min = bd_.r2l_data(n->offset*r2lndps + r2lbdi + 1);
+            const Real q_max = bd_.r2l_data(n->offset*r2lndps + r2lbdi + 2);
+            bd_.l2r_data(n->offset*l2rndps + l2rbdi + 0) = q_min;
+            bd_.l2r_data(n->offset*l2rndps + l2rbdi + 2) = q_max;
+            for (Int k = 0; k < 2; ++k) {
+              bd_.l2r_data(n->kids[k]->offset*l2rndps + l2rbdi + 0) = q_min;
+              bd_.l2r_data(n->kids[k]->offset*l2rndps + l2rbdi + 2) = q_max;
+              bd_.r2l_data(n->kids[k]->offset*r2lndps + r2lbdi + 1) = q_min;
+              bd_.r2l_data(n->kids[k]->offset*r2lndps + r2lbdi + 2) = q_max;
+            }
+          }
+          const auto& k0 = n->kids[0];
+          const auto& k1 = n->kids[1];
+          solve_node_problem(
+            problem_type,
+             bd_.l2r_data( n->offset*l2rndps),
+            &bd_.l2r_data( n->offset*l2rndps + l2rbdi),
+             bd_.r2l_data( n->offset*r2lndps + r2lbdi),
+             bd_.l2r_data(k0->offset*l2rndps),
+            &bd_.l2r_data(k0->offset*l2rndps + l2rbdi),
+             bd_.r2l_data(k0->offset*r2lndps + r2lbdi),
+             bd_.l2r_data(k1->offset*l2rndps),
+            &bd_.l2r_data(k1->offset*l2rndps + l2rbdi),
+             bd_.r2l_data(k1->offset*r2lndps + r2lbdi));
+        }
+      }
+    }
+    Timer::stop(Timer::snp);
+    // Send.
+    if (lvl.kids.size())
+      for (size_t i = 0; i < lvl.kids.size(); ++i) {
+        const auto& mmd = lvl.kids[i];
+        mpi::isend(*p_, &bd_.r2l_data(mmd.offset*r2lndps), mmd.size*r2lndps, mmd.rank,
+                   NodeSets::mpitag, &lvl.kids_req[i]);
+      }
+  }
+  // Wait on sends to clean up.
+  for (size_t il = 0; il < ns_->levels.size(); ++il) {
+    auto& lvl = ns_->levels[il];
+    if (il+1 < ns_->levels.size())
+      mpi::waitall(lvl.me_req.size(), lvl.me_req.data());
+    mpi::waitall(lvl.kids_req.size(), lvl.kids_req.data());
+  }
+  Timer::stop(Timer::qltrunr2l);
+}
+
+template <typename ES>
+constexpr Int QLT<ES>::MetaData::problem_type_[];
+
+namespace test {
+using namespace impl;
+
+class TestQLT : public cedr::test::TestRandomized {
+public:
+  typedef QLT<Kokkos::DefaultExecutionSpace> QLTT;
+
+  TestQLT (const Parallel::Ptr& p, const tree::Node::Ptr& tree,
+           const Int& ncells, const bool verbose=false)
+    : TestRandomized("QLT", p, ncells, verbose),
+      qlt_(p, ncells, tree), tree_(tree)
+  {
+    if (verbose) qlt_.print(std::cout);
+    init();
+  }
+
+private:
+  QLTT qlt_;
+  tree::Node::Ptr tree_;
+
+  CDR& get_cdr () override { return qlt_; }
+
+  void init_numbering () override {
+    init_numbering(tree_);
+  }
+
+  void init_numbering (const tree::Node::Ptr& node) {
+    check(qlt_);
+    // TestQLT doesn't actually care about a particular ordering, as there is no
+    // geometry to the test problem. However, use *some* ordering to model what
+    // a real problem must do.
+    if ( ! node->nkids) {
+      if (node->rank == p_->rank())
+        gcis_.push_back(node->cellidx);
+      return;
+    }
+    for (Int i = 0; i < node->nkids; ++i)
+      init_numbering(node->kids[i]);
+  }
+
+  static void check (const QLTT& qlt) {
+    const Int n = qlt.nlclcells();
+    std::vector<Long> gcis;
+    qlt.get_owned_glblcells(gcis);
+    cedr_assert(static_cast<Int>(gcis.size()) == n);
+    for (Int i = 0; i < n; ++i)
+      cedr_assert(qlt.gci2lci(gcis[i]) == i);
+  }
+
+  void init_tracers () override {
+    for (const auto& t : tracers_)
+      qlt_.declare_tracer(t.problem_type, 0);
+    qlt_.end_tracer_declarations();
+    cedr_assert(qlt_.get_num_tracers() == static_cast<Int>(tracers_.size()));
+    for (size_t i = 0; i < tracers_.size(); ++i)
+      cedr_assert(qlt_.get_problem_type(i) == (tracers_[i].problem_type |
+                                               ProblemType::consistent));
+  }
+  
+  void run_impl (const Int trial) override {
+    MPI_Barrier(p_->comm());
+    Timer::start(Timer::qltrun);
+    qlt_.run();
+    MPI_Barrier(p_->comm());
+    Timer::stop(Timer::qltrun);
+    if (trial == 0) {
+      Timer::reset(Timer::qltrun);
+      Timer::reset(Timer::qltrunl2r);
+      Timer::reset(Timer::qltrunr2l);
+      Timer::reset(Timer::waitall);
+      Timer::reset(Timer::snp);
+    }
+  }
+};
+
+// Test all QLT variations and situations.
+Int test_qlt (const Parallel::Ptr& p, const tree::Node::Ptr& tree, const Int& ncells,
+              const Int nrepeat = 1,
+              // Diagnostic output for dev and illustration purposes. To be
+              // clear, no QLT unit test requires output to be checked; each
+              // checks in-memory data and returns a failure count.
+              const bool write = false,
+              const bool verbose = false) {
+  return TestQLT(p, tree, ncells, verbose).run(nrepeat, write);
+}
+} // namespace test
+
+// Tree for a 1-D periodic domain, for unit testing.
+namespace oned {
+struct Mesh {
+  struct ParallelDecomp {
+    enum Enum {
+      // The obvious distribution of ranks: 1 rank takes exactly 1 contiguous
+      // set of cell indices.
+      contiguous,
+      // For heavy-duty testing of QLT comm pattern, use a ridiculous assignment
+      // of ranks to cell indices. This forces the QLT tree to communicate,
+      // pack, and unpack in silly ways.
+      pseudorandom
+    };
+  };
+  
+  Mesh (const Int nc, const Parallel::Ptr& p,
+        const ParallelDecomp::Enum& parallel_decomp = ParallelDecomp::contiguous) {
+    init(nc, p, parallel_decomp);
+  }
+  
+  void init (const Int nc, const Parallel::Ptr& p,
+             const ParallelDecomp::Enum& parallel_decomp) {
+    nc_ = nc;
+    nranks_ = p->size();
+    p_ = p;
+    pd_ = parallel_decomp;
+    cedr_throw_if(nranks_ > nc_, "#GIDs < #ranks is not supported.");
+  }
+
+  Int ncell () const { return nc_; }
+
+  const Parallel::Ptr& parallel () const { return p_; }
+
+  Int rank (const Int& ci) const {
+    switch (pd_) {
+    case ParallelDecomp::contiguous:
+      return std::min(nranks_ - 1, ci / (nc_ / nranks_));
+    default: {
+      const auto chunk = ci / nranks_;
+      return (ci + chunk) % nranks_;
+    }
+    }
+  }
+
+  static Int unittest (const Parallel::Ptr& p) {
+    const Mesh::ParallelDecomp::Enum dists[] = { Mesh::ParallelDecomp::pseudorandom,
+                                                 Mesh::ParallelDecomp::contiguous };
+    Int ne = 0;
+    for (size_t id = 0; id < sizeof(dists)/sizeof(*dists); ++id) {
+      Mesh m(std::max(42, 3*p->size()), p, dists[id]);
+      const Int nc = m.ncell();
+      for (Int ci = 0; ci < nc; ++ci)
+        if (m.rank(ci) < 0 || m.rank(ci) >= p->size())
+          ++ne;
+    }
+    return ne;
+  }
+
+private:
+  Int nc_, nranks_;
+  Parallel::Ptr p_;
+  ParallelDecomp::Enum pd_;
+};
+
+tree::Node::Ptr make_tree (const Mesh& m, const Int cs, const Int ce,
+                           const tree::Node* parent, const bool imbalanced) {
+  const Int
+    cn = ce - cs,
+    cn0 = ( imbalanced && cn > 2 ?
+            cn/3 :
+            cn/2 );
+  tree::Node::Ptr n = std::make_shared<tree::Node>();
+  n->parent = parent;
+  if (cn == 1) {
+    n->nkids = 0;
+    n->rank = m.rank(cs);
+    n->cellidx = cs;
+    return n;
+  }
+  n->nkids = 2;
+  n->kids[0] = make_tree(m, cs, cs + cn0, n.get(), imbalanced);
+  n->kids[1] = make_tree(m, cs + cn0, ce, n.get(), imbalanced);
+  return n;
+}
+
+tree::Node::Ptr make_tree (const Mesh& m, const bool imbalanced) {
+  return make_tree(m, 0, m.ncell(), nullptr, imbalanced);
+}
+
+tree::Node::Ptr make_tree (const Parallel::Ptr& p, const Int& ncells,
+                           const bool imbalanced) {
+  Mesh m(ncells, p);
+  return make_tree(m, imbalanced);
+}
+
+namespace test {
+void mark_cells (const tree::Node::Ptr& node, std::vector<Int>& cells) {
+  if ( ! node->nkids) {
+    ++cells[node->cellidx];
+    return;
+  }
+  for (Int i = 0; i < node->nkids; ++i)
+    mark_cells(node->kids[i], cells);
+}
+
+Int unittest (const Parallel::Ptr& p) {
+  const Mesh::ParallelDecomp::Enum dists[] = { Mesh::ParallelDecomp::pseudorandom,
+                                               Mesh::ParallelDecomp::contiguous };
+  Int ne = 0;
+  for (size_t id = 0; id < sizeof(dists)/sizeof(*dists); ++id)
+    for (bool imbalanced: {false, true}) {
+      Mesh m(std::max(42, 3*p->size()), p, Mesh::ParallelDecomp::pseudorandom);
+      tree::Node::Ptr tree = make_tree(m, imbalanced);
+      std::vector<Int> cells(m.ncell(), 0);
+      mark_cells(tree, cells);
+      for (Int i = 0; i < m.ncell(); ++i)
+        if (cells[i] != 1) ++ne;
+    }
+  return ne;
+}
+} // namespace test
+} // namespace oned
+
+tree::Node::Ptr tree::make_tree_over_1d_mesh (const Parallel::Ptr& p, const Int& ncells,
+                                              const bool imbalanced) {
+  return oned::make_tree(oned::Mesh(ncells, p), imbalanced);
+}
+
+namespace test {
+Int unittest_NodeSets (const Parallel::Ptr& p) {
+  using Mesh = oned::Mesh;
+  const Int szs[] = { p->size(), 3*p->size() };
+  const Mesh::ParallelDecomp::Enum dists[] = { Mesh::ParallelDecomp::pseudorandom,
+                                               Mesh::ParallelDecomp::contiguous };
+  Int nerr = 0;
+  for (size_t is = 0; is < sizeof(szs)/sizeof(*szs); ++is)
+    for (size_t id = 0; id < sizeof(dists)/sizeof(*dists); ++id)
+      for (bool imbalanced: {false, true}) {
+        Mesh m(szs[is], p, dists[id]);
+        tree::Node::Ptr tree = make_tree(m, imbalanced);
+        impl::NodeSets::ConstPtr nodesets = impl::analyze(p, m.ncell(), tree);
+        tree = nullptr;
+        nerr += impl::unittest(p, nodesets, m.ncell());
+      }
+  return nerr;
+}
+
+Int unittest_QLT (const Parallel::Ptr& p, const bool write_requested=false) {
+  using Mesh = oned::Mesh;
+  const Int szs[] = { p->size(), 2*p->size(), 7*p->size(), 21*p->size() };
+  const Mesh::ParallelDecomp::Enum dists[] = { Mesh::ParallelDecomp::contiguous,
+                                               Mesh::ParallelDecomp::pseudorandom };
+  Int nerr = 0;
+  for (size_t is = 0, islim = sizeof(szs)/sizeof(*szs); is < islim; ++is)
+    for (size_t id = 0, idlim = sizeof(dists)/sizeof(*dists); id < idlim; ++id)
+    for (bool imbalanced: {false, true}) {
+      if (p->amroot()) {
+        std::cout << " (" << szs[is] << ", " << id << ", " << imbalanced << ")";
+        std::cout.flush();
+      }
+      Mesh m(szs[is], p, dists[id]);
+      tree::Node::Ptr tree = make_tree(m, imbalanced);
+      const bool write = (write_requested && m.ncell() < 3000 &&
+                          is == islim-1 && id == idlim-1);
+      nerr += test::test_qlt(p, tree, m.ncell(), 1, write);
+    }
+  return nerr;
+}
+
+Int run_unit_and_randomized_tests (const Parallel::Ptr& p, const Input& in) {
+  Int nerr = 0;
+  if (in.unittest) {
+    Int ne;
+    ne = oned::Mesh::unittest(p);
+    if (ne && p->amroot()) std::cerr << "FAIL: Mesh::unittest()\n";
+    nerr += ne;
+    ne = oned::test::unittest(p);
+    if (ne && p->amroot()) std::cerr << "FAIL: oned::unittest_tree()\n";
+    nerr += ne;
+    ne = unittest_NodeSets(p);
+    if (ne && p->amroot()) std::cerr << "FAIL: oned::unittest_NodeSets()\n";
+    nerr += ne;
+    ne = unittest_QLT(p, in.write);
+    if (ne && p->amroot()) std::cerr << "FAIL: oned::unittest_QLT()\n";
+    nerr += ne;
+    if (p->amroot()) std::cout << "\n";
+  }
+  // Performance test.
+  if (in.perftest && in.ncells > 0) {
+    oned::Mesh m(in.ncells, p,
+                 (in.pseudorandom ?
+                  oned::Mesh::ParallelDecomp::pseudorandom :
+                  oned::Mesh::ParallelDecomp::contiguous));
+    Timer::init();
+    Timer::start(Timer::total); Timer::start(Timer::tree);
+    tree::Node::Ptr tree = make_tree(m, false);
+    Timer::stop(Timer::tree);
+    test::test_qlt(p, tree, in.ncells, in.nrepeat, false, in.verbose);
+    Timer::stop(Timer::total);
+    if (p->amroot()) Timer::print();
+  }
+  return nerr;
+}
+
+} // namespace test
+} // namespace qlt
+} // namespace cedr
+
+#ifdef KOKKOS_HAVE_SERIAL
+template class cedr::qlt::QLT<Kokkos::Serial>;
+#endif
+#ifdef KOKKOS_HAVE_OPENMP
+template class cedr::qlt::QLT<Kokkos::OpenMP>;
+#endif
+#ifdef KOKKOS_HAVE_CUDA
+template class cedr::qlt::QLT<Kokkos::Cuda>;
+#endif
diff --git a/cedr/cedr_qlt.hpp b/cedr/cedr_qlt.hpp
new file mode 100644
index 0000000..e923600
--- /dev/null
+++ b/cedr/cedr_qlt.hpp
@@ -0,0 +1,225 @@
+#ifndef INCLUDE_CEDR_QLT_HPP
+#define INCLUDE_CEDR_QLT_HPP
+
+#include <mpi.h>
+
+#include <memory>
+#include <string>
+#include <iostream>
+#include <vector>
+#include <map>
+
+#include "cedr_cdr.hpp"
+
+namespace cedr {
+// QLT: Quasi-local tree-based non-iterative tracer density reconstructor for
+//      mass conservation, shape preservation, and tracer consistency.
+namespace qlt {
+using cedr::mpi::Parallel;
+
+namespace impl { class NodeSets; }
+
+namespace tree {
+// The caller builds a tree of these nodes to pass to QLT.
+struct Node {
+  typedef std::shared_ptr<Node> Ptr;
+  const Node* parent; // (Can't be a shared_ptr: would be a circular dependency.)
+  Int rank;           // Owning rank.
+  Long cellidx;       // If a leaf, the cell to which this node corresponds.
+  Int nkids;          // 0 at leaf, 1 or 2 otherwise.
+  Node::Ptr kids[2];
+  void* reserved;     // For internal use.
+  Node () : parent(nullptr), rank(-1), cellidx(-1), nkids(0), reserved(nullptr) {}
+};
+
+// Utility to make a tree over a 1D mesh. For testing, it can be useful to
+// create an imbalanced tree.
+Node::Ptr make_tree_over_1d_mesh(const Parallel::Ptr& p, const Int& ncells,
+                                 const bool imbalanced = false);
+} // namespace tree
+
+template <typename ExeSpace = Kokkos::DefaultExecutionSpace>
+class QLT : public cedr::CDR {
+public:
+  typedef typename cedr::impl::DeviceType<ExeSpace>::type Device;
+  typedef QLT<ExeSpace> Me;
+  typedef std::shared_ptr<Me> Ptr;
+  
+  // Set up QLT topology and communication data structures based on a tree.
+  QLT(const Parallel::Ptr& p, const Int& ncells, const tree::Node::Ptr& tree);
+
+  void print(std::ostream& os) const override;
+
+  // Number of cells owned by this rank.
+  Int nlclcells() const;
+
+  // Cells owned by this rank, in order of local numbering. Thus,
+  // gci2lci(gcis[i]) == i. Ideally, the caller never actually calls gci2lci(),
+  // and instead uses the information from get_owned_glblcells to determine
+  // local cell indices.
+  void get_owned_glblcells(std::vector<Long>& gcis) const;
+
+  // For global cell index cellidx, i.e., the globally unique ordinal associated
+  // with a cell in the caller's tree, return this rank's local index for
+  // it. This is not an efficient operation.
+  Int gci2lci(const Int& gci) const;
+
+  void declare_tracer(int problem_type, const Int& rhomidx) override;
+
+  void end_tracer_declarations() override;
+
+  int get_problem_type(const Int& tracer_idx) const override;
+
+  Int get_num_tracers() const override;
+
+  // lclcellidx is gci2lci(cellidx).
+  KOKKOS_INLINE_FUNCTION
+  void set_rhom(const Int& lclcellidx, const Int& rhomidx, const Real& rhom) override;
+
+  // lclcellidx is gci2lci(cellidx).
+  KOKKOS_INLINE_FUNCTION
+  void set_Qm(const Int& lclcellidx, const Int& tracer_idx,
+              const Real& Qm, const Real& Qm_min, const Real& Qm_max,
+              const Real Qm_prev = -1) override;
+
+  void run() override;
+
+  KOKKOS_INLINE_FUNCTION
+  Real get_Qm(const Int& lclcellidx, const Int& tracer_idx) override;
+
+private:
+  typedef Kokkos::View<Int*, Kokkos::LayoutLeft, Device> IntList;
+  typedef cedr::impl::Const<IntList> ConstIntList;
+  typedef cedr::impl::ConstUnmanaged<IntList> ConstUnmanagedIntList;
+
+  static void init(const std::string& name, IntList& d,
+                   typename IntList::HostMirror& h, size_t n);
+
+  struct MetaDataBuilder {
+    typedef std::shared_ptr<MetaDataBuilder> Ptr;
+    std::vector<int> trcr2prob;
+  };
+
+  struct MetaData {
+    enum : Int { nprobtypes = 4 };
+
+    template <typename IntListT>
+    struct Arrays {
+      // trcr2prob(i) is the ProblemType of tracer i.
+      IntListT trcr2prob;
+      // bidx2trcr(prob2trcrptr(i) : prob2trcrptr(i+1)-1) is the list of
+      // tracers having ProblemType index i. bidx2trcr is the permutation
+      // from the user's tracer index to the bulk data's ordering (bidx).
+      Int prob2trcrptr[nprobtypes+1];
+      IntListT bidx2trcr;
+      // Inverse of bidx2trcr.
+      IntListT trcr2bidx;
+      // Points to the start of l2r bulk data for each problem type, within a
+      // slot.
+      Int prob2bl2r[nprobtypes + 1];
+      // Point to the start of l2r bulk data for each tracer, within a slot.
+      IntListT trcr2bl2r;
+      // Same for r2l bulk data.
+      Int prob2br2l[nprobtypes + 1];
+      IntListT trcr2br2l;
+    };
+
+    static int get_problem_type(const int& idx);
+    
+    // icpc doesn't let us use problem_type_ here, even though it's constexpr.
+    static int get_problem_type_idx(const int& mask);
+
+    static int get_problem_type_l2r_bulk_size(const int& mask);
+
+    static int get_problem_type_r2l_bulk_size(const int& mask);
+
+    struct CPT {
+      // We could make the l2r buffer smaller by one entry, Qm. However, the
+      // l2r comm is more efficient if it's done with one buffer. Similarly,
+      // we separate the r2l data into a separate buffer for packing and MPI
+      // efficiency.
+      //   There are 7 possible problems.
+      //   The only problem not supported is conservation alone. It makes very
+      // little sense to use QLT for conservation alone.
+      //   The remaining 6 fall into 4 categories of details. These 4 categories
+      // are tracked by QLT; which of the original 6 problems being solved is
+      // not important.
+      enum {
+        // l2r: rhom, (Qm_min, Qm, Qm_max)*; l2r, r2l: Qm*
+        s  = ProblemType::shapepreserve,
+        st = ProblemType::shapepreserve | ProblemType::consistent,
+        // l2r: rhom, (Qm_min, Qm, Qm_max, Qm_prev)*; l2r, r2l: Qm*
+        cs  = ProblemType::conserve | s,
+        cst = ProblemType::conserve | st,
+        // l2r: rhom, (q_min, Qm, q_max)*; l2r, r2l: Qm*
+        t = ProblemType::consistent,
+        // l2r: rhom, (q_min, Qm, q_max, Qm_prev)*; l2r, r2l: Qm*
+        ct = ProblemType::conserve | t
+      };
+    };
+
+    Arrays<typename ConstUnmanagedIntList::HostMirror> a_h;
+    Arrays<ConstUnmanagedIntList> a_d;
+
+    void init(const MetaDataBuilder& mdb);
+
+  private:
+    static constexpr Int problem_type_[] = { CPT::st, CPT::cst, CPT::t, CPT::ct };
+    Arrays<typename IntList::HostMirror> a_h_;
+    Arrays<IntList> a_d_;
+  };
+
+  struct BulkData {
+    typedef Kokkos::View<Real*, Kokkos::LayoutLeft, Device> RealList;
+    typedef cedr::impl::Unmanaged<RealList> UnmanagedRealList;
+
+    UnmanagedRealList l2r_data, r2l_data;
+
+    void init(const MetaData& md, const Int& nslots);
+
+  private:
+    RealList l2r_data_, r2l_data_;
+  };
+
+private:
+  void init(const Parallel::Ptr& p, const Int& ncells, const tree::Node::Ptr& tree);
+
+  void init_ordinals();
+
+  KOKKOS_INLINE_FUNCTION
+  static void solve_node_problem(const Int problem_type,
+                                 const Real& rhom, const Real* pd, const Real& Qm,
+                                 const Real& rhom0, const Real* k0d, Real& Qm0,
+                                 const Real& rhom1, const Real* k1d, Real& Qm1);
+
+private:
+  Parallel::Ptr p_;
+  // Tree and communication topology.
+  std::shared_ptr<const impl::NodeSets> ns_;
+  // Globally unique cellidx -> rank-local index.
+  std::map<Int,Int> gci2lci_;
+  // Temporary to collect caller's tracer information prior to calling
+  // end_tracer_declarations().
+  typename MetaDataBuilder::Ptr mdb_;
+  // Constructed in end_tracer_declarations().
+  MetaData md_;
+  BulkData bd_;
+};
+
+namespace test {
+struct Input {
+  bool unittest, perftest, write;
+  Int ncells, ntracers, tracer_type, nrepeat;
+  bool pseudorandom, verbose;
+};
+
+Int run_unit_and_randomized_tests(const Parallel::Ptr& p, const Input& in);
+} // namespace test
+} // namespace qlt
+} // namespace cedr
+
+// These are the definitions that must be visible in the calling translation
+// unit, unless Cuda relocatable device code is enabled.
+#include "cedr_qlt_inl.hpp"
+
+#endif
diff --git a/cedr/cedr_qlt_inl.hpp b/cedr/cedr_qlt_inl.hpp
new file mode 100644
index 0000000..fb9290f
--- /dev/null
+++ b/cedr/cedr_qlt_inl.hpp
@@ -0,0 +1,161 @@
+#ifndef INCLUDE_CEDR_QLT_INL_HPP
+#define INCLUDE_CEDR_QLT_INL_HPP
+
+#include <cassert>
+
+#include "cedr_local.hpp"
+
+namespace cedr {
+namespace qlt {
+
+template <typename ES> KOKKOS_INLINE_FUNCTION
+void QLT<ES>::set_rhom (const Int& lclcellidx, const Int& rhomidx, const Real& rhom) {
+  const Int ndps = md_.a_d.prob2bl2r[md_.nprobtypes];
+  bd_.l2r_data(ndps*lclcellidx) = rhom;  
+}
+
+template <typename ES> KOKKOS_INLINE_FUNCTION
+void QLT<ES>::set_Qm (const Int& lclcellidx, const Int& tracer_idx,
+                      const Real& Qm,
+                      const Real& Qm_min, const Real& Qm_max,
+                      const Real Qm_prev) {
+  const Int ndps = md_.a_d.prob2bl2r[md_.nprobtypes];
+  Real* bd; {
+    const Int bdi = md_.a_d.trcr2bl2r(tracer_idx);
+    bd = &bd_.l2r_data(ndps*lclcellidx + bdi);
+  }
+  bd[1] = Qm;
+  {
+    const Int problem_type = md_.a_d.trcr2prob(tracer_idx);
+    if (problem_type & ProblemType::shapepreserve) {
+      bd[0] = Qm_min;
+      bd[2] = Qm_max;
+    } else if (problem_type & ProblemType::consistent) {
+      const Real rhom = bd_.l2r_data(ndps*lclcellidx);
+      bd[0] = Qm_min / rhom;
+      bd[2] = Qm_max / rhom;
+    } else {
+      cedr_kernel_throw_if(true, "set_Q: invalid problem_type.");
+    }
+    if (problem_type & ProblemType::conserve) {
+      cedr_kernel_throw_if(Qm_prev < -0.5,
+                           "Qm_prev was not provided to set_Q.");
+      bd[3] = Qm_prev;
+    }
+  }
+}
+
+template <typename ES> KOKKOS_INLINE_FUNCTION
+Real QLT<ES>::get_Qm (const Int& lclcellidx, const Int& tracer_idx) {
+  const Int ndps = md_.a_d.prob2br2l[md_.nprobtypes];
+  const Int bdi = md_.a_d.trcr2br2l(tracer_idx);
+  return bd_.r2l_data(ndps*lclcellidx + bdi);
+}
+
+//todo Replace this and the calling code with ReconstructSafely.
+KOKKOS_INLINE_FUNCTION
+void r2l_nl_adjust_bounds (Real Qm_bnd[2], const Real rhom[2], Real Qm_extra) {
+  Real q[2];
+  for (Int i = 0; i < 2; ++i) q[i] = Qm_bnd[i] / rhom[i];
+  if (Qm_extra < 0) {
+    Int i0, i1;
+    if (q[0] >= q[1]) { i0 = 0; i1 = 1; } else { i0 = 1; i1 = 0; }
+    const Real Qm_gap = (q[i1] - q[i0])*rhom[i0];
+    if (Qm_gap <= Qm_extra) {
+      Qm_bnd[i0] += Qm_extra;
+      return;
+    }
+  } else {
+    Int i0, i1;
+    if (q[0] <= q[1]) { i0 = 0; i1 = 1; } else { i0 = 1; i1 = 0; }
+    const Real Qm_gap = (q[i1] - q[i0])*rhom[i0];
+    if (Qm_gap >= Qm_extra) {
+      Qm_bnd[i0] += Qm_extra;
+      return;
+    }
+  }
+  { // Have to adjust both. Adjust so that the q bounds are the same. This
+    // procedure assures that as long as rhom is conservative, then the
+    // adjustment never pushes q_{min,max} out of the safety bounds.
+    const Real Qm_tot = Qm_bnd[0] + Qm_bnd[1] + Qm_extra;
+    const Real rhom_tot = rhom[0] + rhom[1];
+    const Real q_tot = Qm_tot / rhom_tot;
+    for (Int i = 0; i < 2; ++i)
+      Qm_bnd[i] = q_tot*rhom[i];
+  }
+}
+
+namespace impl {
+KOKKOS_INLINE_FUNCTION
+void solve_node_problem (const Real& rhom, const Real* pd, const Real& Qm,
+                         const Real& rhom0, const Real* k0d, Real& Qm0,
+                         const Real& rhom1, const Real* k1d, Real& Qm1) {
+  Real Qm_min_kids [] = {k0d[0], k1d[0]};
+  Real Qm_orig_kids[] = {k0d[1], k1d[1]};
+  Real Qm_max_kids [] = {k0d[2], k1d[2]};
+  { // The ideal problem is not assuredly feasible. Test for feasibility. If not
+    // feasible, adjust bounds to solve the safety problem, which is assuredly
+    // feasible if the total density field rho is mass conserving (Q doesn't
+    // have to be mass conserving, of course; achieving mass conservation is one
+    // use for QLT).
+    const Real Qm_min = pd[0], Qm_max = pd[2];
+    const bool lo = Qm < Qm_min, hi = Qm > Qm_max;
+    if (lo || hi) {
+      // If the discrepancy is numerical noise, don't act on it.
+      const Real tol = 10*std::numeric_limits<Real>::epsilon();
+      const Real discrepancy = lo ? Qm_min - Qm : Qm - Qm_max;
+      if (discrepancy > tol*Qm_max) {
+        const Real rhom_kids[] = {rhom0, rhom1};
+        r2l_nl_adjust_bounds(lo ? Qm_min_kids : Qm_max_kids,
+                             rhom_kids,
+                             Qm - (lo ? Qm_min : Qm_max));
+      }
+    } else {
+      // Quick exit if everything is OK as is. This is a speedup, and it also
+      // lets the subnode solver make ~1 ulp changes instead of having to keep x
+      // = y if y satisfies the conditions. Without this block, the
+      // no_change_should_hold tests can fail.
+      if (Qm == pd[1] && // Was our total tracer mass adjusted?
+          // Are the kids' problems feasible?
+          Qm_orig_kids[0] >= Qm_min_kids[0] && Qm_orig_kids[0] <= Qm_max_kids[0] &&
+          Qm_orig_kids[1] >= Qm_min_kids[1] && Qm_orig_kids[1] <= Qm_max_kids[1]) {
+        // Don't need to do anything, so skip even the math-based quick exits in
+        // solve_node_problem.
+        Qm0 = Qm_orig_kids[0];
+        Qm1 = Qm_orig_kids[1];
+        return;
+      }
+    }
+  }
+  { // Solve the node's QP.
+    static const Real ones[] = {1, 1};
+    const Real w[] = {1/rhom0, 1/rhom1};
+    Real Qm_kids[2] = {k0d[1], k1d[1]};
+    local::solve_1eq_bc_qp_2d(w, ones, Qm, Qm_min_kids, Qm_max_kids,
+                              Qm_orig_kids, Qm_kids);
+    Qm0 = Qm_kids[0];
+    Qm1 = Qm_kids[1];
+  }
+}
+} // namespace impl
+
+template <typename ES> KOKKOS_INLINE_FUNCTION
+void QLT<ES>::solve_node_problem (const Int problem_type,
+                                  const Real& rhom, const Real* pd, const Real& Qm,
+                                  const Real& rhom0, const Real* k0d, Real& Qm0,
+                                  const Real& rhom1, const Real* k1d, Real& Qm1) {
+  if ( ! (problem_type & ProblemType::shapepreserve)) {      
+    Real mpd[3], mk0d[3], mk1d[3];
+    mpd[0]  = pd [0]*rhom ; mpd [1] = pd[1] ; mpd [2] = pd [2]*rhom ;
+    mk0d[0] = k0d[0]*rhom0; mk0d[1] = k0d[1]; mk0d[2] = k0d[2]*rhom0;
+    mk1d[0] = k1d[0]*rhom1; mk1d[1] = k1d[1]; mk1d[2] = k1d[2]*rhom1;
+    impl::solve_node_problem(rhom, mpd, Qm, rhom0, mk0d, Qm0, rhom1, mk1d, Qm1);
+    return;
+  }
+  impl::solve_node_problem(rhom, pd, Qm, rhom0, k0d, Qm0, rhom1, k1d, Qm1);
+}
+
+} // namespace qlt
+} // namespace cedr
+
+#endif
diff --git a/cedr/cedr_test.cpp b/cedr/cedr_test.cpp
new file mode 100644
index 0000000..ebb76f1
--- /dev/null
+++ b/cedr/cedr_test.cpp
@@ -0,0 +1,109 @@
+#include "cedr_qlt.hpp"
+#include "cedr_caas.hpp"
+#include "cedr_mpi.hpp"
+#include "cedr_util.hpp"
+#include "cedr_test.hpp"
+
+#include <stdexcept>
+#include <sstream>
+
+namespace cedr { 
+struct InputParser {
+  qlt::test::Input qin;
+  test::transport1d::Input tin;
+
+  class ArgAdvancer {
+    const int argc_;
+    char const* const* argv_;
+    int i_;
+  public:
+    ArgAdvancer (int argc, char** argv) : argc_(argc), argv_(argv), i_(1) {}
+    const char* advance () {
+      if (i_+1 >= argc_) cedr_throw_if(true, "Command line is missing an argument.");
+      return argv_[++i_];
+    }
+    const char* token () const { return argv_[i_]; }
+    void incr () { ++i_; }
+    bool more () const { return i_ < argc_; }
+  };
+  
+  InputParser (int argc, char** argv, const qlt::Parallel::Ptr& p) {
+    using util::eq;
+    qin.unittest = false;
+    qin.perftest = false;
+    qin.write = false;
+    qin.ncells = 0;
+    qin.ntracers = 1;
+    qin.tracer_type = 0;
+    qin.nrepeat = 1;
+    qin.pseudorandom = false;
+    qin.verbose = false;
+    tin.ncells = 0;
+    for (ArgAdvancer aa(argc, argv); aa.more(); aa.incr()) {
+      const char* token = aa.token();
+      if (eq(token, "-t", "--unittest")) qin.unittest = true;
+      else if (eq(token, "-pt", "--perftest")) qin.perftest = true;
+      else if (eq(token, "-w", "--write")) qin.write = true;
+      else if (eq(token, "-nc", "--ncells")) qin.ncells = std::atoi(aa.advance());
+      else if (eq(token, "-nt", "--ntracers")) qin.ntracers = std::atoi(aa.advance());
+      else if (eq(token, "-tt", "--tracertype")) qin.tracer_type = std::atoi(aa.advance());
+      else if (eq(token, "-nr", "--nrepeat")) qin.nrepeat = std::atoi(aa.advance());
+      else if (eq(token, "--proc-random")) qin.pseudorandom = true;
+      else if (eq(token, "-v", "--verbose")) qin.verbose = true;
+      else if (eq(token, "-t1d", "--transport1dtest")) tin.ncells = 1;
+      else cedr_throw_if(true, "Invalid token " << token);
+    }
+
+    if (tin.ncells) {
+      tin.ncells = qin.ncells;
+      tin.verbose = qin.verbose;
+    }
+
+    cedr_throw_if(qin.tracer_type < 0 || qin.tracer_type >= 4,
+                  "Tracer type is out of bounds [0, 3].");
+    cedr_throw_if(qin.ntracers < 1, "Number of tracers is < 1.");
+  }
+
+  void print (std::ostream& os) const {
+    os << "ncells " << qin.ncells
+       << " nrepeat " << qin.nrepeat;
+    if (qin.pseudorandom) os << " random";
+    os << "\n";
+  }
+};
+} // namespace cedr
+
+int main (int argc, char** argv) {
+  int nerr = 0, retval = 0;
+  MPI_Init(&argc, &argv);
+  auto p = cedr::mpi::make_parallel(MPI_COMM_WORLD);
+  srand(p->rank());
+  Kokkos::initialize(argc, argv);
+  try {
+    cedr::InputParser inp(argc, argv, p);
+    if (p->amroot()) inp.print(std::cout);
+    if (inp.qin.unittest) {
+      nerr += cedr::local::unittest();
+      nerr += cedr::caas::test::unittest(p);
+    }
+    if (inp.qin.unittest || inp.qin.perftest)
+      nerr += cedr::qlt::test::run_unit_and_randomized_tests(p, inp.qin);
+    if (inp.tin.ncells > 0)
+      nerr += cedr::test::transport1d::run(p, inp.tin);
+    {
+      int gnerr;
+      cedr::mpi::reduce(*p, &nerr, &gnerr, 1, MPI_SUM, p->root());
+      retval = gnerr != 0 ? -1 : 0;
+      if (p->amroot())
+        std::cout << (gnerr != 0 ? "FAIL" : "PASS") << "\n";
+    }
+  } catch (const std::exception& e) {
+    if (p->amroot())
+      std::cerr << e.what();
+    retval = -1;
+  }
+  Kokkos::finalize_all();
+  if (nerr) prc(nerr);
+  MPI_Finalize();
+  return retval;
+}
diff --git a/cedr/cedr_test.hpp b/cedr/cedr_test.hpp
new file mode 100644
index 0000000..afa9b5c
--- /dev/null
+++ b/cedr/cedr_test.hpp
@@ -0,0 +1,22 @@
+#ifndef INCLUDE_CEDR_TEST_HPP
+#define INCLUDE_CEDR_TEST_HPP
+
+#include "cedr.hpp"
+#include "cedr_mpi.hpp"
+
+namespace cedr {
+namespace test {
+namespace transport1d {
+
+struct Input {
+  Int ncells;
+  bool verbose;
+};
+
+Int run(const mpi::Parallel::Ptr& p, const Input& in);
+
+} // namespace transport1d
+} // namespace test
+} // namespace cedr
+
+#endif
diff --git a/cedr/cedr_test_1d_transport.cpp b/cedr/cedr_test_1d_transport.cpp
new file mode 100644
index 0000000..dbf2d9b
--- /dev/null
+++ b/cedr/cedr_test_1d_transport.cpp
@@ -0,0 +1,320 @@
+#include "cedr_test.hpp"
+#include "cedr_qlt.hpp"
+#include "cedr_caas.hpp"
+
+#include <algorithm>
+
+namespace cedr {
+namespace test {
+namespace transport1d {
+
+namespace interp {
+inline Real to_periodic_core (const Real& xl, const Real& xr, const Real& x) {
+  if (x >= xl && x <= xr) return x;
+  const Real w = xr - xl, xmxl = x - xl;
+  return x - w*std::floor(xmxl / w);
+}
+
+inline Real get_slope (const Real x[2], const Real y[2]) {
+  return (y[1] - y[0]) / (x[1] - x[0]);
+}
+
+inline void
+get_cubic (Real dx, Real v1, Real s1, Real v2, Real s2, Real c[4]) {
+  Real dx2 = dx*dx;
+  Real dx3 = dx2*dx;
+  Real den = -dx3;
+  Real b1, b2;
+  c[2] = s1;
+  c[3] = v1;
+  b1 = v2 - dx*c[2] - c[3];
+  b2 = s2 - c[2];
+  c[0] = (2.0*b1 - dx*b2) / den;
+  c[1] = (-3.0*dx*b1 + dx2*b2) / den;
+}
+
+void cubic_interp_periodic (
+  const Real* const x, const Int nx, const Real* const y,
+  const Real* const xi, const Int nxi, Real* const yi,
+  Int* const dod)
+{
+  const int nc = nx - 1;
+#ifdef _OPENMP
+# pragma omp parallel for
+#endif
+  for (Int j = 0; j < nxi; ++j) {
+    const Real xi_per = to_periodic_core(x[0], x[nc], xi[j]);
+    Int ip1 = std::upper_bound(x, x + nx, xi_per) - x;
+    // Handle numerical issues at boundaries.
+    if (ip1 == 0) ++ip1;
+    else if (ip1 == nx) --ip1;
+    const Int i = ip1 - 1;
+    // Domain of dependence.
+    Int* dodj = dod + 4*j;
+    for (Int k = 0; k < 4; ++k)
+      dodj[k] = (i - 1 + k + nc) % nc;
+    // Slopes.
+    const bool at_start = i == 0, at_end = i == nc - 1;
+    const Real smid = get_slope(x+i, y+i);
+    Real s1, s2;
+    if (at_start) {
+      const Real a = (x[nc] - x[nc-1]) / ((x[1] - x[0]) + (x[nc] - x[nc-1]));
+      s1 = (1 - a)*get_slope(x+nc-1, y+nc-1) + a*smid;
+    } else {
+      const Real a = (x[i] - x[i-1]) / (x[ip1] - x[i-1]);
+      s1 = (1 - a)*get_slope(x+i-1, y+i-1) + a*smid;
+    }
+    if (at_end) {
+      const Real a = (x[ip1] - x[i]) / ((x[ip1] - x[i]) + (x[1] - x[0]));
+      s2 = (1 - a)*smid + a*get_slope(x, y);
+    } else {
+      const Real a = (x[ip1] - x[i]) / (x[i+2] - x[i]);
+      s2 = (1 - a)*smid + a*get_slope(x+ip1, y+ip1);
+    }
+    // Interp.
+    Real c[4];
+    get_cubic(x[ip1] - x[i], y[i], s1, y[ip1], s2, c);
+    const Real xij = xi_per - x[i];
+    yi[j] = (((c[0]*xij + c[1])*xij) + c[2])*xij + c[3];
+  }
+}
+} // namespace interp
+
+class PyWriter {
+  typedef std::unique_ptr<FILE, cedr::util::FILECloser> FilePtr;
+  FilePtr fh_;
+public:
+  PyWriter(const std::string& filename);
+  void write(const std::string& field_name, const std::vector<Real>& v) const;
+};
+
+PyWriter::PyWriter (const std::string& filename) {
+  fh_ = FilePtr(fopen((filename + ".py").c_str(), "w"));
+  fprintf(fh_.get(), "s = {};\n");
+}
+
+void PyWriter::write (const std::string& field_name, const std::vector<Real>& v) const {
+  fprintf(fh_.get(), "s['%s'] = [", field_name.c_str());
+  for (const auto& e: v)
+    fprintf(fh_.get(), " %1.15e,", e);
+  fprintf(fh_.get(), "]\n");
+}
+
+struct InitialCondition {
+  enum Enum { sin, bell, rect, uniform };
+  static std::string convert (const Enum& e) {
+    switch (e) {
+    case Enum::sin: return "sin";
+    case Enum::bell: return "bell";
+    case Enum::rect: return "rect";
+    case Enum::uniform: return "uniform";
+    }
+    cedr_throw_if(true, "InitialCondition::convert can't convert " << e);
+  }
+  static Enum convert (const std::string& s) {
+    using util::eq;
+    if (eq(s, "sin")) return Enum::sin;
+    if (eq(s, "bell")) return Enum::bell;
+    if (eq(s, "rect")) return Enum::rect;
+    if (eq(s, "uniform")) return Enum::uniform;
+    cedr_throw_if(true, "InitialCondition::convert can't convert " << s);
+  }
+  static Real eval (const Enum& ic, const Real x) {
+    switch (ic) {
+    case Enum::sin: return 0.1 + 0.8*0.5*(1 + std::sin(6*M_PI*x));
+    case Enum::bell: return x < 0.5 ? std::sin(2*M_PI*x) : 0;
+    case Enum::rect: return x > 0.66 || x < 0.33 ? 0 : 1;
+    case Enum::uniform: return 0.42;
+    }
+    cedr_throw_if(true, "InitialCondition::eval can't convert " << ic);
+  }
+};
+
+class Problem1D {
+  std::vector<Real> xb_, xcp_, rwrk_;
+  std::vector<Int> iwrk_;
+
+  void init_mesh (const Int ncells, const bool nonuniform_mesh) {
+    xb_.resize(ncells+1);
+    xcp_.resize(ncells+1);
+    xb_[0] = 0;
+    if (nonuniform_mesh) {
+      // Large-scale, continuous variation in cell size, plus a huge jump at the
+      // periodic boundary.
+      for (Int i = 1; i <= ncells; ++i) {
+        const Real x = cedr::util::square(Real(i) / ncells);
+        xb_[i] = 0.01 + sin(0.5*M_PI*x*x*x*x);
+      }
+      // Random local cell sizes.
+      for (Int i = 1; i <= ncells; ++i)
+        xb_[i] *= 0.3 + cedr::util::urand();
+      // Cumsum.
+      for (Int i = 1; i <= ncells; ++i)
+        xb_[i] += xb_[i-1];
+      // Normalize.
+      for (Int i = 1; i <= ncells; ++i)
+        xb_[i] /= xb_[ncells];
+    } else {
+      xb_.back() = 1;
+      for (Int i = 1; i < ncells; ++i)
+        xb_[i] = Real(i) / ncells;
+    }
+    for (Int i = 0; i < ncells; ++i)
+      xcp_[i] = 0.5*(xb_[i] + xb_[i+1]);
+    xcp_.back() = 1 + xcp_[0];
+  }
+
+  static void run_cdr (const Problem1D& p, CDR& cdr,
+                       const Real* yp, Real* y, const Int* dods) {
+    const Int n = p.ncells();
+    for (Int i = 0; i < n; ++i) {
+      const Int* dod = dods + 4*i;
+      Real min = yp[dod[0]], max = min;
+      for (Int j = 1; j < 4; ++j) {
+        const Real v = yp[dod[j]];
+        min = std::min(min, v);
+        max = std::max(max, v);
+      }
+      const Real area_i = p.area(i);
+      cdr.set_Qm(i, 0, y[i]*area_i, min*area_i, max*area_i, yp[i]*area_i);
+    }
+    cdr.run();
+    for (Int i = 0; i < n; ++i)
+      y[i] = cdr.get_Qm(i, 0) / p.area(i);
+    y[n] = y[0];
+  }
+
+  static void run_caas (const Problem1D& p, const Real* yp, Real* y, const Int* dods) {
+    const Int n = p.ncells();
+    std::vector<Real> lo(n), up(n), w(n);
+    Real m = 0;
+    for (Int i = 0; i < n; ++i) {
+      const Int* dod = dods + 4*i;
+      Real min = yp[dod[0]], max = min;
+      for (Int j = 1; j < 4; ++j) {
+        const Real v = yp[dod[j]];
+        min = std::min(min, v);
+        max = std::max(max, v);
+      }
+      const Real area_i = p.area(i);
+      lo[i] = min*area_i;
+      up[i] = max*area_i;
+      y[i] = std::max(min, std::min(max, y[i]));
+      m += (yp[i] - y[i])*area_i;
+    }
+    Real wsum = 0;
+    for (Int i = 0; i < n; ++i) {
+      w[i] = m >= 0 ? up[i] - y[i]*p.area(i) : y[i]*p.area(i) - lo[i];
+      wsum += w[i];
+    }
+    for (Int i = 0; i < n; ++i)
+      y[i] += (m/(wsum*p.area(i)))*w[i];
+  }
+
+public:
+  Problem1D (const Int ncells, const bool nonuniform_mesh = false) {
+    init_mesh(ncells, nonuniform_mesh);
+  }
+
+  Int ncells () const { return xb_.size() - 1; }
+  Real xb (const Int& i) const { return xb_[i]; }
+  Real xcp (const Int& i) const { return xcp_[i]; }
+  Real area (const Int& i) const { return xb_[i+1] - xb_[i]; }
+
+  const std::vector<Real> get_xb () const { return xb_; }
+  const std::vector<Real> get_xcp () const { return xcp_; }
+
+  void cycle (const Int& nsteps, const Real* y0, Real* yf, CDR* cdr = nullptr) {
+    const Int n = xcp_.size();
+    rwrk_.resize(2*n);
+    iwrk_.resize(4*n);
+    Real* xcpi = rwrk_.data();
+    Int* dod = iwrk_.data();
+
+    const Real xos = -1.0 / nsteps;
+    for (Int i = 0; i < n; ++i)
+      xcpi[i] = xcp_[i] + xos;
+
+    Real* ys[] = {xcpi + n, yf};
+    std::copy(y0, y0 + n, ys[0]);
+    for (Int ti = 0; ti < nsteps; ++ti) {
+      interp::cubic_interp_periodic(xcp_.data(), n, ys[0],
+                                    xcpi, n, ys[1], dod);
+      if (cdr)
+        run_cdr(*this, *cdr, ys[0], ys[1], dod);
+      else
+        run_caas(*this, ys[0], ys[1], dod);
+      std::swap(ys[0], ys[1]);
+    }
+    std::copy(ys[0], ys[0] + n, yf);
+  }
+};
+
+//todo Clean this up. Right now everything is hardcoded and kludgy.
+// - optional write
+// - some sort of brief quantitative output
+// - better, more canonical IC
+// - optional tree imbalance
+// - optional mesh nonuniformity
+// - parallel?
+Int run (const mpi::Parallel::Ptr& parallel, const Input& in) {
+  cedr_throw_if(parallel->size() > 1, "run_1d_transport_test runs in serial only.");
+  Int nerr = 0;
+
+  Problem1D p(in.ncells, false /* nonuniform_mesh */ );
+
+  auto tree = qlt::tree::make_tree_over_1d_mesh(parallel, in.ncells,
+                                                false /* imbalanced */);
+  typedef qlt::QLT<Kokkos::DefaultHostExecutionSpace> QLTT;
+  QLTT qlt(parallel, in.ncells, tree);
+
+  typedef caas::CAAS<Kokkos::DefaultHostExecutionSpace> CAAST;
+  CAAST caas(parallel, in.ncells);
+
+  CDR* cdrs[] = {&qlt, &caas};
+  const int ncdrs = sizeof(cdrs)/sizeof(*cdrs);
+
+  for (CDR* cdr : cdrs) {
+    cdr->declare_tracer(cedr::ProblemType::conserve |
+                        cedr::ProblemType::shapepreserve, 0);
+    cdr->end_tracer_declarations();
+    for (Int i = 0; i < in.ncells; ++i)
+      cdr->set_rhom(i, 0, p.area(i));
+    cdr->print(std::cout);
+  }
+
+  std::vector<Real> y0(in.ncells+1);
+  for (Int i = 0, nc = p.ncells(); i < nc; ++i)
+    y0[i] = (p.xcp(i) < 0.4 || p.xcp(i) > 0.9 ?
+             InitialCondition::eval(InitialCondition::sin, p.xcp(i)) :
+             InitialCondition::eval(InitialCondition::rect, p.xcp(i)));
+  y0.back() = y0[0];
+
+  PyWriter w("out_transport1d");
+  w.write("xb", p.get_xb());
+  w.write("xcp", p.get_xcp());
+  w.write("y0", y0);
+
+  std::vector<Real> yf(in.ncells+1);
+  const Int nsteps = Int(3.17*in.ncells);
+  const Int ncycles = 1;
+  
+  const char* names[] = {"yqlt", "ycaas"};
+  for (int ic = 0; ic < ncdrs; ++ic) {
+    std::copy(y0.begin(), y0.end(), yf.begin());
+    for (Int i = 0; i < ncycles; ++i)
+      p.cycle(nsteps, yf.data(), yf.data(), cdrs[ic]);
+    w.write(names[ic], yf);
+  }
+
+  std::copy(y0.begin(), y0.end(), yf.begin());
+  for (Int i = 0; i < ncycles; ++i)
+    p.cycle(nsteps, yf.data(), yf.data());
+  w.write("ylcaas", yf);
+
+  return nerr;
+}
+
+} // namespace transport1d
+} // namespace test
+} // namespace cedr
diff --git a/cedr/cedr_test_randomized.cpp b/cedr/cedr_test_randomized.cpp
new file mode 100644
index 0000000..32ede9b
--- /dev/null
+++ b/cedr/cedr_test_randomized.cpp
@@ -0,0 +1,439 @@
+#include "cedr_test_randomized.hpp"
+
+namespace cedr {
+namespace test {
+
+std::string TestRandomized::Tracer::str () const {
+  std::stringstream ss;
+  ss << "(ti " << idx;
+  if (problem_type & PT::conserve) ss << " c";
+  if (problem_type & PT::shapepreserve) ss << " s";
+  if (problem_type & PT::consistent) ss << " t";
+  ss << " pt " << perturbation_type << " ssh " << safe_should_hold
+     << " lsh " << local_should_hold << ")";
+  return ss.str();
+}
+
+TestRandomized::Writer::~Writer () {
+  if ( ! fh) return;
+  fprintf(fh.get(), "  return s\n");
+}
+
+void TestRandomized::init_tracers_vector () {
+  typedef Tracer::PT PT;
+  static const Int pts[] = {
+    PT::conserve | PT::shapepreserve | PT::consistent,
+    PT::shapepreserve, // Test a noncanonical problem type.
+    PT::conserve | PT::consistent,
+    PT::consistent
+  };
+  Int tracer_idx = 0;
+  for (Int perturb = 0; perturb < 6; ++perturb)
+    for (Int ti = 0; ti < 4; ++ti) {
+      Tracer t;
+      t.problem_type = pts[ti];
+      const bool shapepreserve = t.problem_type & PT::shapepreserve;
+      t.idx = tracer_idx++;
+      t.perturbation_type = perturb;
+      t.safe_should_hold = true;
+      t.no_change_should_hold = perturb == 0;
+      t.local_should_hold = perturb < 4 && shapepreserve;
+      t.write = perturb == 2 && ti == 2;
+      tracers_.push_back(t);
+    }
+}
+
+static Real urand () { return rand() / ((Real) RAND_MAX + 1.0); }
+
+void TestRandomized::generate_rho (Values& v) {
+  auto r = v.rhom();
+  const Int n = v.ncells();
+  for (Int i = 0; i < n; ++i)
+    r[i] = 0.5 + 1.5*urand();
+}
+
+void TestRandomized::generate_Q (const Tracer& t, Values& v) {
+  Real* rhom = v.rhom(), * Qm_min = v.Qm_min(t.idx), * Qm = v.Qm(t.idx),
+    * Qm_max = v.Qm_max(t.idx), * Qm_prev = v.Qm_prev(t.idx);
+  const Int n = v.ncells();
+  for (Int i = 0; i < n; ++i) {
+    const Real
+      q_min = 0.1 + 0.8*urand(),
+      q_max = std::min<Real>(1, q_min + (0.9 - q_min)*urand()),
+      q = q_min + (q_max - q_min)*urand();
+    // Check correctness up to FP.
+    cedr_assert(q_min >= 0 &&
+                q_max <= 1 + 10*std::numeric_limits<Real>::epsilon() &&
+                q_min <= q && q <= q_max);
+    Qm_min[i] = q_min*rhom[i];
+    Qm_max[i] = q_max*rhom[i];
+    // Protect against FP error.
+    Qm[i] = std::max<Real>(Qm_min[i], std::min(Qm_max[i], q*rhom[i]));
+    // Set previous Qm to the current unperturbed value.
+    Qm_prev[i] = Qm[i];
+  }
+}
+
+static void gen_rand_perm (const size_t n, std::vector<Int>& p) {
+  p.resize(n);
+  for (size_t i = 0; i < n; ++i)
+    p[i] = i;
+  for (size_t i = 0; i < n; ++i) {
+    const int j = urand()*n, k = urand()*n;
+    std::swap(p[j], p[k]);
+  }
+}
+
+// Permuting the Qm array, even just on a rank as long as there is > 1 cell,
+// produces a problem likely requiring considerable reconstruction, which
+// reconstruction assuredly satisfies the properties. But because this is a
+// local operation only, it doesn't test the 1 cell/rank case.
+void TestRandomized::permute_Q (const Tracer& t, Values& v) {
+  Real* const Qm = v.Qm(t.idx);
+  const Int N = v.ncells();
+  std::vector<Int> p;
+  gen_rand_perm(N, p);
+  std::vector<Real> Qm_orig(N);
+  std::copy(Qm, Qm + N, Qm_orig.begin());
+  for (Int i = 0; i < N; ++i)
+    Qm[i] = Qm_orig[p[i]];
+}
+
+void TestRandomized
+::add_const_to_Q (const Tracer& t, Values& v,
+                  // Move 0 < alpha <= 1 of the way to the QLT or safety
+                  // feasibility bound.
+                  const Real& alpha,
+                  // Whether the modification should be done in a
+                  // mass-conserving way.
+                  const bool conserve_mass,
+                  // Only safety problem is feasible.
+                  const bool safety_problem) {
+  // Some of these reductions aren't used at present. Might add more test
+  // options later that use them.
+  Real rhom, Qm, Qm_max; {
+    Real Qm_sum_lcl[3] = {0};
+    for (Int i = 0; i < v.ncells(); ++i) {
+      Qm_sum_lcl[0] += v.rhom()[i];
+      Qm_sum_lcl[1] += v.Qm(t.idx)[i];
+      Qm_sum_lcl[2] += v.Qm_max(t.idx)[i];
+    }
+    Real Qm_sum_gbl[3] = {0};
+    mpi::all_reduce(*p_, Qm_sum_lcl, Qm_sum_gbl, 3, MPI_SUM);
+    rhom = Qm_sum_gbl[0]; Qm = Qm_sum_gbl[1]; Qm_max = Qm_sum_gbl[2];
+  }
+  Real Qm_max_safety = 0;
+  if (safety_problem && v.ncells()) {
+    Real q_safety_lcl = v.Qm_max(t.idx)[0] / v.rhom()[0];
+    for (Int i = 1; i < v.ncells(); ++i)
+      q_safety_lcl = std::max(q_safety_lcl, v.Qm_max(t.idx)[i] / v.rhom()[i]);
+    Real q_safety_gbl = 0;
+    mpi::all_reduce(*p_, &q_safety_lcl, &q_safety_gbl, 1, MPI_MAX);
+    Qm_max_safety = q_safety_gbl*rhom;
+  }
+  const Real dQm = safety_problem ?
+    ((Qm_max - Qm) + alpha * (Qm_max_safety - Qm_max)) / ncells_ :
+    alpha * (Qm_max - Qm) / ncells_;
+  for (Int i = 0; i < v.ncells(); ++i)
+    v.Qm(t.idx)[i] += dQm;
+  // Now permute Qm so that it's a little more interesting.
+  permute_Q(t, v);
+  // Adjust Qm_prev. Qm_prev is used to test the PT::conserve case, and also
+  // simply to record the correct total mass. The modification above modified
+  // Q's total mass. If conserve_mass, then Qm_prev needs to be made to sum to
+  // the same new mass. If ! conserve_mass, we want Qm_prev to be modified in
+  // an interesting way, so that PT::conserve doesn't trivially undo the mod
+  // that was made above when the root fixes the mass discrepancy.
+  const Real
+    relax = 0.9,
+    dQm_prev = (conserve_mass ? dQm :
+                (safety_problem ?
+                 ((Qm_max - Qm) + relax*alpha * (Qm_max_safety - Qm_max)) / ncells_ :
+                 relax*alpha * (Qm_max - Qm) / ncells_));
+  for (Int i = 0; i < v.ncells(); ++i)
+    v.Qm_prev(t.idx)[i] += dQm_prev;
+}
+
+void TestRandomized::perturb_Q (const Tracer& t, Values& v) {
+  // QLT is naturally mass conserving. But if QLT isn't being asked to impose
+  // mass conservation, then the caller better have a conservative
+  // method. Here, we model that by saying that Qm_prev and Qm should sum to
+  // the same mass.
+  const bool cm = ! (t.problem_type & Tracer::PT::conserve);
+  // For the edge cases, we cannot be exactly on the edge and still expect the
+  // q-limit checks to pass to machine precision. Thus, back away from the
+  // edge by an amount that bounds the error in the global mass due to FP,
+  // assuming each cell's mass is O(1).
+  const Real edg = 1 - ncells_*std::numeric_limits<Real>::epsilon();
+  switch (t.perturbation_type) {
+  case 0:
+    // Do nothing, to test that QLT doesn't make any changes if none is
+    // needed.
+    break;
+  case 1: permute_Q(t, v); break;
+  case 2: add_const_to_Q(t, v, 0.5, cm, false); break;
+  case 3: add_const_to_Q(t, v, edg, cm, false); break;
+  case 4: add_const_to_Q(t, v, 0.5, cm, true ); break;
+  case 5: add_const_to_Q(t, v, edg, cm, true ); break;
+  }
+}
+
+std::string TestRandomized::get_tracer_name (const Tracer& t) {
+  std::stringstream ss;
+  ss << "t" << t.idx;
+  return ss.str();
+}
+
+void TestRandomized::init_writer () {
+  if (p_->amroot()) {
+    w_ = std::make_shared<Writer>();
+    w_->fh = std::unique_ptr<FILE, cedr::util::FILECloser>(fopen("out_QLT.py", "w"));
+    int n = gcis_.size();
+    w_->ngcis.resize(p_->size());
+    mpi::gather(*p_, &n, 1, w_->ngcis.data(), 1, p_->root());
+    w_->displs.resize(p_->size() + 1);
+    w_->displs[0] = 0;
+    for (size_t i = 0; i < w_->ngcis.size(); ++i)
+      w_->displs[i+1] = w_->displs[i] + w_->ngcis[i];
+    cedr_assert(w_->displs.back() == ncells_);
+    w_->gcis.resize(ncells_);
+    mpi::gatherv(*p_, gcis_.data(), gcis_.size(), w_->gcis.data(), w_->ngcis.data(),
+                 w_->displs.data(), p_->root());
+  } else {
+    int n = gcis_.size();
+    mpi::gather(*p_, &n, 1, static_cast<int*>(nullptr), 0, p_->root());
+    Long* Lnull = nullptr;
+    const int* inull = nullptr;
+    mpi::gatherv(*p_, gcis_.data(), gcis_.size(), Lnull, inull, inull, p_->root());
+  }
+  write_inited_ = true;
+}
+
+void TestRandomized
+::gather_field (const Real* Qm_lcl, std::vector<Real>& Qm_gbl,
+                std::vector<Real>& wrk) {
+  if (p_->amroot()) {
+    Qm_gbl.resize(ncells_);
+    wrk.resize(ncells_);
+    mpi::gatherv(*p_, Qm_lcl, gcis_.size(), wrk.data(), w_->ngcis.data(),
+                 w_->displs.data(), p_->root());
+    for (Int i = 0; i < ncells_; ++i)
+      Qm_gbl[w_->gcis[i]] = wrk[i];
+  } else {
+    Real* rnull = nullptr;
+    const int* inull = nullptr;
+    mpi::gatherv(*p_, Qm_lcl, gcis_.size(), rnull, inull, inull, p_->root());
+  }
+}
+
+void TestRandomized
+::write_field (const std::string& tracer_name, const std::string& field_name,
+               const std::vector<Real>& Qm) {
+  if ( ! p_->amroot()) return;
+  fprintf(w_->fh.get(), "  s.%s.%s = [", tracer_name.c_str(), field_name.c_str());
+  for (const auto& e : Qm)
+    fprintf(w_->fh.get(), "%1.15e, ", e);
+  fprintf(w_->fh.get(), "]\n");
+}
+
+void TestRandomized::write_pre (const Tracer& t, Values& v) {
+  if ( ! t.write) return;
+  std::vector<Real> f, wrk;
+  if ( ! write_inited_) {
+    init_writer();
+    if (w_)
+      fprintf(w_->fh.get(),
+              "def getsolns():\n"
+              "  class Struct:\n"
+              "    pass\n"
+              "  s = Struct()\n"
+              "  s.all = Struct()\n");
+    gather_field(v.rhom(), f, wrk);
+    write_field("all", "rhom", f);
+  }
+  const auto name = get_tracer_name(t);
+  if (w_)
+    fprintf(w_->fh.get(), "  s.%s = Struct()\n", name.c_str());
+  gather_field(v.Qm_min(t.idx), f, wrk);
+  write_field(name, "Qm_min", f);
+  gather_field(v.Qm_prev(t.idx), f, wrk);
+  write_field(name, "Qm_orig", f);
+  gather_field(v.Qm(t.idx), f, wrk);
+  write_field(name, "Qm_pre", f);
+  gather_field(v.Qm_max(t.idx), f, wrk);
+  write_field(name, "Qm_max", f);
+}
+
+void TestRandomized::write_post (const Tracer& t, Values& v) {
+  if ( ! t.write) return;
+  const auto name = get_tracer_name(t);
+  std::vector<Real> Qm, wrk;
+  gather_field(v.Qm(t.idx), Qm, wrk);
+  write_field(name, "Qm_qlt", Qm);
+}
+
+Int TestRandomized
+::check (const std::string& cdr_name, const mpi::Parallel& p,
+         const std::vector<Tracer>& ts, const Values& v) {
+  static const bool details = false;
+  static const Real ulp3 = 3*std::numeric_limits<Real>::epsilon();
+  Int nerr = 0;
+  std::vector<Real> lcl_mass(2*ts.size()), q_min_lcl(ts.size()), q_max_lcl(ts.size());
+  std::vector<Int> t_ok(ts.size(), 1), local_violated(ts.size(), 0);
+  for (size_t ti = 0; ti < ts.size(); ++ti) {
+    const auto& t = ts[ti];
+
+    cedr_assert(t.safe_should_hold);
+    const bool safe_only = ! t.local_should_hold;
+    const Int n = v.ncells();
+    const Real* rhom = v.rhom(), * Qm_min = v.Qm_min(t.idx), * Qm = v.Qm(t.idx),
+      * Qm_max = v.Qm_max(t.idx), * Qm_prev = v.Qm_prev(t.idx);
+
+    q_min_lcl[ti] = 1;
+    q_max_lcl[ti] = 0;
+    for (Int i = 0; i < n; ++i) {
+      const bool lv = (Qm[i] < Qm_min[i] || Qm[i] > Qm_max[i]);
+      if (lv) local_violated[ti] = 1;
+      if ( ! safe_only && lv) {
+        // If this fails at ~ machine eps, check r2l_nl_adjust_bounds code in
+        // solve_node_problem.
+        if (details)
+          pr("check q " << t.str() << ": " << Qm[i] << " " <<
+             (Qm[i] < Qm_min[i] ? Qm[i] - Qm_min[i] : Qm[i] - Qm_max[i]));
+        t_ok[ti] = false;
+        ++nerr;
+      }
+      if (t.no_change_should_hold && Qm[i] != Qm_prev[i]) {
+        if (details)
+          pr("Q should be unchanged but is not: " << Qm_prev[i] << " changed to " <<
+             Qm[i] << " in " << t.str());
+        t_ok[ti] = false;
+        ++nerr;
+      }
+      lcl_mass[2*ti    ] += Qm_prev[i];
+      lcl_mass[2*ti + 1] += Qm[i];
+      q_min_lcl[ti] = std::min(q_min_lcl[ti], Qm_min[i]/rhom[i]);
+      q_max_lcl[ti] = std::max(q_max_lcl[ti], Qm_max[i]/rhom[i]);
+    }
+  }
+
+  std::vector<Real> q_min_gbl(ts.size(), 0), q_max_gbl(ts.size(), 0);
+  mpi::all_reduce(p, q_min_lcl.data(), q_min_gbl.data(), q_min_lcl.size(), MPI_MIN);
+  mpi::all_reduce(p, q_max_lcl.data(), q_max_gbl.data(), q_max_lcl.size(), MPI_MAX);
+
+  for (size_t ti = 0; ti < ts.size(); ++ti) {
+    // Check safety problem. If local_should_hold and it does, then the safety
+    // problem is by construction also solved (since it's a relaxation of the
+    // local problem).
+    const auto& t = ts[ti];
+    const bool safe_only = ! t.local_should_hold;
+    if (safe_only) {
+      const Int n = v.ncells();
+      const Real* rhom = v.rhom(), * Qm_min = v.Qm_min(t.idx), * Qm = v.Qm(t.idx),
+        * Qm_max = v.Qm_max(t.idx);
+      const Real q_min = q_min_gbl[ti], q_max = q_max_gbl[ti];
+      for (Int i = 0; i < n; ++i) {
+        if (Qm[i] < q_min*rhom[i]*(1 - ulp3) ||
+            Qm[i] > q_max*rhom[i]*(1 + ulp3)) {
+          if (details)
+            pr("check q (safety) " << t.str() << ": " << q_min*rhom[i] << " "
+               << Qm_min[i] << " " << Qm[i] << " " << Qm_max[i] << " "
+               << q_max*rhom[i] << " | " << (Qm[i] < q_min*rhom[i] ?
+                                             Qm[i] - q_min*rhom[i] :
+                                             Qm[i] - q_max*rhom[i]));
+          t_ok[ti] = false;
+          ++nerr;
+        }
+      }        
+    }
+  }
+
+  std::vector<Real> glbl_mass(2*ts.size(), 0);
+  mpi::reduce(p, lcl_mass.data(), glbl_mass.data(), lcl_mass.size(), MPI_SUM,
+              p.root());
+  std::vector<Int> t_ok_gbl(ts.size(), 0);
+  mpi::reduce(p, t_ok.data(), t_ok_gbl.data(), t_ok.size(), MPI_MIN, p.root());
+  // Right now we're not using these:
+  std::vector<Int> local_violated_gbl(ts.size(), 0);
+  mpi::reduce(p, local_violated.data(), local_violated_gbl.data(),
+              local_violated.size(), MPI_MAX, p.root());
+
+  if (p.amroot()) {
+    const Real tol = 1e3*std::numeric_limits<Real>::epsilon();
+    for (size_t ti = 0; ti < ts.size(); ++ti) {
+      // Check mass conservation.
+      const Real desired_mass = glbl_mass[2*ti], actual_mass = glbl_mass[2*ti+1],
+        rd = cedr::util::reldif(desired_mass, actual_mass);
+      const bool mass_failed = rd > tol;
+      if (mass_failed) {
+        ++nerr;
+        t_ok_gbl[ti] = false;
+      }
+      if ( ! t_ok_gbl[ti]) {
+        std::cout << "FAIL " << cdr_name << ": " << ts[ti].str();
+        if (mass_failed) std::cout << " mass re " << rd;
+        std::cout << "\n";
+      }
+    }
+  }
+
+  return nerr;
+}
+  
+TestRandomized
+::TestRandomized (const std::string& name, const mpi::Parallel::Ptr& p,
+                  const Int& ncells, const bool verbose)
+  : cdr_name_(name), p_(p), ncells_(ncells), write_inited_(false)
+{}
+
+void TestRandomized::init () {
+  init_numbering();
+  init_tracers_vector();
+  init_tracers();
+}
+
+Int TestRandomized::run (const Int nrepeat, const bool write) {
+  const Int nt = tracers_.size(), nlclcells = gcis_.size();
+
+  Values v(nt, nlclcells);
+  generate_rho(v);
+  for (const auto& t : tracers_) {
+    generate_Q(t, v);
+    perturb_Q(t, v);
+  }
+
+  if (write)
+    for (const auto& t : tracers_)
+      write_pre(t, v);
+
+  CDR& cdr = get_cdr();
+  {
+    Real* rhom = v.rhom();
+    for (Int i = 0; i < nlclcells; ++i)
+      cdr.set_rhom(i, 0, rhom[i]);
+  }
+  for (Int trial = 0; trial <= nrepeat; ++trial) {
+    for (Int ti = 0; ti < nt; ++ti) {
+      Real* Qm_min = v.Qm_min(ti), * Qm = v.Qm(ti), * Qm_max = v.Qm_max(ti),
+        * Qm_prev = v.Qm_prev(ti);
+      for (Int i = 0; i < nlclcells; ++i)
+        cdr.set_Qm(i, ti, Qm[i], Qm_min[i], Qm_max[i], Qm_prev[i]);
+    }
+
+    run_impl(trial);
+  }
+
+  for (Int ti = 0; ti < nt; ++ti) {
+    Real* Qm = v.Qm(ti);
+    for (Int i = 0; i < nlclcells; ++i)
+      Qm[i] = cdr.get_Qm(i, ti);
+  }
+
+  if (write)
+    for (const auto& t : tracers_)
+      write_post(t, v);
+  return check(cdr_name_, *p_, tracers_, v);
+}
+
+} // namespace test
+} // namespace cedr
diff --git a/cedr/cedr_test_randomized.hpp b/cedr/cedr_test_randomized.hpp
new file mode 100644
index 0000000..dd4f54d
--- /dev/null
+++ b/cedr/cedr_test_randomized.hpp
@@ -0,0 +1,128 @@
+#ifndef INCLUDE_CEDR_TEST_RANDOMIZED_HPP
+#define INCLUDE_CEDR_TEST_RANDOMIZED_HPP
+
+#include "cedr_cdr.hpp"
+#include "cedr_mpi.hpp"
+#include "cedr_util.hpp"
+
+namespace cedr {
+namespace test {
+
+class TestRandomized {
+public:
+  TestRandomized(const std::string& cdr_name, const mpi::Parallel::Ptr& p,
+                 const Int& ncells, const bool verbose = false);
+
+  // The subclass should call this, probably in its constructor.
+  void init();
+
+  Int run(const Int nrepeat = 1, const bool write=false);
+
+private:
+  const std::string cdr_name_;
+
+protected:
+  struct Tracer {
+    typedef ProblemType PT;
+    
+    Int idx;
+    Int problem_type;
+    Int perturbation_type;
+    bool no_change_should_hold, safe_should_hold, local_should_hold;
+    bool write;
+
+    std::string str() const;
+
+    Tracer ()
+      : idx(-1), problem_type(-1), perturbation_type(-1), no_change_should_hold(false),
+        safe_should_hold(true), local_should_hold(true), write(false)
+    {}
+  };
+
+  struct Values {
+    Values (const Int ntracers, const Int ncells)
+      : ncells_(ncells), v_((4*ntracers + 1)*ncells)
+    {}
+    Int ncells () const { return ncells_; }
+    Real* rhom () { return v_.data(); }
+    Real* Qm_min  (const Int& ti) { return v_.data() + ncells_*(1 + 4*ti    ); }
+    Real* Qm      (const Int& ti) { return v_.data() + ncells_*(1 + 4*ti + 1); }
+    Real* Qm_max  (const Int& ti) { return v_.data() + ncells_*(1 + 4*ti + 2); }
+    Real* Qm_prev (const Int& ti) { return v_.data() + ncells_*(1 + 4*ti + 3); }
+    const Real* rhom () const { return const_cast<Values*>(this)->rhom(); }
+    const Real* Qm_min  (const Int& ti) const
+    { return const_cast<Values*>(this)->Qm_min (ti); }
+    const Real* Qm      (const Int& ti) const
+    { return const_cast<Values*>(this)->Qm     (ti); }
+    const Real* Qm_max  (const Int& ti) const
+    { return const_cast<Values*>(this)->Qm_max (ti); }
+    const Real* Qm_prev (const Int& ti) const
+    { return const_cast<Values*>(this)->Qm_prev(ti); }
+  private:
+    Int ncells_;
+    std::vector<Real> v_;
+  };
+
+  // For solution output, if requested.
+  struct Writer {
+    std::unique_ptr<FILE, cedr::util::FILECloser> fh;
+    std::vector<Int> ngcis;  // Number of i'th rank's gcis_ array.
+    std::vector<Long> gcis;  // Global cell indices packed by rank's gcis_ vector.
+    std::vector<int> displs; // Cumsum of above.
+    ~Writer();
+  };
+
+  const mpi::Parallel::Ptr p_;
+  const Int ncells_;
+  // Global mesh entity IDs, 1-1 with reduction array index or QLT leaf node.
+  std::vector<Long> gcis_;
+  std::vector<Tracer> tracers_;
+
+  // Tell this class the CDR.
+  virtual CDR& get_cdr() = 0;
+
+  // Fill gcis_.
+  virtual void init_numbering() = 0;
+
+  // Using tracers_, the vector of Tracers, initialize the CDR's tracers.
+  virtual void init_tracers() = 0;
+
+  virtual void run_impl(const Int trial) = 0;
+
+private:
+  // For optional output.
+  bool write_inited_;
+  std::shared_ptr<Writer> w_; // Only on root.
+
+  void init_tracers_vector();
+
+  void add_const_to_Q(
+    const Tracer& t, Values& v,
+    // Move 0 < alpha <= 1 of the way to the QLT or safety feasibility bound.
+    const Real& alpha,
+    // Whether the modification should be done in a mass-conserving way.
+    const bool conserve_mass,
+    // Only safety problem is feasible.
+    const bool safety_problem);
+
+  void perturb_Q(const Tracer& t, Values& v);
+  void init_writer();
+  void gather_field(const Real* Qm_lcl, std::vector<Real>& Qm_gbl,
+                    std::vector<Real>& wrk);
+  void write_field(const std::string& tracer_name, const std::string& field_name,
+                   const std::vector<Real>& Qm);
+  void write_pre(const Tracer& t, Values& v);
+  void write_post(const Tracer& t, Values& v);
+
+  static void generate_rho(Values& v);
+  static void generate_Q(const Tracer& t, Values& v);
+  static void permute_Q(const Tracer& t, Values& v);
+  static std::string get_tracer_name(const Tracer& t);
+  static Int check(const std::string& cdr_name, const mpi::Parallel& p,
+                   const std::vector<Tracer>& ts, const Values& v);
+};
+
+} // namespace test
+} // namespace cedr
+
+#endif
diff --git a/cedr/cedr_util.cpp b/cedr/cedr_util.cpp
new file mode 100644
index 0000000..3854888
--- /dev/null
+++ b/cedr/cedr_util.cpp
@@ -0,0 +1,23 @@
+#include "cedr_util.hpp"
+
+namespace cedr {
+namespace util {
+
+bool eq (const std::string& a, const char* const b1, const char* const b2) {
+  return (a == std::string(b1) || (b2 && a == std::string(b2)) ||
+          a == std::string("-") + std::string(b1));
+}
+
+Real urand () { return std::rand() / ((Real) RAND_MAX + 1.0); }
+
+Real reldif (const Real* a, const Real* b, const Int n) {
+  Real num = 0, den = 0;
+  for (Int i = 0; i < n; ++i) {
+    num += std::abs(a[i] - b[i]);
+    den += std::abs(a[i]);
+  }
+  return num/den;
+}
+
+}
+}
diff --git a/cedr/cedr_util.hpp b/cedr/cedr_util.hpp
new file mode 100644
index 0000000..87f5e2b
--- /dev/null
+++ b/cedr/cedr_util.hpp
@@ -0,0 +1,90 @@
+#ifndef INCLUDE_CEDR_UTIL_HPP
+#define INCLUDE_CEDR_UTIL_HPP
+
+#include <sstream>
+
+#include "cedr_kokkos.hpp"
+#include "cedr_mpi.hpp"
+
+namespace cedr {
+namespace util {
+
+template <typename T> KOKKOS_INLINE_FUNCTION constexpr
+T square (const T& x) { return x*x; }
+
+bool eq(const std::string& a, const char* const b1, const char* const b2 = 0);
+
+// Uniform rand in [0, 1).
+Real urand();
+
+#define pr(m) do {                                      \
+    int _pid_ = 0;                                      \
+    MPI_Comm_rank(MPI_COMM_WORLD, &_pid_);              \
+    std::stringstream _ss_;                             \
+    _ss_.precision(15);                                 \
+    _ss_ << "pid " << _pid_ << " " << m << std::endl;   \
+    std::cerr << _ss_.str();                            \
+  } while (0)
+#define pr0(m) do {                                     \
+    int _pid_; MPI_Comm_rank(MPI_COMM_WORLD, &_pid_);   \
+    if (_pid_ != 0) break;                              \
+    std::stringstream _ss_;                             \
+    _ss_ << "pid " << _pid_ << " " << m << std::endl;   \
+    std::cerr << _ss_.str();                            \
+  } while (0)
+#define prc(m) pr(#m << " | " << (m))
+#define pr0c(m) pr0(#m << " | " << (m))
+#define puf(m) "(" << #m << " " << (m) << ")"
+#define pu(m) << " " << puf(m)
+template <typename T>
+void prarr (const std::string& name, const T* const v, const size_t n) {
+  std::stringstream ss;
+  ss.precision(15);
+  ss << name << " = [";
+  for (size_t i = 0; i < n; ++i) ss << " " << v[i];
+  ss << "];";
+  pr(ss.str());
+}
+#define mprarr(m) cedr::util::prarr(#m, m.data(), m.size())
+
+#ifndef NDEBUG
+# define cedr_assert(condition) do {                                    \
+    if ( ! (condition)) {                                               \
+      std::stringstream _ss_;                                           \
+      _ss_ << __FILE__ << ":" << __LINE__ << ": FAIL:\n" << #condition  \
+        << "\n";                                                        \
+        throw std::logic_error(_ss_.str());                             \
+    }                                                                   \
+  } while (0)
+# define cedr_kernel_assert(condition) do {     \
+    if ( ! (condition))                         \
+      Kokkos::abort(#condition);                \
+  } while (0)
+#else
+# define cedr_assert(condition)
+# define cedr_kernel_assert(condition)
+#endif
+#define cedr_throw_if(condition, message) do {                          \
+    if (condition) {                                                    \
+      std::stringstream _ss_;                                           \
+      _ss_ << __FILE__ << ":" << __LINE__ << ": The condition:\n"       \
+           << #condition "\nled to the exception\n" << message << "\n"; \
+        throw std::logic_error(_ss_.str());                             \
+    }                                                                   \
+  } while (0)
+#define cedr_kernel_throw_if(condition, message) do {                   \
+    if (condition)                                                      \
+      Kokkos::abort(#condition " led to the exception\n" message);      \
+  } while (0)
+
+inline Real reldif (const Real a, const Real b)
+{ return std::abs(b - a)/std::max(std::abs(a), std::abs(b)); }
+
+Real reldif(const Real* a, const Real* b, const Int n);
+
+struct FILECloser { void operator() (FILE* fh) { fclose(fh); } };
+
+}
+}
+
+#endif
diff --git a/cedr/make_qltcpp.sh b/cedr/make_qltcpp.sh
new file mode 100644
index 0000000..c0267c7
--- /dev/null
+++ b/cedr/make_qltcpp.sh
@@ -0,0 +1,10 @@
+# bash make_qltcpp.sh
+# mpicxx -Wall -pedantic -fopenmp -std=c++11 -I/home/ambradl/lib/kokkos/cpu/include qlt.cpp -L/home/ambradl/lib/kokkos/cpu/lib -lkokkos -ldl
+# OMP_PROC_BIND=false OMP_NUM_THREADS=2 mpirun -np 14 ./a.out -t
+
+(for f in cedr_kokkos.hpp cedr.hpp cedr_mpi.hpp cedr_util.hpp cedr_cdr.hpp cedr_qlt.hpp cedr_caas.hpp cedr_caas_inl.hpp cedr_local.hpp cedr_mpi_inl.hpp cedr_local_inl.hpp cedr_qlt_inl.hpp cedr_test_randomized.hpp cedr_test.hpp cedr_util.cpp cedr_local.cpp cedr_mpi.cpp cedr_qlt.cpp cedr_caas.cpp cedr_test_randomized.cpp cedr_test_1d_transport.cpp cedr_test.cpp; do
+    echo "//>> $f"
+    cat $f
+    echo ""
+done) > qlt.cpp
+sed sV'#include "cedr'V'//#include "cedr'V -i qlt.cpp
diff --git a/siqk/CMakeLists.txt b/siqk/CMakeLists.txt
new file mode 100644
index 0000000..9ff299c
--- /dev/null
+++ b/siqk/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_executable (siqk_test siqk_test.cpp)
+set_target_properties (siqk_test PROPERTIES
+  COMPILE_FLAGS ${COMPOSE_COMPILE_FLAGS}
+  LINK_FLAGS ${COMPOSE_LINK_FLAGS})
+target_include_directories (siqk_test PRIVATE ${COMPOSE_INCLUDES})
+target_link_libraries (siqk_test ${COMPOSE_LIBRARIES})
+
+configure_file (siqk_runtests.py siqk_runtests.py)
+
+add_test (NAME siqk-test-area
+  COMMAND python siqk_runtests.py $<TARGET_FILE:siqk_test> 0)
+add_test (NAME siqk-test-cube
+  COMMAND python siqk_runtests.py $<TARGET_FILE:siqk_test> 1)
diff --git a/siqk/readme.txt b/siqk/readme.txt
new file mode 100644
index 0000000..261c49a
--- /dev/null
+++ b/siqk/readme.txt
@@ -0,0 +1,13 @@
+For clarity, suppose your your C++ compiler is g++-4.8 in what follows. But it
+can be something else.
+
+1. Get and install the standalone Kokkos TPL:
+
+$ git clone https://github.com/kokkos/kokkos.git
+$ ./kokkos/generate_makefile.bash --with-openmp --ldflags=-fPIC --prefix=/path/to/desired/installation --compiler=g++-4.8
+
+2. cp an existing make.inc.* file to one for your machine, say,
+make.inc.mymachine. Edit it with machine-specific information. Then
+    $ ln -s make.inc.machine make.inc
+    $ make -j8
+    $ ./siqk_runtests.py
diff --git a/siqk/siqk.hpp b/siqk/siqk.hpp
new file mode 100644
index 0000000..f71b94b
--- /dev/null
+++ b/siqk/siqk.hpp
@@ -0,0 +1,10 @@
+#ifndef INCLUDE_SIQK_HPP
+#define INCLUDE_SIQK_HPP
+
+#include "siqk_geometry.hpp"
+#include "siqk_search.hpp"
+#include "siqk_intersect.hpp"
+#include "siqk_quadrature.hpp"
+#include "siqk_sqr.hpp"
+
+#endif
diff --git a/siqk/siqk_defs.hpp b/siqk/siqk_defs.hpp
new file mode 100644
index 0000000..9c3cbd0
--- /dev/null
+++ b/siqk/siqk_defs.hpp
@@ -0,0 +1,230 @@
+#ifndef INCLUDE_SIQK_DEFS_HPP
+#define INCLUDE_SIQK_DEFS_HPP
+
+#include <cmath>
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <sstream>
+
+#include <Kokkos_Core.hpp>
+
+#ifdef SIQK_TIME
+# include <unistd.h>
+# include <sys/time.h>
+# include <sys/resource.h>
+#endif
+
+// Always want this for GPU.
+#define SIQK_NONRECURSIVE
+
+#ifdef KOKKOS_HAVE_CUDA
+# define KOKKOS_CONSTANT __constant__ __device__
+#else
+# define KOKKOS_CONSTANT
+#endif
+
+namespace siqk {
+namespace ko = Kokkos;
+#define pr(m) do {                              \
+    std::stringstream _ss_;                     \
+    _ss_ << m << std::endl;                     \
+    std::cerr << _ss_.str();                    \
+  } while (0)
+#define prc(m) pr(#m << " | " << (m))
+#define puf(m)"(" << #m << " " << (m) << ")"
+#define pu(m) << " " << puf(m)
+template<typename T>
+static void prarr (const std::string& name, const T* const v, const size_t n) {
+  std::cerr << name << ": ";
+  for (size_t i = 0; i < n; ++i) std::cerr << " " << v[i];
+  std::cerr << "\n";
+}
+
+#define SIQK_THROW_IF(condition, message) do {                          \
+    if (condition) {                                                    \
+      std::stringstream _ss_;                                           \
+      _ss_ << __FILE__ << ":" << __LINE__ << ": The condition:\n" << #condition \
+        "\nled to the exception\n" << message << "\n";                  \
+      throw std::logic_error(_ss_.str());                               \
+    }                                                                   \
+  } while (0)
+
+#define SIQK_STDERR_IF(condition, message) do { \
+  try { SIQK_THROW_IF(condition, message); } \
+  catch (const std::logic_error& e) { std::cerr << e.what(); } \
+} while (0)
+
+#ifdef SIQK_TIME
+static timeval tic () {
+  timeval t;
+  gettimeofday(&t, 0);
+  return t;
+}
+static double calc_et (const timeval& t1, const timeval& t2) {
+  static const double us = 1.0e6;
+  return (t2.tv_sec * us + t2.tv_usec - t1.tv_sec * us - t1.tv_usec) / us;
+}
+static double toc (const timeval& t1) {
+  Kokkos::fence();
+  timeval t;
+  gettimeofday(&t, 0);
+  return calc_et(t1, t);
+}
+static double get_memusage () {
+  static const double scale = 1.0 / (1 << 10); // Memory in MB.
+  rusage ru;
+  getrusage(RUSAGE_SELF, &ru);
+  return ru.ru_maxrss*scale;
+}
+#else
+inline int tic () { return 0; }
+inline double toc (const int&) { return 0; }
+inline double get_memusage () { return 0; }
+#endif
+static void print_times (const std::string& name, const double* const parts,
+                         const int nparts) {
+#ifdef SIQK_TIME
+  double total = 0; for (int i = 0; i < nparts; ++i) total += parts[i];
+  printf("%20s %1.3e s %7.1f MB", name.c_str(), total, get_memusage());
+  for (int i = 0; i < nparts; ++i) printf(" %1.3e s", parts[i]);
+  printf("\n");
+#endif
+}
+static void print_times (const std::string& name, const double total) {
+#ifdef SIQK_TIME
+   printf("%20s %1.3e s %5.1f MB\n", name.c_str(), total, get_memusage());
+#endif
+}
+
+KOKKOS_INLINE_FUNCTION static void error (const char* const msg)
+{ ko::abort(msg); }
+
+KOKKOS_INLINE_FUNCTION static void message (const char* const msg)
+{ printf("%s\n", msg); }
+
+typedef int Int;
+typedef double Real;
+
+#ifdef KOKKOS_HAVE_CUDA
+typedef ko::LayoutLeft Layout;
+#else
+typedef ko::LayoutRight Layout;
+#endif
+
+// SIQK's array types.
+typedef ko::View<Real*[3], Layout> Vec3s;
+typedef ko::View<const Real*[3], Layout> ConstVec3s;
+typedef ko::View<Real*[6], Layout> Vec6s;
+typedef ko::View<const Real*[6], Layout> ConstVec6s;
+typedef ko::View<Real*[3], ko::LayoutRight, ko::MemoryTraits<ko::Unmanaged> > RawVec3s;
+typedef ko::View<const Real*[3], ko::LayoutRight, ko::MemoryTraits<ko::Unmanaged> > RawConstVec3s;
+typedef ko::View<Real*, ko::LayoutRight, ko::MemoryTraits<ko::Unmanaged> > RawArray;
+typedef ko::View<const Real*, ko::LayoutRight, ko::MemoryTraits<ko::Unmanaged> > RawConstArray;
+typedef ko::View<Int**, Layout> Idxs;
+typedef ko::View<const Int**, Layout> ConstIdxs;
+typedef ko::View<Int*[8], Layout> Nodes;
+typedef ko::View<const Int*[8], Layout> ConstNodes;
+
+// Decorator for a View. UnmanagedView<ViewType> gives the same view as
+// ViewType, except the memory is unmanaged.
+template <typename ViewT>
+using UnmanagedView = ko::View<
+  typename ViewT::data_type, typename ViewT::array_layout,
+  typename ViewT::device_type, ko::MemoryTraits<ko::Unmanaged> >;
+
+// Get the host or device version of the array.
+template <typename VT, typename ES> struct InExeSpace {
+  typedef VT type;
+};
+template <typename VT> struct InExeSpace<VT, ko::HostSpace> {
+  typedef typename VT::HostMirror type;
+};
+
+#ifdef KOKKOS_HAVE_CUDA
+// A 1D slice of an array.
+template <typename VT> KOKKOS_FORCEINLINE_FUNCTION
+ko::View<typename VT::value_type*, ko::LayoutStride, typename VT::device_type,
+         ko::MemoryTraits<ko::Unmanaged> >
+slice (const VT& v, Int i) { return ko::subview(v, i, ko::ALL()); }
+// An explicitly const 1D slice of an array.
+template <typename VT> KOKKOS_FORCEINLINE_FUNCTION
+ko::View<typename VT::const_value_type*, ko::LayoutStride, typename VT::device_type,
+         ko::MemoryTraits<ko::Unmanaged> >
+const_slice (const VT& v, Int i) { return ko::subview(v, i, ko::ALL()); }
+#else
+template <typename VT> KOKKOS_FORCEINLINE_FUNCTION
+typename VT::value_type*
+slice (const VT& v, Int i) { return v.data() + v.extent(1)*i; }
+
+template <typename VT> KOKKOS_FORCEINLINE_FUNCTION
+typename VT::const_value_type*
+const_slice (const VT& v, Int i) { return v.data() + v.extent(1)*i; }
+#endif
+
+// Number of slices in a 2D array, where each row is a slice.
+template <typename A2D> KOKKOS_FORCEINLINE_FUNCTION
+Int nslices (const A2D& a) { return static_cast<Int>(a.extent(0)); }
+
+// Number of entries in a 2D array's row.
+template <typename A2D> KOKKOS_FORCEINLINE_FUNCTION
+Int szslice (const A2D& a) { return static_cast<Int>(a.extent(1)); }
+
+template <typename V, typename CV>
+KOKKOS_INLINE_FUNCTION
+static void copy (V dst, CV src, const Int n) {
+  for (Int i = 0; i < n; ++i) dst[i] = src[i];
+}
+
+template <typename DV, typename SV>
+void resize_and_copy (DV& d, const SV& s,
+                      typename std::enable_if<DV::rank == 1>::type* = 0) {
+  ko::resize(d, nslices(s));
+  ko::deep_copy(d, s);
+}
+
+template <typename DV, typename SV>
+void resize_and_copy (
+  DV& d, const SV& s,
+  typename std::enable_if<DV::rank == 2 && DV::rank_dynamic == 1>::type* = 0)
+{
+  ko::resize(d, nslices(s));
+  ko::deep_copy(d, s);
+}
+
+template <typename DV, typename SV>
+void resize_and_copy (
+  DV& d, const SV& s,
+  typename std::enable_if<DV::rank == 2 && DV::rank_dynamic == 2>::type* = 0)
+{
+  ko::resize(d, nslices(s), szslice(s));
+  ko::deep_copy(d, s);
+}
+
+template <typename DV, typename SA>
+void hm_resize_and_copy (DV& d, const SA& s, const Int n) {
+  ko::resize(d, n);
+  auto d_hm = ko::create_mirror_view(d);
+  for (Int i = 0; i < n; ++i) d_hm[i] = s[i];
+  ko::deep_copy(d, d_hm);
+}
+
+// GPU-friendly replacements for std::min/max.
+template <typename T> KOKKOS_INLINE_FUNCTION
+const T& min (const T& a, const T& b) { return a < b ? a : b; }
+template <typename T> KOKKOS_INLINE_FUNCTION
+const T& max (const T& a, const T& b) { return a > b ? a : b; }
+template <typename T> KOKKOS_INLINE_FUNCTION
+void swap (T& a, T&b) {
+  T tmp = a;
+  a = b;
+  b = tmp;
+}
+template <typename T> KOKKOS_INLINE_FUNCTION constexpr T square (const T& x) { return x*x; }
+
+template<typename T> KOKKOS_INLINE_FUNCTION
+T sign (const T& a) { return a > 0 ? 1 : (a < 0 ? -1 : 0); }
+
+} // namespace siqk
+
+#endif // INCLUDE_SIQK_DEFS_HPP
diff --git a/siqk/siqk_geometry.hpp b/siqk/siqk_geometry.hpp
new file mode 100644
index 0000000..9ad9ecd
--- /dev/null
+++ b/siqk/siqk_geometry.hpp
@@ -0,0 +1,310 @@
+#ifndef INCLUDE_SIQK_GEOMETRY_HPP
+#define INCLUDE_SIQK_GEOMETRY_HPP
+
+#include "siqk_defs.hpp"
+#include "siqk_quadrature.hpp"
+
+namespace siqk {
+
+// Vectors and points are 2D. Thus, if you're working on planes in 3D, project
+// to a 2D space before calling these.
+struct PlaneGeometry {
+  template <typename V> KOKKOS_INLINE_FUNCTION
+  static void scale (const Real& a, V v) {
+    v[0] *= a; v[1] *= a;
+  }
+  template <typename CV> KOKKOS_INLINE_FUNCTION
+  static Real dot_c_amb (const CV c, const CV a, const CV b) {
+    return c[0]*(a[0] - b[0]) + c[1]*(a[1] - b[1]);
+  }
+  template <typename CV, typename V> KOKKOS_INLINE_FUNCTION
+  static void combine (const CV u, const CV v, const Real& a, V x) {
+    const Real& oma = 1 - a;
+    x[0] = oma*u[0] + a*v[0];
+    x[1] = oma*u[1] + a*v[1];
+  }
+  template <typename CV, typename V> KOKKOS_INLINE_FUNCTION
+  static void axpy (const Real& a, const CV x, V y) {
+    y[0] += a*x[0];
+    y[1] += a*x[1];
+  }
+
+  template <typename CV, typename V> KOKKOS_INLINE_FUNCTION
+  static void edge_normal (const CV e1, const CV e2, V en) {
+    en[0] = e1[1] - e2[1];
+    en[1] = e2[0] - e1[0];
+  }
+
+  template <typename CV> KOKKOS_INLINE_FUNCTION
+  static bool inside (const CV v, const CV e1, const CV en) {
+    return dot_c_amb(en, v, e1) >= 0;
+  }
+
+  template <typename CV, typename V> KOKKOS_INLINE_FUNCTION
+  static void intersect (const CV v1, const CV v2, const CV e1, const CV en,
+                         V intersection) {
+    Real a; {
+      const Real
+        num = dot_c_amb(en, e1, v1),
+        den = dot_c_amb(en, v2, v1);
+      a = num == 0 || den == 0 ? 0 : num/den;
+      a = a < 0 ? 0 : a > 1 ? 1 : a;
+    }
+    combine(v1, v2, a, intersection);
+  }
+
+  template <typename CV, typename V> KOKKOS_INLINE_FUNCTION
+  static bool output (const CV v, Int& no, const V vo) {
+#ifdef SIQK_DEBUG
+    if (no >= nslices(vo)) {
+      std::stringstream ss;
+      ss << "output: No room in vo; vo.n() is " << nslices(vo) << " but no is "
+         << no << "\n";
+      message(ss.str().c_str());
+    }
+#endif
+    if (no >= nslices(vo)) return false;
+    vo(no,0) = v[0];
+    vo(no,1) = v[1];
+    ++no;
+    return true;
+  }
+
+  //todo Handle non-convex case.
+  template <typename CV2s>
+  KOKKOS_INLINE_FUNCTION
+  static Real calc_area (const TriangleQuadrature& , const CV2s& v,
+                         const Int n) {
+    return calc_area_formula(v, n);
+  }
+
+  template <typename CV2s>
+  KOKKOS_INLINE_FUNCTION
+  static Real calc_area_formula (const CV2s& v, const Int n) {
+    Real area = 0;
+    for (Int i = 1, ilim = n - 1; i < ilim; ++i)
+      area += calc_tri_jacobian(slice(v,0), slice(v,i), slice(v,i+1));
+    return 0.5*area;
+  }
+
+  template <typename CV, typename CA>
+  KOKKOS_INLINE_FUNCTION
+  static void bary2coord (const CV v1, const CV v2, const CV v3, const CA alpha,
+                          Real u[2]) {
+    for (Int k = 0; k < 2; ++k) u[k] = 0;
+    axpy(alpha[0], v1, u);
+    axpy(alpha[1], v2, u);
+    axpy(alpha[2], v3, u);
+  }
+
+  template <typename CV>
+  KOKKOS_INLINE_FUNCTION
+  static Real calc_tri_jacobian (const CV v1, const CV v2, const CV v3) {
+    Real r1[2], r2[2];
+    r1[0] = v2[0] - v1[0];
+    r1[1] = v2[1] - v1[1];
+    r2[0] = v3[0] - v1[0];
+    r2[1] = v3[1] - v1[1];
+    const Real a = r1[0]*r2[1] - r1[1]*r2[0];
+    return a;
+  }
+};
+
+// All inputs and outputs are relative to the unit-radius sphere. Vectors and
+// points are 3D.
+struct SphereGeometry {
+  template <typename CV, typename V> KOKKOS_INLINE_FUNCTION
+  static void cross (const CV a, const CV b, V c) {
+    c[0] = a[1]*b[2] - a[2]*b[1];
+    c[1] = a[2]*b[0] - a[0]*b[2];
+    c[2] = a[0]*b[1] - a[1]*b[0];
+  }
+  template <typename CV> KOKKOS_INLINE_FUNCTION
+  static Real dot (const CV a, const CV b) {
+    return a[0]*b[0] + a[1]*b[1] + a[2]*b[2];
+  }
+  template <typename CV> KOKKOS_INLINE_FUNCTION
+  static Real norm2 (const CV v) {
+    return dot(v, v);
+  }
+  template <typename V> KOKKOS_INLINE_FUNCTION
+  static void scale (const Real& a, V v) {
+    v[0] *= a; v[1] *= a; v[2] *= a;
+  }
+  template <typename V> KOKKOS_INLINE_FUNCTION
+  static void normalize (V v) {
+    scale(1.0/std::sqrt(norm2(v)), v);
+  }
+  template <typename CV> KOKKOS_INLINE_FUNCTION
+  static Real dot_c_amb (const CV c, const CV a, const CV b) {
+    return c[0]*(a[0] - b[0]) + c[1]*(a[1] - b[1]) + c[2]*(a[2] - b[2]);
+  }
+  template <typename CV, typename V> KOKKOS_INLINE_FUNCTION
+  static void axpy (const Real& a, const CV x, V y) {
+    y[0] += a*x[0];
+    y[1] += a*x[1];
+    y[2] += a*x[2];
+  }
+  template <typename CV, typename V> KOKKOS_INLINE_FUNCTION
+  static void axpbyz (const Real& a, const CV x, const Real& b, const CV y,
+                      V z) {
+    z[0] = a*x[0] + b*y[0];
+    z[1] = a*x[1] + b*y[1];
+    z[2] = a*x[2] + b*y[2];
+  }
+  template <typename V, typename CV> KOKKOS_INLINE_FUNCTION
+  static void copy (V d, const CV s) {
+    d[0] = s[0];
+    d[1] = s[1];
+    d[2] = s[2];
+  }
+  template <typename CV, typename V> KOKKOS_INLINE_FUNCTION
+  static void combine (const CV u, const CV v, const Real& a, V x) {
+    const Real& oma = 1 - a;
+    x[0] = oma*u[0] + a*v[0];
+    x[1] = oma*u[1] + a*v[1];
+    x[2] = oma*u[2] + a*v[2];
+  }
+
+  template <typename CV, typename V> KOKKOS_INLINE_FUNCTION
+  static void edge_normal (const CV a, const CV b, V en) {
+    cross(a, b, en);
+    normalize(en);
+  }
+
+  // Is v inside the line anchored at a with inward-facing normal n?
+  template <typename CV> KOKKOS_INLINE_FUNCTION
+  static bool inside (const CV v, const CV a, const CV n) {
+    return dot_c_amb(n, v, a) >= 0;
+  }
+
+  /* Let
+       en = edge normal
+       e1 = edge starting point
+       d = en' e1
+       v(a) = (1 - a) v1 + a v2.
+     Solve n' v = d for a:
+       a = (en' (e1 - v1)) / (en' (v2 - v1)).
+     Then uvec(v(a)) is the intersection point on the unit sphere. Assume
+     intersection exists. (Already filtered by 'inside'.)
+  */
+  template <typename CV, typename V> KOKKOS_INLINE_FUNCTION
+  static void intersect (const CV v1, const CV v2, const CV e1, const CV en,
+                         V intersection) {
+    Real a; {
+      const Real
+        num = dot_c_amb(en, e1, v1),
+        den = dot_c_amb(en, v2, v1);
+      a = num == 0 || den == 0 ? 0 : num/den;
+      a = a < 0 ? 0 : a > 1 ? 1 : a;
+    }
+    combine(v1, v2, a, intersection);
+    normalize(intersection);
+  }
+
+  template <typename CV, typename V> KOKKOS_INLINE_FUNCTION
+  static bool output (const CV v, Int& no, V vo) {
+#ifdef SIQK_DEBUG
+    if (no >= nslices(vo)) {
+      std::stringstream ss;
+      ss << "output: No room in vo; vo.n() is " << nslices(vo) << " but no is "
+         << no << "\n";
+      message(ss.str().c_str());
+    }
+#endif
+    if (no >= nslices(vo)) return false;
+    vo(no,0) = v[0];
+    vo(no,1) = v[1];
+    vo(no,2) = v[2];
+    ++no;
+    return true;
+  }
+
+  //todo Handle non-convex case.
+  // This uses a terrible formula, but it's just for testing.
+  template <typename CV3s>
+  KOKKOS_INLINE_FUNCTION
+  static Real calc_area_formula (const CV3s& v, const Int n) {
+    Real area = 0;
+    for (Int i = 1, ilim = n - 1; i < ilim; ++i) {
+      const Real a = calc_arc_length(slice(v,0), slice(v,i));
+      const Real b = calc_arc_length(slice(v,i), slice(v,i+1));
+      const Real c = calc_arc_length(slice(v,i+1), slice(v,0));
+      const Real s = 0.5*(a + b + c);
+      const Real d = (std::tan(0.5*s)*std::tan(0.5*(s-a))*
+                      std::tan(0.5*(s-b))*std::tan(0.5*(s-c)));
+      if (d <= 0) continue;
+      area += 4*std::atan(std::sqrt(d));
+    }
+    return area;
+  }
+  template <typename CV> KOKKOS_INLINE_FUNCTION
+  static Real calc_arc_length (const CV a, const CV b) {
+    const Real d = dot(a, b);
+    if (d >= 1) return 0;
+    return acos(d);
+  }
+
+  template <typename CV3s>
+  KOKKOS_INLINE_FUNCTION
+  static Real calc_area (const TriangleQuadrature& q, const CV3s& v,
+                         const Int n) {
+    Real area = 0, u[3];
+    for (Int i = 1, ilim = n - 1; i < ilim; ++i) {
+      Real a = 0;
+      RawConstVec3s coord;
+      RawConstArray weight;
+      q.get_coef(8, coord, weight);
+      for (Int k = 0, klim = nslices(coord); k < klim; ++k) {
+        const Real jac = calc_tri_jacobian(slice(v,0), slice(v,i), slice(v,i+1),
+                                           slice(coord, k), u);
+        a += weight[k]*jac;
+      }
+      area += 0.5*a;
+    }
+    return area;
+  }
+
+  template <typename CV, typename CA>
+  KOKKOS_INLINE_FUNCTION
+  static Real calc_tri_jacobian (const CV v1, const CV v2, const CV v3,
+                                 const CA alpha, Real u[3]) {
+    // V(:,i) is vertex i of the spherical triangle on the unit sphere. The
+    // coefs
+    //     alpha = [a1, a2, 1 - a1 - a2]'
+    //           = [1 0; 0 1; -1 -1] [a1, a2]'
+    //           = alpha_a a
+    // (barycentric coords) give the location
+    //     v = V alpha
+    // on the planar triangle, and u = uvec(v) is the point on the unit sphere.
+    //   For a planar tri in 3D, the jacobian is
+    //     v_a = v_alpha alpha_a
+    //         = V [1 0; 0 1; -1 -1]
+    //     J = norm(cross(v_a(:,1), v_a(:,2))).
+    // For a spherical tri with the same vertices,
+    //     u = v/(v' v)^{1/2}
+    //     u_a = u_alpha alpha_a
+    //         = (v'v)^{-1/2} (I - u u') V alpha_a
+    //         = (v'v)^{-1/2} (I - u u') v_a
+    //     J = norm(cross(u_a(:,1), u_a(:,2))).
+    for (Int k = 0; k < 3; ++k) u[k] = 0;
+    axpy(alpha[0], v1, u);
+    axpy(alpha[1], v2, u);
+    axpy(alpha[2], v3, u);
+    const auto oovn = 1/std::sqrt(norm2(u));
+    scale(oovn, u);
+    Real u_a[3][3];
+    axpbyz(1, v1, -1, v3, u_a[0]);
+    axpbyz(1, v2, -1, v3, u_a[1]);
+    for (int i = 0; i < 2; ++i) {
+      axpy(-dot(u, u_a[i]), u, u_a[i]);
+      scale(oovn, u_a[i]);
+    }
+    cross(u_a[0], u_a[1], u_a[2]);
+    return std::sqrt(norm2(u_a[2]));
+  }
+};
+
+} // namespace siqk
+
+#endif // INCLUDE_SIQK_GEOMETRY_HPP
diff --git a/siqk/siqk_intersect.hpp b/siqk/siqk_intersect.hpp
new file mode 100644
index 0000000..6fcde7b
--- /dev/null
+++ b/siqk/siqk_intersect.hpp
@@ -0,0 +1,338 @@
+#ifndef INCLUDE_SIQK_INTERSECT_HPP
+#define INCLUDE_SIQK_INTERSECT_HPP
+
+#include "siqk_defs.hpp"
+#include "siqk_geometry.hpp"
+#include "siqk_search.hpp"
+#include "siqk_quadrature.hpp"
+
+namespace siqk {
+
+// Sutherland-Hodgmann polygon clipping algorithm. Follow Foley, van Dam,
+// Feiner, Hughes Fig 3.49.
+namespace sh {
+/* A mesh is described by the following arrays:
+       p: 3 x #nodes, the array of vertices.
+       e: max(#verts) x #elems, the array of element base-0 indices.
+       nml: 3 x #edges, the array of edge normals.
+       en: max(#verts) x #elems, the array of edge-normal base-0 indices.
+     e. e indexes p. e(i,j) == -1 in column j indicates that j:end are not used.
+     nml. As a mesh is refined, cancellation error makes an edge normal based
+   off of an element's vertices increasingly inaccurate. Roughly, if an edge
+   subtends angle phi of the sphere, -log10(phi/(2 pi)) digits are lost in the
+   edge normal. Therefore, we compute edge normals offline, since in certain
+   meshes, they can be computed by an accurate means. E.g., in a cubed-sphere
+   mesh, the whole line of a square face can be used to compute the edge
+   normal. Furthermore, there are far fewer unique edge normals than edges.
+ */
+template <typename ES = ko::DefaultExecutionSpace>
+struct Mesh {
+  typename InExeSpace<ConstVec3s, ES>::type p, nml;
+  typename InExeSpace<ConstIdxs, ES>::type e, en;
+
+  Mesh () {}
+
+  Mesh (const Mesh<ko::HostSpace>& m) {
+    typename InExeSpace<Vec3s, ES>::type tp, tnml;
+    typename InExeSpace<Idxs, ES>::type te, ten;
+    resize_and_copy(tp, m.p); p = tp;
+    resize_and_copy(tnml, m.nml); nml = tnml;
+    resize_and_copy(te, m.e); e = te;
+    resize_and_copy(ten, m.en); en = ten;
+  }
+};
+
+// Generally not a user routine.
+template <typename geo, typename CV3s, typename V3s, typename CV>
+KOKKOS_INLINE_FUNCTION
+bool clip_against_edge (
+  // Input vertex list.
+  const CV3s& vi, const Int ni,
+  // Output vertex list.
+  V3s& vo, Int& no,
+  // One point of the clip edge.
+  const CV ce1,
+  // Clip edge's inward-facing normal.
+  const CV cen)
+{
+  Real intersection[3];
+  no = 0;
+  auto s = const_slice(vi, ni-1);
+  for (Int j = 0; j < ni; ++j) {
+    auto p = const_slice(vi,j);
+    if (geo::inside(p, ce1, cen)) {
+      if (geo::inside(s, ce1, cen)) {
+        if ( ! geo::output(p, no, vo)) return false;
+      } else {
+        geo::intersect(s, p, ce1, cen, intersection);
+        if ( ! geo::output(intersection, no, vo)) return false;
+        if ( ! geo::output(p, no, vo)) return false;
+      }
+    } else if (geo::inside(s, ce1, cen)) {
+      geo::intersect(s, p, ce1, cen, intersection);
+      if ( ! geo::output(intersection, no, vo)) return false;
+    }
+    s = p;
+  }
+  return true;
+}
+
+// Efficient user routine that uses the mesh data structure.
+//todo An optimization would be to have 2 clip_against_edge routines. One would
+// handle the special case of the first vertex list being in (p,e) format.
+template <typename geo, typename MeshT, typename CV3s, typename V3s>
+KOKKOS_INLINE_FUNCTION
+bool clip_against_poly (
+  // Clip mesh. m.e(:,cp_e) is the element, and m.en(:,cp_e) is the
+  // corresponding list of normal indices.
+  const MeshT& m, const Int cp_e,
+  // A list of vertices describing the polygon to clip. The vertices must be in
+  // a convention-determined order, such as CCW. vi(:,1:ni-1) are valid entries.
+  const CV3s& vi, const Int ni,
+  // On output, vo(:,0:no-1) are vertices of the clipped polygon. no is 0 if
+  // there is no intersection.
+  V3s& vo, Int& no,
+  // Workspace. Both vo and wrk must be large enough to hold all generated
+  // vertices. If they are not, false is returned.
+  V3s& wrk)
+{
+  Int nos[] = { 0, 0 };
+  V3s* vs[] = { &vo, &wrk };
+
+  const auto e = slice(m.e, cp_e);
+  const auto en = slice(m.en, cp_e);
+
+  auto nv = szslice(m.e); // Number of vertices in clip polygon.
+  while (e[nv-1] == -1) --nv;
+
+  no = 0;
+  if (nv % 2 == 0) {
+    // Make sure the final vertex output list is in the caller's buffer.
+    swap(vs[0], vs[1]);
+    swap(nos[0], nos[1]);
+  }
+
+  if ( ! clip_against_edge<geo>(vi, ni, *vs[0], nos[0], const_slice(m.p, e[0]),
+                                const_slice(m.nml, en[0])))
+    return false;
+  if ( ! nos[0]) return true;
+
+  for (Int ie = 1, ielim = nv - 1; ; ++ie) {
+    if ( ! clip_against_edge<geo>(*vs[0], nos[0], *vs[1], nos[1],
+                                  const_slice(m.p, e[ie]),
+                                  const_slice(m.nml, en[ie])))
+      return false;
+    if ( ! nos[1]) return true;
+    if (ie == ielim) break;
+    swap(vs[0], vs[1]);
+    swap(nos[0], nos[1]);
+  }
+
+  no = nos[1];
+  return true;
+}
+
+// Not used for real stuff; just a convenient version for testing. In this
+// version, clip_poly is a list of clip polygon vertices. This is instead of the
+// mesh data structure.
+template <typename geo, typename CV3s_CP, typename CV3s_CEN, typename CV3s_VI,
+          typename V3s>
+KOKKOS_INLINE_FUNCTION
+bool clip_against_poly (
+  // Clip polygon.
+  const CV3s_CP& clip_poly,
+  // Clip polygon edges' inward-facing normals.
+  const CV3s_CEN& clip_edge_normals,
+  const CV3s_VI& vi, const Int ni,
+  V3s& vo, Int& no,
+  V3s& wrk)
+{
+  Int nos[] = { 0, 0 };
+  V3s* vs[] = { &vo, &wrk };
+
+  no = 0;
+  if (nslices(clip_poly) % 2 == 0) {
+    // Make sure the final vertex output list is in the caller's buffer.
+    swap(vs[0], vs[1]);
+    swap(nos[0], nos[1]);
+  }
+
+  if ( ! clip_against_edge<geo>(vi, ni, *vs[0], nos[0],
+                                const_slice(clip_poly, 0),
+                                const_slice(clip_edge_normals, 0)))
+    return false;
+  if ( ! nos[0]) return true;
+
+  for (Int ie = 1, ielim = nslices(clip_poly) - 1; ; ++ie) {
+    if ( ! clip_against_edge<geo>(*vs[0], nos[0], *vs[1], nos[1],
+                                  const_slice(clip_poly, ie),
+                                  const_slice(clip_edge_normals, ie)))
+      return false;
+    if ( ! nos[1]) return true;
+    if (ie == ielim) break;
+    swap(vs[0], vs[1]);
+    swap(nos[0], nos[1]);
+  }
+
+  no = nos[1];
+  return true;
+}
+} // namespace sh
+
+namespace test {
+static constexpr Int max_nvert = 20;
+static constexpr Int max_hits = 25; // Covers at least a 2-halo.
+
+// In practice, we want to form high-quality normals using information about the
+// mesh.
+template <typename geo>
+void fill_normals (sh::Mesh<ko::HostSpace>& m) {
+  // Count number of edges.
+  Int ne = 0;
+  for (Int ip = 0; ip < nslices(m.e); ++ip)
+    for (Int iv = 0; iv < szslice(m.e); ++iv)
+      if (m.e(ip,iv) == -1) break; else ++ne;
+  // Fill.
+  Idxs::HostMirror en("en", nslices(m.e), szslice(m.e));
+  ko::deep_copy(en, -1);
+  Vec3s::HostMirror nml("nml", ne);
+  Int ie = 0;
+  for (Int ip = 0; ip < nslices(m.e); ++ip)
+    for (Int iv = 0; iv < szslice(m.e); ++iv)
+      if (m.e(ip,iv) == -1)
+        break;
+      else {
+        // Somewhat complicated next node index.
+        const Int iv_next = (iv+1 == szslice(m.e) ? 0 :
+                             (m.e(ip,iv+1) == -1 ? 0 : iv+1));
+        geo::edge_normal(slice(m.p, m.e(ip, iv)), slice(m.p, m.e(ip, iv_next)),
+                         slice(nml, ie));
+        en(ip,iv) = ie;
+        ++ie;
+      }
+  m.en = en;
+  m.nml = nml;
+}
+
+//todo The current approach is to do redundant clips so that the hits buffer can
+// be small and static. Need to think about this.
+template <typename geo>
+class AreaOTFunctor {
+  const TriangleQuadrature quad_;
+  const sh::Mesh<>& cm_;
+  const ConstVec3s& p_;
+  const ConstIdxs& e_;
+  const Int k_; // Index into (p,e).
+  //todo More efficient method that also works on GPU.
+  Int hits_[max_hits];
+  Int nh_;
+  Real area_;
+
+public:
+  KOKKOS_INLINE_FUNCTION
+  AreaOTFunctor (const sh::Mesh<>& cm, const ConstVec3s& p, const ConstIdxs& e,
+                 const Int& k)
+    : cm_(cm), p_(p), e_(e), k_(k), nh_(0), area_(0)
+  {}
+
+  KOKKOS_INLINE_FUNCTION void operator() (const Int mesh_elem_idx) {
+    // Check whether we've clipped against this polygon before and there was a
+    // non-0 intersection.
+    for (Int i = 0; i < nh_; ++i)
+      if (hits_[i] == mesh_elem_idx)
+        return;
+    // We have not, so do the intersection.
+    Int no = 0;
+    {
+      // Area of all overlapping regions.
+      // In and out vertex lists.
+      Real buf[9*max_nvert];
+      RawVec3s
+        vi(buf, max_nvert),
+        vo(buf + 3*max_nvert, max_nvert),
+        wrk(buf + 6*max_nvert, max_nvert);
+      Int ni;
+      ni = 0;
+      for (Int i = 0; i < szslice(e_); ++i) {
+        if (e_(k_,i) == -1) break;
+        copy(slice(vi, i), slice(p_, e_(k_,i)), 3);
+        ++ni;
+      }
+      sh::clip_against_poly<geo>(cm_, mesh_elem_idx, vi, ni, vo, no, wrk);
+      if (no) area_ += geo::calc_area(quad_, vo, no);
+    }
+    if (no) {
+      // Non-0 intersection, so record.
+      if (nh_ == max_hits) Kokkos::abort("max_hits is too small.");
+      hits_[nh_++] = mesh_elem_idx;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION const Real& area () const { return area_; }
+};
+
+template <typename geo, typename OctreeT>
+class TestAreaOTKernel {
+  const sh::Mesh<> cm_;
+  const OctreeT ot_;
+  mutable ConstVec3s p_;
+  mutable ConstIdxs e_;
+
+public:
+  typedef Real value_type;
+
+  TestAreaOTKernel (const sh::Mesh<ko::HostSpace>& cm,
+                    const ConstVec3s::HostMirror& p_hm,
+                    const ConstIdxs::HostMirror& e_hm, const OctreeT& ot)
+    : cm_(cm), ot_(ot)
+  {
+    { Vec3s p; resize_and_copy(p, p_hm); p_ = p; }
+    { Idxs e; resize_and_copy(e, e_hm); e_ = e; }
+  }
+  
+  // Clip the k'th polygon in (p,e) against mesh cm.
+  KOKKOS_INLINE_FUNCTION void operator() (const Int k, Real& area) const {
+    // Clipped element bounding box.
+    Real ebb[6];
+    OctreeT::calc_bb(p_, slice(e_, k), szslice(e_), ebb);
+    // Get list of possible overlaps.
+    AreaOTFunctor<geo> f(cm_, p_, e_, k);
+    //todo Team threads.
+    ot_.apply(ebb, f);
+    area += f.area();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join (volatile value_type& dst, volatile value_type const& src) const
+  { dst += src; }
+};
+
+template <typename geo> Real test_area_ot (
+  const ConstVec3s::HostMirror& cp, const ConstIdxs::HostMirror& ce,
+  const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e)
+{
+  typedef Octree<geo, 10> OctreeT;
+
+  // Clip mesh and edge normal calculation. (In practice, we'd like to use
+  // higher-quality edge normals.)
+  sh::Mesh<ko::HostSpace> cm; cm.p = cp; cm.e = ce;
+  fill_normals<geo>(cm);
+
+  Real et[2] = {0};
+  auto t = tic();
+  // Oct-tree over the clip mesh.
+  OctreeT ot(cp, ce);
+  et[0] = toc(t);
+
+  Real area = 0;
+  TestAreaOTKernel<geo, OctreeT> f(cm, p, e, ot);
+  t = tic();
+  ko::parallel_reduce(nslices(e), f, area);
+  et[1] = toc(t);
+  print_times("test_area_ot", et, 2);
+  return area;
+}
+} // namespace test
+} // namespace siqk
+
+#endif // INCLUDE_SIQK_INTERSECT_HPP
diff --git a/siqk/siqk_quadrature.hpp b/siqk/siqk_quadrature.hpp
new file mode 100644
index 0000000..42164ad
--- /dev/null
+++ b/siqk/siqk_quadrature.hpp
@@ -0,0 +1,613 @@
+#ifndef INCLUDE_SIQK_QUADRATURE_HPP
+#define INCLUDE_SIQK_QUADRATURE_HPP
+
+#include "siqk_defs.hpp"
+
+namespace siqk {
+
+/* For the TRISYM entries, see, e.g.,
+     Triangular quadrature to use for integration Dunavant, D.A. "High Degree
+     Efficient Symmetrical Gaussian Quadrature Rules for the Triangle."
+     J. Numer. Meth. Eng., 21, pp 1129-1148.
+   and
+     Zhang, Linbo, Tao Cui, and Hui Liu. "A set of symmetric quadrature rules on
+     triangles and tetrahedra." J. of Computational Mathematics (2009): 89-96.
+   For the TRITAYLOR, see
+     Day, David M. and Mark A. Taylor, "A new 11 point degree 6 cubature formula
+     for the triangle", PAMM 7 (2007)
+   and
+     Taylor, Mark A., Beth A. Wingate, and Len P. Bos. "A cardinal function
+     algorithm for computing multivariate quadrature points." SIAM Journal on
+     Numerical Analysis 45.1 (2007): 193-205.
+*/
+
+// The symmetric order-12 quadrature rule gives 1 fewer digit of conservation
+// than, e.g., the order-14 one, so switch to the Taylor et al rule. Part of the
+// problem I think is that the Dunavant table results from double precision
+// computations (rather than quad) and is recorded to perhaps one fewer digit
+// than might have been available. But I can't find a table with an extra
+// digit. The Taylor et al. rule has one fewer coordinate, so that's also an
+// advantage. The loss of symmetry in the coordinates I think is not relevant to
+// this application.
+#define SIQK_USE_TRITAY12
+
+#define SIQK_QUADRATURE_TRISYM_ORDER4_COORD                  \
+  {0.108103018168070, 0.445948490915965, 0.445948490915965,  \
+   0.445948490915965, 0.108103018168070, 0.445948490915965,  \
+   0.445948490915965, 0.445948490915965, 0.108103018168070,  \
+   0.816847572980458, 0.091576213509771, 0.091576213509771,  \
+   0.091576213509771, 0.816847572980458, 0.091576213509771,  \
+   0.091576213509771, 0.091576213509771, 0.816847572980458}
+#define SIQK_QUADRATURE_TRISYM_ORDER4_WEIGHT                 \
+  {0.223381589678011, 0.223381589678011, 0.223381589678011,  \
+   0.109951743655322, 0.109951743655322, 0.109951743655322}
+
+#define SIQK_QUADRATURE_TRISYM_ORDER8_COORD                  \
+  {0.333333333333333, 0.333333333333333, 0.333333333333333,  \
+   0.081414823414554, 0.459292588292723, 0.459292588292723,  \
+   0.459292588292723, 0.081414823414554, 0.459292588292723,  \
+   0.459292588292723, 0.459292588292723, 0.081414823414554,  \
+   0.658861384496480, 0.170569307751760, 0.170569307751760,  \
+   0.170569307751760, 0.658861384496480, 0.170569307751760,  \
+   0.170569307751760, 0.170569307751760, 0.658861384496480,  \
+   0.898905543365938, 0.050547228317031, 0.050547228317031,  \
+   0.050547228317031, 0.898905543365938, 0.050547228317031,  \
+   0.050547228317031, 0.050547228317031, 0.898905543365938,  \
+   0.008394777409958, 0.263112829634638, 0.728492392955404,  \
+   0.008394777409958, 0.728492392955404, 0.263112829634638,  \
+   0.263112829634638, 0.008394777409958, 0.728492392955404,  \
+   0.263112829634638, 0.728492392955404, 0.008394777409958,  \
+   0.728492392955404, 0.263112829634638, 0.008394777409958,  \
+   0.728492392955404, 0.008394777409958, 0.263112829634638}
+#define SIQK_QUADRATURE_TRISYM_ORDER8_WEIGHT                 \
+  {0.144315607677787, 0.095091634267285, 0.095091634267285,  \
+   0.095091634267285, 0.103217370534718, 0.103217370534718,  \
+   0.103217370534718, 0.032458497623198, 0.032458497623198,  \
+   0.032458497623198, 0.027230314174435, 0.027230314174435,  \
+   0.027230314174435, 0.027230314174435, 0.027230314174435,  \
+   0.027230314174435}
+
+#define SIQK_QUADRATURE_TRISYM_ORDER12_COORD                 \
+  {0.023565220452390, 0.488217389773805, 0.488217389773805,  \
+   0.488217389773805, 0.023565220452390, 0.488217389773805,  \
+   0.488217389773805, 0.488217389773805, 0.023565220452390,  \
+   0.120551215411079, 0.439724392294460, 0.439724392294460,  \
+   0.439724392294460, 0.120551215411079, 0.439724392294460,  \
+   0.439724392294460, 0.439724392294460, 0.120551215411079,  \
+   0.457579229975768, 0.271210385012116, 0.271210385012116,  \
+   0.271210385012116, 0.457579229975768, 0.271210385012116,  \
+   0.271210385012116, 0.271210385012116, 0.457579229975768,  \
+   0.744847708916828, 0.127576145541586, 0.127576145541586,  \
+   0.127576145541586, 0.744847708916828, 0.127576145541586,  \
+   0.127576145541586, 0.127576145541586, 0.744847708916828,  \
+   0.957365299093576, 0.021317350453210, 0.021317350453210,  \
+   0.021317350453210, 0.957365299093576, 0.021317350453210,  \
+   0.021317350453210, 0.021317350453210, 0.957365299093576,  \
+   0.115343494534698, 0.275713269685514, 0.608943235779788,  \
+   0.115343494534698, 0.608943235779788, 0.275713269685514,  \
+   0.275713269685514, 0.115343494534698, 0.608943235779788,  \
+   0.275713269685514, 0.608943235779788, 0.115343494534698,  \
+   0.608943235779788, 0.115343494534698, 0.275713269685514,  \
+   0.608943235779788, 0.275713269685514, 0.115343494534698,  \
+   0.022838332222257, 0.281325580989940, 0.695836086787803,  \
+   0.022838332222257, 0.695836086787803, 0.281325580989940,  \
+   0.281325580989940, 0.022838332222257, 0.695836086787803,  \
+   0.281325580989940, 0.695836086787803, 0.022838332222257,  \
+   0.695836086787803, 0.022838332222257, 0.281325580989940,  \
+   0.695836086787803, 0.281325580989940, 0.022838332222257,  \
+   0.025734050548330, 0.116251915907597, 0.858014033544073,  \
+   0.025734050548330, 0.858014033544073, 0.116251915907597,  \
+   0.116251915907597, 0.025734050548330, 0.858014033544073,  \
+   0.116251915907597, 0.858014033544073, 0.025734050548330,  \
+   0.858014033544073, 0.025734050548330, 0.116251915907597,  \
+   0.858014033544073, 0.116251915907597, 0.025734050548330}
+#define SIQK_QUADRATURE_TRISYM_ORDER12_WEIGHT                \
+  {0.025731066440455, 0.025731066440455, 0.025731066440455,  \
+   0.043692544538038, 0.043692544538038, 0.043692544538038,  \
+   0.062858224217885, 0.062858224217885, 0.062858224217885,  \
+   0.034796112930709, 0.034796112930709, 0.034796112930709,  \
+   0.006166261051559, 0.006166261051559, 0.006166261051559,  \
+   0.040371557766381, 0.040371557766381, 0.040371557766381,  \
+   0.040371557766381, 0.040371557766381, 0.040371557766381,  \
+   0.022356773202303, 0.022356773202303, 0.022356773202303,  \
+   0.022356773202303, 0.022356773202303, 0.022356773202303,  \
+   0.017316231108659, 0.017316231108659, 0.017316231108659,  \
+   0.017316231108659, 0.017316231108659, 0.017316231108659}
+
+#define SIQK_QUADRATURE_TRISYM_ORDER14_COORD \
+  {0.3333333333333333148296162562473910, 0.3333333333333333148296162562473910, 0.3333333333333333148296162562473910, \
+   0.0099797608064584319986778382371995, 0.0099797608064584319986778382371995, 0.9800404783870830804914930922677740, \
+   0.0099797608064584319986778382371995, 0.9800404783870830804914930922677740, 0.0099797608064584319986778382371995, \
+   0.9800404783870830804914930922677740, 0.0099797608064584319986778382371995, 0.0099797608064584319986778382371995, \
+   0.4799778935211884145495275788562139, 0.4799778935211884145495275788562139, 0.0400442129576231709009448422875721, \
+   0.4799778935211884145495275788562139, 0.0400442129576231709009448422875721, 0.4799778935211884145495275788562139, \
+   0.0400442129576231709009448422875721, 0.4799778935211884145495275788562139, 0.4799778935211884145495275788562139, \
+   0.1538119591769669114444951674158801, 0.1538119591769669114444951674158801, 0.6923760816460662326221608964260668, \
+   0.1538119591769669114444951674158801, 0.6923760816460662326221608964260668, 0.1538119591769669114444951674158801, \
+   0.6923760816460662326221608964260668, 0.1538119591769669114444951674158801, 0.1538119591769669114444951674158801, \
+   0.0740234771169878125185448425327195, 0.0740234771169878125185448425327195, 0.8519530457660243749629103149345610, \
+   0.0740234771169878125185448425327195, 0.8519530457660243749629103149345610, 0.0740234771169878125185448425327195, \
+   0.8519530457660243749629103149345610, 0.0740234771169878125185448425327195, 0.0740234771169878125185448425327195, \
+   0.1303546825033299882967696703417460, 0.1303546825033299882967696703417460, 0.7392906349933400234064606593165081, \
+   0.1303546825033299882967696703417460, 0.7392906349933400234064606593165081, 0.1303546825033299882967696703417460, \
+   0.7392906349933400234064606593165081, 0.1303546825033299882967696703417460, 0.1303546825033299882967696703417460, \
+   0.2306172260266531326422523306973744, 0.2306172260266531326422523306973744, 0.5387655479466937347154953386052512, \
+   0.2306172260266531326422523306973744, 0.5387655479466937347154953386052512, 0.2306172260266531326422523306973744, \
+   0.5387655479466937347154953386052512, 0.2306172260266531326422523306973744, 0.2306172260266531326422523306973744, \
+   0.4223320834191477968211358984262915, 0.4223320834191477968211358984262915, 0.1553358331617044063577282031474169, \
+   0.4223320834191477968211358984262915, 0.1553358331617044063577282031474169, 0.4223320834191477968211358984262915, \
+   0.1553358331617044063577282031474169, 0.4223320834191477968211358984262915, 0.4223320834191477968211358984262915, \
+   0.7862373859346609705767150444444269, 0.1906163600319009110428680742188590, 0.0231462540334381183804168813367141, \
+   0.7862373859346609705767150444444269, 0.0231462540334381183804168813367141, 0.1906163600319009110428680742188590, \
+   0.1906163600319009110428680742188590, 0.7862373859346609705767150444444269, 0.0231462540334381183804168813367141, \
+   0.1906163600319009110428680742188590, 0.0231462540334381183804168813367141, 0.7862373859346609705767150444444269, \
+   0.0231462540334381183804168813367141, 0.7862373859346609705767150444444269, 0.1906163600319009110428680742188590, \
+   0.0231462540334381183804168813367141, 0.1906163600319009110428680742188590, 0.7862373859346609705767150444444269, \
+   0.6305521436606074114905595706659369, 0.3623231377435471300962888108188054, 0.0071247185958454584131516185152577, \
+   0.6305521436606074114905595706659369, 0.0071247185958454584131516185152577, 0.3623231377435471300962888108188054, \
+   0.3623231377435471300962888108188054, 0.6305521436606074114905595706659369, 0.0071247185958454584131516185152577, \
+   0.3623231377435471300962888108188054, 0.0071247185958454584131516185152577, 0.6305521436606074114905595706659369, \
+   0.0071247185958454584131516185152577, 0.6305521436606074114905595706659369, 0.3623231377435471300962888108188054, \
+   0.0071247185958454584131516185152577, 0.3623231377435471300962888108188054, 0.6305521436606074114905595706659369, \
+   0.6265773298563063198329814440512564, 0.2907712058836673940653838599246228, 0.0826514642600262861016346960241208, \
+   0.6265773298563063198329814440512564, 0.0826514642600262861016346960241208, 0.2907712058836673940653838599246228, \
+   0.2907712058836673940653838599246228, 0.6265773298563063198329814440512564, 0.0826514642600262861016346960241208, \
+   0.2907712058836673940653838599246228, 0.0826514642600262861016346960241208, 0.6265773298563063198329814440512564, \
+   0.0826514642600262861016346960241208, 0.6265773298563063198329814440512564, 0.2907712058836673940653838599246228, \
+   0.0826514642600262861016346960241208, 0.2907712058836673940653838599246228, 0.6265773298563063198329814440512564, \
+   0.9142099849296254632236014003865421, 0.0711657108777507679819862573822320, 0.0146243041926237687944123422312259, \
+   0.9142099849296254632236014003865421, 0.0146243041926237687944123422312259, 0.0711657108777507679819862573822320, \
+   0.0711657108777507679819862573822320, 0.9142099849296254632236014003865421, 0.0146243041926237687944123422312259, \
+   0.0711657108777507679819862573822320, 0.0146243041926237687944123422312259, 0.9142099849296254632236014003865421, \
+   0.0146243041926237687944123422312259, 0.9142099849296254632236014003865421, 0.0711657108777507679819862573822320, \
+   0.0146243041926237687944123422312259, 0.0711657108777507679819862573822320, 0.9142099849296254632236014003865421}
+#define SIQK_QUADRATURE_TRISYM_ORDER14_WEIGHT \
+  {0.0585962852260285965710906452841300,0.0017351512297252675524200649093132,0.0017351512297252675524200649093132, \
+   0.0017351512297252675524200649093132,0.0261637825586145227052536910150593,0.0261637825586145227052536910150593, \
+   0.0261637825586145227052536910150593,0.0039197292424018289128118119890587,0.0039197292424018289128118119890587, \
+   0.0039197292424018289128118119890587,0.0122473597569408669538670864085361,0.0122473597569408669538670864085361, \
+   0.0122473597569408669538670864085361,0.0281996285032579604989955157634540,0.0281996285032579604989955157634540, \
+   0.0281996285032579604989955157634540,0.0508870871859594883779287499692146,0.0508870871859594883779287499692146, \
+   0.0508870871859594883779287499692146,0.0504534399016036000373830461285252,0.0504534399016036000373830461285252, \
+   0.0504534399016036000373830461285252,0.0170636442122334523741056244716674,0.0170636442122334523741056244716674, \
+   0.0170636442122334523741056244716674,0.0170636442122334523741056244716674,0.0170636442122334523741056244716674, \
+   0.0170636442122334523741056244716674,0.0096834664255066003890615178306689,0.0096834664255066003890615178306689, \
+   0.0096834664255066003890615178306689,0.0096834664255066003890615178306689,0.0096834664255066003890615178306689, \
+   0.0096834664255066003890615178306689,0.0363857559284850029523994408009457,0.0363857559284850029523994408009457, \
+   0.0363857559284850029523994408009457,0.0363857559284850029523994408009457,0.0363857559284850029523994408009457, \
+   0.0363857559284850029523994408009457,0.0069646633735184126576256424812073,0.0069646633735184126576256424812073, \
+   0.0069646633735184126576256424812073,0.0069646633735184126576256424812073,0.0069646633735184126576256424812073, \
+   0.0069646633735184126576256424812073}
+
+#define SIQK_QUADRATURE_TRISYM_ORDER20_COORD \
+  {0.3333333333333333148296162562473910, 0.3333333333333333148296162562473910, 0.3333333333333333148296162562473910, \
+   0.2158743059329919777855621987328050, 0.2158743059329919777855621987328050, 0.5682513881340160999400268337922171, \
+   0.2158743059329919777855621987328050, 0.5682513881340160999400268337922171, 0.2158743059329919777855621987328050, \
+   0.5682513881340160999400268337922171, 0.2158743059329919777855621987328050, 0.2158743059329919777855621987328050, \
+   0.0753767665297472716501303580116655, 0.0753767665297472716501303580116655, 0.8492464669405054289441636683477554, \
+   0.0753767665297472716501303580116655, 0.8492464669405054289441636683477554, 0.0753767665297472716501303580116655, \
+   0.8492464669405054289441636683477554, 0.0753767665297472716501303580116655, 0.0753767665297472716501303580116655, \
+   0.0103008281372217926769030427180951, 0.0103008281372217926769030427180951, 0.9793983437255564528101103860535659, \
+   0.0103008281372217926769030427180951, 0.9793983437255564528101103860535659, 0.0103008281372217926769030427180951, \
+   0.9793983437255564528101103860535659, 0.0103008281372217926769030427180951, 0.0103008281372217926769030427180951, \
+   0.4936022112987001886352800283930264, 0.4936022112987001886352800283930264, 0.0127955774025996227294399432139471, \
+   0.4936022112987001886352800283930264, 0.0127955774025996227294399432139471, 0.4936022112987001886352800283930264, \
+   0.0127955774025996227294399432139471, 0.4936022112987001886352800283930264, 0.4936022112987001886352800283930264, \
+   0.4615509381069253236340443891094765, 0.4615509381069253236340443891094765, 0.0768981237861493527319112217810471, \
+   0.4615509381069253236340443891094765, 0.0768981237861493527319112217810471, 0.4615509381069253236340443891094765, \
+   0.0768981237861493527319112217810471, 0.4615509381069253236340443891094765, 0.4615509381069253236340443891094765, \
+   0.3286214064242369836676971317501739, 0.4293405702582103744546770940360148, 0.2420380233175526418776257742138114, \
+   0.3286214064242369836676971317501739, 0.2420380233175526418776257742138114, 0.4293405702582103744546770940360148, \
+   0.4293405702582103744546770940360148, 0.3286214064242369836676971317501739, 0.2420380233175526418776257742138114, \
+   0.4293405702582103744546770940360148, 0.2420380233175526418776257742138114, 0.3286214064242369836676971317501739, \
+   0.2420380233175526418776257742138114, 0.3286214064242369836676971317501739, 0.4293405702582103744546770940360148, \
+   0.2420380233175526418776257742138114, 0.4293405702582103744546770940360148, 0.3286214064242369836676971317501739, \
+   0.2604803617865687481724989993381314, 0.1015775342809694392620656344661256, 0.6379421039324617570542841349379160, \
+   0.2604803617865687481724989993381314, 0.6379421039324617570542841349379160, 0.1015775342809694392620656344661256, \
+   0.1015775342809694392620656344661256, 0.2604803617865687481724989993381314, 0.6379421039324617570542841349379160, \
+   0.1015775342809694392620656344661256, 0.6379421039324617570542841349379160, 0.2604803617865687481724989993381314, \
+   0.6379421039324617570542841349379160, 0.2604803617865687481724989993381314, 0.1015775342809694392620656344661256, \
+   0.6379421039324617570542841349379160, 0.1015775342809694392620656344661256, 0.2604803617865687481724989993381314, \
+   0.1370742358464553112273875967730419, 0.7100659730011301684626801034028176, 0.1528597911524145480655079154530540, \
+   0.1370742358464553112273875967730419, 0.1528597911524145480655079154530540, 0.7100659730011301684626801034028176, \
+   0.7100659730011301684626801034028176, 0.1370742358464553112273875967730419, 0.1528597911524145480655079154530540, \
+   0.7100659730011301684626801034028176, 0.1528597911524145480655079154530540, 0.1370742358464553112273875967730419, \
+   0.1528597911524145480655079154530540, 0.1370742358464553112273875967730419, 0.7100659730011301684626801034028176, \
+   0.1528597911524145480655079154530540, 0.7100659730011301684626801034028176, 0.1370742358464553112273875967730419, \
+   0.1467269458722997854671632467216114, 0.4985454776784148389623396724346094, 0.3547275764492854310816483121016063, \
+   0.1467269458722997854671632467216114, 0.3547275764492854310816483121016063, 0.4985454776784148389623396724346094, \
+   0.4985454776784148389623396724346094, 0.1467269458722997854671632467216114, 0.3547275764492854310816483121016063, \
+   0.4985454776784148389623396724346094, 0.3547275764492854310816483121016063, 0.1467269458722997854671632467216114, \
+   0.3547275764492854310816483121016063, 0.1467269458722997854671632467216114, 0.4985454776784148389623396724346094, \
+   0.3547275764492854310816483121016063, 0.4985454776784148389623396724346094, 0.1467269458722997854671632467216114, \
+   0.0269989777425532900823057502748270, 0.0491867226725819992050325879517914, 0.9238142995848647176515555656806100, \
+   0.0269989777425532900823057502748270, 0.9238142995848647176515555656806100, 0.0491867226725819992050325879517914, \
+   0.0491867226725819992050325879517914, 0.0269989777425532900823057502748270, 0.9238142995848647176515555656806100, \
+   0.0491867226725819992050325879517914, 0.9238142995848647176515555656806100, 0.0269989777425532900823057502748270, \
+   0.9238142995848647176515555656806100, 0.0269989777425532900823057502748270, 0.0491867226725819992050325879517914, \
+   0.9238142995848647176515555656806100, 0.0491867226725819992050325879517914, 0.0269989777425532900823057502748270, \
+   0.0618717859336170294959345028473763, 0.7796601465405693653920593533257488, 0.1584680675258135496008549125690479, \
+   0.0618717859336170294959345028473763, 0.1584680675258135496008549125690479, 0.7796601465405693653920593533257488, \
+   0.7796601465405693653920593533257488, 0.0618717859336170294959345028473763, 0.1584680675258135496008549125690479, \
+   0.7796601465405693653920593533257488, 0.1584680675258135496008549125690479, 0.0618717859336170294959345028473763, \
+   0.1584680675258135496008549125690479, 0.0618717859336170294959345028473763, 0.7796601465405693653920593533257488, \
+   0.1584680675258135496008549125690479, 0.7796601465405693653920593533257488, 0.0618717859336170294959345028473763, \
+   0.0477243674276219970176171614184568, 0.3704915391495476328920233299868414, 0.5817840934228304394792985476669855, \
+   0.0477243674276219970176171614184568, 0.5817840934228304394792985476669855, 0.3704915391495476328920233299868414, \
+   0.3704915391495476328920233299868414, 0.0477243674276219970176171614184568, 0.5817840934228304394792985476669855, \
+   0.3704915391495476328920233299868414, 0.5817840934228304394792985476669855, 0.0477243674276219970176171614184568, \
+   0.5817840934228304394792985476669855, 0.0477243674276219970176171614184568, 0.3704915391495476328920233299868414, \
+   0.5817840934228304394792985476669855, 0.3704915391495476328920233299868414, 0.0477243674276219970176171614184568, \
+   0.1206005151863643737319975457467081, 0.8633469487547525966775197048264090, 0.0160525360588830157126949416124262, \
+   0.1206005151863643737319975457467081, 0.0160525360588830157126949416124262, 0.8633469487547525966775197048264090, \
+   0.8633469487547525966775197048264090, 0.1206005151863643737319975457467081, 0.0160525360588830157126949416124262, \
+   0.8633469487547525966775197048264090, 0.0160525360588830157126949416124262, 0.1206005151863643737319975457467081, \
+   0.0160525360588830157126949416124262, 0.1206005151863643737319975457467081, 0.8633469487547525966775197048264090, \
+   0.0160525360588830157126949416124262, 0.8633469487547525966775197048264090, 0.1206005151863643737319975457467081, \
+   0.0026971477967097875517998861738533, 0.0561949381877454995359855161041196, 0.9411079140155447220195128466002643, \
+   0.0026971477967097875517998861738533, 0.9411079140155447220195128466002643, 0.0561949381877454995359855161041196, \
+   0.0561949381877454995359855161041196, 0.0026971477967097875517998861738533, 0.9411079140155447220195128466002643, \
+   0.0561949381877454995359855161041196, 0.9411079140155447220195128466002643, 0.0026971477967097875517998861738533, \
+   0.9411079140155447220195128466002643, 0.0026971477967097875517998861738533, 0.0561949381877454995359855161041196, \
+   0.9411079140155447220195128466002643, 0.0561949381877454995359855161041196, 0.0026971477967097875517998861738533, \
+   0.0030156332779423624702863637736527, 0.2086750067484213488899769117779215, 0.7883093599736362699914593576977495, \
+   0.0030156332779423624702863637736527, 0.7883093599736362699914593576977495, 0.2086750067484213488899769117779215, \
+   0.2086750067484213488899769117779215, 0.0030156332779423624702863637736527, 0.7883093599736362699914593576977495, \
+   0.2086750067484213488899769117779215, 0.7883093599736362699914593576977495, 0.0030156332779423624702863637736527, \
+   0.7883093599736362699914593576977495, 0.0030156332779423624702863637736527, 0.2086750067484213488899769117779215, \
+   0.7883093599736362699914593576977495, 0.2086750067484213488899769117779215, 0.0030156332779423624702863637736527, \
+   0.0299053757884570198255502759820956, 0.7211512409120340860724240883428138, 0.2489433832995089357353890591184609, \
+   0.0299053757884570198255502759820956, 0.2489433832995089357353890591184609, 0.7211512409120340860724240883428138, \
+   0.7211512409120340860724240883428138, 0.0299053757884570198255502759820956, 0.2489433832995089357353890591184609, \
+   0.7211512409120340860724240883428138, 0.2489433832995089357353890591184609, 0.0299053757884570198255502759820956, \
+   0.2489433832995089357353890591184609, 0.0299053757884570198255502759820956, 0.7211512409120340860724240883428138, \
+   0.2489433832995089357353890591184609, 0.7211512409120340860724240883428138, 0.0299053757884570198255502759820956, \
+   0.0067566542224609888248054723192126, 0.6400554419405418693500564586429391, 0.3531879038369971635091815187479369, \
+   0.0067566542224609888248054723192126, 0.3531879038369971635091815187479369, 0.6400554419405418693500564586429391, \
+   0.6400554419405418693500564586429391, 0.0067566542224609888248054723192126, 0.3531879038369971635091815187479369, \
+   0.6400554419405418693500564586429391, 0.3531879038369971635091815187479369, 0.0067566542224609888248054723192126, \
+   0.3531879038369971635091815187479369, 0.0067566542224609888248054723192126, 0.6400554419405418693500564586429391, \
+   0.3531879038369971635091815187479369, 0.6400554419405418693500564586429391, 0.0067566542224609888248054723192126}
+#define SIQK_QUADRATURE_TRISYM_ORDER20_WEIGHT \
+  {0.0125376079944966561247055025773989,0.0274718698764242139076507953632245,0.0274718698764242139076507953632245, \
+   0.0274718698764242139076507953632245,0.0097652722770514236577676925321612,0.0097652722770514236577676925321612, \
+   0.0097652722770514236577676925321612,0.0013984195353918234608348036829284,0.0013984195353918234608348036829284, \
+   0.0013984195353918234608348036829284,0.0092921026251851831373462786700657,0.0092921026251851831373462786700657, \
+   0.0092921026251851831373462786700657,0.0165778760323669269172164320025331,0.0165778760323669269172164320025331, \
+   0.0165778760323669269172164320025331,0.0206677623486650786921448030852844,0.0206677623486650786921448030852844, \
+   0.0206677623486650786921448030852844,0.0206677623486650786921448030852844,0.0206677623486650786921448030852844, \
+   0.0206677623486650786921448030852844,0.0208222355211545064046507746979842,0.0208222355211545064046507746979842, \
+   0.0208222355211545064046507746979842,0.0208222355211545064046507746979842,0.0208222355211545064046507746979842, \
+   0.0208222355211545064046507746979842,0.0095686384198490608693488113090098,0.0095686384198490608693488113090098, \
+   0.0095686384198490608693488113090098,0.0095686384198490608693488113090098,0.0095686384198490608693488113090098, \
+   0.0095686384198490608693488113090098,0.0244527709689724634389840218773315,0.0244527709689724634389840218773315, \
+   0.0244527709689724634389840218773315,0.0244527709689724634389840218773315,0.0244527709689724634389840218773315, \
+   0.0244527709689724634389840218773315,0.0031557306306305341579709899946238,0.0031557306306305341579709899946238, \
+   0.0031557306306305341579709899946238,0.0031557306306305341579709899946238,0.0031557306306305341579709899946238, \
+   0.0031557306306305341579709899946238,0.0121367963653212975611017654387069,0.0121367963653212975611017654387069, \
+   0.0121367963653212975611017654387069,0.0121367963653212975611017654387069,0.0121367963653212975611017654387069, \
+   0.0121367963653212975611017654387069,0.0149664801438864486504698447788542,0.0149664801438864486504698447788542, \
+   0.0149664801438864486504698447788542,0.0149664801438864486504698447788542,0.0149664801438864486504698447788542, \
+   0.0149664801438864486504698447788542,0.0063275933217777392825187376956819,0.0063275933217777392825187376956819, \
+   0.0063275933217777392825187376956819,0.0063275933217777392825187376956819,0.0063275933217777392825187376956819, \
+   0.0063275933217777392825187376956819,0.0013425603120636958685146788994302,0.0013425603120636958685146788994302, \
+   0.0013425603120636958685146788994302,0.0013425603120636958685146788994302,0.0013425603120636958685146788994302, \
+   0.0013425603120636958685146788994302,0.0027760769163475539772489852907711,0.0027760769163475539772489852907711, \
+   0.0027760769163475539772489852907711,0.0027760769163475539772489852907711,0.0027760769163475539772489852907711, \
+   0.0027760769163475539772489852907711,0.0107398444741849414391099415411190,0.0107398444741849414391099415411190, \
+   0.0107398444741849414391099415411190,0.0107398444741849414391099415411190,0.0107398444741849414391099415411190, \
+   0.0107398444741849414391099415411190,0.0053678057381874528034004789844857,0.0053678057381874528034004789844857, \
+   0.0053678057381874528034004789844857,0.0053678057381874528034004789844857,0.0053678057381874528034004789844857, \
+   0.0053678057381874528034004789844857}
+
+#define SIQK_QUADRATURE_TRITAY_ORDER6_COORD                             \
+  {4.724686653264358e-02, 5.725498667747682e-02, 8.954981467898796e-01, \
+   4.280913872509884e-02, 8.953626400245792e-01, 6.182822125032195e-02, \
+   2.921805130458027e-01, 6.844757484565146e-01, 2.334373849768268e-02, \
+   8.712234683377076e-01, 6.874625591502949e-02, 6.003027574726293e-02, \
+   5.086198608278325e-02, 6.156762055758400e-01, 3.334618083413767e-01, \
+   2.128646728100595e-01, 6.279461411977890e-01, 1.591891859921515e-01, \
+   2.817957679526839e-01, 6.290913834186361e-02, 6.552950937054525e-01, \
+   6.225041026512227e-01, 6.837821192050995e-02, 3.091176854282673e-01, \
+   7.604403244598745e-02, 2.875294583743921e-01, 6.364265091796204e-01, \
+   5.941924379444020e-01, 3.287835564131346e-01, 7.702400564246337e-02, \
+   3.353648085404556e-01, 3.122904050136449e-01, 3.523447864458995e-01}
+
+#define SIQK_QUADRATURE_TRITAY_ORDER6_WEIGHT                            \
+  {3.806807185295551e-02, 3.837935530775279e-02, 4.620045674456197e-02, \
+   5.346758944419899e-02, 8.375582696574595e-02, 1.016448330255167e-01, \
+   1.018615244613670e-01, 1.114218316600018e-01, 1.120094502629461e-01, \
+   1.247875714375583e-01, 1.884034888373949e-01}
+
+#define SIQK_QUADRATURE_TRITAY_ORDER12_COORD \
+  {7.26510255160501828e-02, 9.27348974483949817e-01, 0.00000000000000000e+00, \
+   2.11790731803609689e-02, 2.35517332495786824e-02, 9.55269193570060349e-01, \
+   1.41841115784669236e-01, 5.40914911362088088e-17, 8.58158884215330708e-01, \
+   1.15143666726236216e-02, 9.45475073220970907e-01, 4.30105601064054710e-02, \
+   2.77555756156289135e-17, 1.54064601626856063e-01, 8.45935398373143910e-01, \
+   3.72684680767588483e-01, -1.88694080537681499e-16, 6.27315319232411683e-01, \
+   9.43134911146902510e-01, 2.71109713562557482e-02, 2.97541174968417414e-02, \
+   8.44725347421859452e-01, 1.46044961672175677e-01, 9.22969090596487129e-03, \
+   8.23277107647898521e-01, 2.11522233831219000e-02, 1.55570668968979586e-01, \
+   6.21586880750877868e-01, 1.45665147883470222e-02, 3.63846604460775103e-01, \
+   2.21919501597089841e-02, 7.88601719223131714e-01, 1.89206330617159302e-01, \
+   2.27722111443204644e-01, 7.49189739790679599e-01, 2.30881487661157569e-02, \
+   7.38137544226065284e-02, 7.18714961015890358e-02, 8.54314749475804436e-01, \
+   6.43364629415364875e-01, 3.32129083947645065e-01, 2.45062866369900600e-02, \
+   2.28091126376529507e-02, 3.61181591189672080e-01, 6.16009296172674969e-01, \
+   6.63093778446759319e-01, 2.43458133948799671e-01, 9.34480876044410103e-02, \
+   2.51456820638045198e-02, 5.81689214740147453e-01, 3.93165103196048027e-01, \
+   4.29837040104380730e-01, 5.44446676271925334e-01, 2.57162836236939363e-02, \
+   9.40413011410586863e-02, 8.26003314017559997e-01, 7.99553848413813162e-02, \
+   7.94010795132135239e-01, 1.16386499067277244e-01, 8.96027058005875177e-02, \
+   7.83496599417470019e-02, 2.03768481077729741e-01, 7.17881858980523258e-01, \
+   2.25505520049374242e-01, 6.44132203822605637e-02, 7.10081259568365097e-01, \
+   6.43800731623786371e-01, 9.54285858105846096e-02, 2.60770682565629019e-01, \
+   5.43837635808460451e-01, 2.44982965093490213e-01, 2.11179399098049336e-01, \
+   4.32112641877997194e-01, 7.05667243440369213e-02, 4.97320633777965815e-01, \
+   2.55495747579340349e-01, 6.19381257362555782e-01, 1.25122995058103870e-01, \
+   1.22162380966293838e-01, 6.27682615680314027e-01, 2.50155003353392136e-01, \
+   4.47861373562203791e-01, 4.22605657433460014e-01, 1.29532969004336196e-01, \
+   4.09354529674576528e-01, 2.10785259391403995e-01, 3.79860210934019449e-01, \
+   1.24718320885524481e-01, 4.08963804491244809e-01, 4.66317874623230710e-01, \
+   2.28197277938737758e-01, 2.13777432530059680e-01, 5.58025289531202562e-01, \
+   2.88796329020881648e-01, 4.09786577770025306e-01, 3.01417093209092990e-01}
+
+#define SIQK_QUADRATURE_TRITAY_ORDER12_WEIGHT                           \
+  {4.888049814660050e-03, 6.675900027367356e-03, 6.845534654343699e-03, \
+   7.119751436080721e-03, 7.714492373624846e-03, 9.654708742436301e-03, \
+   1.050932673560249e-02, 1.068084365762828e-02, 1.848368581123072e-02, \
+   1.854548042160657e-02, 2.062000411968213e-02, 2.168508541701153e-02, \
+   2.249074619915818e-02, 2.490407320150775e-02, 2.509917342768508e-02, \
+   2.794373431987983e-02, 2.814555860521331e-02, 2.816965445973000e-02, \
+   3.052917241207244e-02, 3.057527760403899e-02, 3.957360579297199e-02, \
+   4.128188739546268e-02, 4.593784216579169e-02, 4.749957532530720e-02, \
+   4.814880503690738e-02, 5.096492487678762e-02, 5.335208304882109e-02, \
+   5.414687261316752e-02, 5.943783395113540e-02, 5.998970732710617e-02, \
+   6.316454642265663e-02, 7.522206260332436e-02}
+
+#define SIQK_QUADRATURE_TRITAY_ORDER16_COORD \
+  {2.22044604925031308e-16, 1.00000000000000022e+00, -4.44089209850062616e-16, \
+   1.72652007459386422e-16, -1.72652007459386422e-16, 1.00000000000000000e+00, \
+   9.99999999999999556e-01, 1.67697146066824836e-16, 2.76392063783237780e-16, \
+   5.51287671788707190e-02, 9.39886358357719054e-01, 4.98487446341022711e-03, \
+   6.97876983249687277e-03, 5.43806683058353502e-02, 9.38640561861667777e-01, \
+   9.37963548813877668e-01, 9.39400491638755185e-03, 5.26424462697347786e-02, \
+   3.66619396286766500e-02, 1.64345086362403456e-02, 9.46903551735083004e-01, \
+   1.67139052970596280e-02, 9.46948726986246103e-01, 3.63373677166942688e-02, \
+   9.42217145243293808e-01, 4.26604005767651506e-02, 1.51224541799410417e-02, \
+   1.18395699389696601e-01, 1.22269495438720680e-02, 8.69377351066431325e-01, \
+   1.21386193179034985e-02, 8.67369652104666988e-01, 1.20491728577429513e-01, \
+   1.38549201074093298e-01, 8.45674402138906656e-01, 1.57763967870000466e-02, \
+   1.56119497522677064e-02, 1.39575963210261389e-01, 8.44812087037470905e-01, \
+   8.54716865118515079e-01, 1.31782174323082840e-01, 1.35009605584020809e-02, \
+   8.38676993516376368e-01, 1.57955126300247592e-02, 1.45527493853598866e-01, \
+   2.47883957465546700e-01, 7.36546288443630570e-01, 1.55697540908227294e-02, \
+   2.48047467521941595e-01, 1.39688430330388181e-02, 7.37983689445019575e-01, \
+   1.54489124190416716e-02, 2.54789518603903087e-01, 7.29761568977055242e-01, \
+   1.40536794130045051e-02, 7.31638652255490185e-01, 2.54307668331505310e-01, \
+   7.14650647525855276e-01, 1.57253728950845356e-02, 2.69623979579060202e-01, \
+   7.19291320004516122e-01, 2.66230284364682601e-01, 1.44783956308012773e-02, \
+   7.34816524385439873e-02, 8.67350406521407824e-01, 5.91679410400481887e-02, \
+   6.23723757982518195e-02, 7.41493666956614256e-02, 8.63478257506086755e-01, \
+   5.64947509640178147e-01, 1.59285948360033090e-02, 4.19123895523818568e-01, \
+   4.03471605078646045e-01, 1.56061028067777056e-02, 5.80922292114576355e-01, \
+   3.93065372986517114e-01, 5.91009481748388743e-01, 1.59251452650941427e-02, \
+   1.58528135007360294e-02, 4.03477149688871994e-01, 5.80670036810391865e-01, \
+   1.55759225172019677e-02, 5.69474562852597677e-01, 4.14949514630200356e-01, \
+   8.56028762075832783e-01, 6.78493700650298209e-02, 7.61218678591373960e-02, \
+   5.57652171741686020e-01, 4.26596859027159547e-01, 1.57509692311544325e-02, \
+   1.58711917968908656e-01, 6.70982507889701790e-02, 7.74189831242121151e-01, \
+   1.65257027288124081e-01, 7.52831023147951472e-01, 8.19119495639244466e-02, \
+   6.69143759151381579e-02, 7.75372778355688519e-01, 1.57712845729173323e-01, \
+   8.06983742470389620e-02, 1.68907315778736744e-01, 7.50394309974224294e-01, \
+   7.60435265981276642e-01, 1.68733583291941547e-01, 7.08311507267818108e-02, \
+   7.41575866479260215e-01, 8.21244708436324466e-02, 1.76299662677107338e-01, \
+   2.90354968333863872e-01, 6.28870536334479868e-01, 8.07744953316562597e-02, \
+   6.13421339495847429e-01, 8.11413015265752130e-02, 3.05437358977577345e-01, \
+   8.03401946048588056e-02, 2.96911206508048198e-01, 6.22748598887093108e-01, \
+   2.98521053628375943e-01, 7.67542314170573392e-02, 6.24724714954566718e-01, \
+   7.65491844989589776e-02, 6.22302233384477099e-01, 3.01148582116563923e-01, \
+   6.11711534686959046e-01, 3.10378628805096313e-01, 7.79098365079446409e-02, \
+   4.57714874646253878e-01, 8.19218215186586080e-02, 4.60363303835087556e-01, \
+   4.46142332818981191e-01, 4.71702266501346945e-01, 8.21554006796718639e-02, \
+   8.15831550859882348e-02, 4.54660341525047307e-01, 4.63756503388964458e-01, \
+   1.87663085257486151e-01, 1.70109133923693812e-01, 6.42227780818820149e-01, \
+   1.69570213325764829e-01, 6.40600432948674525e-01, 1.89829353725560646e-01, \
+   6.34777673094082173e-01, 1.91226758371660088e-01, 1.73995568534257739e-01, \
+   3.31577016252400436e-01, 1.88531576707023696e-01, 4.79891407040575868e-01, \
+   1.87871344418995001e-01, 4.77292995769074468e-01, 3.34835659811930531e-01, \
+   1.91505318098148747e-01, 3.12697462175977048e-01, 4.95797219725874205e-01, \
+   3.11122038514993648e-01, 4.96122594594562871e-01, 1.92755366890443480e-01, \
+   4.91017887987217960e-01, 1.92880531286706181e-01, 3.16101580726075804e-01, \
+   4.74506574489367838e-01, 3.36004145381649799e-01, 1.89489280128982363e-01, \
+   3.31914842734057136e-01, 3.33728055084797526e-01, 3.34357102181145338e-01}
+
+#define SIQK_QUADRATURE_TRITAY_ORDER16_WEIGHT \
+  {3.101299925557040e-04, 3.157587355864167e-04, 3.543300779435999e-04, \
+   2.758185808404191e-03, 3.134620382788961e-03, 3.926570441300832e-03,  \
+   4.727574193224073e-03, 4.891225563554369e-03, 4.993082174472287e-03,  \
+   6.877690940807241e-03, 7.048958902004150e-03, 7.482343216857858e-03,  \
+   7.804875180599580e-03, 7.884184667408244e-03, 8.789727319135741e-03,  \
+   1.020569201350139e-02, 1.047814393079899e-02, 1.053567064989013e-02,  \
+   1.088233801010153e-02, 1.111442043493028e-02, 1.120933468410323e-02,  \
+   1.150613084965787e-02, 1.184069512498871e-02, 1.287323216839533e-02,  \
+   1.289784008040242e-02, 1.290361638049960e-02, 1.301716160293398e-02,  \
+   1.328840708045580e-02, 1.328923809154386e-02, 1.337661646188983e-02,  \
+   1.878939033204372e-02, 1.915329470976454e-02, 1.924248475126509e-02,  \
+   1.948099129262171e-02, 1.973020557737488e-02, 2.061823890489025e-02,  \
+   2.564362192416913e-02, 2.582028209673193e-02, 2.591150211345546e-02,  \
+   2.642639940905077e-02, 2.692527865136344e-02, 2.709476646596388e-02,  \
+   2.923685732222178e-02, 2.964315841816427e-02, 2.971791383743251e-02,  \
+   3.159001279314883e-02, 3.164634225766622e-02, 3.203536808857846e-02,  \
+   4.060202979591518e-02, 4.072187567651760e-02, 4.073396006206902e-02,  \
+   4.075252740422450e-02, 4.075823324694786e-02, 4.084655298115641e-02,  \
+   4.616091672652638e-02}
+
+#define SIQK_QUADRATURE_TRITAY_ORDER18_COORD \
+  {7.07029890425770434e-03, 1.16731059668412299e-02, 9.81256595128901066e-01, \
+   1.18506636748826333e-02, 9.81003085838793698e-01, 7.14625048632366866e-03, \
+   9.77787974953233552e-01, 1.06966317091697870e-02, 1.15153933375966612e-02, \
+   1.21952425108865503e-02, 9.38247698355045179e-01, 4.95570591340682709e-02, \
+   5.03248860967756076e-02, 1.26627518417214337e-02, 9.37012362061502957e-01, \
+   9.28052601109434661e-01, 5.98109409983804755e-02, 1.21364578921848640e-02, \
+   9.24985307647630872e-01, 1.37363297926722354e-02, 6.12783625596968889e-02, \
+   6.29343769992106727e-02, 9.22952795940546356e-01, 1.41128270602429717e-02, \
+   1.46695353279870377e-02, 6.33107354992695215e-02, 9.22019729172743441e-01, \
+   8.38221442443636167e-01, 1.17265100334603151e-02, 1.50052047522903520e-01, \
+   1.20132291087278187e-02, 1.55472058732347040e-01, 8.32514712158925141e-01, \
+   1.53147795225895278e-01, 8.34329388898221724e-01, 1.25228158758829977e-02, \
+   1.26364459307456434e-02, 8.50163803195673196e-01, 1.37199750873581161e-01, \
+   1.39355658599882609e-01, 1.28816350521976618e-02, 8.47762706347919726e-01, \
+   8.35267146700183760e-01, 1.51080160895878751e-01, 1.36526924039374886e-02, \
+   4.12764350243855882e-01, 1.01917879216578220e-02, 5.77043861834486305e-01, \
+   1.19773841073520515e-02, 2.81337239930327110e-01, 7.06685375962320839e-01, \
+   2.75105559050908943e-01, 7.12437462850100567e-01, 1.24569780989904899e-02, \
+   7.11523343775096961e-01, 2.76302525086338957e-01, 1.21741311385640816e-02, \
+   5.69603491897309744e-01, 1.09658368560618374e-02, 4.19430671246628417e-01, \
+   1.11273414647166669e-02, 4.28911051788389452e-01, 5.59961606746893992e-01, \
+   5.66810345010056338e-01, 4.21542055511477942e-01, 1.16475994784657200e-02, \
+   4.17052309556705914e-01, 5.71125859044442907e-01, 1.18218313988511792e-02, \
+   1.15242148311881509e-02, 5.82686827051090317e-01, 4.05788958117721532e-01, \
+   7.14440844241883699e-01, 1.30567806713246960e-02, 2.72502375086791593e-01, \
+   2.64452707580261070e-01, 1.30760400963919332e-02, 7.22471252323346969e-01, \
+   1.33578918342581732e-02, 7.26343706240674458e-01, 2.60298401925067369e-01, \
+   8.68135265415298840e-01, 6.87230068637382230e-02, 6.31417277209629368e-02, \
+   6.27086061132897665e-02, 8.65230210152941437e-01, 7.20611837337687966e-02, \
+   7.60967385052684769e-02, 6.48599071037368607e-02, 8.59043354390994662e-01, \
+   6.27716704398273706e-02, 1.48349494336207116e-01, 7.88878835223965513e-01, \
+   7.88170460224977831e-01, 6.24359898395942040e-02, 1.49393549935427972e-01, \
+   1.47224894550839758e-01, 7.87136901173502213e-01, 6.56382042756580297e-02, \
+   4.22525938278520530e-01, 5.19104921609511785e-02, 5.25563569560528299e-01, \
+   7.74048614563915161e-01, 1.54312992744383953e-01, 7.16383926917008862e-02, \
+   6.76067776910891149e-01, 2.61784274560294683e-01, 6.21479485288141675e-02, \
+   6.74530572355868108e-02, 7.66725787281281046e-01, 1.65821155483132143e-01, \
+   6.17776557233678525e-02, 2.58210367662733586e-01, 6.80011976613898561e-01, \
+   1.74941863707076289e-01, 6.79065925147429861e-02, 7.57151543778180725e-01, \
+   5.84917884088599349e-02, 5.29357827480425258e-01, 4.12150384110714807e-01, \
+   6.72145076162932620e-01, 6.66036150484161232e-02, 2.61251308788651271e-01, \
+   5.51208842356557649e-01, 5.85675461899432051e-02, 3.90223611453499153e-01, \
+   2.98183807982819626e-01, 6.44535360410836422e-02, 6.37362655976096759e-01, \
+   2.61427822878740113e-01, 6.74813842915130246e-01, 6.37583342061296410e-02, \
+   5.82159599068178268e-02, 3.91460231036876105e-01, 5.50323809056306068e-01, \
+   6.75570147429912504e-02, 6.48770149230717630e-01, 2.83672836026291120e-01, \
+   5.44832625703827067e-01, 3.94649822040802345e-01, 6.05175522553705880e-02, \
+   3.99787267113028255e-01, 5.39013715193329634e-01, 6.11990176936421104e-02, \
+   1.51078277618042822e-01, 1.62789508278475825e-01, 6.86132214103481353e-01, \
+   1.61959533146025403e-01, 6.81243632264066146e-01, 1.56796834589908451e-01, \
+   6.78965449795995379e-01, 1.54283287802020219e-01, 1.66751262401984401e-01, \
+   4.97246831616064200e-01, 2.52272775044453668e-01, 2.50480393339482132e-01, \
+   2.45792781854977660e-01, 2.54798153240703207e-01, 4.99409064904319133e-01, \
+   2.75839635471827105e-01, 1.48558054919434857e-01, 5.75602309608738039e-01, \
+   1.41286303940196589e-01, 2.93023960643619241e-01, 5.65689735416184170e-01, \
+   5.75308715344231558e-01, 2.80899127230990808e-01, 1.43792157424777634e-01, \
+   2.66045287116412177e-01, 4.82098959297083796e-01, 2.51855753586504028e-01, \
+   2.89515501140379161e-01, 5.64187824544361005e-01, 1.46296674315259834e-01, \
+   4.20272276953932211e-01, 1.30769964434388403e-01, 4.48957758611679414e-01, \
+   5.51913339122326096e-01, 1.47969222194756778e-01, 3.00117438682917126e-01, \
+   1.54754368775656848e-01, 5.63868422294592553e-01, 2.81377208929750600e-01, \
+   1.38678912478906013e-01, 4.36115742879047474e-01, 4.25205344642046457e-01, \
+   3.79754605982586757e-01, 3.60326393528548949e-01, 2.59919000488864349e-01, \
+   4.32257322202306393e-01, 4.22418833467425037e-01, 1.45323844330268570e-01, \
+   2.50087546338060018e-01, 3.71900183305238496e-01, 3.78012270356701430e-01, \
+   3.73879170813181227e-01, 2.41364500692846234e-01, 3.84756328493972566e-01}
+#define SIQK_QUADRATURE_TRITAY_ORDER18_WEIGHT \
+  {1.258287849322552e-03, 1.263672600361209e-03, 1.663464766659172e-03, \
+   4.075174606270012e-03, 4.306776287080819e-03, 4.389337308965301e-03,  \
+   4.854979278083793e-03, 5.123310595743368e-03, 5.419884417037201e-03,  \
+   6.469269508792310e-03, 6.816991179147562e-03, 6.923866407332497e-03,  \
+   6.971077005242425e-03, 7.206069998379916e-03, 7.685172776701560e-03,  \
+   8.124490112628030e-03, 8.485915214007324e-03, 8.504426621066338e-03,  \
+   8.547676033732530e-03, 8.694442727954849e-03, 8.727198121935910e-03,  \
+   8.920337864331938e-03, 8.922343193968446e-03, 8.952316877617482e-03,  \
+   9.062987810035171e-03, 9.239241944101240e-03, 9.289678218556065e-03,  \
+   1.016085758882769e-02, 1.068858309045880e-02, 1.159584270491392e-02,  \
+   1.372133554295597e-02, 1.451509611701859e-02, 1.472613692527127e-02,  \
+   1.497181258145377e-02, 1.535134740593910e-02, 1.626316829313562e-02,  \
+   1.639421042530506e-02, 1.656173375959963e-02, 1.730837634372872e-02,  \
+   1.735406869880698e-02, 1.736860247019273e-02, 1.742643812271074e-02,  \
+   1.743007805929840e-02, 1.777357849874442e-02, 1.800914981913493e-02,  \
+   1.814631429213930e-02, 1.909488510415974e-02, 1.961264000589436e-02,  \
+   2.413550629437514e-02, 2.449560607831186e-02, 2.486104169360984e-02,  \
+   2.535328684929062e-02, 2.548859970214835e-02, 2.606800318335970e-02,  \
+   2.617304374623586e-02, 2.622203417758513e-02, 2.637298224112941e-02,  \
+   2.647245318638137e-02, 2.711977972504153e-02, 2.717351017096441e-02,  \
+   2.735502743194343e-02, 2.786441729563326e-02, 2.888671321165472e-02,  \
+   2.926968908113495e-02, 3.045196253398069e-02, 3.186369822247498e-02}
+
+class TriangleQuadrature {
+  const Real trisym_order4_coord_  [ 18] = SIQK_QUADRATURE_TRISYM_ORDER4_COORD;
+  const Real trisym_order4_weight_ [  6] = SIQK_QUADRATURE_TRISYM_ORDER4_WEIGHT;
+  const Real tritay_order6_coord_  [ 33] = SIQK_QUADRATURE_TRITAY_ORDER6_COORD;
+  const Real tritay_order6_weight_ [ 11] = SIQK_QUADRATURE_TRITAY_ORDER6_WEIGHT;
+  const Real trisym_order8_coord_  [ 48] = SIQK_QUADRATURE_TRISYM_ORDER8_COORD;
+  const Real trisym_order8_weight_ [ 16] = SIQK_QUADRATURE_TRISYM_ORDER8_WEIGHT;
+#ifdef SIQK_USE_TRITAY12
+  const Real tritay_order12_coord_ [ 96] = SIQK_QUADRATURE_TRITAY_ORDER12_COORD;
+  const Real tritay_order12_weight_[ 32] = SIQK_QUADRATURE_TRITAY_ORDER12_WEIGHT;
+#else
+  const Real trisym_order12_coord_ [ 99] = SIQK_QUADRATURE_TRISYM_ORDER12_COORD;
+  const Real trisym_order12_weight_[ 33] = SIQK_QUADRATURE_TRISYM_ORDER12_WEIGHT;
+#endif
+  const Real trisym_order14_coord_ [138] = SIQK_QUADRATURE_TRISYM_ORDER14_COORD;
+  const Real trisym_order14_weight_[ 46] = SIQK_QUADRATURE_TRISYM_ORDER14_WEIGHT;
+  const Real tritay_order16_coord_ [165] = SIQK_QUADRATURE_TRITAY_ORDER16_COORD;
+  const Real tritay_order16_weight_[ 55] = SIQK_QUADRATURE_TRITAY_ORDER16_WEIGHT;
+  const Real tritay_order18_coord_ [198] = SIQK_QUADRATURE_TRITAY_ORDER18_COORD;
+  const Real tritay_order18_weight_[ 66] = SIQK_QUADRATURE_TRITAY_ORDER18_WEIGHT;
+  const Real trisym_order20_coord_ [264] = SIQK_QUADRATURE_TRISYM_ORDER20_COORD;
+  const Real trisym_order20_weight_[ 88] = SIQK_QUADRATURE_TRISYM_ORDER20_WEIGHT;
+
+public:
+  KOKKOS_INLINE_FUNCTION TriangleQuadrature () {}
+
+  KOKKOS_INLINE_FUNCTION
+  void get_coef (const int order, RawConstVec3s& coord,
+                 RawConstArray& weight) const {
+    switch (order) {
+    case 4:
+      coord = RawConstVec3s(trisym_order4_coord_, 6);
+      weight = RawConstArray(trisym_order4_weight_, 6);
+      break;
+    case 6:
+      coord = RawConstVec3s(tritay_order6_coord_, 11);
+      weight = RawConstArray(tritay_order6_weight_, 11);
+      break;
+    case 8:
+      coord = RawConstVec3s(trisym_order8_coord_, 16);
+      weight = RawConstArray(trisym_order8_weight_, 16);
+      break;
+    case 12:
+#ifdef SIQK_USE_TRITAY12
+      coord = RawConstVec3s(tritay_order12_coord_, 32);
+      weight = RawConstArray(tritay_order12_weight_, 32);
+#else
+      coord = RawConstVec3s(trisym_order12_coord_, 33);
+      weight = RawConstArray(trisym_order12_weight_, 33);
+#endif
+      break;
+    case 14:
+      coord = RawConstVec3s(trisym_order14_coord_, 46);
+      weight = RawConstArray(trisym_order14_weight_, 46);
+      break;
+    case 16:
+      coord = RawConstVec3s(tritay_order16_coord_, 55);
+      weight = RawConstArray(tritay_order16_weight_, 55);
+      break;
+    case 18:
+      coord = RawConstVec3s(tritay_order18_coord_, 66);
+      weight = RawConstArray(tritay_order18_weight_, 66);
+      break;
+    case 20:
+      coord = RawConstVec3s(trisym_order20_coord_, 88);
+      weight = RawConstArray(trisym_order20_weight_, 88);
+      break;
+    default:
+      ko::abort("TriangleQuadrature::get_coef: order not supported.");
+    }
+  }
+};
+
+} // namespace siqk
+
+#endif // INCLUDE_SIQK_QUADRATURE_HPP
diff --git a/siqk/siqk_runtests.py b/siqk/siqk_runtests.py
new file mode 100755
index 0000000..8a05d9a
--- /dev/null
+++ b/siqk/siqk_runtests.py
@@ -0,0 +1,57 @@
+#!/usr/bin/python
+
+import os, sys
+
+quick = True
+exe = sys.argv[1]
+testno = int(sys.argv[2])
+
+stride = 2
+biggest = 1111
+
+xlates = [4.2*10**f for f in range(-17, 0, stride)]
+xlates.append(0)
+
+ylates = [0]
+
+angles = xlates
+
+fails = []
+cnt = 0
+
+if testno == 0:
+    for n in [4, 50, 511, biggest]:
+        if quick and n > 50: break
+        for angle in angles:
+            for xlate in xlates:
+                for ylate in ylates:
+                    cmd = ('OMP_NUM_THREADS=8 {exe:s} --testno 0 --xlate {xlate:1.15e} --ylate {ylate:1.14e} --angle {angle:1.15e} -n {n:d}'.
+                           format(exe=exe, xlate=xlate, ylate=ylate, angle=angle, n=n))
+                    stat = os.system(cmd + ' |& grep PASSED &> /dev/null')
+                    if stat:
+                        fails.append(cmd)
+                    else:
+                        cnt += 1
+        print len(fails)
+
+elif testno == 1:
+    for n in [4, 20, 40, 79]:
+        if quick and n > 20: break
+        for angle in angles:
+            cmd = ('OMP_NUM_THREADS=8 {exe:s} --testno 1 --angle {angle:1.15e} -n {n:d}'.
+                   format(exe=exe, angle=angle, n=n))
+            stat = os.system(cmd + ' |& grep PASSED &> /dev/null')
+            if stat:
+                fails.append(cmd)
+            else:
+                cnt += 1
+        print len(fails)
+    
+if len(fails) > 0:
+    print 'FAILED'
+    for f in fails:
+        print f
+    sys.exit(-1)
+else:
+    print 'PASSED ({0:d})'.format(cnt)
+    sys.exit(0)
diff --git a/siqk/siqk_search.hpp b/siqk/siqk_search.hpp
new file mode 100644
index 0000000..1e517e3
--- /dev/null
+++ b/siqk/siqk_search.hpp
@@ -0,0 +1,378 @@
+#ifndef INCLUDE_SIQK_SEARCH_HPP
+#define INCLUDE_SIQK_SEARCH_HPP
+
+#include "siqk_defs.hpp"
+#include "siqk_geometry.hpp"
+#include <vector>
+
+namespace siqk {
+
+// Oct-tree. Might do something else better suited to the sphere later.
+template <typename Geo, Int max_depth_ = 10>
+class Octree {
+public:
+  enum { max_depth = max_depth_ };
+  typedef Real BoundingBox[6];
+
+  struct Options {
+    // Do not go beyond max_depth_ depth, including the root and leaf. With this
+    // constraInt, try to go deep enough so that a leaf has no more than
+    // max_nelem elements.
+    Int max_nelem;
+    Options () : max_nelem(8) {}
+  };
+
+  // Bounding box for a cluster of points ps (possibly vertices).
+  template <typename CV3s, typename BB>
+  static void calc_bb (const CV3s& ps, const Int np, BB bb) {
+    if (np == 0) return;
+    for (Int j = 0; j < 3; ++j)
+      bb[j] = bb[j+3] = ps(0,j);
+    for (Int i = 1; i < np; ++i) {
+      for (Int j = 0; j < 3; ++j) {
+        bb[j] = min(bb[j], ps(i,j));
+        bb[j+3] = max(bb[j+3], ps(i,j));
+      }
+    }
+    pad_bb(bb);
+  }
+
+  template <typename CV3s, typename CIV, typename BB>
+  KOKKOS_INLINE_FUNCTION
+  static void calc_bb (const CV3s& p, const CIV e, const Int ne, BB ebb) {
+    for (Int j = 0; j < 3; ++j)
+      ebb[j] = ebb[j+3] = p(e[0], j);
+    for (Int i = 1; i < ne; ++i) {
+      if (e[i] == -1) break;
+      for (Int j = 0; j < 3; ++j) {
+        ebb[j] = min(ebb[j], p(e[i], j));
+        ebb[j+3] = max(ebb[j+3], p(e[i], j));
+      }
+    }
+    pad_bb(ebb);
+  }
+
+  // If a bounding box was constructed from vertices of a spherical polygon,
+  // expand it to account for the possible protrusion of the sphere.
+  template <typename BB>
+  KOKKOS_INLINE_FUNCTION
+  static void pad_bb (BB bb) {
+    if (std::is_same<Geo, PlaneGeometry>::value) return;
+    Real hl = 0.5*std::sqrt(square(bb[3] - bb[0]) + square(bb[4] - bb[1]) +
+                            square(bb[5] - bb[2]));
+    // Limit the half-length to the circle's radius.
+    hl = min(1.0, hl);
+    // Max distance from a chord of length 2 hl to the unit circle:
+    //     hl = sin theta
+    //    pad = 1 - cos theta = 1 - sqrt(1 - sin^2 theta) = 1 - sqrt(1 - hl^2).
+    const Real pad = 1 - std::sqrt(1 - square(hl));
+    for (Int i = 0; i < 3; ++i) bb[  i] -= pad;
+    for (Int i = 0; i < 3; ++i) bb[3+i] += pad;
+  }
+
+  template <typename CV3s>
+  static void calc_bb (const CV3s& ps, BoundingBox bb) {
+    calc_bb(ps, nslices(ps), bb);
+  }
+
+  template <typename CV3s, typename CIs, typename V6s>
+  static void calc_bb (const CV3s& p, const CIs& e, V6s& ebbs) {
+    assert(nslices(ebbs) == nslices(e));
+    for (Int k = 0, klim = nslices(e); k < klim; ++k)
+      calc_bb(p, slice(e, k), szslice(e), slice(ebbs, k));
+  }
+
+  // p is a 3xNp array of points. e is a KxNe array of elements. An entry <0 is
+  // ignored. All <0 entries must be at the end of an element's list.
+  Octree (const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e,
+          const Options& o) {
+    init(p, e, o);
+  }
+  Octree (const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e) {
+    Options o;
+    init(p, e, o);
+  }
+
+  Octree() {}
+  void init (const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e) {
+    Options o;
+    init(p, e, o);
+  }
+
+  // Apply f to every element in leaf nodes with which bb overlaps. f must have
+  // function
+  //     void operator(const Int element).
+  template <typename CV, typename Functor>
+  KOKKOS_INLINE_FUNCTION
+  void apply (const CV bb, Functor& f) const {
+    if (nslices(nodes_) == 0) {
+      for (Int i = 0; i < offsets_[1]; ++i)
+        f(elems_[i]);
+      return;
+    }
+#ifdef SIQK_NONRECURSIVE
+    // Non-recursive impl.
+    {
+      // Stack.
+      Real snbb[8*max_depth_];
+      Int sni[max_depth_], si[max_depth_];
+      Int sp = 0;
+      // Args for top-level call.
+      copy(snbb, bb_, 8);
+      sni[sp] = 0;
+      si[sp] = 0;
+      while (sp >= 0) {
+        // Get stack frame's (nbb, ni, current i) values.
+        const Int i = si[sp];
+        if (i == 8) {
+          --sp;
+          continue;
+        }
+        // Increment stored value of i for next iteration. Current value is
+        // stored in 'i' above.
+        ++si[sp];
+        const Int ni = sni[sp];
+        const Real* const nbb = snbb + 8*sp;
+        // Can use the next stack frame's bb space for a child bb.
+        Real* const child_bb = snbb + 8*(sp+1);
+        fill_child_bb(nbb, i, child_bb);
+        if ( ! do_bb_overlap(child_bb, bb)) continue;
+        Int e = nodes_(ni,i);
+        if (e < 0) {
+          // Leaf, so apply functor to each element.
+          e = std::abs(e + 1);
+          for (Int k = offsets_[e]; k < offsets_[e+1]; ++k)
+            f(elems_[k]);
+        } else if (e > 0) {
+          // Recurse.
+          ++sp;
+          sni[sp] = e;
+          si[sp] = 0;
+        }
+      }
+    }
+#else
+    apply_r(0, bb_, bb, f);
+#endif
+  }
+
+private:
+  /* Each node in the oct-tree contains 8 integers, stored in 'nodes'.
+
+     >0 is an index Into 'nodes', pointing to a child node.
+
+     A <=0 entry in 'nodes' indicates a leaf node. If 0, there are no elements
+     in the leaf. If <0, the negative of the entry minus 1 is the index of an
+     offset array indexing 'elems'.
+
+     Each segment of 'elems' contains a list of element indices covered by a
+     leaf node. Element indices refer to the list of elements the caller
+     provides during oct-tree construction.
+  */
+
+  // Static data structures holding the completed octree.
+  //   nodes(:,i) is a list. The list includes children of node i (>0) and leaf
+  // node data (<=0).
+  //todo Make these const once ready to do full GPU stuff.
+  Nodes nodes_;
+  // A leaf node corresponding to -k covers elements
+  //     elems[offset[k] : offset[k]-1].
+  ko::View<int*> offsets_, elems_;
+  // Root node's bounding box.
+  BoundingBox bb_;
+
+  // Dynamic data structures for construction phase.
+  class IntList {
+    Int* const buf_;
+    Int i_;
+  public:
+    IntList (Int* const buf) : buf_(buf), i_(0) {}
+    void reset () { i_ = 0; }
+    void push (const Int& i) { buf_[i_++] = i; }
+    Int* data () { return buf_; }
+    Int n () const { return i_; }
+    const Int& operator[] (const Int& i) const { return buf_[i]; }
+  };
+
+  class DynIntList {
+    std::vector<Int> buf_;
+  public:
+    DynIntList () {}
+    void push (const Int& i) { buf_.push_back(i); }
+    Int& back () { return buf_.back(); }
+    Int& operator[] (const size_t i) {
+      if (i >= buf_.size())
+        buf_.resize(i+1);
+      return buf_[i];
+    }
+    const Int& operator[] (const size_t i) const { return buf_[i]; }
+    Int n () const { return static_cast<Int>(buf_.size()); }
+    const Int* data () const { return buf_.data(); }
+  };
+
+  // Opposite index slot convention.
+  class DynNodes {
+    std::vector<Int> buf_;
+  public:
+    Int n () const { return static_cast<Int>(buf_.size()) >> 3; }
+    const Int* data () const { return buf_.data(); }
+    Int& operator() (const Int& r, const Int& c) {
+      const size_t ec = (c+1) << 3;
+      if (ec >= buf_.size())
+        buf_.resize(ec);
+      return const_cast<Int&>(
+        const_cast<const DynNodes*>(this)->operator()(r, c));
+    }
+    const Int& operator() (const Int& r, const Int& c) const {
+      assert(((c << 3) + r) >= 0);
+      assert(((c << 3) + r) < (Int) buf_.size());
+      return buf_[(c << 3) + r];
+    }
+  };
+
+  void init (const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e,
+             const Options& o) {
+    if (nslices(e) == 0) return;
+    // Get OT's bounding box.
+    calc_bb(p, bb_);
+    // Get elements' bounding boxes.
+    Vec6s::HostMirror ebbs("ebbs", nslices(e));
+    calc_bb(p, e, ebbs);
+    // Static element lists for work. Each level has active work space.
+    std::vector<Int> buf(max_depth_*nslices(e));
+    IntList es(buf.data()), wrk(buf.data() + nslices(e));
+    for (Int i = 0, ilim = nslices(e); i < ilim; ++i)
+      es.push(i);
+    // Dynamic element lists.
+    DynIntList offsets, elems;
+    offsets[0] = 0;
+    // Dynamic node data structure.
+    DynNodes nodes;
+    // Recurse. We don't care about the return value. If it's 0 and nodes.n() ==
+    // 0, we'll detect as much in 'apply'.
+    init_r(1, bb_, ebbs, o, es, wrk, offsets, elems, nodes);
+    // Build the static data structures.
+    if (elems.n() == 0) return;
+    init_static_ds(nodes, offsets, elems);
+  }
+
+  Int init_r (const Int depth, // Tree's depth at this point, including root.
+              const BoundingBox& nbb, // My bounding box.
+              const ConstVec6s::HostMirror& ebbs, // All elements' bounding boxes.
+              const Options& o, // Options controlling construct of the tree.
+              IntList& es, // List of elements in my bounding box.
+              IntList& wrk, // Work space to store working element lists.
+              DynIntList& offsets, // Offsetss Into elems.
+              DynIntList& elems, // Elements belonging to leaf nodes.
+              DynNodes& nodes) // Dynamic nodes data structure.
+  {
+    const Int my_idx = nodes.n(); // My node index.
+    // Decide what to do.
+    if (es.n() == 0) {
+      // I have no elements, so return 0 to indicate I'm a leaf node containing
+      // nothing.
+      return 0;
+    } else if (es.n() <= o.max_nelem || depth == max_depth_) {
+      // I'm a leaf node with elements. Store my list of elements and return the
+      // storage location.
+      const Int os = offsets.back();
+      offsets.push(os + es.n());
+      for (Int i = 0, n = es.n(); i < n; ++i)
+        elems[os + i] = es[i];
+      return 1 - offsets.n();
+    } else {
+      // I'm not a leaf node.
+      nodes(0, my_idx) = 0; // Insert myself Into the nodes array.
+      for (Int ic = 0; ic < 8; ++ic) {
+        BoundingBox child_bb;
+        fill_child_bb(nbb, ic, child_bb);
+        // Find the elements that are in this child's bb.
+        IntList ces(wrk.data());
+        for (Int i = 0, n = es.n(); i < n; ++i)
+          if (do_bb_overlap(child_bb, slice(ebbs, es[i])))
+            ces.push(es[i]);
+        // Create some work space.
+        IntList cwrk(wrk.data() + ces.n());
+        // Recurse.
+        const Int child_idx = init_r(depth+1, child_bb, ebbs, o, ces, cwrk,
+                                     offsets, elems, nodes);
+        nodes(ic, my_idx) = child_idx;
+      }
+      return my_idx;
+    }
+  }
+
+  void init_static_ds (const DynNodes nodes, const DynIntList& offsets,
+                       const DynIntList& elems) {
+    {
+      ko::resize(nodes_, nodes.n());
+      auto nodes_hm = ko::create_mirror_view(nodes_);
+      for (Int i = 0; i < nodes.n(); ++i)
+        for (Int j = 0; j < 8; ++j)
+          nodes_hm(i,j) = nodes(j,i);
+      ko::deep_copy(nodes_, nodes_hm);
+    }
+    hm_resize_and_copy(offsets_, offsets, offsets.n());
+    hm_resize_and_copy(elems_, elems, elems.n());
+  }
+
+  // Using parent bb p, fill child bb c, with child_idx in 0:7.
+  template <typename CBB, typename BB>
+  KOKKOS_INLINE_FUNCTION
+  static void fill_child_bb (const CBB& p, const Int& child_idx, BB& c) {
+    const Real m[] = { 0.5*(p[0] + p[3]),
+                         0.5*(p[1] + p[4]),
+                         0.5*(p[2] + p[5]) };
+    switch (child_idx) {
+    case 0: c[0] = p[0]; c[1] = p[1]; c[2] = p[2]; c[3] = m[0]; c[4] = m[1]; c[5] = m[2]; break;
+    case 1: c[0] = m[0]; c[1] = p[1]; c[2] = p[2]; c[3] = p[3]; c[4] = m[1]; c[5] = m[2]; break;
+    case 2: c[0] = m[0]; c[1] = m[1]; c[2] = p[2]; c[3] = p[3]; c[4] = p[4]; c[5] = m[2]; break;
+    case 3: c[0] = p[0]; c[1] = m[1]; c[2] = p[2]; c[3] = m[0]; c[4] = p[4]; c[5] = m[2]; break;
+    case 4: c[0] = p[0]; c[1] = p[1]; c[2] = m[2]; c[3] = m[0]; c[4] = m[1]; c[5] = p[5]; break;
+    case 5: c[0] = m[0]; c[1] = p[1]; c[2] = m[2]; c[3] = p[3]; c[4] = m[1]; c[5] = p[5]; break;
+    case 6: c[0] = m[0]; c[1] = m[1]; c[2] = m[2]; c[3] = p[3]; c[4] = p[4]; c[5] = p[5]; break;
+    case 7: c[0] = p[0]; c[1] = m[1]; c[2] = m[2]; c[3] = m[0]; c[4] = p[4]; c[5] = p[5]; break;
+    default:
+      // impossible
+      error("fill_child_bb: The impossible has happened.");
+    }
+  }
+
+  // Do bounding boxes a and b overlap?
+  template <typename BB>
+  KOKKOS_INLINE_FUNCTION
+  static bool do_bb_overlap (const BoundingBox a, const BB b) {
+    for (Int i = 0; i < 3; ++i)
+      if ( ! do_lines_overlap(a[i], a[i+3], b[i], b[i+3]))
+        return false;
+    return true;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static bool do_lines_overlap (const Real& a1, const Real& a2,
+                                const Real& b1, const Real& b2) {
+    return ! (a2 < b1 || a1 > b2);
+  }
+
+  template <typename CV, typename Functor> KOKKOS_INLINE_FUNCTION
+  void apply_r (const Int ni, const BoundingBox& nbb, const CV bb,
+                Functor& f) const {
+    for (Int i = 0; i < 8; ++i) {
+      BoundingBox child_bb;
+      fill_child_bb(nbb, i, child_bb);
+      if ( ! do_bb_overlap(child_bb, bb)) continue;
+      Int e = nodes_(ni,i);
+      if (e > 0)
+        apply_r(e, child_bb, bb, f);
+      else if (e < 0) {
+        e = std::abs(e + 1);
+        for (Int k = offsets_[e]; k < offsets_[e+1]; ++k)
+          f(elems_[k]);
+      }
+    }
+  }
+};
+
+} // namespace siqk
+
+#endif // INCLUDE_SIQK_SEARCH_HPP
diff --git a/siqk/siqk_sqr.hpp b/siqk/siqk_sqr.hpp
new file mode 100644
index 0000000..c50f710
--- /dev/null
+++ b/siqk/siqk_sqr.hpp
@@ -0,0 +1,267 @@
+#ifndef INCLUDE_SIQK_SQR_HPP
+#define INCLUDE_SIQK_SQR_HPP
+
+#include "siqk_defs.hpp"
+#include "siqk_intersect.hpp"
+
+namespace siqk {
+namespace sqr { // spherical quadrilateral <-> reference square
+/* Let p be a 3x4 matrix with p(:,i) the i'th vertex in a spherical quad in CCW
+   order. Let (a,b) be coordinates in the reference square [0,1]^2. (Here we
+   choose [0,1] instead of [-1,1].) (a,b) = (0,0) corresponds to p(:,1); (1,0)
+   is p(:,2); (1,1) is p(:,3); (0,1) is p(:,4).
+     The map from reference square to bilinear quad can be written
+       T = p*[ 1 -1 1 -1
+              -1  1 0  0
+              -1  0 0  1
+               1  0 0  0]';
+       f(a,b) = T(:,1)*a*b + T(:,2)*a + T(:,3)*b + T(:,4);
+   The map to the sphere is then completed with
+       g(a,b) = norm(f(a,b))
+       q = f(a,b) / g(a,b).
+   The Jacobian matrix for q is given by
+       q_a = f_a/g - (f g_a)/g^2
+       g_a = g_f f_a
+   and similarly for q_b.
+*/
+
+namespace impl {
+// Compute T(i,:).
+template <typename ConstVec3sT, typename Quad>
+KOKKOS_INLINE_FUNCTION
+void calc_T_row (const ConstVec3sT& p, const Quad& e, const Int i,
+                 Real& t1, Real& t2, Real& t3, Real& t4) {
+  t4 = p(e[0],i);
+  t3 = -t4 + p(e[3],i);
+  t2 = -t4 + p(e[1],i);
+  t1 = -t2 + p(e[2],i) - p(e[3],i);
+}
+
+// Compute T(:,1)*a*b + T(:,2)*a + T(:,3)*b + T(:,4).
+template <typename ConstVec3sT, typename Quad>
+KOKKOS_INLINE_FUNCTION
+void calc_ref_to_bilinear (const ConstVec3sT& p, const Quad& e,
+                           Real a, Real b, Real q[3]) {
+  a = 0.5*(a + 1);
+  b = 0.5*(b + 1);
+  for (Int i = 0; i < 3; ++i) {
+    Real t1, t2, t3, t4;
+    impl::calc_T_row(p, e, i, t1, t2, t3, t4);
+    q[i] = t1*a*b + t2*a + t3*b + t4;
+  }
+}
+
+// The residual function is r(a,b) = f(a,b)/g(a,b) - q.
+template <typename ConstVec3sT, typename Quad>
+KOKKOS_INLINE_FUNCTION
+void calc_residual (const ConstVec3sT& p, const Quad& e, const Real a,
+                    const Real b, const Real q[3], Real r[3]) {
+  calc_ref_to_bilinear(p, e, a, b, r);
+  const Real rnorm = std::sqrt(SphereGeometry::norm2(r));
+  for (Int i = 0; i < 3; ++i)
+    r[i] = r[i]/rnorm - q[i];  
+}
+
+// Compute the Jacobian matrix of the residual function: Jacobian(ref square ->
+// sphere).
+//   TODO Consider rewriting this in terms of the p=1 basis isoparametric
+// interpolation formulation. Better performance? See
+// calc_isoparametric_jacobian in slmmir.cpp.
+template <typename ConstVec3sT, typename Quad>
+KOKKOS_INLINE_FUNCTION
+void calc_Jacobian (const ConstVec3sT& p, const Quad& e, Real a, Real b,
+                    Real J[6]) {
+  a = 0.5*(a + 1);
+  b = 0.5*(b + 1);  
+  Real r[3];
+  for (Int i = 0; i < 3; ++i) {
+    Real t1, t2, t3, t4;
+    calc_T_row(p, e, i, t1, t2, t3, t4);
+    r[  i] = t1*a*b + t2*a + t3*b + t4;
+    J[  i] = t1*b + t2;
+    J[3+i] = t1*a + t3;
+  }
+  Real rtJ[2] = {0};
+  for (Int j = 0; j < 2; ++j) {
+    const Real* const Jj = J + 3*j;
+    for (Int i = 0; i < 3; ++i)
+      rtJ[j] += r[i]*Jj[i];
+  }
+  const Real rnorm2 = SphereGeometry::norm2(r), rnorm = std::sqrt(rnorm2);
+  for (Int j = 0; j < 2; ++j) {
+    Real* const Jj = J + 3*j;
+    for (Int i = 0; i < 3; ++i)
+      Jj[i] = (Jj[i] - r[i]*rtJ[j]/rnorm2)/rnorm;
+  }
+}
+
+// Solve J dx = r.
+KOKKOS_INLINE_FUNCTION
+void solve_Jxr (Real J[6], const Real r[3], Real dx[2]) {
+  // QR factorization: J -> J [n1 a; 0 n2].
+  const Real n1 = std::sqrt(SphereGeometry::norm2(J));
+  SphereGeometry::scale(1/n1, J);
+  const Real a = SphereGeometry::dot(J, J+3);
+  SphereGeometry::axpy(-a, J, J+3);
+  const Real n2 = std::sqrt(SphereGeometry::norm2(J+3));
+  SphereGeometry::scale(1/n2, J+3);
+  // r -> Q' r.
+  Real Qtr[2] = {0};
+  for (Int j = 0; j < 2; ++j) {
+    const Real* const Jj = J + 3*j;
+    for (Int i = 0; i < 3; ++i)
+      Qtr[j] += Jj[i]*r[i];
+  }
+  // dx = R \ (Q' r).
+  dx[1] = 2*(Qtr[1] / n2);
+  dx[0] = 2*((Qtr[0] - a*dx[1]) / n1);
+}
+} // namespace impl
+
+struct Info {
+  bool success;
+  Int n_iterations;
+};
+
+template <typename ConstVec3sT, typename Quad>
+KOKKOS_INLINE_FUNCTION
+void calc_ref_to_sphere (
+  // The spherical quad containing the point.
+  const ConstVec3sT& p, const Quad& e,
+  // (a,b) in [-1,1]
+  const Real a, const Real b,
+  // The point on the sphere.
+  Real q[3])
+{
+  impl::calc_ref_to_bilinear(p, e, a, b, q);
+  SphereGeometry::normalize(q);
+}
+
+template <typename ConstVec3sT, typename Quad>
+KOKKOS_INLINE_FUNCTION
+void calc_sphere_to_ref (
+  // The spherical quad containing the point.
+  const ConstVec3sT& p, const Quad& e,
+  // The point on the sphere.
+  const Real q[3],
+  // (a,b) in [-1,1]
+  Real& a, Real& b,
+  // Optional info output.
+  Info* const info = nullptr,
+  // Max number of iterations before returning with failure.
+  const Int max_its = 10,
+  // Tolerance for Newton iteration.
+  const Real tol = 1e2*std::numeric_limits<Real>::epsilon())
+{
+  const Real tol2 = square(tol);
+  Real rnorm2 = 1;
+  a = b = 0;
+  Int it = 0;
+  for (it = 1; it <= max_its; ++it) { // Newton's method.
+    Real r[3], J[6];
+    impl::calc_residual(p, e, a, b, q, r);
+    rnorm2 = SphereGeometry::norm2(r);
+    if (rnorm2 <= tol2) break;
+    impl::calc_Jacobian(p, e, a, b, J);
+    Real dx[2];
+    impl::solve_Jxr(J, r, dx);
+    a -= dx[0];
+    b -= dx[1];
+  }
+  if (info) {
+    info->success = rnorm2 <= tol2;
+    info->n_iterations = it;
+  }
+}
+
+// Ref coords, packed (x,y), CCW, starting from (-1,-1).
+KOKKOS_INLINE_FUNCTION
+const Real* get_ref_vertices () {
+  static const Real c[] = {-1, -1, 1, -1, 1, 1, -1, 1};
+  return c;
+}
+
+namespace test {
+struct Info {
+  Int sum_nits, max_nits, nfails;
+};
+
+class TestSphereToRefKernel {
+  const Real a_test[9] = {-0.1, -1e-16, 0, 1e-15, 0.1, 0.7, 1, 1-1e-14, 1.1};
+  const Int n_a_test = sizeof(a_test)/sizeof(*a_test);
+
+  const Real tol_;
+  mutable ConstVec3s p_;
+  mutable ConstIdxs e_;
+
+public:
+  typedef Info value_type;
+
+  TestSphereToRefKernel (const ConstVec3s::HostMirror& p_hm,
+                         const ConstIdxs::HostMirror& e_hm,
+                         const Real tol = 1e1*std::numeric_limits<Real>::epsilon())
+    : tol_(tol)
+  {
+    { Vec3s p; resize_and_copy(p, p_hm); p_ = p; }
+    { Idxs e; resize_and_copy(e, e_hm); e_ = e; }
+  }
+
+  Int n () const { return nslices(e_)*square(n_a_test); }
+  const Real& tol () const { return tol_; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const Int k, value_type& jinfo) const {
+    const Int
+      ei = k / square(n_a_test),
+      ij = k % square(n_a_test),
+      i = ij / n_a_test,
+      j = ij % n_a_test;
+    const Real a_t = a_test[i], b_t = a_test[j];
+    Real q[3];
+    sqr::calc_ref_to_sphere(p_, slice(e_, ei), a_t, b_t, q);
+    Real a, b;
+    sqr::Info info;
+    sqr::calc_sphere_to_ref(p_, slice(e_, ei), q, a, b, &info, 100, tol_);
+    const Real err = std::sqrt(square(a_t - a) + square(b_t - b));
+    // tol is on dx, not (a,b), so adjust slightly.
+    if ( ! info.success || err > 1e4*tol_) {
+      jinfo.nfails++;
+      printf("calc_sphere_to_ref ei %d i %d j %d: nits %d re %1.1e\n",
+             ei, i, j, info.n_iterations, err);
+    }
+    jinfo.sum_nits += info.n_iterations;
+    jinfo.max_nits = max(jinfo.max_nits, info.n_iterations);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init (value_type& info) {
+    info.sum_nits = 0;
+    info.max_nits = 0;
+    info.nfails = 0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join (volatile value_type& dst, volatile value_type const& src) const {
+    dst.max_nits = max(dst.max_nits, src.max_nits);
+    dst.sum_nits += src.sum_nits;
+    dst.nfails += src.nfails;
+  }
+};
+
+inline Int test_sphere_to_ref (const ConstVec3s::HostMirror& p,
+                               const ConstIdxs::HostMirror& e) {
+  TestSphereToRefKernel k(p, e);
+  Info info;
+  auto t = tic();
+  ko::parallel_reduce(k.n(), k, info);
+  const auto et = toc(t);
+  fprintf(stderr, "sqr: #fails %d #iterations mean %1.1f max %d\n",
+          info.nfails, (Real) info.sum_nits / k.n(), info.max_nits);
+  print_times("test_sphere_to_ref", et);
+  return info.nfails;
+}
+} // namespace test
+} // namespace sqr
+} // namespace siqk
+
+#endif // INCLUDE_SIQK_SQR_HPP
diff --git a/siqk/siqk_test.cpp b/siqk/siqk_test.cpp
new file mode 100644
index 0000000..e08ab2a
--- /dev/null
+++ b/siqk/siqk_test.cpp
@@ -0,0 +1,519 @@
+// ko=/home/ambradl/lib/kokkos/cpu; mycpp -I$ko/include -L$ko/lib -fopenmp unit_test.cpp -lkokkos -ldl -Wall -pedantic -DSIQK_TIME
+// ./a.out -m | grep "mat=1" > foo.m
+// >> msik('draw_unit_test0', 'foo');
+
+#include <limits>
+
+#include "siqk.hpp"
+using namespace siqk;
+
+#define INSTANTIATE_PLANE
+
+//> Code that will likely be moved to library files.
+
+template <typename CV3s>
+void write_matlab (const std::string& name, const CV3s& p) {
+  printf("mat=1; %s = [", name.c_str());
+  for (Int ip = 0; ip < nslices(p); ++ip)
+    printf(" %1.15e %1.15e %1.15e;", p(ip,0), p(ip,1), p(ip,2));
+  printf("].';\n");
+}
+
+template <typename CV3s, typename CIs>
+void write_matlab (const std::string& name, const CV3s& p, const CIs& e) {
+  printf("mat=1; %s.p = [", name.c_str());
+  for (Int ip = 0; ip < nslices(p); ++ip)
+    printf(" %1.15e %1.15e %1.15e;", p(ip,0), p(ip,1), p(ip,2));
+  printf("].';\n");
+  printf("mat=1; %s.n = [", name.c_str());
+  for (Int ie = 0; ie < nslices(e); ++ie)
+    printf(" %d %d %d %d;", e(ie,0)+1, e(ie,1)+1, e(ie,2)+1, e(ie,3)+1);
+  printf("].';\n");
+}
+
+static void make_planar_mesh (Vec3s::HostMirror& p, Idxs::HostMirror& e,
+                              const Int n) {
+  const Real d = std::sqrt(0.5);
+  ko::resize(e, n*n, 4);
+  ko::resize(p, (n+1)*(n+1));
+  for (Int iy = 0; iy < n+1; ++iy)
+    for (Int ix = 0; ix < n+1; ++ix) {
+      const auto idx = (n+1)*iy + ix;
+      p(idx,0) = 2*(static_cast<Real>(ix)/n - 0.5)*d;
+      p(idx,1) = 2*(static_cast<Real>(iy)/n - 0.5)*d;
+      p(idx,2) = 0;
+    }
+  for (Int iy = 0; iy < n; ++iy)
+    for (Int ix = 0; ix < n; ++ix) {
+      const auto idx = n*iy + ix;
+      e(idx,0) = (n+1)*iy + ix;
+      e(idx,1) = (n+1)*iy + ix+1;
+      e(idx,2) = (n+1)*(iy+1) + ix+1;
+      e(idx,3) = (n+1)*(iy+1) + ix;
+    }
+}
+
+// Row-major R.
+inline void form_rotation (const Real axis[3], const Real angle, Real r[9]) {
+  const Real nrm = std::sqrt(SphereGeometry::norm2(axis));
+  const Real& x = axis[0] / nrm, & y = axis[1] / nrm, & z = axis[2] / nrm,
+    & th = angle;
+  const Real cth = std::cos(th), sth = std::sin(th), omcth = 1 - cth;
+  r[0] = cth + x*x*omcth;
+  r[3] = y*x*omcth + z*sth;
+  r[6] = z*x*omcth - y*sth;
+  r[1] = x*y*omcth - z*sth;
+  r[4] = cth + y*y*omcth;
+  r[7] = z*y*omcth + x*sth;
+  r[2] = x*z*omcth + y*sth;
+  r[5] = y*z*omcth - x*sth;
+  r[8] = cth + z*z*omcth;
+}
+
+template <typename V>
+static void rotate (const Real R[9], V p) {
+  const Real x = p[0], y = p[1], z = p[2];
+  p[0] = R[0]*x + R[1]*y + R[2]*z;
+  p[1] = R[3]*x + R[4]*y + R[5]*z;
+  p[2] = R[6]*x + R[7]*y + R[8]*z;
+}
+
+template <typename V>
+static void translate (const Real xlate[3], V p) {
+  for (Int i = 0; i < 3; ++i) p[i] += xlate[i];
+}
+
+static void transform_planar_mesh (const Real R[9], const Real xlate[3],
+                                   Vec3s::HostMirror& p) {
+  for (Int i = 0; i < nslices(p); ++i) {
+    rotate(R, slice(p, i));
+    translate(xlate, slice(p, i));
+  }
+}
+
+// Remove vertices marked unused and adjust numbering.
+static void remove_unused_vertices (Vec3s::HostMirror& p, Idxs::HostMirror& e,
+                                    const Real unused) {
+  // adjust[i] is the number to subtract from i. Hence if e(ei,0) was originally
+  // i, it is adjusted to i - adjust[i].
+  std::vector<Int> adjust(nslices(p), 0);
+  Int rmcnt = 0;
+  for (Int i = 0; i < nslices(p); ++i) {
+    if (p(i,0) != unused) continue;
+    adjust[i] = 1;
+    ++rmcnt;
+  }
+  // Cumsum.
+  for (Int i = 1; i < nslices(p); ++i)
+    adjust[i] += adjust[i-1];
+  // Adjust e.
+  for (Int ei = 0; ei < nslices(e); ++ei)
+    for (Int k = 0; k < szslice(e); ++k)
+      e(ei,k) -= adjust[e(ei,k)];
+  // Remove unused from p.
+  Vec3s::HostMirror pc("copy", nslices(p));
+  ko::deep_copy(pc, p);
+  ko::resize(p, nslices(p) - rmcnt);
+  for (Int i = 0, j = 0; i < nslices(pc); ++i) {
+    if (pc(i,0) == unused) continue;
+    for (Int k = 0; k < szslice(pc); ++k) p(j,k) = pc(i,k);
+    ++j;
+  }
+}
+
+// A very simple cube-sphere mesh with nxn elements per face. At least for now
+// I'm not bothering with making the elements well proportioned.
+void make_cubesphere_mesh (Vec3s::HostMirror& p, Idxs::HostMirror& e,
+                           const Int n) {
+  // Transformation of the reference mesh make_planar_mesh to make each of the
+  // six faces.
+  const Real d = std::sqrt(0.5);
+  static Real R[6][9] = {{ 1, 0, 0, 0, 0, 0, 0, 1, 0},  // face 0, -y
+                         { 0, 0, 0, 1, 0, 0, 0, 1, 0},  //      1, +x
+                         {-1, 0, 0, 0, 0, 0, 0, 1, 0},  //      2, +y
+                         { 0, 0, 0,-1, 0, 0, 0, 1, 0},  //      3, -x
+                         { 1, 0, 0, 0, 1, 0, 0, 0, 0},  //      4, +z
+                         {-1, 0, 0, 0, 1, 0, 0, 0, 0}}; //      5, -z
+  static Real xlate[6][3] = {{ 0,-d, 0}, { d, 0, 0}, { 0, d, 0},
+                             {-d, 0, 0}, { 0, 0, d}, { 0, 0,-d}};
+  // Construct 6 uncoupled faces.
+  Vec3s::HostMirror ps[6];
+  Vec3s::HostMirror& p_ref = ps[0];
+  Idxs::HostMirror es[6];
+  Idxs::HostMirror& e_ref = es[0];
+  make_planar_mesh(p_ref, e_ref, n);
+  ko::resize(e, 6*nslices(e_ref), 4);
+  ko::resize(p, 6*nslices(p_ref));
+  for (Int i = 1; i < 6; ++i) {
+    ko::resize(es[i], nslices(e_ref), 4);
+    ko::deep_copy(es[i], e_ref);
+    ko::resize(ps[i], nslices(p_ref));
+    ko::deep_copy(ps[i], p_ref);
+    transform_planar_mesh(R[i], xlate[i], ps[i]);
+  }
+  transform_planar_mesh(R[0], xlate[0], ps[0]);
+  // Pack (p,e), accounting for equivalent vertices. For the moment, keep the p
+  // slot for an equivalent vertex to make node numbering simpler, but make the
+  // value bogus so we know if there's a problem in the numbering.
+  const Real unused = -2;
+  ko::deep_copy(p, unused);
+  Int p_base = 0, e_base = 0;
+  { // -y face
+    const Vec3s::HostMirror& fp = ps[0];
+    Idxs::HostMirror& fe = es[0];
+    for (Int j = 0; j < nslices(fp); ++j)
+      for (Int k = 0; k < 3; ++k) p(j,k) = fp(j,k);
+    for (Int j = 0; j < nslices(fe); ++j)
+      for (Int k = 0; k < 4; ++k) e(j,k) = fe(j,k);
+    p_base += nslices(p_ref);
+    e_base += nslices(e_ref);
+  }
+  for (Int fi = 1; fi <= 2; ++fi) { // +x, +y faces
+    const Vec3s::HostMirror& fp = ps[fi];
+    Idxs::HostMirror& fe = es[fi];
+    for (Int j = 0; j < nslices(fp); ++j) {
+      if (j % (n+1) == 0) continue; // equiv vertex
+      for (Int k = 0; k < 3; ++k) p(p_base+j,k) = fp(j,k);
+    }
+    for (Int j = 0; j < nslices(fe); ++j) {
+      for (Int k = 0; k < 4; ++k) fe(j,k) += p_base;
+      // Left 2 vertices of left elem on face fi equiv to right 2 vertices of
+      // right elem on face fi-1. Write to the face, then copy to e, so that
+      // other faces can use these updated data.
+      if (j % n == 0) {
+        fe(j,0) = es[fi-1](j+n-1,1);
+        fe(j,3) = es[fi-1](j+n-1,2);
+      }
+      for (Int k = 0; k < 4; ++k) e(e_base+j,k) = fe(j,k);
+    }
+    p_base += nslices(p_ref);
+    e_base += nslices(e_ref);
+  }
+  { // -x face
+    const Vec3s::HostMirror& fp = ps[3];
+    Idxs::HostMirror& fe = es[3];
+    for (Int j = 0; j < nslices(fp); ++j) {
+      if (j % (n+1) == 0 || (j+1) % (n+1) == 0) continue;
+      for (Int k = 0; k < 3; ++k) p(p_base+j,k) = fp(j,k);
+    }
+    for (Int j = 0; j < nslices(fe); ++j) {
+      for (Int k = 0; k < 4; ++k) fe(j,k) += p_base;
+      if (j % n == 0) {
+        fe(j,0) = es[2](j+n-1,1);
+        fe(j,3) = es[2](j+n-1,2);
+      } else if ((j+1) % n == 0) {
+        fe(j,1) = es[0]((j+1)-n,0);
+        fe(j,2) = es[0]((j+1)-n,3);
+      }
+      for (Int k = 0; k < 4; ++k) e(e_base+j,k) = fe(j,k);
+    }
+    p_base += nslices(p_ref);
+    e_base += nslices(e_ref);
+  }
+  { // +z face
+    const Vec3s::HostMirror& fp = ps[4];
+    Idxs::HostMirror& fe = es[4];
+    for (Int j = n+1; j < nslices(fp) - (n+1); ++j) {
+      if (j % (n+1) == 0 || (j+1) % (n+1) == 0) continue;
+      for (Int k = 0; k < 3; ++k) p(p_base+j,k) = fp(j,k);
+    }
+    for (Int j = 0; j < nslices(fe); ++j)
+      for (Int k = 0; k < 4; ++k) fe(j,k) += p_base;
+    for (Int j = 0; j < n; ++j) { // -y
+      fe(j,0) = es[0](n*(n-1)+j,3);
+      fe(j,1) = es[0](n*(n-1)+j,2);
+    }
+    for (Int j = 0; j < n; ++j) { // +y
+      fe(n*(n-1)+j,2) = es[2](n*n-1-j,3);
+      fe(n*(n-1)+j,3) = es[2](n*n-1-j,2);
+    }
+    for (Int j = 0, i3 = 0; j < nslices(fe); j += n, ++i3) { // -x
+      fe(j,0) = es[3](n*n-1-i3,2);
+      fe(j,3) = es[3](n*n-1-i3,3);
+    }
+    for (Int j = n-1, i1 = 0; j < nslices(fe); j += n, ++i1) { // +x
+      fe(j,1) = es[1](n*(n-1)+i1,3);
+      fe(j,2) = es[1](n*(n-1)+i1,2);
+    }
+    for (Int j = 0; j < nslices(fe); ++j)
+      for (Int k = 0; k < 4; ++k) e(e_base+j,k) = fe(j,k);
+    p_base += nslices(p_ref);
+    e_base += nslices(e_ref);
+  }
+  { // -z face
+    const Vec3s::HostMirror& fp = ps[5];
+    Idxs::HostMirror& fe = es[5];
+    for (Int j = n+1; j < nslices(fp) - (n+1); ++j) {
+      if (j % (n+1) == 0 || (j+1) % (n+1) == 0) continue;
+      for (Int k = 0; k < 3; ++k) p(p_base+j,k) = fp(j,k);
+    }
+    for (Int j = 0; j < nslices(fe); ++j)
+      for (Int k = 0; k < 4; ++k) fe(j,k) += p_base;
+    for (Int j = 0; j < n; ++j) { // -y
+      fe(j,0) = es[0](n-1-j,1);
+      fe(j,1) = es[0](n-1-j,0);
+    }
+    for (Int j = 0; j < n; ++j) { // +y
+      fe(n*(n-1)+j,2) = es[2](j,1);
+      fe(n*(n-1)+j,3) = es[2](j,0);
+    }
+    for (Int j = 0, i3 = 0; j < nslices(fe); j += n, ++i3) { // -x
+      fe(j,0) = es[1](i3,0);
+      fe(j,3) = es[1](i3,1);
+    }
+    for (Int j = n-1, i1 = 0; j < nslices(fe); j += n, ++i1) { // +x
+      fe(j,1) = es[3](n-1-i1,1);
+      fe(j,2) = es[3](n-1-i1,0);
+    }
+    for (Int j = 0; j < nslices(fe); ++j)
+      for (Int k = 0; k < 4; ++k) e(e_base+j,k) = fe(j,k);
+  }
+  // Now go back and remove the unused vertices and adjust the numbering.
+  remove_unused_vertices(p, e, unused);
+  // Project to the unit sphere.
+  for (Int i = 0; i < nslices(p); ++i)
+    SphereGeometry::normalize(slice(p, i));
+}
+
+void calc_elem_ctr (const Vec3s::HostMirror& p, const Idxs::HostMirror& e,
+                    const Int ei, Real ctr[3]) {
+  for (Int j = 0; j < 3; ++j) ctr[j] = 0;
+  Int n = 0;
+  for (Int i = 0; i < szslice(e); ++i) {
+    if (e(ei,i) < 0) break;
+    for (Int j = 0; j < 3; ++j) ctr[j] += p(e(ei,i),j);
+    ++n;
+  }
+  for (Int j = 0; j < 3; ++j) ctr[j] /= n;
+}
+
+// Return 0 if all elements' subtri normals point outward relative to the
+// sphere.
+Int check_elem_normal_against_sphere (const Vec3s::HostMirror& p,
+                                      const Idxs::HostMirror& e) {
+  Int nerr = 0;
+  for (Int ei = 0; ei < nslices(e); ++ei) { // for each element
+    Real sphere[3]; // ray through elem ctr
+    calc_elem_ctr(p, e, ei, sphere);
+    for (Int ti = 0; ti < szslice(e) - 2; ++ti) { // for each tri
+      if (e(ei,ti+2) < 0) break;
+      Real tri_normal[3]; {
+        Real v[2][3];
+        for (Int j = 0; j < 2; ++j) {
+          SphereGeometry::copy(v[j], slice(p, e(ei,ti+j+1)));
+          SphereGeometry::axpy(-1, slice(p, e(ei,0)), v[j]);
+        }
+        SphereGeometry::cross(v[0], v[1], tri_normal);
+      }
+      if (SphereGeometry::dot(tri_normal, sphere) <= 0)
+        ++nerr;
+    }
+  }
+  return nerr;
+}
+
+//> Unit test code.
+
+struct Input {
+  Int testno;
+  Int n;
+  Real angle, xlate, ylate;
+  bool write_matlab, geo_sphere;
+
+  Input(Int argc, char** argv);
+  void print(std::ostream& os) const;
+};
+
+static void project_onto_sphere (Vec3s::HostMirror& p) {
+  for (Int ip = 0; ip < nslices(p); ++ip) {
+    p(ip,2) = 1;
+    SphereGeometry::normalize(slice(p, ip));
+  }
+}
+
+static void
+perturb_mesh (Vec3s::HostMirror& p, const Real angle, const Real xlate,
+              const Real ylate) {
+  const Real cr = std::cos(angle), sr = std::sin(angle);
+  for (Int ip = 0; ip < nslices(p); ++ip) {
+    const Real x = p(ip,0), y = p(ip,1);
+    p(ip,0) =  cr*x - sr*y + xlate;
+    p(ip,1) = -sr*x + cr*y + ylate;
+  }  
+}
+
+static void
+rotate_mesh (Vec3s::HostMirror& p, const Real axis[3], const Real angle) {
+  Real R[9];
+  form_rotation(axis, angle, R);
+  for (Int i = 0; i < nslices(p); ++i)
+    rotate(R, slice(p,i));
+}
+
+static void fill_quad (const ConstVec3s::HostMirror& p,
+                       Vec3s::HostMirror& poly) {
+  const Int n = static_cast<int>(std::sqrt(nslices(p) - 1));
+  copy(slice(poly, 0), slice(p, 0), 3);
+  copy(slice(poly, 1), slice(p, n), 3);
+  copy(slice(poly, 2), slice(p, nslices(p) - 1), 3);
+  copy(slice(poly, 3), slice(p, nslices(p) - 1 - n), 3);
+}
+
+// Area of the outline of (p,e) clipped against the outline of (cp,ce).
+template <typename Geo>
+static Real calc_true_area (
+  const ConstVec3s::HostMirror& cp, const ConstIdxs::HostMirror& ce,
+  const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e,
+  const bool wm)
+{
+  Vec3s::HostMirror clip_poly("clip_poly", 4), poly("poly", 4),
+    nml("nml", 4);
+  fill_quad(cp, clip_poly);
+  fill_quad(p, poly);
+  for (Int i = 0; i < 4; ++i)
+    Geo::edge_normal(slice(clip_poly, i), slice(clip_poly, (i+1) % 4),
+                     slice(nml, i));
+  Vec3s::HostMirror vo("vo", test::max_nvert);
+  Int no;
+  {
+    Vec3s::HostMirror wrk("wrk", test::max_nvert);
+    sh::clip_against_poly<Geo>(clip_poly, nml, poly, 4, vo, no, wrk);
+  }
+  if (wm) {
+    write_matlab("clip_poly", clip_poly);
+    write_matlab("poly", poly);
+    write_matlab("intersection",
+                 ko::subview(vo, std::pair<Int,Int>(0, no), ko::ALL()));
+  }
+  return Geo::calc_area_formula(vo, no);
+}
+
+template <typename Geo> void finalize_mesh (Vec3s::HostMirror& p) {}
+template <> void finalize_mesh<SphereGeometry> (Vec3s::HostMirror& p) {
+  project_onto_sphere(p);
+}
+
+template <typename Geo>
+static Int
+test_area (const Int n, const Real angle, const Real xlate, const Real ylate,
+           const bool wm) {
+  Vec3s::HostMirror cp;
+  Idxs::HostMirror ce;
+  make_planar_mesh(cp, ce, n);
+
+  Vec3s::HostMirror p; resize_and_copy(p, cp);
+  Idxs::HostMirror e; resize_and_copy(e, ce);
+  perturb_mesh(p, angle, xlate, ylate);
+
+  finalize_mesh<Geo>(cp);
+  finalize_mesh<Geo>(p);
+
+  const Real ta = calc_true_area<Geo>(cp, ce, p, e, wm);
+  const Real a = test::test_area_ot<Geo>(cp, ce, p, e);
+
+  const Real re = std::abs(a - ta)/ta;
+  fprintf(stderr, "true area %1.4e mesh area %1.4e relerr %1.4e\n", ta, a, re);
+  if (wm) {
+    write_matlab("cm", cp, ce);
+    write_matlab("m", p, e);
+  }
+  return re < 1e-8 ? 0 : 1;
+}
+
+static Int test_cube (const Input& in) {
+  Vec3s::HostMirror cp;
+  Idxs::HostMirror ce;
+  make_cubesphere_mesh(cp, ce, in.n);
+  Vec3s::HostMirror p; resize_and_copy(p, cp);
+  Idxs::HostMirror e; resize_and_copy(e, ce);
+  Int nerr = 0;
+  {
+    const Int ne = check_elem_normal_against_sphere(cp, ce);
+    if (ne) std::cerr << "FAIL: check_elem_normal_against_sphere\n";
+    nerr += ne;
+  }
+  { // Make a copy, perturb it, and compute the area of the sphere from the
+    // overlap mesh.
+    Real axis[] = {0.1, -0.3, 0.2};
+    rotate_mesh(p, axis, in.angle);
+    const Real
+      a = test::test_area_ot<SphereGeometry>(cp, ce, p, e),
+      ta = 4*M_PI,
+      re = std::abs(a - ta)/ta;
+    fprintf(stderr, "true area %1.4e mesh area %1.4e relerr %1.4e\n",
+            ta, a, re);
+    nerr += re < 1e-8 ? 0 : 1;
+  }
+  // Test ref square <-> spherical quad transformations.
+  nerr += sqr::test::test_sphere_to_ref(p, e);
+  if (in.write_matlab) {
+    write_matlab("cm", cp, ce);
+    write_matlab("m", p, e);
+  }
+  return nerr;
+}
+
+template <typename Geo>
+Int run (const Input& in) {
+  switch (in.testno) {
+  case 0:
+    return test_area<Geo>(in.n, in.angle, in.xlate, in.ylate, in.write_matlab);
+  case 1:
+    return test_cube(in);
+  default:
+    return 1;
+  }
+}
+
+inline bool
+eq (const std::string& a, const char* const b1, const char* const b2 = 0) {
+  return (a == std::string(b1) || (b2 && a == std::string(b2)) ||
+          a == std::string("-") + std::string(b1));
+}
+
+Input::Input (Int argc, char** argv)
+  : testno(0), n(25), angle(M_PI*1e-1), xlate(1e-1), ylate(1e-1),
+    write_matlab(false), geo_sphere(true)
+{
+  for (Int i = 1; i < argc; ++i) {
+    const std::string& token = argv[i];
+    if (eq(token, "--testno")) testno = atoi(argv[++i]);
+    else if (eq(token, "-n")) n = atoi(argv[++i]);
+    else if (eq(token, "-m", "--write-matlab")) write_matlab = true;
+    else if (eq(token, "--plane")) geo_sphere = false;
+    else if (eq(token, "--xlate")) xlate = atof(argv[++i]);
+    else if (eq(token, "--ylate")) ylate = atof(argv[++i]);
+    else if (eq(token, "--angle")) angle = atof(argv[++i]);
+  }
+
+  print(std::cout);
+}
+
+void Input::print (std::ostream& os) const {
+  os << "testno " << testno << "\n"
+     << "n (-n): " << n << "\n"
+     << "write matlab (-m): " << write_matlab << "\n"
+     << "planar geometry (--plane): " << ! geo_sphere << "\n"
+     << "angle (--angle): " << angle << "\n"
+     << "xlate (--xlate): " << xlate << "\n"
+     << "ylate (--ylate): " << ylate << "\n";
+}
+
+int main (int argc, char** argv) {
+  Kokkos::initialize(argc, argv);
+  {
+    Input in(argc, argv);
+    Int nerr = 0;
+    if (in.geo_sphere)
+      nerr += run<SphereGeometry>(in);
+    else {
+#ifdef INSTANTIATE_PLANE
+      nerr += run<PlaneGeometry>(in);
+#else
+      Kokkos::abort("PlaneGeometry not instantiated.");
+#endif
+    }
+    std::cerr << (nerr ? "FAIL" : "PASS") << "ED\n";
+  }
+  Kokkos::finalize_all();
+}