diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..175ace3 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,81 @@ +cmake_minimum_required (VERSION 3.5) + +project (compose CXX) +set (CMAKE_CXX_STANDARD 11) + +function (prc var) + message ("${var}: ${${var}}") +endfunction () + +find_package (MPI REQUIRED) + +if (Kokkos_DIR) + include (${Kokkos_DIR}/kokkos.cmake) + set (Kokkos_INCLUDE ${Kokkos_DIR}/include) +else () + message (FATAL_ERROR "COMPOSE requires Kokkos_DIR") +endif () + +set (SOURCES + cedr/cedr_caas.cpp + cedr/cedr_local.cpp + cedr/cedr_mpi.cpp + cedr/cedr_qlt.cpp + cedr/cedr_test.cpp + cedr/cedr_test_1d_transport.cpp + cedr/cedr_test_randomized.cpp + cedr/cedr_util.cpp) + +set (HEADERS + cedr/cedr.hpp + cedr/cedr_caas.hpp + cedr/cedr_caas_inl.hpp + cedr/cedr_cdr.hpp + cedr/cedr_kokkos.hpp + cedr/cedr_local.hpp + cedr/cedr_local_inl.hpp + cedr/cedr_mpi.hpp + cedr/cedr_mpi_inl.hpp + cedr/cedr_qlt.hpp + cedr/cedr_qlt_inl.hpp + cedr/cedr_test.hpp + cedr/cedr_test_randomized.hpp + cedr/cedr_util.hpp + siqk/siqk.hpp + siqk/siqk_defs.hpp + siqk/siqk_geometry.hpp + siqk/siqk_intersect.hpp + siqk/siqk_quadrature.hpp + siqk/siqk_search.hpp + siqk/siqk_sqr.hpp) + +if (NOT COMPOSE_TEST_MPIRUN) + set (COMPOSE_TEST_MPIRUN mpirun) +endif () +if (NOT COMPOSE_TEST_NRANK) + set (COMPOSE_TEST_NRANK 8) +endif () + +set (COMPOSE_COMPILE_FLAGS "${MPI_COMPILE_FLAGS} ${KOKKOS_CXXFLAGS} ${CMAKE_CXX_FLAGS}") +set (COMPOSE_LINK_FLAGS "${MPI_LINK_FLAGS} ${KOKKOS_LDFLAGS}") +set (COMPOSE_INCLUDES "${Kokkos_INCLUDE}") +set (COMPOSE_LIBRARIES ${MPI_LIBRARIES} ${KOKKOS_LIBS}) + +prc(MPI_COMPILE_FLAGS) +prc(MPI_LINK_FLAGS) +prc(MPI_LIBRARIES) +add_library (${PROJECT_NAME} ${SOURCES}) +set_target_properties (${PROJECT_NAME} PROPERTIES + COMPILE_FLAGS ${COMPOSE_COMPILE_FLAGS} + LINK_FLAGS ${COMPOSE_LINK_FLAGS}) +target_include_directories (${PROJECT_NAME} PUBLIC cedr siqk) +target_include_directories (${PROJECT_NAME} PRIVATE siqk cedr) +target_include_directories (${PROJECT_NAME} PUBLIC ${COMPOSE_INCLUDES}) +target_link_libraries (${PROJECT_NAME} ${COMPOSE_LIBRARIES}) + +install (TARGETS ${PROJECT_NAME} ARCHIVE DESTINATION lib) +install (FILES ${HEADERS} DESTINATION include/compose) + +enable_testing () +add_subdirectory(siqk) +add_subdirectory(cedr) diff --git a/README.md b/README.md index 8dec62e..165fdd3 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,22 @@ # COMPOSE -Compact Multi-moment Performance-Portable Semi-Lagrangian methods for non-hydrostatic dynamics +Compact Multi-moment Performance-Portable Semi-Lagrangian methods + +COMPOSE provides libraries for semi-Lagrangian transport and, together or +separately, property preservation. + +CEDR: Communication-Efficient Constrained Density Reconstructors. +SIQK: Sphereical Polygon Intersection and Quadrature. + +First, install Kokkos: + https://github.com/kokkos/kokkos +For example, in a typical environment using OpenMP, a simple build line is: + ./kokkos/generate_makefile.bash --with-serial --with-openmp --prefix=/path/to/my/libs --compiler=g++ + make -j8 install + +Second, configure, build, and test COMPOSE: + cmake \ + -D Kokkos_DIR=/path/to/my/kokkos/install \ + -D CMAKE_INSTALL_PREFIX=/path/to/my/compose/install \ + /path/to/compose/repo + make -j8 + ctest diff --git a/cedr/CMakeLists.txt b/cedr/CMakeLists.txt new file mode 100644 index 0000000..f0f5c88 --- /dev/null +++ b/cedr/CMakeLists.txt @@ -0,0 +1,15 @@ +add_executable (cedr_test cedr_test.cpp) +set_target_properties (cedr_test PROPERTIES + COMPILE_FLAGS ${COMPOSE_COMPILE_FLAGS} + LINK_FLAGS ${COMPOSE_LINK_FLAGS}) + +target_include_directories (cedr_test PRIVATE ${COMPOSE_INCLUDES}) +target_link_libraries (cedr_test ${PROJECT_NAME} ${COMPOSE_LIBRARIES}) + +add_test (NAME cedr-test-unit + COMMAND $ -t) +add_test (NAME cedr-test-unit-mpi + COMMAND ${COMPOSE_TEST_MPIRUN} -np ${COMPOSE_TEST_NRANK} + $ -t --proc-random -nc 111 -nt 11) +add_test (NAME cedr-test-t1d + COMMAND $ -t -t1d -nc 111) diff --git a/cedr/cedr.hpp b/cedr/cedr.hpp new file mode 100644 index 0000000..dea74e7 --- /dev/null +++ b/cedr/cedr.hpp @@ -0,0 +1,31 @@ +#ifndef INCLUDE_CEDR_HPP +#define INCLUDE_CEDR_HPP + +#include "cedr_kokkos.hpp" + +// Communication-Efficient Constrained Density Reconstructors +namespace cedr { +typedef int Int; +typedef long int Long; +typedef std::size_t Size; +typedef double Real; + +// CDRs in general implement +// * tracer mass, Qm, conservation; +// * mixing ratio, q, shape preservation, either local bound preservation or +// dynamic range preservation; and +// * tracer consistency, which follows from dynamic range preservation or +// stronger (including local bound preservation) with rhom coming from the +// dynamics. +// +// One can solve a subset of these. +// If !conserve, then the CDR does not alter the tracer mass, but it does not +// correct for any failure in mass conservation in the field given to it. +// If consistent but !shapepreserve, the the CDR solves the dynamic range +// preservation problem rather than the local bound preservation problem. +struct ProblemType { + enum : Int { conserve = 1, shapepreserve = 1 << 1, consistent = 1 << 2 }; +}; +} + +#endif diff --git a/cedr/cedr_caas.cpp b/cedr/cedr_caas.cpp new file mode 100644 index 0000000..2694024 --- /dev/null +++ b/cedr/cedr_caas.cpp @@ -0,0 +1,214 @@ +#include "cedr_caas.hpp" +#include "cedr_util.hpp" +#include "cedr_test_randomized.hpp" + +namespace cedr { +namespace caas { + +template +CAAS::CAAS (const mpi::Parallel::Ptr& p, const Int nlclcells) + : p_(p), nlclcells_(nlclcells), nrhomidxs_(0), need_conserve_(false) +{ + cedr_throw_if(nlclcells == 0, "CAAS does not support 0 cells on a rank."); + tracer_decls_ = std::make_shared >(); +} + +template +void CAAS::declare_tracer(int problem_type, const Int& rhomidx) { + cedr_throw_if( ! (problem_type & ProblemType::shapepreserve), + "CAAS is a WIP; ! shapepreserve is not supported yet."); + cedr_throw_if(rhomidx > 0, "rhomidx > 0 is not supported yet."); + tracer_decls_->push_back(Decl(problem_type, rhomidx)); + if (problem_type & ProblemType::conserve) + need_conserve_ = true; + nrhomidxs_ = std::max(nrhomidxs_, rhomidx+1); +} + +template +void CAAS::end_tracer_declarations () { + cedr_throw_if(tracer_decls_->size() == 0, "#tracers is 0."); + cedr_throw_if(nrhomidxs_ == 0, "#rhomidxs is 0."); + probs_ = IntList("CAAS probs", static_cast(tracer_decls_->size())); + t2r_ = IntList("CAAS t2r", static_cast(tracer_decls_->size())); + for (Int i = 0; i < probs_.extent_int(0); ++i) { + probs_(i) = (*tracer_decls_)[i].probtype; + t2r_(i) = (*tracer_decls_)[i].rhomidx; + } + tracer_decls_ = nullptr; + // (rho, Qm, Qm_min, Qm_max, [Qm_prev]) + const Int e = need_conserve_ ? 1 : 0; + d_ = RealList("CAAS data", nlclcells_ * ((3+e)*probs_.size() + 1)); + const auto nslots = 4*probs_.size(); + // (e'Qm_clip, e'Qm, e'Qm_min, e'Qm_max, [e'Qm_prev]) + send_ = RealList("CAAS send", nslots); + recv_ = RealList("CAAS recv", nslots); +} + +template +int CAAS::get_problem_type (const Int& tracer_idx) const { + cedr_assert(tracer_idx >= 0 && tracer_idx < probs_.extent_int(0)); + return probs_[tracer_idx]; +} + +template +Int CAAS::get_num_tracers () const { + return probs_.extent_int(0); +} + +template +void CAAS::reduce_locally () { + const Int nt = probs_.size(); + Int k = 0; + Int os = nlclcells_; + // Qm_clip + for ( ; k < nt; ++k) { + Real Qm_sum = 0, Qm_clip_sum = 0; + for (Int i = 0; i < nlclcells_; ++i) { + const Real Qm = d_(os+i); + Qm_sum += (probs_(k) & ProblemType::conserve ? + d_(os + nlclcells_*3*nt + i) /* Qm_prev */ : + Qm); + const Real Qm_min = d_(os + nlclcells_* nt + i); + const Real Qm_max = d_(os + nlclcells_*2*nt + i); + const Real Qm_clip = cedr::impl::min(Qm_max, cedr::impl::max(Qm_min, Qm)); + Qm_clip_sum += Qm_clip; + d_(os+i) = Qm_clip; + } + send_( k) = Qm_clip_sum; + send_(nt + k) = Qm_sum; + os += nlclcells_; + } + k += nt; + // Qm_min, Qm_max + for ( ; k < 4*nt; ++k) { + Real accum = 0; + for (Int i = 0; i < nlclcells_; ++i) + accum += d_(os+i); + send_(k) = accum; + os += nlclcells_; + } +} + +template +void CAAS::reduce_globally () { + int err = mpi::all_reduce(*p_, send_.data(), recv_.data(), send_.size(), MPI_SUM); + cedr_throw_if(err != MPI_SUCCESS, + "CAAS::reduce_globally MPI_Allreduce returned " << err); +} + +template +void CAAS::finish_locally () { + const Int nt = probs_.size(); + Int os = nlclcells_; + for (Int k = 0; k < nt; ++k) { + const Real Qm_clip_sum = recv_( k); + const Real Qm_sum = recv_(nt + k); + const Real m = Qm_sum - Qm_clip_sum; + if (m < 0) { + const Real Qm_min_sum = recv_(2*nt + k); + Real fac = Qm_clip_sum - Qm_min_sum; + if (fac > 0) { + fac = m/fac; + for (Int i = 0; i < nlclcells_; ++i) { + const Real Qm_min = d_(os + nlclcells_* nt + i); + Real& Qm = d_(os+i); + Qm += fac*(Qm - Qm_min); + } + } + } else if (m > 0) { + const Real Qm_max_sum = recv_(3*nt + k); + Real fac = Qm_max_sum - Qm_clip_sum; + if (fac > 0) { + fac = m/fac; + for (Int i = 0; i < nlclcells_; ++i) { + const Real Qm_max = d_(os + nlclcells_*2*nt + i); + Real& Qm = d_(os+i); + Qm += fac*(Qm_max - Qm); + } + } + } + os += nlclcells_; + } +} + +template +void CAAS::run () { + reduce_locally(); + reduce_globally(); + finish_locally(); +} + +namespace test { +struct TestCAAS : public cedr::test::TestRandomized { + typedef CAAS CAAST; + + TestCAAS (const mpi::Parallel::Ptr& p, const Int& ncells, const bool verbose) + : TestRandomized("CAAS", p, ncells, verbose), + p_(p) + { + const auto np = p->size(), rank = p->rank(); + nlclcells_ = ncells / np; + const Int todo = ncells - nlclcells_ * np; + if (rank < todo) ++nlclcells_; + caas_ = std::make_shared(p, nlclcells_); + init(); + } + + CDR& get_cdr () override { return *caas_; } + + void init_numbering () override { + const auto np = p_->size(), rank = p_->rank(); + Int start = 0; + for (Int lrank = 0; lrank < rank; ++lrank) + start += get_nllclcells(ncells_, np, lrank); + gcis_.resize(nlclcells_); + for (Int i = 0; i < nlclcells_; ++i) + gcis_[i] = start + i; + } + + void init_tracers () override { + // CAAS doesn't yet support everything, so remove a bunch of the tracers. + std::vector tracers; + Int idx = 0; + for (auto& t : tracers_) { + if ( ! (t.problem_type & ProblemType::shapepreserve) || + ! t.local_should_hold) + continue; + t.idx = idx++; + tracers.push_back(t); + caas_->declare_tracer(t.problem_type, 0); + } + tracers_ = tracers; + caas_->end_tracer_declarations(); + } + + void run_impl (const Int trial) override { + caas_->run(); + } + +private: + mpi::Parallel::Ptr p_; + Int nlclcells_; + CAAST::Ptr caas_; + + static Int get_nllclcells (const Int& ncells, const Int& np, const Int& rank) { + Int nlclcells = ncells / np; + const Int todo = ncells - nlclcells * np; + if (rank < todo) ++nlclcells; + return nlclcells; + } +}; + +Int unittest (const mpi::Parallel::Ptr& p) { + const auto np = p->size(); + Int nerr = 0; + for (Int nlclcells : {1, 2, 4, 11}) { + Long ncells = np*nlclcells; + if (ncells > np) ncells -= np/2; + nerr += TestCAAS(p, ncells, false).run(1, false); + } + return nerr; +} +} // namespace test +} // namespace caas +} // namespace cedr diff --git a/cedr/cedr_caas.hpp b/cedr/cedr_caas.hpp new file mode 100644 index 0000000..849636f --- /dev/null +++ b/cedr/cedr_caas.hpp @@ -0,0 +1,75 @@ +#ifndef INCLUDE_CEDR_CAAS_HPP +#define INCLUDE_CEDR_CAAS_HPP + +#include "cedr_cdr.hpp" + +namespace cedr { +// ClipAndAssuredSum. +namespace caas { + +template +class CAAS : public CDR { +public: + typedef typename cedr::impl::DeviceType::type Device; + typedef CAAS Me; + typedef std::shared_ptr Ptr; + +public: + CAAS(const mpi::Parallel::Ptr& p, const Int nlclcells); + + void declare_tracer(int problem_type, const Int& rhomidx) override; + + void end_tracer_declarations() override; + + int get_problem_type(const Int& tracer_idx) const override; + + Int get_num_tracers() const override; + + // lclcellidx is trivial; it is the user's index for the cell. + KOKKOS_INLINE_FUNCTION + void set_rhom(const Int& lclcellidx, const Int& rhomidx, const Real& rhom) override; + + KOKKOS_INLINE_FUNCTION + void set_Qm(const Int& lclcellidx, const Int& tracer_idx, + const Real& Qm, const Real& Qm_min, const Real& Qm_max, + const Real Qm_prev = -1) override; + + void run() override; + + KOKKOS_INLINE_FUNCTION + Real get_Qm(const Int& lclcellidx, const Int& tracer_idx) override; + +private: + typedef Kokkos::View RealList; + typedef cedr::impl::Unmanaged UnmanagedRealList; + typedef Kokkos::View IntList; + + struct Decl { + int probtype; + Int rhomidx; + Decl (const int probtype_, const Int rhomidx_) + : probtype(probtype_), rhomidx(rhomidx_) {} + }; + + mpi::Parallel::Ptr p_; + + Int nlclcells_, nrhomidxs_; + std::shared_ptr > tracer_decls_; + bool need_conserve_; + IntList probs_, t2r_; + RealList d_, send_, recv_; + + void reduce_locally(); + void reduce_globally(); + void finish_locally(); +}; + +namespace test { +Int unittest(const mpi::Parallel::Ptr& p); +} // namespace test +} // namespace caas +} // namespace cedr + +#include "cedr_caas_inl.hpp" + +#endif diff --git a/cedr/cedr_caas_inl.hpp b/cedr/cedr_caas_inl.hpp new file mode 100644 index 0000000..f1a64fd --- /dev/null +++ b/cedr/cedr_caas_inl.hpp @@ -0,0 +1,42 @@ +#ifndef INCLUDE_CEDR_CAAS_INL_HPP +#define INCLUDE_CEDR_CAAS_INL_HPP + +#include "cedr_util.hpp" + +namespace cedr { +// ClipAndAssuredSum. +namespace caas { + +template KOKKOS_INLINE_FUNCTION +void CAAS::set_rhom (const Int& lclcellidx, const Int& rhomidx, const Real& rhom) { + cedr_kernel_assert(lclcellidx >= 0 && lclcellidx < nlclcells_); + cedr_kernel_assert(rhomidx >= 0 && rhomidx < nrhomidxs_); + d_(lclcellidx) = rhom; +} + +template KOKKOS_INLINE_FUNCTION +void CAAS +::set_Qm (const Int& lclcellidx, const Int& tracer_idx, + const Real& Qm, const Real& Qm_min, const Real& Qm_max, + const Real Qm_prev) { + cedr_kernel_assert(lclcellidx >= 0 && lclcellidx < nlclcells_); + cedr_kernel_assert(tracer_idx >= 0 && tracer_idx < probs_.extent_int(0)); + const Int nt = probs_.size(); + d_((1 + tracer_idx)*nlclcells_ + lclcellidx) = Qm; + d_((1 + nt + tracer_idx)*nlclcells_ + lclcellidx) = Qm_min; + d_((1 + 2*nt + tracer_idx)*nlclcells_ + lclcellidx) = Qm_max; + if (need_conserve_) + d_((1 + 3*nt + tracer_idx)*nlclcells_ + lclcellidx) = Qm_prev; +} + +template KOKKOS_INLINE_FUNCTION +Real CAAS::get_Qm (const Int& lclcellidx, const Int& tracer_idx) { + cedr_kernel_assert(lclcellidx >= 0 && lclcellidx < nlclcells_); + cedr_kernel_assert(tracer_idx >= 0 && tracer_idx < probs_.extent_int(0)); + return d_((1 + tracer_idx)*nlclcells_ + lclcellidx); +} + +} // namespace caas +} // namespace cedr + +#endif diff --git a/cedr/cedr_cdr.hpp b/cedr/cedr_cdr.hpp new file mode 100644 index 0000000..6dad452 --- /dev/null +++ b/cedr/cedr_cdr.hpp @@ -0,0 +1,65 @@ +#ifndef INCLUDE_CEDR_CDR_HPP +#define INCLUDE_CEDR_CDR_HPP + +#include "cedr_mpi.hpp" + +namespace cedr { +// Constrained Density Reconstructor interface. +struct CDR { + typedef std::shared_ptr Ptr; + + virtual void print(std::ostream& os) const {} + + // Set up QLT tracer metadata. Call declare_tracer in order of the tracer + // index in the caller's numbering. Once end_tracer_declarations is called, it + // is an error to call declare_tracer again. + // Associate the tracer with a rhom index. In many problems, there will be + // only one rhom, so rhomidx is always 0. + // It is an error to call this function from a parallel region. + virtual void declare_tracer(int problem_type, const Int& rhomidx) = 0; + + // It is an error to call this function from a parallel region. + virtual void end_tracer_declarations() = 0; + + virtual int get_problem_type(const Int& tracer_idx) const = 0; + + virtual Int get_num_tracers() const = 0; + + // set_{rhom,Qm}: Set cell values prior to running the QLT algorithm. + // + // Notation: + // rho: Total density. + // Q: Tracer density. + // q: Tracer mixing ratio = Q/rho. + // *m: Mass corresponding to the density; results from an integral over a + // region, such as a cell. + // Some CDRs have a nontrivial local <-> global cell index map. For these + // CDRs, lclcellidx may be nontrivial. For others, the caller should provide + // the index into the local cell. + // + // set_rhom must be called before set_Qm. + virtual void set_rhom( + const Int& lclcellidx, const Int& rhomidx, + // Current total mass in this cell. + const Real& rhom) = 0; + + virtual void set_Qm( + const Int& lclcellidx, const Int& tracer_idx, + // Current tracer mass in this cell. + const Real& Qm, + // Minimum and maximum permitted tracer mass in this cell. + const Real& Qm_min, const Real& Qm_max, + // If mass conservation is requested, provide the previous Qm, which will be + // summed to give the desired global mass. + const Real Qm_prev = -1) = 0; + + // Run the QLT algorithm with the values set by set_{rho,Q}. It is an error to + // call this function from a parallel region. + virtual void run() = 0; + + // Get a cell's tracer mass Qm after the QLT algorithm has run. + virtual Real get_Qm(const Int& lclcellidx, const Int& tracer_idx) = 0; +}; +} // namespace cedr + +#endif diff --git a/cedr/cedr_kokkos.hpp b/cedr/cedr_kokkos.hpp new file mode 100644 index 0000000..ec25b02 --- /dev/null +++ b/cedr/cedr_kokkos.hpp @@ -0,0 +1,53 @@ +#ifndef INCLUDE_CEDR_KOKKOS_HPP +#define INCLUDE_CEDR_KOKKOS_HPP + +#include + +namespace cedr { +namespace impl { +template +using MemoryTraits = Kokkos::MemoryTraits< + MemoryTraitsType::Unmanaged | MemoryTraitsType::RandomAccess | + MemoryTraitsType::Atomic | flag>; + +template +using Unmanaged = Kokkos::View< + typename View::data_type, typename View::array_layout, + typename View::device_type, MemoryTraits >; +template +using Const = Kokkos::View< + typename View::const_data_type, typename View::array_layout, + typename View::device_type, typename View::memory_traits>; +template +using ConstUnmanaged = Const >; + +template +struct DeviceType { + typedef Kokkos::Device type; +}; + +#ifdef KOKKOS_HAVE_CUDA +typedef Kokkos::Device DefaultDeviceType; + +template <> struct DeviceType { + typedef DefaultDeviceType type; +}; +#else +typedef Kokkos::Device DefaultDeviceType; +#endif + +// GPU-friendly replacements for std::*. +template KOKKOS_INLINE_FUNCTION +const T& min (const T& a, const T& b) { return a < b ? a : b; } +template KOKKOS_INLINE_FUNCTION +const T& max (const T& a, const T& b) { return a > b ? a : b; } +template KOKKOS_INLINE_FUNCTION +void swap (T& a, T& b) { const T tmp = a; a = b; b = tmp; } +} +} + +#endif diff --git a/cedr/cedr_local.cpp b/cedr/cedr_local.cpp new file mode 100644 index 0000000..15167e5 --- /dev/null +++ b/cedr/cedr_local.cpp @@ -0,0 +1,223 @@ +#include "cedr_local.hpp" +#include "cedr_local_inl.hpp" + +namespace cedr { +namespace local { +namespace test { +// Check the first-order optimality conditions. Return true if OK, false +// otherwise. If quiet, don't print anything. +bool check_1eq_bc_qp_foc ( + const char* label, const Int n, const Real* w, const Real* a, const Real b, + const Real* xlo, const Real* xhi, const Real* y, const Real* x, const bool verbose) +{ + auto& os = std::cout; + bool ok = true; + Real xtmp; + // Check the bound constraints. + for (Int i = 0; i < n; ++i) + if (x[i] < (xtmp = xlo[i])) { + if (verbose) + os << "x[" << i << "] = " << x[i] + << " but x[i] - xlo[i] = " << (x[i] - xtmp) << "\n"; + ok = false; + } + for (Int i = 0; i < n; ++i) + if (x[i] > (xtmp = xhi[i])) { + if (verbose) + os << "x[" << i << "] = " << x[i] + << " but xhi[i] - x[i] = " << (xtmp - x[i]) << "\n"; + ok = false; + } + // Check the equality constraint. + Real r = 0; + for (Int i = 0; i < n; ++i) + r += a[i]*x[i]; + r -= b; + if (std::abs(r) > impl::calc_r_tol(b, a, y, n)) { + if (verbose) + os << "r = " << r << "\n"; + ok = false; + } + // Check the gradient is 0 when projected into the constraints. Compute + // g = W (x - y) + // g_reduced = g - C ((C'C) \ (C'g)) + // where + // IA = I(:,A) + // C = [IA a], + // and A is the active set. + const Real padtol = 1e5*std::numeric_limits::epsilon(); + Real lambda = 0, den = 0; + for (Int i = 0; i < n; ++i) { + const Real pad = padtol*(xhi[i] - xlo[i]); + if (xlo[i] + pad <= x[i] && x[i] <= xhi[i] - pad) { + const Real gi = w[i]*(x[i] - y[i]); + lambda += a[i]*gi; + den += a[i]*a[i]; + } + } + lambda /= den; + Real normg = 0, normy = 0; + for (Int i = 0; i < n; ++i) { + normy += cedr::util::square(y[i]); + const Real pad = padtol*(xhi[i] - xlo[i]); + if (xlo[i] + pad <= x[i] && x[i] <= xhi[i] - pad) + normg += cedr::util::square(w[i]*(x[i] - y[i]) - a[i]*lambda); + } + normy = std::sqrt(normy); + normg = std::sqrt(normg); + const Real gtol = 1e4*std::numeric_limits::epsilon()*normy; + if (normg > gtol) { + if (verbose) + os << "norm(g) = " << normg << " gtol = " << gtol << "\n"; + ok = false; + } + // Check the gradient at the active boundaries. + for (Int i = 0; i < n; ++i) { + const bool onlo = x[i] == xlo[i]; + const bool onhi = onlo ? false : x[i] == xhi[i]; + if (onlo || onhi) { + const Real rg = w[i]*(x[i] - y[i]) - a[i]*lambda; + if (onlo && rg < -gtol) { + if (verbose) + os << "onlo but rg = " << rg << "\n"; + ok = false; + } else if (onhi && rg > gtol) { + if (verbose) + os << "onhi but rg = " << rg << "\n"; + ok = false; + } + } + } + if ( ! ok && verbose) + os << "label: " << label << "\n"; + return ok; +} +} // namespace test + +Int unittest () { + bool verbose = true; + Int nerr = 0; + + Int n; + static const Int N = 16; + Real w[N], a[N], b, xlo[N], xhi[N], y[N], x[N], al, au; + + auto run = [&] () { + const Int info = solve_1eq_bc_qp(n, w, a, b, xlo, xhi, y, x); + const bool ok = test::check_1eq_bc_qp_foc( + "unittest", n, w, a, b, xlo, xhi, y, x, verbose); + if ( ! ok) ++nerr; + + if (n == 2) { + // This version never returns 0. + Real x2[2]; + const Int info2 = solve_1eq_bc_qp_2d(w, a, b, xlo, xhi, y, x2); + if (info2 != 1 && (info == 0 || info == 1)) { + if (verbose) pr(puf(info) pu(info2)); + ++nerr; + } + const Real rd = cedr::util::reldif(x, x2, 2); + if (rd > 1e4*std::numeric_limits::epsilon()) { + if (verbose) + printf("%1.1e | y %1.15e %1.15e | x %1.15e %1.15e | " + "x2 %1.15e %1.15e | l %1.15e %1.15e | u %1.15e %1.15e\n", + rd, y[0], y[1], x[0], x[1], x2[0], x2[1], + xlo[0], xlo[1], xhi[0], xhi[1]); + ++nerr; + } + } + + caas(n, a, b, xlo, xhi, y, x); + Real m = 0, den = 0; + for (Int i = 0; i < n; ++i) { + m += a[i]*x[i]; + den += std::abs(a[i]*x[i]); + if (x[i] < xlo[i]) ++nerr; + else if (x[i] > xhi[i]) ++nerr; + } + const Real rd = std::abs(b - m)/den; + if (rd > 1e3*std::numeric_limits::epsilon()) { + if (verbose) pr(puf(rd) pu(n) pu(b) pu(m)); + ++nerr; + } + }; + + auto gena = [&] () { + for (Int i = 0; i < n; ++i) + a[i] = 0.1 + cedr::util::urand(); + }; + auto genw = [&] () { + for (Int i = 0; i < n; ++i) + w[i] = 0.1 + cedr::util::urand(); + }; + auto genbnds = [&] () { + al = au = 0; + for (Int i = 0; i < n; ++i) { + xlo[i] = cedr::util::urand() - 0.5; + al += a[i]*xlo[i]; + xhi[i] = xlo[i] + cedr::util::urand(); + au += a[i]*xhi[i]; + } + }; + auto genb = [&] (const bool in) { + if (in) { + const Real alpha = cedr::util::urand(); + b = alpha*al + (1 - alpha)*au; + } else { + if (cedr::util::urand() > 0.5) + b = au + 0.01 + cedr::util::urand(); + else + b = al - 0.01 - cedr::util::urand(); + } + }; + auto geny = [&] (const bool in) { + if (in) { + for (Int i = 0; i < n; ++i) { + const Real alpha = cedr::util::urand(); + y[i] = alpha*xlo[i] + (1 - alpha)*xhi[i]; + } + } else if (cedr::util::urand() > 0.2) { + for (Int i = 1; i < n; i += 2) { + const Real alpha = cedr::util::urand(); + y[i] = alpha*xlo[i] + (1 - alpha)*xhi[i]; + cedr_assert(y[i] >= xlo[i] && y[i] <= xhi[i]); + } + for (Int i = 0; i < n; i += 4) + y[i] = xlo[i] - cedr::util::urand(); + for (Int i = 2; i < n; i += 4) + y[i] = xhi[i] + cedr::util::urand(); + } else { + for (Int i = 0; i < n; i += 2) + y[i] = xlo[i] - cedr::util::urand(); + for (Int i = 1; i < n; i += 2) + y[i] = xhi[i] + cedr::util::urand(); + } + }; + auto b4y = [&] () { + b = 0; + for (Int i = 0; i < n; ++i) + b += a[i]*y[i]; + }; + + for (n = 2; n <= 16; ++n) { + const Int count = n == 2 ? 100 : 10; + for (Int i = 0; i < count; ++i) { + gena(); + genw(); + genbnds(); + genb(true); + geny(true); + run(); + b4y(); + run(); + genb(true); + geny(false); + run(); + } + } + + return nerr; +} + +} +} diff --git a/cedr/cedr_local.hpp b/cedr/cedr_local.hpp new file mode 100644 index 0000000..e6a6ba8 --- /dev/null +++ b/cedr/cedr_local.hpp @@ -0,0 +1,41 @@ +#ifndef INCLUDE_CEDR_LOCAL_HPP +#define INCLUDE_CEDR_LOCAL_HPP + +#include "cedr.hpp" +#include "cedr_kokkos.hpp" + +namespace cedr { +namespace local { + +// Solve +// min_x sum_i w(i) (x(i) - y(i))^2 +// st a' x = b +// xlo <= x <= xhi, +// a(i), w(i) > 0. Return 0 on success and x == y, 1 on success and x != y, -1 +// if infeasible, -2 if max_its hit with no solution. See Section 3 of Bochev, +// Ridzal, Shashkov, Fast optimization-based conservative remap of scalar fields +// through aggregate mass transfer. lambda is used in check_1eq_bc_qp_foc. +KOKKOS_INLINE_FUNCTION +Int solve_1eq_bc_qp(const Int n, const Real* w, const Real* a, const Real b, + const Real* xlo, const Real* xhi, + const Real* y, Real* x, const Int max_its = 100); + +KOKKOS_INLINE_FUNCTION +Int solve_1eq_bc_qp_2d(const Real* w, const Real* a, const Real b, + const Real* xlo, const Real* xhi, + const Real* y, Real* x); + +// ClipAndAssuredSum. Does not check for feasibility. +KOKKOS_INLINE_FUNCTION +void caas(const Int n, const Real* a, const Real b, + const Real* xlo, const Real* xhi, + const Real* y, Real* x); + +Int unittest(); + +} +} + +#include "cedr_local_inl.hpp" + +#endif diff --git a/cedr/cedr_local_inl.hpp b/cedr/cedr_local_inl.hpp new file mode 100644 index 0000000..5c3c867 --- /dev/null +++ b/cedr/cedr_local_inl.hpp @@ -0,0 +1,303 @@ +#ifndef INCLUDE_CEDR_LOCAL_INL_HPP +#define INCLUDE_CEDR_LOCAL_INL_HPP + +#include "cedr_util.hpp" + +namespace cedr { +namespace local { + +namespace impl { +KOKKOS_INLINE_FUNCTION +Real calc_r_tol (const Real b, const Real* a, const Real* y, const Int n) { + Real ab = std::abs(b); + for (Int i = 0; i < n; ++i) ab = std::max(ab, std::abs(a[i]*y[i])); + return 1e1*std::numeric_limits::epsilon()*std::abs(ab); +} + +// Eval r at end points to check for feasibility, and also possibly a quick exit +// on a common case. Return -1 if infeasible, 1 if a corner is a solution, 0 if +// feasible and a corner is not. +KOKKOS_INLINE_FUNCTION +Int check_lu (const Int n, const Real* a, const Real& b, + const Real* xlo, const Real* xhi, const Real* y, const Real& r_tol, + Real* x) { + Real r = -b; + for (Int i = 0; i < n; ++i) { + x[i] = xlo[i]; + r += a[i]*x[i]; + } + if (std::abs(r) <= r_tol) return 1; + if (r > 0) return -1; + r = -b; + for (Int i = 0; i < n; ++i) { + x[i] = xhi[i]; + r += a[i]*x[i]; + } + if (std::abs(r) <= r_tol) return 1; + if (r < 0) return -1; + return 0; +} + +KOKKOS_INLINE_FUNCTION +void calc_r (const Int n, const Real* w, const Real* a, const Real b, + const Real* xlo, const Real* xhi, const Real* y, const Real& lambda, + Real* x, Real& r, Real& r_lambda) { + r = 0; + r_lambda = 0; + for (Int i = 0; i < n; ++i) { + const Real q = a[i]/w[i]; + const Real x_trial = y[i] + lambda*q; + Real xtmp; + if (x_trial < (xtmp = xlo[i])) + x[i] = xtmp; + else if (x_trial > (xtmp = xhi[i])) + x[i] = xtmp; + else { + x[i] = x_trial; + r_lambda += a[i]*q; + } + r += a[i]*x[i]; + } + r -= b; +} +} // namespace impl + +// 2D special case for efficiency. +KOKKOS_INLINE_FUNCTION +Int solve_1eq_bc_qp_2d (const Real* w, const Real* a, const Real b, + const Real* xlo, const Real* xhi, + const Real* y, Real* x) { + const Real r_tol = impl::calc_r_tol(b, a, y, 2); + Int info = impl::check_lu(2, a, b, xlo, xhi, y, r_tol, x); + if (info != 0) return info; + + { // Check if the optimal point ignoring bound constraints is in bounds. + Real qmass = 0, dm = b; + for (int i = 0; i < 2; ++i) { + const Real qi = a[i]/w[i]; + qmass += a[i]*qi; + dm -= a[i]*y[i]; + } + const Real lambda = dm/qmass; + bool ok = true; + for (int i = 0; i < 2; ++i) { + x[i] = y[i] + lambda*(a[i]/w[i]); + if (x[i] < xlo[i] || x[i] > xhi[i]) { + ok = false; + break; + } + } + if (ok) return 1; + } + + // Solve for intersection of a'x = b, given by the parameterized line + // p(alpa) = x_base + alpha x_dir, + // with a bounding line. + + // Get parameterized line. + Real x_base[2]; + for (int i = 0; i < 2; ++i) + x_base[i] = 0.5*b/a[i]; + Real x_dir[] = {-a[1], a[0]}; + + // Get the 4 alpha values. + Real alphas[4]; + alphas[0] = (xlo[1] - x_base[1])/x_dir[1]; // bottom + alphas[1] = (xhi[0] - x_base[0])/x_dir[0]; // right + alphas[2] = (xhi[1] - x_base[1])/x_dir[1]; // top + alphas[3] = (xlo[0] - x_base[0])/x_dir[0]; // left + + // Find the middle two in the sorted alphas. + Real min = alphas[0], max = min; + Int imin = 0, imax = 0; + for (Int i = 1; i < 4; ++i) { + const Real alpha = alphas[i]; + if (alpha < min) { min = alpha; imin = i; } + if (alpha > max) { max = alpha; imax = i; } + } + Int ais[2]; + Int cnt = 0; + for (Int i = 0; i < 4; ++i) + if (i != imin && i != imax) { + ais[cnt++] = i; + if (cnt == 2) break; + } + + Real objs[2]; + Real alpha_mid = 0; + for (Int j = 0; j < 2; ++j) { + const Real alpha = alphas[ais[j]]; + alpha_mid += alpha; + Real obj = 0; + for (Int i = 0; i < 2; ++i) { + x[i] = x_base[i] + alpha*x_dir[i]; + obj += w[i]*cedr::util::square(y[i] - x[i]); + } + objs[j] = obj; + } + + const Int ai = ais[objs[0] <= objs[1] ? 0 : 1]; + + info = 1; + Int clipidx = 0; + const Real alpha = alphas[ai]; + switch (ai) { + case 0: case 2: + x[0] = x_base[0] + alpha*x_dir[0]; + x[1] = ai == 0 ? xlo[1] : xhi[1]; + clipidx = 0; + break; + case 1: case 3: + x[0] = ai == 1 ? xhi[0] : xlo[0]; + x[1] = x_base[1] + alpha*x_dir[1]; + clipidx = 1; + break; + default: cedr_assert(0); info = -2; + } + x[clipidx] = cedr::impl::min(xhi[clipidx], cedr::impl::max(xlo[clipidx], x[clipidx])); + return info; +} + +KOKKOS_INLINE_FUNCTION +Int solve_1eq_bc_qp (const Int n, const Real* w, const Real* a, const Real b, + const Real* xlo, const Real* xhi, const Real* y, Real* x, + const Int max_its) { + const Real r_tol = impl::calc_r_tol(b, a, y, n); + Int info = impl::check_lu(n, a, b, xlo, xhi, y, r_tol, x); + if (info != 0) return info; + + for (int i = 0; i < n; ++i) + if (x[i] != y[i]) { + info = 1; + x[i] = y[i]; + } + + // In our use case, the caller has already checked (more cheaply) for a quick + // exit. +#if 0 + { // Check for a quick exit. + bool all_in = true; + Real r = 0; + for (Int i = 0; i < n; ++i) { + if (x[i] < xlo[i] || x[i] > xhi[i]) { + all_in = false; + break; + } + r += a[i]*x[i]; + } + if (all_in) { + r -= b; + if (std::abs(r) <= r_tol) + return info; + } + } +#endif + + const Real wall_dist = 1e-3; + + // Get lambda endpoints. + Real lamlo = 0, lamhi = 0; + for (Int i = 0; i < n; ++i) { + const Real rq = w[i]/a[i]; + const Real lamlo_i = rq*(xlo[i] - y[i]); + const Real lamhi_i = rq*(xhi[i] - y[i]); + if (i == 0) { + lamlo = lamlo_i; + lamhi = lamhi_i; + } else { + lamlo = cedr::impl::min(lamlo, lamlo_i); + lamhi = cedr::impl::max(lamhi, lamhi_i); + } + } + const Real lamlo_feas = lamlo, lamhi_feas = lamhi; + Real lambda = lamlo <= 0 && lamhi >= 0 ? 0 : lamlo; + + // Bisection-safeguarded Newton iteration for r(lambda) = 0. + bool prev_step_bisect = false; + Int nbisect = 0; + info = -2; + for (Int iteration = 0; iteration < max_its; ++iteration) { + // Compute x, r, r_lambda. + Real r, r_lambda; + impl::calc_r(n, w, a, b, xlo, xhi, y, lambda, x, r, r_lambda); + // Is r(lambda) - b sufficiently == 0? + if (std::abs(r) <= r_tol) { + info = 1; + break; + } + // Check if the lambda bounds are too close. + if (nbisect > 64) { + if (lamhi == lamhi_feas || lamlo == lamlo_feas) { + // r isn't small enough and one lambda bound is on the feasibility + // limit. The QP must not be feasible. + info = -1; + break; + } + info = 1; + break; + } + // Adjust lambda bounds. + if (r > 0) + lamhi = lambda; + else + lamlo = lambda; + if (r_lambda != 0) { + // Newton step. + lambda -= r/r_lambda; + } else { + // Force bisection. + lambda = lamlo; + } + // Safeguard. The wall distance check assures progress, but use it only + // every other potential bisection. + const Real D = prev_step_bisect ? 0 : wall_dist*(lamhi - lamlo); + if (lambda - lamlo < D || lamhi - lambda < D) { + lambda = 0.5*(lamlo + lamhi); + ++nbisect; + prev_step_bisect = true; + } else { + prev_step_bisect = false; + } + } + + return info; +} + +KOKKOS_INLINE_FUNCTION +void caas (const Int n, const Real* a, const Real b, + const Real* xlo, const Real* xhi, + const Real* y, Real* x) { + Real dm = b; + for (Int i = 0; i < n; ++i) { + x[i] = cedr::impl::max(xlo[i], cedr::impl::min(xhi[i], y[i])); + dm -= a[i]*x[i]; + } + if (dm == 0) return; + if (dm > 0) { + Real fac = 0; + for (Int i = 0; i < n; ++i) + fac += a[i]*(xhi[i] - x[i]); + if (fac > 0) { + fac = dm/fac; + for (Int i = 0; i < n; ++i) + x[i] += fac*(xhi[i] - x[i]); + } + } else if (dm < 0) { + Real fac = 0; + for (Int i = 0; i < n; ++i) + fac += a[i]*(x[i] - xlo[i]); + if (fac > 0) { + fac = dm/fac; + for (Int i = 0; i < n; ++i) + x[i] += fac*(x[i] - xlo[i]); + } + } + // Clip again for numerics. + for (Int i = 0; i < n; ++i) + x[i] = cedr::impl::max(xlo[i], cedr::impl::min(xhi[i], x[i])); +} + +} // namespace local +} // namespace cedr + +#endif diff --git a/cedr/cedr_mpi.cpp b/cedr/cedr_mpi.cpp new file mode 100644 index 0000000..1569a66 --- /dev/null +++ b/cedr/cedr_mpi.cpp @@ -0,0 +1,41 @@ +#include "cedr_mpi.hpp" + +namespace cedr { +namespace mpi { + +Parallel::Ptr make_parallel (MPI_Comm comm) { + return std::make_shared(comm); +} + +Int Parallel::size () const { + int sz = 0; + MPI_Comm_size(comm_, &sz); + return sz; +} + +Int Parallel::rank () const { + int pid = 0; + MPI_Comm_rank(comm_, &pid); + return pid; +} + +template <> MPI_Datatype get_type() { return MPI_INT; } +template <> MPI_Datatype get_type() { return MPI_DOUBLE; } +template <> MPI_Datatype get_type() { return MPI_LONG_INT; } + +int waitany (int count, MPI_Request* reqs, int* index, MPI_Status* stats) { + return MPI_Waitany(count, reqs, index, stats ? stats : MPI_STATUS_IGNORE); +} + +int waitall (int count, MPI_Request* reqs, MPI_Status* stats) { + return MPI_Waitall(count, reqs, stats ? stats : MPI_STATUS_IGNORE); +} + +bool all_ok (const Parallel& p, bool im_ok) { + int ok = im_ok, msg; + all_reduce(p, &ok, &msg, 1, MPI_LAND); + return static_cast(msg); +} + +} +} diff --git a/cedr/cedr_mpi.hpp b/cedr/cedr_mpi.hpp new file mode 100644 index 0000000..1f28594 --- /dev/null +++ b/cedr/cedr_mpi.hpp @@ -0,0 +1,77 @@ +#ifndef INCLUDE_CEDR_MPI_HPP +#define INCLUDE_CEDR_MPI_HPP + +#include + +#include + +#include "cedr.hpp" + +namespace cedr { +namespace mpi { + +class Parallel { + MPI_Comm comm_; +public: + typedef std::shared_ptr Ptr; + Parallel(MPI_Comm comm) : comm_(comm) {} + MPI_Comm comm () const { return comm_; } + Int size() const; + Int rank() const; + Int root () const { return 0; } + bool amroot () const { return rank() == root(); } +}; + +Parallel::Ptr make_parallel(MPI_Comm comm); + +template MPI_Datatype get_type(); + +template +int reduce(const Parallel& p, const T* sendbuf, T* rcvbuf, int count, MPI_Op op, + int root); + +template +int all_reduce(const Parallel& p, const T* sendbuf, T* rcvbuf, int count, MPI_Op op); + +template +int isend(const Parallel& p, const T* buf, int count, int dest, int tag, + MPI_Request* ireq); + +template +int irecv(const Parallel& p, T* buf, int count, int src, int tag, MPI_Request* ireq); + +int waitany(int count, MPI_Request* reqs, int* index, MPI_Status* stats = nullptr); + +int waitall(int count, MPI_Request* reqs, MPI_Status* stats = nullptr); + +template +int gather(const Parallel& p, const T* sendbuf, int sendcount, + T* recvbuf, int recvcount, int root); + +template +int gatherv(const Parallel& p, const T* sendbuf, int sendcount, + T* recvbuf, const int* recvcounts, const int* displs, int root); + +bool all_ok(const Parallel& p, bool im_ok); + +struct Op { + typedef std::shared_ptr Ptr; + + Op (MPI_User_function* function, bool commute) { + MPI_Op_create(function, static_cast(commute), &op_); + } + + ~Op () { MPI_Op_free(&op_); } + + const MPI_Op& get () const { return op_; } + +private: + MPI_Op op_; +}; + +} // namespace mpi +} // namespace cedr + +#include "cedr_mpi_inl.hpp" + +#endif diff --git a/cedr/cedr_mpi_inl.hpp b/cedr/cedr_mpi_inl.hpp new file mode 100644 index 0000000..e4f28db --- /dev/null +++ b/cedr/cedr_mpi_inl.hpp @@ -0,0 +1,59 @@ +#ifndef INCLUDE_CEDR_MPI_INL_HPP +#define INCLUDE_CEDR_MPI_INL_HPP + +namespace cedr { +namespace mpi { + +template +int reduce (const Parallel& p, const T* sendbuf, T* rcvbuf, int count, MPI_Op op, + int root) { + MPI_Datatype dt = get_type(); + return MPI_Reduce(const_cast(sendbuf), rcvbuf, count, dt, op, root, p.comm()); +} + +template +int all_reduce (const Parallel& p, const T* sendbuf, T* rcvbuf, int count, MPI_Op op) { + MPI_Datatype dt = get_type(); + return MPI_Allreduce(const_cast(sendbuf), rcvbuf, count, dt, op, p.comm()); +} + +template +int isend (const Parallel& p, const T* buf, int count, int dest, int tag, + MPI_Request* ireq) { + MPI_Datatype dt = get_type(); + MPI_Request ureq; + MPI_Request* req = ireq ? ireq : &ureq; + int ret = MPI_Isend(const_cast(buf), count, dt, dest, tag, p.comm(), req); + if ( ! ireq) MPI_Request_free(req); + return ret; +} + +template +int irecv (const Parallel& p, T* buf, int count, int src, int tag, MPI_Request* ireq) { + MPI_Datatype dt = get_type(); + MPI_Request ureq; + MPI_Request* req = ireq ? ireq : &ureq; + int ret = MPI_Irecv(buf, count, dt, src, tag, p.comm(), req); + if ( ! ireq) MPI_Request_free(req); + return ret; +} + +template +int gather (const Parallel& p, const T* sendbuf, int sendcount, + T* recvbuf, int recvcount, int root) { + MPI_Datatype dt = get_type(); + return MPI_Gather(sendbuf, sendcount, dt, recvbuf, recvcount, dt, root, p.comm()); +} + +template +int gatherv (const Parallel& p, const T* sendbuf, int sendcount, + T* recvbuf, const int* recvcounts, const int* displs, int root) { + MPI_Datatype dt = get_type(); + return MPI_Gatherv(sendbuf, sendcount, dt, recvbuf, recvcounts, displs, dt, root, + p.comm()); +} + +} // namespace mpi +} // namespace cedr + +#endif diff --git a/cedr/cedr_qlt.cpp b/cedr/cedr_qlt.cpp new file mode 100644 index 0000000..6a04f04 --- /dev/null +++ b/cedr/cedr_qlt.cpp @@ -0,0 +1,1159 @@ +#include "cedr_qlt.hpp" +#include "cedr_test_randomized.hpp" + +#include + +#include +#include + +#include +#include +#include +#include + +namespace cedr { +namespace qlt { + +class Timer { +public: + enum Op { tree, analyze, qltrun, qltrunl2r, qltrunr2l, snp, waitall, + total, NTIMERS }; + static inline void init () { +#ifdef QLT_TIME + for (int i = 0; i < NTIMERS; ++i) { + et_[i] = 0; + cnt_[i] = 0; + } +#endif + } + static inline void reset (const Op op) { +#ifdef QLT_TIME + et_[op] = 0; + cnt_[op] = 0; +#endif + } + static inline void start (const Op op) { +#ifdef QLT_TIME + gettimeofday(&t_start_[op], 0); + ++cnt_[op]; +#endif + } + static inline void stop (const Op op) { +#ifdef QLT_TIME + timeval t2; + gettimeofday(&t2, 0); + const timeval& t1 = t_start_[op]; + static const double us = 1.0e6; + et_[op] += (t2.tv_sec*us + t2.tv_usec - t1.tv_sec*us - t1.tv_usec)/us; +#endif + } +# define tpr(op) do { \ + printf("%-20s %10.3e %10.1f (%4d %10.3e)\n", \ + #op, et_[op], 100*et_[op]/tot, cnt_[op], et_[op]/cnt_[op]); \ + } while (0) + static void print () { +#ifdef QLT_TIME + const double tot = et_[total]; + tpr(tree); tpr(analyze); + tpr(qltrun); tpr(qltrunl2r); tpr(qltrunr2l); tpr(snp); tpr(waitall); + printf("%-20s %10.3e %10.1f\n", "total", tot, 100.0); +#endif + } +#undef tpr +private: +#ifdef QLT_TIME + static timeval t_start_[NTIMERS]; + static double et_[NTIMERS]; + static int cnt_[NTIMERS]; +#endif +}; +#ifdef QLT_TIME +timeval Timer::t_start_[Timer::NTIMERS]; +double Timer::et_[Timer::NTIMERS]; +int Timer::cnt_[Timer::NTIMERS]; +#endif + +namespace impl { +struct NodeSets { + typedef std::shared_ptr ConstPtr; + + enum : int { mpitag = 42 }; + + // A node in the tree that is relevant to this rank. + struct Node { + // Rank of the node. If the node is in a level, then its rank is my rank. If + // it's not in a level, then it is a comm partner of a node on this rank. + Int rank; + // Globally unique identifier; cellidx if leaf node, ie, if nkids == 0. + Int id; + // This node's parent, a comm partner, if such a partner is required. + const Node* parent; + // This node's kids, comm partners, if such partners are required. Parent + // and kid nodes are pruned relative to the full tree over the mesh to + // contain just the nodes that matter to this rank. + Int nkids; + const Node* kids[2]; + // Offset factor into bulk data. An offset is a unit; actual buffer sizes + // are multiples of this unit. + Int offset; + + Node () : rank(-1), id(-1), parent(nullptr), nkids(0), offset(-1) {} + }; + + // A level in the level schedule that is constructed to orchestrate + // communication. A node in a level depends only on nodes in lower-numbered + // levels (l2r) or higher-numbered (r2l). + // + // The communication patterns are as follows: + // > l2r + // MPI rcv into kids + // sum into node + // MPI send from node + // > r2l + // MPI rcv into node + // solve QP for kids + // MPI send from kids + struct Level { + struct MPIMetaData { + Int rank; // Rank of comm partner. + Int offset; // Offset to start of buffer for this comm. + Int size; // Size of this buffer in units of offsets. + }; + + // The nodes in the level. + std::vector nodes; + // MPI information for this level. + std::vector me, kids; + // Have to keep requests separate so we can call waitall if we want to. + mutable std::vector me_req, kids_req; + }; + + // Levels. nodes[0] is level 0, the leaf level. + std::vector levels; + // Number of data slots this rank needs. Each node owned by this rank, plus + // kids on other ranks, have an associated slot. + Int nslots; + + // Allocate a node. The list node_mem_ is the mechanism for memory ownership; + // node_mem_ isn't used for anything other than owning nodes. + Node* alloc () { + node_mem_.push_front(Node()); + return &node_mem_.front(); + } + + void print(std::ostream& os) const; + +private: + std::list node_mem_; +}; + +void NodeSets::print (std::ostream& os) const { + std::stringstream ss; + if (levels.empty()) return; + const Int myrank = levels[0].nodes[0]->rank; + ss << "pid " << myrank << ":"; + ss << " #levels " << levels.size(); + for (size_t i = 0; i < levels.size(); ++i) { + const auto& lvl = levels[i]; + ss << "\n " << i << ": " << lvl.nodes.size(); + std::set ps, ks; + for (size_t j = 0; j < lvl.nodes.size(); ++j) { + const auto n = lvl.nodes[j]; + for (Int k = 0; k < n->nkids; ++k) + if (n->kids[k]->rank != myrank) + ks.insert(n->kids[k]->rank); + if (n->parent && n->parent->rank != myrank) + ps.insert(n->parent->rank); + } + ss << " |"; + for (const auto& e : ks) ss << " " << e; + if ( ! lvl.kids.empty()) ss << " (" << lvl.kids.size() << ") |"; + for (const auto& e : ps) ss << " " << e; + if ( ! lvl.me.empty()) ss << " (" << lvl.me.size() << ")"; + } + ss << "\n"; + os << ss.str(); +} + +// Find tree depth, assign ranks to non-leaf nodes, and init 'reserved'. +Int init_tree (const tree::Node::Ptr& node, Int& id) { + node->reserved = nullptr; + Int depth = 0; + for (Int i = 0; i < node->nkids; ++i) { + cedr_assert(node.get() == node->kids[i]->parent); + depth = std::max(depth, init_tree(node->kids[i], id)); + } + if (node->nkids) { + node->rank = node->kids[0]->rank; + node->cellidx = id++; + } else { + cedr_throw_if(node->cellidx < 0 || node->cellidx >= id, + "cellidx is " << node->cellidx << " but should be between " << + 0 << " and " << id); + } + return depth + 1; +} + +void level_schedule_and_collect ( + NodeSets& ns, const Int& my_rank, const tree::Node::Ptr& node, Int& level, + bool& need_parent_ns_node) +{ + cedr_assert(node->rank != -1); + level = -1; + bool make_ns_node = false; + for (Int i = 0; i < node->nkids; ++i) { + Int kid_level; + bool kid_needs_ns_node; + level_schedule_and_collect(ns, my_rank, node->kids[i], kid_level, + kid_needs_ns_node); + level = std::max(level, kid_level); + if (kid_needs_ns_node) make_ns_node = true; + } + ++level; + // Is parent node needed for isend? + const bool node_is_owned = node->rank == my_rank; + need_parent_ns_node = node_is_owned; + if (node_is_owned || make_ns_node) { + cedr_assert( ! node->reserved); + NodeSets::Node* ns_node = ns.alloc(); + // Levels hold only owned nodes. + if (node_is_owned) ns.levels[level].nodes.push_back(ns_node); + node->reserved = ns_node; + ns_node->rank = node->rank; + ns_node->id = node->cellidx; + ns_node->parent = nullptr; + if (node_is_owned) { + // If this node is owned, it needs to have information about all kids. + ns_node->nkids = node->nkids; + for (Int i = 0; i < node->nkids; ++i) { + const auto& kid = node->kids[i]; + if ( ! kid->reserved) { + // This kid isn't owned by this rank. But need it for irecv. + NodeSets::Node* ns_kid; + kid->reserved = ns_kid = ns.alloc(); + ns_node->kids[i] = ns_kid; + cedr_assert(kid->rank != my_rank); + ns_kid->rank = kid->rank; + ns_kid->id = kid->cellidx; + ns_kid->parent = nullptr; // Not needed. + // The kid may have kids in the original tree, but in the tree pruned + // according to rank, it does not. + ns_kid->nkids = 0; + } else { + // This kid is owned by this rank, so fill in its parent pointer. + NodeSets::Node* ns_kid = static_cast(kid->reserved); + ns_node->kids[i] = ns_kid; + ns_kid->parent = ns_node; + } + } + } else { + // This node is not owned. Update the owned kids with its parent. + ns_node->nkids = 0; + for (Int i = 0; i < node->nkids; ++i) { + const auto& kid = node->kids[i]; + if (kid->reserved && kid->rank == my_rank) { + NodeSets::Node* ns_kid = static_cast(kid->reserved); + ns_node->kids[ns_node->nkids++] = ns_kid; + ns_kid->parent = ns_node; + } + } + } + } +} + +void level_schedule_and_collect (NodeSets& ns, const Int& my_rank, + const tree::Node::Ptr& tree) { + Int iunused; + bool bunused; + level_schedule_and_collect(ns, my_rank, tree, iunused, bunused); +} + +void consolidate (NodeSets& ns) { + auto levels = ns.levels; + ns.levels.clear(); + for (const auto& level : levels) + if ( ! level.nodes.empty()) + ns.levels.push_back(level); +} + +typedef std::pair RankNode; + +void init_offsets (const Int my_rank, std::vector& rns, + std::vector& mmds, Int& offset) { + // Set nodes on my rank to have rank -1 so that they sort first. + for (auto& rn : rns) + if (rn.first == my_rank) + rn.first = -1; + + // Sort so that all comms with a given rank are contiguous. Stable sort so + // that rns retains its order, in particular in the leaf node level. + std::stable_sort(rns.begin(), rns.end()); + + // Collect nodes into groups by rank and set up comm metadata for each group. + Int prev_rank = -1; + for (auto& rn : rns) { + const Int rank = rn.first; + if (rank == -1) { + if (rn.second->offset == -1) + rn.second->offset = offset++; + continue; + } + if (rank != prev_rank) { + cedr_assert(rank > prev_rank); + prev_rank = rank; + mmds.push_back(NodeSets::Level::MPIMetaData()); + auto& mmd = mmds.back(); + mmd.rank = rank; + mmd.offset = offset; + mmd.size = 0; + } + ++mmds.back().size; + rn.second->offset = offset++; + } +} + +// Set up comm data. Consolidate so that there is only one message between me +// and another rank per level. Determine an offset for each node, to be +// multiplied by data-size factors later, for use in data buffers. +void init_comm (const Int my_rank, NodeSets& ns) { + ns.nslots = 0; + for (auto& lvl : ns.levels) { + Int nkids = 0; + for (const auto& n : lvl.nodes) + nkids += n->nkids; + + std::vector me(lvl.nodes.size()), kids(nkids); + for (size_t i = 0, mi = 0, ki = 0; i < lvl.nodes.size(); ++i) { + const auto& n = lvl.nodes[i]; + me[mi].first = n->parent ? n->parent->rank : my_rank; + me[mi].second = const_cast(n); + ++mi; + for (Int k = 0; k < n->nkids; ++k) { + kids[ki].first = n->kids[k]->rank; + kids[ki].second = const_cast(n->kids[k]); + ++ki; + } + } + + init_offsets(my_rank, me, lvl.me, ns.nslots); + lvl.me_req.resize(lvl.me.size()); + init_offsets(my_rank, kids, lvl.kids, ns.nslots); + lvl.kids_req.resize(lvl.kids.size()); + } +} + +// Analyze the tree to extract levels. Levels are run from 0 to #level - 1. Each +// level has nodes whose corresponding operations depend on only nodes in +// lower-indexed levels. This mechanism prevents deadlock in the general case of +// multiple cells per rank, with multiple ranks appearing in a subtree other +// than the root. +// In addition, the set of nodes collected into levels are just those owned by +// this rank, and those with which owned nodes must communicate. +// Once this function is done, the tree can be deleted. +NodeSets::ConstPtr analyze (const Parallel::Ptr& p, const Int& ncells, + const tree::Node::Ptr& tree) { + const auto nodesets = std::make_shared(); + cedr_assert( ! tree->parent); + Int id = ncells; + const Int depth = init_tree(tree, id); + nodesets->levels.resize(depth); + level_schedule_and_collect(*nodesets, p->rank(), tree); + consolidate(*nodesets); + init_comm(p->rank(), *nodesets); + return nodesets; +} + +// Check that the offsets are self consistent. +Int check_comm (const NodeSets::ConstPtr& ns) { + Int nerr = 0; + std::vector offsets(ns->nslots, 0); + for (const auto& lvl : ns->levels) + for (const auto& n : lvl.nodes) { + cedr_assert(n->offset < ns->nslots); + ++offsets[n->offset]; + for (Int i = 0; i < n->nkids; ++i) + if (n->kids[i]->rank != n->rank) + ++offsets[n->kids[i]->offset]; + } + for (const auto& e : offsets) + if (e != 1) ++nerr; + return nerr; +} + +// Check that there are the correct number of leaf nodes, and that their offsets +// all come first and are ordered the same as ns->levels[0]->nodes. +Int check_leaf_nodes (const Parallel::Ptr& p, const NodeSets::ConstPtr& ns, + const Int ncells) { + Int nerr = 0; + cedr_assert( ! ns->levels.empty()); + cedr_assert( ! ns->levels[0].nodes.empty()); + Int my_nleaves = 0; + for (const auto& n : ns->levels[0].nodes) { + cedr_assert( ! n->nkids); + ++my_nleaves; + } + for (const auto& n : ns->levels[0].nodes) { + cedr_assert(n->offset < my_nleaves); + cedr_assert(n->id < ncells); + } + Int glbl_nleaves = 0; + mpi::all_reduce(*p, &my_nleaves, &glbl_nleaves, 1, MPI_SUM); + if (glbl_nleaves != ncells) + ++nerr; + return nerr; +} + +// Sum cellidx using the QLT comm pattern. +Int test_comm_pattern (const Parallel::Ptr& p, const NodeSets::ConstPtr& ns, + const Int ncells) { + Int nerr = 0; + // Rank-wide data buffer. + std::vector data(ns->nslots); + // Sum this rank's cellidxs. + for (auto& n : ns->levels[0].nodes) + data[n->offset] = n->id; + // Leaves to root. + for (size_t il = 0; il < ns->levels.size(); ++il) { + auto& lvl = ns->levels[il]; + // Set up receives. + for (size_t i = 0; i < lvl.kids.size(); ++i) { + const auto& mmd = lvl.kids[i]; + mpi::irecv(*p, &data[mmd.offset], mmd.size, mmd.rank, NodeSets::mpitag, + &lvl.kids_req[i]); + } + //todo Replace with simultaneous waitany and isend. + mpi::waitall(lvl.kids_req.size(), lvl.kids_req.data()); + // Combine kids' data. + for (auto& n : lvl.nodes) { + if ( ! n->nkids) continue; + data[n->offset] = 0; + for (Int i = 0; i < n->nkids; ++i) + data[n->offset] += data[n->kids[i]->offset]; + } + // Send to parents. + for (size_t i = 0; i < lvl.me.size(); ++i) { + const auto& mmd = lvl.me[i]; + mpi::isend(*p, &data[mmd.offset], mmd.size, mmd.rank, NodeSets::mpitag, + &lvl.me_req[i]); + } + if (il+1 == ns->levels.size()) + mpi::waitall(lvl.me_req.size(), lvl.me_req.data()); + } + // Root to leaves. + for (size_t il = ns->levels.size(); il > 0; --il) { + auto& lvl = ns->levels[il-1]; + // Get the global sum from parent. + for (size_t i = 0; i < lvl.me.size(); ++i) { + const auto& mmd = lvl.me[i]; + mpi::irecv(*p, &data[mmd.offset], mmd.size, mmd.rank, NodeSets::mpitag, + &lvl.me_req[i]); + } + //todo Replace with simultaneous waitany and isend. + mpi::waitall(lvl.me_req.size(), lvl.me_req.data()); + // Pass to kids. + for (auto& n : lvl.nodes) { + if ( ! n->nkids) continue; + for (Int i = 0; i < n->nkids; ++i) + data[n->kids[i]->offset] = data[n->offset]; + } + // Send. + for (size_t i = 0; i < lvl.kids.size(); ++i) { + const auto& mmd = lvl.kids[i]; + mpi::isend(*p, &data[mmd.offset], mmd.size, mmd.rank, NodeSets::mpitag, + &lvl.kids_req[i]); + } + } + // Wait on sends to clean up. + for (size_t il = 0; il < ns->levels.size(); ++il) { + auto& lvl = ns->levels[il]; + if (il+1 < ns->levels.size()) + mpi::waitall(lvl.me_req.size(), lvl.me_req.data()); + mpi::waitall(lvl.kids_req.size(), lvl.kids_req.data()); + } + { // Check that all leaf nodes have the right number. + const Int desired_sum = (ncells*(ncells - 1)) / 2; + for (const auto& n : ns->levels[0].nodes) + if (data[n->offset] != desired_sum) ++nerr; + if (p->amroot()) { + std::cout << " " << data[ns->levels[0].nodes[0]->offset]; + std::cout.flush(); + } + } + return nerr; +} + +// Unit tests for NodeSets. +Int unittest (const Parallel::Ptr& p, const NodeSets::ConstPtr& ns, + const Int ncells) { + Int nerr = 0; + nerr += check_comm(ns); + if (nerr) return nerr; + nerr += check_leaf_nodes(p, ns, ncells); + if (nerr) return nerr; + nerr += test_comm_pattern(p, ns, ncells); + if (nerr) return nerr; + return nerr; +} +} // namespace impl + +template +void QLT::init (const std::string& name, IntList& d, + typename IntList::HostMirror& h, size_t n) { + d = IntList("QLT " + name, n); + h = Kokkos::create_mirror_view(d); +} + +template +int QLT::MetaData::get_problem_type (const int& idx) { + return problem_type_[idx]; +} + +// icpc doesn't let us use problem_type_ here, even though it's constexpr. +template +int QLT::MetaData::get_problem_type_idx (const int& mask) { + switch (mask) { + case CPT::s: case CPT::st: return 0; + case CPT::cs: case CPT::cst: return 1; + case CPT::t: return 2; + case CPT::ct: return 3; + default: cedr_kernel_throw_if(true, "Invalid problem type."); return -1; + } +} + +template +int QLT::MetaData::get_problem_type_l2r_bulk_size (const int& mask) { + if (mask & ProblemType::conserve) return 4; + return 3; +} + +template +int QLT::MetaData::get_problem_type_r2l_bulk_size (const int& mask) { + if (mask & ProblemType::shapepreserve) return 1; + return 3; +} + +template +void QLT::MetaData::init (const MetaDataBuilder& mdb) { + const Int ntracers = mdb.trcr2prob.size(); + + Me::init("trcr2prob", a_d_.trcr2prob, a_h_.trcr2prob, ntracers); + std::copy(mdb.trcr2prob.begin(), mdb.trcr2prob.end(), a_h_.trcr2prob.data()); + Kokkos::deep_copy(a_d_.trcr2prob, a_h_.trcr2prob); + + Me::init("bidx2trcr", a_d_.bidx2trcr, a_h_.bidx2trcr, ntracers); + Me::init("trcr2bl2r", a_d_.trcr2bl2r, a_h_.trcr2bl2r, ntracers); + Me::init("trcr2br2l", a_d_.trcr2br2l, a_h_.trcr2br2l, ntracers); + a_h_.prob2trcrptr[0] = 0; + a_h_.prob2bl2r[0] = 1; // rho is at 0. + a_h_.prob2br2l[0] = 0; + for (Int pi = 0; pi < nprobtypes; ++pi) { + a_h_.prob2trcrptr[pi+1] = a_h_.prob2trcrptr[pi]; + const Int l2rbulksz = get_problem_type_l2r_bulk_size(get_problem_type(pi)); + const Int r2lbulksz = get_problem_type_r2l_bulk_size(get_problem_type(pi)); + for (Int ti = 0; ti < ntracers; ++ti) { + const auto problem_type = a_h_.trcr2prob[ti]; + if (problem_type != problem_type_[pi]) continue; + const auto tcnt = a_h_.prob2trcrptr[pi+1] - a_h_.prob2trcrptr[pi]; + a_h_.trcr2bl2r[ti] = a_h_.prob2bl2r[pi] + tcnt*l2rbulksz; + a_h_.trcr2br2l[ti] = a_h_.prob2br2l[pi] + tcnt*r2lbulksz; + a_h_.bidx2trcr[a_h_.prob2trcrptr[pi+1]++] = ti; + } + Int ni = a_h_.prob2trcrptr[pi+1] - a_h_.prob2trcrptr[pi]; + a_h_.prob2bl2r[pi+1] = a_h_.prob2bl2r[pi] + ni*l2rbulksz; + a_h_.prob2br2l[pi+1] = a_h_.prob2br2l[pi] + ni*r2lbulksz; + } + Kokkos::deep_copy(a_d_.bidx2trcr, a_h_.bidx2trcr); + Kokkos::deep_copy(a_d_.trcr2bl2r, a_h_.trcr2bl2r); + Kokkos::deep_copy(a_d_.trcr2br2l, a_h_.trcr2br2l); + + Me::init("trcr2bidx", a_d_.trcr2bidx, a_h_.trcr2bidx, ntracers); + for (Int ti = 0; ti < ntracers; ++ti) + a_h_.trcr2bidx(a_h_.bidx2trcr(ti)) = ti; + Kokkos::deep_copy(a_d_.trcr2bidx, a_h_.trcr2bidx); + + a_h = a_h_; + + // Won't default construct Unmanaged, so have to do pointer stuff and raw + // array copy explicitly. + a_d.trcr2prob = a_d_.trcr2prob; + a_d.bidx2trcr = a_d_.bidx2trcr; + a_d.trcr2bidx = a_d_.trcr2bidx; + a_d.trcr2bl2r = a_d_.trcr2bl2r; + a_d.trcr2br2l = a_d_.trcr2br2l; + std::copy(a_h_.prob2trcrptr, a_h_.prob2trcrptr + nprobtypes + 1, + a_d.prob2trcrptr); + std::copy(a_h_.prob2bl2r, a_h_.prob2bl2r + nprobtypes + 1, a_d.prob2bl2r); + std::copy(a_h_.prob2br2l, a_h_.prob2br2l + nprobtypes + 1, a_d.prob2br2l); + cedr_assert(a_d.prob2trcrptr[nprobtypes] == ntracers); +} + +template +void QLT::BulkData::init (const MetaData& md, const Int& nslots) { + l2r_data_ = RealList("QLT l2r_data", md.a_h.prob2bl2r[md.nprobtypes]*nslots); + r2l_data_ = RealList("QLT r2l_data", md.a_h.prob2br2l[md.nprobtypes]*nslots); + l2r_data = l2r_data_; + r2l_data = r2l_data_; +} + +template +void QLT::init (const Parallel::Ptr& p, const Int& ncells, + const tree::Node::Ptr& tree) { + p_ = p; + Timer::start(Timer::analyze); + ns_ = impl::analyze(p, ncells, tree); + init_ordinals(); + Timer::stop(Timer::analyze); + mdb_ = std::make_shared(); +} + +template +void QLT::init_ordinals () { + for (const auto& n : ns_->levels[0].nodes) + gci2lci_[n->id] = n->offset; +} + +template +QLT::QLT (const Parallel::Ptr& p, const Int& ncells, const tree::Node::Ptr& tree) { + init(p, ncells, tree); + cedr_throw_if(nlclcells() == 0, "QLT does not support 0 cells on a rank."); +} + +template +void QLT::print (std::ostream& os) const { + ns_->print(os); +} + +// Number of cells owned by this rank. +template +Int QLT::nlclcells () const { return ns_->levels[0].nodes.size(); } + +// Cells owned by this rank, in order of local numbering. Thus, +// gci2lci(gcis[i]) == i. Ideally, the caller never actually calls gci2lci(), +// and instead uses the information from get_owned_glblcells to determine +// local cell indices. +template +void QLT::get_owned_glblcells (std::vector& gcis) const { + gcis.resize(ns_->levels[0].nodes.size()); + for (const auto& n : ns_->levels[0].nodes) + gcis[n->offset] = n->id; +} + +// For global cell index cellidx, i.e., the globally unique ordinal associated +// with a cell in the caller's tree, return this rank's local index for +// it. This is not an efficient operation. +template +Int QLT::gci2lci (const Int& gci) const { + const auto it = gci2lci_.find(gci); + if (it == gci2lci_.end()) { + pr(puf(gci)); + std::vector gcis; + get_owned_glblcells(gcis); + mprarr(gcis); + } + cedr_throw_if(it == gci2lci_.end(), "gci " << gci << " not in gci2lci map."); + return it->second; +} + +template +void QLT::declare_tracer (int problem_type, const Int& rhomidx) { + cedr_throw_if( ! mdb_, "end_tracer_declarations was already called; " + "it is an error to call declare_tracer now."); + cedr_throw_if(rhomidx > 0, "rhomidx > 0 is not supported yet."); + // For its exception side effect, and to get canonical problem type, since + // some possible problem types map to the same canonical one: + problem_type = md_.get_problem_type(md_.get_problem_type_idx(problem_type)); + mdb_->trcr2prob.push_back(problem_type); +} + +template +void QLT::end_tracer_declarations () { + md_.init(*mdb_); + mdb_ = nullptr; + bd_.init(md_, ns_->nslots); +} + +template +int QLT::get_problem_type (const Int& tracer_idx) const { + cedr_throw_if(tracer_idx < 0 || tracer_idx > md_.a_h.trcr2prob.extent_int(0), + "tracer_idx is out of bounds: " << tracer_idx); + return md_.a_h.trcr2prob[tracer_idx]; +} + +template +Int QLT::get_num_tracers () const { + return md_.a_h.trcr2prob.size(); +} + +template +void QLT::run () { + Timer::start(Timer::qltrunl2r); + using namespace impl; + // Number of data per slot. + const Int l2rndps = md_.a_d.prob2bl2r[md_.nprobtypes]; + const Int r2lndps = md_.a_d.prob2br2l[md_.nprobtypes]; + // Leaves to root. + for (size_t il = 0; il < ns_->levels.size(); ++il) { + auto& lvl = ns_->levels[il]; + // Set up receives. + if (lvl.kids.size()) { + for (size_t i = 0; i < lvl.kids.size(); ++i) { + const auto& mmd = lvl.kids[i]; + mpi::irecv(*p_, &bd_.l2r_data(mmd.offset*l2rndps), mmd.size*l2rndps, mmd.rank, + NodeSets::mpitag, &lvl.kids_req[i]); + } + //todo Replace with simultaneous waitany and isend. + Timer::start(Timer::waitall); + mpi::waitall(lvl.kids_req.size(), lvl.kids_req.data()); + Timer::stop(Timer::waitall); + } + // Combine kids' data. + //todo Kernelize, interacting with waitany todo above. + for (const auto& n : lvl.nodes) { + if ( ! n->nkids) continue; + cedr_kernel_assert(n->nkids == 2); + // Total density. + bd_.l2r_data(n->offset*l2rndps) = (bd_.l2r_data(n->kids[0]->offset*l2rndps) + + bd_.l2r_data(n->kids[1]->offset*l2rndps)); + // Tracers. + for (Int pti = 0; pti < md_.nprobtypes; ++pti) { + const Int problem_type = md_.get_problem_type(pti); + const bool sum_only = problem_type & ProblemType::shapepreserve; + const Int bsz = md_.get_problem_type_l2r_bulk_size(problem_type); + const Int bis = md_.a_d.prob2trcrptr[pti], bie = md_.a_d.prob2trcrptr[pti+1]; + for (Int bi = bis; bi < bie; ++bi) { + const Int bdi = md_.a_d.trcr2bl2r(md_.a_d.bidx2trcr(bi)); + Real* const me = &bd_.l2r_data(n->offset*l2rndps + bdi); + const Real* const k0 = &bd_.l2r_data(n->kids[0]->offset*l2rndps + bdi); + const Real* const k1 = &bd_.l2r_data(n->kids[1]->offset*l2rndps + bdi); + me[0] = sum_only ? k0[0] + k1[0] : cedr::impl::min(k0[0], k1[0]); + me[1] = k0[1] + k1[1] ; + me[2] = sum_only ? k0[2] + k1[2] : cedr::impl::max(k0[2], k1[2]); + if (bsz == 4) + me[3] = k0[3] + k1[3] ; + } + } + } + // Send to parents. + if (lvl.me.size()) { + for (size_t i = 0; i < lvl.me.size(); ++i) { + const auto& mmd = lvl.me[i]; + mpi::isend(*p_, &bd_.l2r_data(mmd.offset*l2rndps), mmd.size*l2rndps, mmd.rank, + NodeSets::mpitag, &lvl.me_req[i]); + } + if (il+1 == ns_->levels.size()) { + Timer::start(Timer::waitall); + mpi::waitall(lvl.me_req.size(), lvl.me_req.data()); + Timer::stop(Timer::waitall); + } + } + } + Timer::stop(Timer::qltrunl2r); Timer::start(Timer::qltrunr2l); + // Root. + if ( ! ns_->levels.empty() && ns_->levels.back().nodes.size() == 1 && + ! ns_->levels.back().nodes[0]->parent) { + const auto& n = ns_->levels.back().nodes[0]; + for (Int pti = 0; pti < md_.nprobtypes; ++pti) { + const Int problem_type = md_.get_problem_type(pti); + const Int bis = md_.a_d.prob2trcrptr[pti], bie = md_.a_d.prob2trcrptr[pti+1]; + for (Int bi = bis; bi < bie; ++bi) { + const Int l2rbdi = md_.a_d.trcr2bl2r(md_.a_d.bidx2trcr(bi)); + const Int r2lbdi = md_.a_d.trcr2br2l(md_.a_d.bidx2trcr(bi)); + // If QLT is enforcing global mass conservation, set the root's r2l Qm + // value to the l2r Qm_prev's sum; otherwise, copy the l2r Qm value to + // the r2l one. + const Int os = problem_type & ProblemType::conserve ? 3 : 1; + bd_.r2l_data(n->offset*r2lndps + r2lbdi) = + bd_.l2r_data(n->offset*l2rndps + l2rbdi + os); + if ( ! (problem_type & ProblemType::shapepreserve)) { + // We now know the global q_{min,max}. Start propagating it + // leafward. + bd_.r2l_data(n->offset*r2lndps + r2lbdi + 1) = + bd_.l2r_data(n->offset*l2rndps + l2rbdi + 0); + bd_.r2l_data(n->offset*r2lndps + r2lbdi + 2) = + bd_.l2r_data(n->offset*l2rndps + l2rbdi + 2); + } + } + } + } + // Root to leaves. + for (size_t il = ns_->levels.size(); il > 0; --il) { + auto& lvl = ns_->levels[il-1]; + if (lvl.me.size()) { + for (size_t i = 0; i < lvl.me.size(); ++i) { + const auto& mmd = lvl.me[i]; + mpi::irecv(*p_, &bd_.r2l_data(mmd.offset*r2lndps), mmd.size*r2lndps, mmd.rank, + NodeSets::mpitag, &lvl.me_req[i]); + } + //todo Replace with simultaneous waitany and isend. + Timer::start(Timer::waitall); + mpi::waitall(lvl.me_req.size(), lvl.me_req.data()); + Timer::stop(Timer::waitall); + } + // Solve QP for kids' values. + //todo Kernelize, interacting with waitany todo above. + Timer::start(Timer::snp); + for (const auto& n : lvl.nodes) { + if ( ! n->nkids) continue; + for (Int pti = 0; pti < md_.nprobtypes; ++pti) { + const Int problem_type = md_.get_problem_type(pti); + const Int bis = md_.a_d.prob2trcrptr[pti], bie = md_.a_d.prob2trcrptr[pti+1]; + for (Int bi = bis; bi < bie; ++bi) { + const Int l2rbdi = md_.a_d.trcr2bl2r(md_.a_d.bidx2trcr(bi)); + const Int r2lbdi = md_.a_d.trcr2br2l(md_.a_d.bidx2trcr(bi)); + cedr_assert(n->nkids == 2); + if ( ! (problem_type & ProblemType::shapepreserve)) { + // Pass q_{min,max} info along. l2r data are updated for use in + // solve_node_problem. r2l data are updated for use in isend. + const Real q_min = bd_.r2l_data(n->offset*r2lndps + r2lbdi + 1); + const Real q_max = bd_.r2l_data(n->offset*r2lndps + r2lbdi + 2); + bd_.l2r_data(n->offset*l2rndps + l2rbdi + 0) = q_min; + bd_.l2r_data(n->offset*l2rndps + l2rbdi + 2) = q_max; + for (Int k = 0; k < 2; ++k) { + bd_.l2r_data(n->kids[k]->offset*l2rndps + l2rbdi + 0) = q_min; + bd_.l2r_data(n->kids[k]->offset*l2rndps + l2rbdi + 2) = q_max; + bd_.r2l_data(n->kids[k]->offset*r2lndps + r2lbdi + 1) = q_min; + bd_.r2l_data(n->kids[k]->offset*r2lndps + r2lbdi + 2) = q_max; + } + } + const auto& k0 = n->kids[0]; + const auto& k1 = n->kids[1]; + solve_node_problem( + problem_type, + bd_.l2r_data( n->offset*l2rndps), + &bd_.l2r_data( n->offset*l2rndps + l2rbdi), + bd_.r2l_data( n->offset*r2lndps + r2lbdi), + bd_.l2r_data(k0->offset*l2rndps), + &bd_.l2r_data(k0->offset*l2rndps + l2rbdi), + bd_.r2l_data(k0->offset*r2lndps + r2lbdi), + bd_.l2r_data(k1->offset*l2rndps), + &bd_.l2r_data(k1->offset*l2rndps + l2rbdi), + bd_.r2l_data(k1->offset*r2lndps + r2lbdi)); + } + } + } + Timer::stop(Timer::snp); + // Send. + if (lvl.kids.size()) + for (size_t i = 0; i < lvl.kids.size(); ++i) { + const auto& mmd = lvl.kids[i]; + mpi::isend(*p_, &bd_.r2l_data(mmd.offset*r2lndps), mmd.size*r2lndps, mmd.rank, + NodeSets::mpitag, &lvl.kids_req[i]); + } + } + // Wait on sends to clean up. + for (size_t il = 0; il < ns_->levels.size(); ++il) { + auto& lvl = ns_->levels[il]; + if (il+1 < ns_->levels.size()) + mpi::waitall(lvl.me_req.size(), lvl.me_req.data()); + mpi::waitall(lvl.kids_req.size(), lvl.kids_req.data()); + } + Timer::stop(Timer::qltrunr2l); +} + +template +constexpr Int QLT::MetaData::problem_type_[]; + +namespace test { +using namespace impl; + +class TestQLT : public cedr::test::TestRandomized { +public: + typedef QLT QLTT; + + TestQLT (const Parallel::Ptr& p, const tree::Node::Ptr& tree, + const Int& ncells, const bool verbose=false) + : TestRandomized("QLT", p, ncells, verbose), + qlt_(p, ncells, tree), tree_(tree) + { + if (verbose) qlt_.print(std::cout); + init(); + } + +private: + QLTT qlt_; + tree::Node::Ptr tree_; + + CDR& get_cdr () override { return qlt_; } + + void init_numbering () override { + init_numbering(tree_); + } + + void init_numbering (const tree::Node::Ptr& node) { + check(qlt_); + // TestQLT doesn't actually care about a particular ordering, as there is no + // geometry to the test problem. However, use *some* ordering to model what + // a real problem must do. + if ( ! node->nkids) { + if (node->rank == p_->rank()) + gcis_.push_back(node->cellidx); + return; + } + for (Int i = 0; i < node->nkids; ++i) + init_numbering(node->kids[i]); + } + + static void check (const QLTT& qlt) { + const Int n = qlt.nlclcells(); + std::vector gcis; + qlt.get_owned_glblcells(gcis); + cedr_assert(static_cast(gcis.size()) == n); + for (Int i = 0; i < n; ++i) + cedr_assert(qlt.gci2lci(gcis[i]) == i); + } + + void init_tracers () override { + for (const auto& t : tracers_) + qlt_.declare_tracer(t.problem_type, 0); + qlt_.end_tracer_declarations(); + cedr_assert(qlt_.get_num_tracers() == static_cast(tracers_.size())); + for (size_t i = 0; i < tracers_.size(); ++i) + cedr_assert(qlt_.get_problem_type(i) == (tracers_[i].problem_type | + ProblemType::consistent)); + } + + void run_impl (const Int trial) override { + MPI_Barrier(p_->comm()); + Timer::start(Timer::qltrun); + qlt_.run(); + MPI_Barrier(p_->comm()); + Timer::stop(Timer::qltrun); + if (trial == 0) { + Timer::reset(Timer::qltrun); + Timer::reset(Timer::qltrunl2r); + Timer::reset(Timer::qltrunr2l); + Timer::reset(Timer::waitall); + Timer::reset(Timer::snp); + } + } +}; + +// Test all QLT variations and situations. +Int test_qlt (const Parallel::Ptr& p, const tree::Node::Ptr& tree, const Int& ncells, + const Int nrepeat = 1, + // Diagnostic output for dev and illustration purposes. To be + // clear, no QLT unit test requires output to be checked; each + // checks in-memory data and returns a failure count. + const bool write = false, + const bool verbose = false) { + return TestQLT(p, tree, ncells, verbose).run(nrepeat, write); +} +} // namespace test + +// Tree for a 1-D periodic domain, for unit testing. +namespace oned { +struct Mesh { + struct ParallelDecomp { + enum Enum { + // The obvious distribution of ranks: 1 rank takes exactly 1 contiguous + // set of cell indices. + contiguous, + // For heavy-duty testing of QLT comm pattern, use a ridiculous assignment + // of ranks to cell indices. This forces the QLT tree to communicate, + // pack, and unpack in silly ways. + pseudorandom + }; + }; + + Mesh (const Int nc, const Parallel::Ptr& p, + const ParallelDecomp::Enum& parallel_decomp = ParallelDecomp::contiguous) { + init(nc, p, parallel_decomp); + } + + void init (const Int nc, const Parallel::Ptr& p, + const ParallelDecomp::Enum& parallel_decomp) { + nc_ = nc; + nranks_ = p->size(); + p_ = p; + pd_ = parallel_decomp; + cedr_throw_if(nranks_ > nc_, "#GIDs < #ranks is not supported."); + } + + Int ncell () const { return nc_; } + + const Parallel::Ptr& parallel () const { return p_; } + + Int rank (const Int& ci) const { + switch (pd_) { + case ParallelDecomp::contiguous: + return std::min(nranks_ - 1, ci / (nc_ / nranks_)); + default: { + const auto chunk = ci / nranks_; + return (ci + chunk) % nranks_; + } + } + } + + static Int unittest (const Parallel::Ptr& p) { + const Mesh::ParallelDecomp::Enum dists[] = { Mesh::ParallelDecomp::pseudorandom, + Mesh::ParallelDecomp::contiguous }; + Int ne = 0; + for (size_t id = 0; id < sizeof(dists)/sizeof(*dists); ++id) { + Mesh m(std::max(42, 3*p->size()), p, dists[id]); + const Int nc = m.ncell(); + for (Int ci = 0; ci < nc; ++ci) + if (m.rank(ci) < 0 || m.rank(ci) >= p->size()) + ++ne; + } + return ne; + } + +private: + Int nc_, nranks_; + Parallel::Ptr p_; + ParallelDecomp::Enum pd_; +}; + +tree::Node::Ptr make_tree (const Mesh& m, const Int cs, const Int ce, + const tree::Node* parent, const bool imbalanced) { + const Int + cn = ce - cs, + cn0 = ( imbalanced && cn > 2 ? + cn/3 : + cn/2 ); + tree::Node::Ptr n = std::make_shared(); + n->parent = parent; + if (cn == 1) { + n->nkids = 0; + n->rank = m.rank(cs); + n->cellidx = cs; + return n; + } + n->nkids = 2; + n->kids[0] = make_tree(m, cs, cs + cn0, n.get(), imbalanced); + n->kids[1] = make_tree(m, cs + cn0, ce, n.get(), imbalanced); + return n; +} + +tree::Node::Ptr make_tree (const Mesh& m, const bool imbalanced) { + return make_tree(m, 0, m.ncell(), nullptr, imbalanced); +} + +tree::Node::Ptr make_tree (const Parallel::Ptr& p, const Int& ncells, + const bool imbalanced) { + Mesh m(ncells, p); + return make_tree(m, imbalanced); +} + +namespace test { +void mark_cells (const tree::Node::Ptr& node, std::vector& cells) { + if ( ! node->nkids) { + ++cells[node->cellidx]; + return; + } + for (Int i = 0; i < node->nkids; ++i) + mark_cells(node->kids[i], cells); +} + +Int unittest (const Parallel::Ptr& p) { + const Mesh::ParallelDecomp::Enum dists[] = { Mesh::ParallelDecomp::pseudorandom, + Mesh::ParallelDecomp::contiguous }; + Int ne = 0; + for (size_t id = 0; id < sizeof(dists)/sizeof(*dists); ++id) + for (bool imbalanced: {false, true}) { + Mesh m(std::max(42, 3*p->size()), p, Mesh::ParallelDecomp::pseudorandom); + tree::Node::Ptr tree = make_tree(m, imbalanced); + std::vector cells(m.ncell(), 0); + mark_cells(tree, cells); + for (Int i = 0; i < m.ncell(); ++i) + if (cells[i] != 1) ++ne; + } + return ne; +} +} // namespace test +} // namespace oned + +tree::Node::Ptr tree::make_tree_over_1d_mesh (const Parallel::Ptr& p, const Int& ncells, + const bool imbalanced) { + return oned::make_tree(oned::Mesh(ncells, p), imbalanced); +} + +namespace test { +Int unittest_NodeSets (const Parallel::Ptr& p) { + using Mesh = oned::Mesh; + const Int szs[] = { p->size(), 3*p->size() }; + const Mesh::ParallelDecomp::Enum dists[] = { Mesh::ParallelDecomp::pseudorandom, + Mesh::ParallelDecomp::contiguous }; + Int nerr = 0; + for (size_t is = 0; is < sizeof(szs)/sizeof(*szs); ++is) + for (size_t id = 0; id < sizeof(dists)/sizeof(*dists); ++id) + for (bool imbalanced: {false, true}) { + Mesh m(szs[is], p, dists[id]); + tree::Node::Ptr tree = make_tree(m, imbalanced); + impl::NodeSets::ConstPtr nodesets = impl::analyze(p, m.ncell(), tree); + tree = nullptr; + nerr += impl::unittest(p, nodesets, m.ncell()); + } + return nerr; +} + +Int unittest_QLT (const Parallel::Ptr& p, const bool write_requested=false) { + using Mesh = oned::Mesh; + const Int szs[] = { p->size(), 2*p->size(), 7*p->size(), 21*p->size() }; + const Mesh::ParallelDecomp::Enum dists[] = { Mesh::ParallelDecomp::contiguous, + Mesh::ParallelDecomp::pseudorandom }; + Int nerr = 0; + for (size_t is = 0, islim = sizeof(szs)/sizeof(*szs); is < islim; ++is) + for (size_t id = 0, idlim = sizeof(dists)/sizeof(*dists); id < idlim; ++id) + for (bool imbalanced: {false, true}) { + if (p->amroot()) { + std::cout << " (" << szs[is] << ", " << id << ", " << imbalanced << ")"; + std::cout.flush(); + } + Mesh m(szs[is], p, dists[id]); + tree::Node::Ptr tree = make_tree(m, imbalanced); + const bool write = (write_requested && m.ncell() < 3000 && + is == islim-1 && id == idlim-1); + nerr += test::test_qlt(p, tree, m.ncell(), 1, write); + } + return nerr; +} + +Int run_unit_and_randomized_tests (const Parallel::Ptr& p, const Input& in) { + Int nerr = 0; + if (in.unittest) { + Int ne; + ne = oned::Mesh::unittest(p); + if (ne && p->amroot()) std::cerr << "FAIL: Mesh::unittest()\n"; + nerr += ne; + ne = oned::test::unittest(p); + if (ne && p->amroot()) std::cerr << "FAIL: oned::unittest_tree()\n"; + nerr += ne; + ne = unittest_NodeSets(p); + if (ne && p->amroot()) std::cerr << "FAIL: oned::unittest_NodeSets()\n"; + nerr += ne; + ne = unittest_QLT(p, in.write); + if (ne && p->amroot()) std::cerr << "FAIL: oned::unittest_QLT()\n"; + nerr += ne; + if (p->amroot()) std::cout << "\n"; + } + // Performance test. + if (in.perftest && in.ncells > 0) { + oned::Mesh m(in.ncells, p, + (in.pseudorandom ? + oned::Mesh::ParallelDecomp::pseudorandom : + oned::Mesh::ParallelDecomp::contiguous)); + Timer::init(); + Timer::start(Timer::total); Timer::start(Timer::tree); + tree::Node::Ptr tree = make_tree(m, false); + Timer::stop(Timer::tree); + test::test_qlt(p, tree, in.ncells, in.nrepeat, false, in.verbose); + Timer::stop(Timer::total); + if (p->amroot()) Timer::print(); + } + return nerr; +} + +} // namespace test +} // namespace qlt +} // namespace cedr + +#ifdef KOKKOS_HAVE_SERIAL +template class cedr::qlt::QLT; +#endif +#ifdef KOKKOS_HAVE_OPENMP +template class cedr::qlt::QLT; +#endif +#ifdef KOKKOS_HAVE_CUDA +template class cedr::qlt::QLT; +#endif diff --git a/cedr/cedr_qlt.hpp b/cedr/cedr_qlt.hpp new file mode 100644 index 0000000..e923600 --- /dev/null +++ b/cedr/cedr_qlt.hpp @@ -0,0 +1,225 @@ +#ifndef INCLUDE_CEDR_QLT_HPP +#define INCLUDE_CEDR_QLT_HPP + +#include + +#include +#include +#include +#include +#include + +#include "cedr_cdr.hpp" + +namespace cedr { +// QLT: Quasi-local tree-based non-iterative tracer density reconstructor for +// mass conservation, shape preservation, and tracer consistency. +namespace qlt { +using cedr::mpi::Parallel; + +namespace impl { class NodeSets; } + +namespace tree { +// The caller builds a tree of these nodes to pass to QLT. +struct Node { + typedef std::shared_ptr Ptr; + const Node* parent; // (Can't be a shared_ptr: would be a circular dependency.) + Int rank; // Owning rank. + Long cellidx; // If a leaf, the cell to which this node corresponds. + Int nkids; // 0 at leaf, 1 or 2 otherwise. + Node::Ptr kids[2]; + void* reserved; // For internal use. + Node () : parent(nullptr), rank(-1), cellidx(-1), nkids(0), reserved(nullptr) {} +}; + +// Utility to make a tree over a 1D mesh. For testing, it can be useful to +// create an imbalanced tree. +Node::Ptr make_tree_over_1d_mesh(const Parallel::Ptr& p, const Int& ncells, + const bool imbalanced = false); +} // namespace tree + +template +class QLT : public cedr::CDR { +public: + typedef typename cedr::impl::DeviceType::type Device; + typedef QLT Me; + typedef std::shared_ptr Ptr; + + // Set up QLT topology and communication data structures based on a tree. + QLT(const Parallel::Ptr& p, const Int& ncells, const tree::Node::Ptr& tree); + + void print(std::ostream& os) const override; + + // Number of cells owned by this rank. + Int nlclcells() const; + + // Cells owned by this rank, in order of local numbering. Thus, + // gci2lci(gcis[i]) == i. Ideally, the caller never actually calls gci2lci(), + // and instead uses the information from get_owned_glblcells to determine + // local cell indices. + void get_owned_glblcells(std::vector& gcis) const; + + // For global cell index cellidx, i.e., the globally unique ordinal associated + // with a cell in the caller's tree, return this rank's local index for + // it. This is not an efficient operation. + Int gci2lci(const Int& gci) const; + + void declare_tracer(int problem_type, const Int& rhomidx) override; + + void end_tracer_declarations() override; + + int get_problem_type(const Int& tracer_idx) const override; + + Int get_num_tracers() const override; + + // lclcellidx is gci2lci(cellidx). + KOKKOS_INLINE_FUNCTION + void set_rhom(const Int& lclcellidx, const Int& rhomidx, const Real& rhom) override; + + // lclcellidx is gci2lci(cellidx). + KOKKOS_INLINE_FUNCTION + void set_Qm(const Int& lclcellidx, const Int& tracer_idx, + const Real& Qm, const Real& Qm_min, const Real& Qm_max, + const Real Qm_prev = -1) override; + + void run() override; + + KOKKOS_INLINE_FUNCTION + Real get_Qm(const Int& lclcellidx, const Int& tracer_idx) override; + +private: + typedef Kokkos::View IntList; + typedef cedr::impl::Const ConstIntList; + typedef cedr::impl::ConstUnmanaged ConstUnmanagedIntList; + + static void init(const std::string& name, IntList& d, + typename IntList::HostMirror& h, size_t n); + + struct MetaDataBuilder { + typedef std::shared_ptr Ptr; + std::vector trcr2prob; + }; + + struct MetaData { + enum : Int { nprobtypes = 4 }; + + template + struct Arrays { + // trcr2prob(i) is the ProblemType of tracer i. + IntListT trcr2prob; + // bidx2trcr(prob2trcrptr(i) : prob2trcrptr(i+1)-1) is the list of + // tracers having ProblemType index i. bidx2trcr is the permutation + // from the user's tracer index to the bulk data's ordering (bidx). + Int prob2trcrptr[nprobtypes+1]; + IntListT bidx2trcr; + // Inverse of bidx2trcr. + IntListT trcr2bidx; + // Points to the start of l2r bulk data for each problem type, within a + // slot. + Int prob2bl2r[nprobtypes + 1]; + // Point to the start of l2r bulk data for each tracer, within a slot. + IntListT trcr2bl2r; + // Same for r2l bulk data. + Int prob2br2l[nprobtypes + 1]; + IntListT trcr2br2l; + }; + + static int get_problem_type(const int& idx); + + // icpc doesn't let us use problem_type_ here, even though it's constexpr. + static int get_problem_type_idx(const int& mask); + + static int get_problem_type_l2r_bulk_size(const int& mask); + + static int get_problem_type_r2l_bulk_size(const int& mask); + + struct CPT { + // We could make the l2r buffer smaller by one entry, Qm. However, the + // l2r comm is more efficient if it's done with one buffer. Similarly, + // we separate the r2l data into a separate buffer for packing and MPI + // efficiency. + // There are 7 possible problems. + // The only problem not supported is conservation alone. It makes very + // little sense to use QLT for conservation alone. + // The remaining 6 fall into 4 categories of details. These 4 categories + // are tracked by QLT; which of the original 6 problems being solved is + // not important. + enum { + // l2r: rhom, (Qm_min, Qm, Qm_max)*; l2r, r2l: Qm* + s = ProblemType::shapepreserve, + st = ProblemType::shapepreserve | ProblemType::consistent, + // l2r: rhom, (Qm_min, Qm, Qm_max, Qm_prev)*; l2r, r2l: Qm* + cs = ProblemType::conserve | s, + cst = ProblemType::conserve | st, + // l2r: rhom, (q_min, Qm, q_max)*; l2r, r2l: Qm* + t = ProblemType::consistent, + // l2r: rhom, (q_min, Qm, q_max, Qm_prev)*; l2r, r2l: Qm* + ct = ProblemType::conserve | t + }; + }; + + Arrays a_h; + Arrays a_d; + + void init(const MetaDataBuilder& mdb); + + private: + static constexpr Int problem_type_[] = { CPT::st, CPT::cst, CPT::t, CPT::ct }; + Arrays a_h_; + Arrays a_d_; + }; + + struct BulkData { + typedef Kokkos::View RealList; + typedef cedr::impl::Unmanaged UnmanagedRealList; + + UnmanagedRealList l2r_data, r2l_data; + + void init(const MetaData& md, const Int& nslots); + + private: + RealList l2r_data_, r2l_data_; + }; + +private: + void init(const Parallel::Ptr& p, const Int& ncells, const tree::Node::Ptr& tree); + + void init_ordinals(); + + KOKKOS_INLINE_FUNCTION + static void solve_node_problem(const Int problem_type, + const Real& rhom, const Real* pd, const Real& Qm, + const Real& rhom0, const Real* k0d, Real& Qm0, + const Real& rhom1, const Real* k1d, Real& Qm1); + +private: + Parallel::Ptr p_; + // Tree and communication topology. + std::shared_ptr ns_; + // Globally unique cellidx -> rank-local index. + std::map gci2lci_; + // Temporary to collect caller's tracer information prior to calling + // end_tracer_declarations(). + typename MetaDataBuilder::Ptr mdb_; + // Constructed in end_tracer_declarations(). + MetaData md_; + BulkData bd_; +}; + +namespace test { +struct Input { + bool unittest, perftest, write; + Int ncells, ntracers, tracer_type, nrepeat; + bool pseudorandom, verbose; +}; + +Int run_unit_and_randomized_tests(const Parallel::Ptr& p, const Input& in); +} // namespace test +} // namespace qlt +} // namespace cedr + +// These are the definitions that must be visible in the calling translation +// unit, unless Cuda relocatable device code is enabled. +#include "cedr_qlt_inl.hpp" + +#endif diff --git a/cedr/cedr_qlt_inl.hpp b/cedr/cedr_qlt_inl.hpp new file mode 100644 index 0000000..fb9290f --- /dev/null +++ b/cedr/cedr_qlt_inl.hpp @@ -0,0 +1,161 @@ +#ifndef INCLUDE_CEDR_QLT_INL_HPP +#define INCLUDE_CEDR_QLT_INL_HPP + +#include + +#include "cedr_local.hpp" + +namespace cedr { +namespace qlt { + +template KOKKOS_INLINE_FUNCTION +void QLT::set_rhom (const Int& lclcellidx, const Int& rhomidx, const Real& rhom) { + const Int ndps = md_.a_d.prob2bl2r[md_.nprobtypes]; + bd_.l2r_data(ndps*lclcellidx) = rhom; +} + +template KOKKOS_INLINE_FUNCTION +void QLT::set_Qm (const Int& lclcellidx, const Int& tracer_idx, + const Real& Qm, + const Real& Qm_min, const Real& Qm_max, + const Real Qm_prev) { + const Int ndps = md_.a_d.prob2bl2r[md_.nprobtypes]; + Real* bd; { + const Int bdi = md_.a_d.trcr2bl2r(tracer_idx); + bd = &bd_.l2r_data(ndps*lclcellidx + bdi); + } + bd[1] = Qm; + { + const Int problem_type = md_.a_d.trcr2prob(tracer_idx); + if (problem_type & ProblemType::shapepreserve) { + bd[0] = Qm_min; + bd[2] = Qm_max; + } else if (problem_type & ProblemType::consistent) { + const Real rhom = bd_.l2r_data(ndps*lclcellidx); + bd[0] = Qm_min / rhom; + bd[2] = Qm_max / rhom; + } else { + cedr_kernel_throw_if(true, "set_Q: invalid problem_type."); + } + if (problem_type & ProblemType::conserve) { + cedr_kernel_throw_if(Qm_prev < -0.5, + "Qm_prev was not provided to set_Q."); + bd[3] = Qm_prev; + } + } +} + +template KOKKOS_INLINE_FUNCTION +Real QLT::get_Qm (const Int& lclcellidx, const Int& tracer_idx) { + const Int ndps = md_.a_d.prob2br2l[md_.nprobtypes]; + const Int bdi = md_.a_d.trcr2br2l(tracer_idx); + return bd_.r2l_data(ndps*lclcellidx + bdi); +} + +//todo Replace this and the calling code with ReconstructSafely. +KOKKOS_INLINE_FUNCTION +void r2l_nl_adjust_bounds (Real Qm_bnd[2], const Real rhom[2], Real Qm_extra) { + Real q[2]; + for (Int i = 0; i < 2; ++i) q[i] = Qm_bnd[i] / rhom[i]; + if (Qm_extra < 0) { + Int i0, i1; + if (q[0] >= q[1]) { i0 = 0; i1 = 1; } else { i0 = 1; i1 = 0; } + const Real Qm_gap = (q[i1] - q[i0])*rhom[i0]; + if (Qm_gap <= Qm_extra) { + Qm_bnd[i0] += Qm_extra; + return; + } + } else { + Int i0, i1; + if (q[0] <= q[1]) { i0 = 0; i1 = 1; } else { i0 = 1; i1 = 0; } + const Real Qm_gap = (q[i1] - q[i0])*rhom[i0]; + if (Qm_gap >= Qm_extra) { + Qm_bnd[i0] += Qm_extra; + return; + } + } + { // Have to adjust both. Adjust so that the q bounds are the same. This + // procedure assures that as long as rhom is conservative, then the + // adjustment never pushes q_{min,max} out of the safety bounds. + const Real Qm_tot = Qm_bnd[0] + Qm_bnd[1] + Qm_extra; + const Real rhom_tot = rhom[0] + rhom[1]; + const Real q_tot = Qm_tot / rhom_tot; + for (Int i = 0; i < 2; ++i) + Qm_bnd[i] = q_tot*rhom[i]; + } +} + +namespace impl { +KOKKOS_INLINE_FUNCTION +void solve_node_problem (const Real& rhom, const Real* pd, const Real& Qm, + const Real& rhom0, const Real* k0d, Real& Qm0, + const Real& rhom1, const Real* k1d, Real& Qm1) { + Real Qm_min_kids [] = {k0d[0], k1d[0]}; + Real Qm_orig_kids[] = {k0d[1], k1d[1]}; + Real Qm_max_kids [] = {k0d[2], k1d[2]}; + { // The ideal problem is not assuredly feasible. Test for feasibility. If not + // feasible, adjust bounds to solve the safety problem, which is assuredly + // feasible if the total density field rho is mass conserving (Q doesn't + // have to be mass conserving, of course; achieving mass conservation is one + // use for QLT). + const Real Qm_min = pd[0], Qm_max = pd[2]; + const bool lo = Qm < Qm_min, hi = Qm > Qm_max; + if (lo || hi) { + // If the discrepancy is numerical noise, don't act on it. + const Real tol = 10*std::numeric_limits::epsilon(); + const Real discrepancy = lo ? Qm_min - Qm : Qm - Qm_max; + if (discrepancy > tol*Qm_max) { + const Real rhom_kids[] = {rhom0, rhom1}; + r2l_nl_adjust_bounds(lo ? Qm_min_kids : Qm_max_kids, + rhom_kids, + Qm - (lo ? Qm_min : Qm_max)); + } + } else { + // Quick exit if everything is OK as is. This is a speedup, and it also + // lets the subnode solver make ~1 ulp changes instead of having to keep x + // = y if y satisfies the conditions. Without this block, the + // no_change_should_hold tests can fail. + if (Qm == pd[1] && // Was our total tracer mass adjusted? + // Are the kids' problems feasible? + Qm_orig_kids[0] >= Qm_min_kids[0] && Qm_orig_kids[0] <= Qm_max_kids[0] && + Qm_orig_kids[1] >= Qm_min_kids[1] && Qm_orig_kids[1] <= Qm_max_kids[1]) { + // Don't need to do anything, so skip even the math-based quick exits in + // solve_node_problem. + Qm0 = Qm_orig_kids[0]; + Qm1 = Qm_orig_kids[1]; + return; + } + } + } + { // Solve the node's QP. + static const Real ones[] = {1, 1}; + const Real w[] = {1/rhom0, 1/rhom1}; + Real Qm_kids[2] = {k0d[1], k1d[1]}; + local::solve_1eq_bc_qp_2d(w, ones, Qm, Qm_min_kids, Qm_max_kids, + Qm_orig_kids, Qm_kids); + Qm0 = Qm_kids[0]; + Qm1 = Qm_kids[1]; + } +} +} // namespace impl + +template KOKKOS_INLINE_FUNCTION +void QLT::solve_node_problem (const Int problem_type, + const Real& rhom, const Real* pd, const Real& Qm, + const Real& rhom0, const Real* k0d, Real& Qm0, + const Real& rhom1, const Real* k1d, Real& Qm1) { + if ( ! (problem_type & ProblemType::shapepreserve)) { + Real mpd[3], mk0d[3], mk1d[3]; + mpd[0] = pd [0]*rhom ; mpd [1] = pd[1] ; mpd [2] = pd [2]*rhom ; + mk0d[0] = k0d[0]*rhom0; mk0d[1] = k0d[1]; mk0d[2] = k0d[2]*rhom0; + mk1d[0] = k1d[0]*rhom1; mk1d[1] = k1d[1]; mk1d[2] = k1d[2]*rhom1; + impl::solve_node_problem(rhom, mpd, Qm, rhom0, mk0d, Qm0, rhom1, mk1d, Qm1); + return; + } + impl::solve_node_problem(rhom, pd, Qm, rhom0, k0d, Qm0, rhom1, k1d, Qm1); +} + +} // namespace qlt +} // namespace cedr + +#endif diff --git a/cedr/cedr_test.cpp b/cedr/cedr_test.cpp new file mode 100644 index 0000000..ebb76f1 --- /dev/null +++ b/cedr/cedr_test.cpp @@ -0,0 +1,109 @@ +#include "cedr_qlt.hpp" +#include "cedr_caas.hpp" +#include "cedr_mpi.hpp" +#include "cedr_util.hpp" +#include "cedr_test.hpp" + +#include +#include + +namespace cedr { +struct InputParser { + qlt::test::Input qin; + test::transport1d::Input tin; + + class ArgAdvancer { + const int argc_; + char const* const* argv_; + int i_; + public: + ArgAdvancer (int argc, char** argv) : argc_(argc), argv_(argv), i_(1) {} + const char* advance () { + if (i_+1 >= argc_) cedr_throw_if(true, "Command line is missing an argument."); + return argv_[++i_]; + } + const char* token () const { return argv_[i_]; } + void incr () { ++i_; } + bool more () const { return i_ < argc_; } + }; + + InputParser (int argc, char** argv, const qlt::Parallel::Ptr& p) { + using util::eq; + qin.unittest = false; + qin.perftest = false; + qin.write = false; + qin.ncells = 0; + qin.ntracers = 1; + qin.tracer_type = 0; + qin.nrepeat = 1; + qin.pseudorandom = false; + qin.verbose = false; + tin.ncells = 0; + for (ArgAdvancer aa(argc, argv); aa.more(); aa.incr()) { + const char* token = aa.token(); + if (eq(token, "-t", "--unittest")) qin.unittest = true; + else if (eq(token, "-pt", "--perftest")) qin.perftest = true; + else if (eq(token, "-w", "--write")) qin.write = true; + else if (eq(token, "-nc", "--ncells")) qin.ncells = std::atoi(aa.advance()); + else if (eq(token, "-nt", "--ntracers")) qin.ntracers = std::atoi(aa.advance()); + else if (eq(token, "-tt", "--tracertype")) qin.tracer_type = std::atoi(aa.advance()); + else if (eq(token, "-nr", "--nrepeat")) qin.nrepeat = std::atoi(aa.advance()); + else if (eq(token, "--proc-random")) qin.pseudorandom = true; + else if (eq(token, "-v", "--verbose")) qin.verbose = true; + else if (eq(token, "-t1d", "--transport1dtest")) tin.ncells = 1; + else cedr_throw_if(true, "Invalid token " << token); + } + + if (tin.ncells) { + tin.ncells = qin.ncells; + tin.verbose = qin.verbose; + } + + cedr_throw_if(qin.tracer_type < 0 || qin.tracer_type >= 4, + "Tracer type is out of bounds [0, 3]."); + cedr_throw_if(qin.ntracers < 1, "Number of tracers is < 1."); + } + + void print (std::ostream& os) const { + os << "ncells " << qin.ncells + << " nrepeat " << qin.nrepeat; + if (qin.pseudorandom) os << " random"; + os << "\n"; + } +}; +} // namespace cedr + +int main (int argc, char** argv) { + int nerr = 0, retval = 0; + MPI_Init(&argc, &argv); + auto p = cedr::mpi::make_parallel(MPI_COMM_WORLD); + srand(p->rank()); + Kokkos::initialize(argc, argv); + try { + cedr::InputParser inp(argc, argv, p); + if (p->amroot()) inp.print(std::cout); + if (inp.qin.unittest) { + nerr += cedr::local::unittest(); + nerr += cedr::caas::test::unittest(p); + } + if (inp.qin.unittest || inp.qin.perftest) + nerr += cedr::qlt::test::run_unit_and_randomized_tests(p, inp.qin); + if (inp.tin.ncells > 0) + nerr += cedr::test::transport1d::run(p, inp.tin); + { + int gnerr; + cedr::mpi::reduce(*p, &nerr, &gnerr, 1, MPI_SUM, p->root()); + retval = gnerr != 0 ? -1 : 0; + if (p->amroot()) + std::cout << (gnerr != 0 ? "FAIL" : "PASS") << "\n"; + } + } catch (const std::exception& e) { + if (p->amroot()) + std::cerr << e.what(); + retval = -1; + } + Kokkos::finalize_all(); + if (nerr) prc(nerr); + MPI_Finalize(); + return retval; +} diff --git a/cedr/cedr_test.hpp b/cedr/cedr_test.hpp new file mode 100644 index 0000000..afa9b5c --- /dev/null +++ b/cedr/cedr_test.hpp @@ -0,0 +1,22 @@ +#ifndef INCLUDE_CEDR_TEST_HPP +#define INCLUDE_CEDR_TEST_HPP + +#include "cedr.hpp" +#include "cedr_mpi.hpp" + +namespace cedr { +namespace test { +namespace transport1d { + +struct Input { + Int ncells; + bool verbose; +}; + +Int run(const mpi::Parallel::Ptr& p, const Input& in); + +} // namespace transport1d +} // namespace test +} // namespace cedr + +#endif diff --git a/cedr/cedr_test_1d_transport.cpp b/cedr/cedr_test_1d_transport.cpp new file mode 100644 index 0000000..dbf2d9b --- /dev/null +++ b/cedr/cedr_test_1d_transport.cpp @@ -0,0 +1,320 @@ +#include "cedr_test.hpp" +#include "cedr_qlt.hpp" +#include "cedr_caas.hpp" + +#include + +namespace cedr { +namespace test { +namespace transport1d { + +namespace interp { +inline Real to_periodic_core (const Real& xl, const Real& xr, const Real& x) { + if (x >= xl && x <= xr) return x; + const Real w = xr - xl, xmxl = x - xl; + return x - w*std::floor(xmxl / w); +} + +inline Real get_slope (const Real x[2], const Real y[2]) { + return (y[1] - y[0]) / (x[1] - x[0]); +} + +inline void +get_cubic (Real dx, Real v1, Real s1, Real v2, Real s2, Real c[4]) { + Real dx2 = dx*dx; + Real dx3 = dx2*dx; + Real den = -dx3; + Real b1, b2; + c[2] = s1; + c[3] = v1; + b1 = v2 - dx*c[2] - c[3]; + b2 = s2 - c[2]; + c[0] = (2.0*b1 - dx*b2) / den; + c[1] = (-3.0*dx*b1 + dx2*b2) / den; +} + +void cubic_interp_periodic ( + const Real* const x, const Int nx, const Real* const y, + const Real* const xi, const Int nxi, Real* const yi, + Int* const dod) +{ + const int nc = nx - 1; +#ifdef _OPENMP +# pragma omp parallel for +#endif + for (Int j = 0; j < nxi; ++j) { + const Real xi_per = to_periodic_core(x[0], x[nc], xi[j]); + Int ip1 = std::upper_bound(x, x + nx, xi_per) - x; + // Handle numerical issues at boundaries. + if (ip1 == 0) ++ip1; + else if (ip1 == nx) --ip1; + const Int i = ip1 - 1; + // Domain of dependence. + Int* dodj = dod + 4*j; + for (Int k = 0; k < 4; ++k) + dodj[k] = (i - 1 + k + nc) % nc; + // Slopes. + const bool at_start = i == 0, at_end = i == nc - 1; + const Real smid = get_slope(x+i, y+i); + Real s1, s2; + if (at_start) { + const Real a = (x[nc] - x[nc-1]) / ((x[1] - x[0]) + (x[nc] - x[nc-1])); + s1 = (1 - a)*get_slope(x+nc-1, y+nc-1) + a*smid; + } else { + const Real a = (x[i] - x[i-1]) / (x[ip1] - x[i-1]); + s1 = (1 - a)*get_slope(x+i-1, y+i-1) + a*smid; + } + if (at_end) { + const Real a = (x[ip1] - x[i]) / ((x[ip1] - x[i]) + (x[1] - x[0])); + s2 = (1 - a)*smid + a*get_slope(x, y); + } else { + const Real a = (x[ip1] - x[i]) / (x[i+2] - x[i]); + s2 = (1 - a)*smid + a*get_slope(x+ip1, y+ip1); + } + // Interp. + Real c[4]; + get_cubic(x[ip1] - x[i], y[i], s1, y[ip1], s2, c); + const Real xij = xi_per - x[i]; + yi[j] = (((c[0]*xij + c[1])*xij) + c[2])*xij + c[3]; + } +} +} // namespace interp + +class PyWriter { + typedef std::unique_ptr FilePtr; + FilePtr fh_; +public: + PyWriter(const std::string& filename); + void write(const std::string& field_name, const std::vector& v) const; +}; + +PyWriter::PyWriter (const std::string& filename) { + fh_ = FilePtr(fopen((filename + ".py").c_str(), "w")); + fprintf(fh_.get(), "s = {};\n"); +} + +void PyWriter::write (const std::string& field_name, const std::vector& v) const { + fprintf(fh_.get(), "s['%s'] = [", field_name.c_str()); + for (const auto& e: v) + fprintf(fh_.get(), " %1.15e,", e); + fprintf(fh_.get(), "]\n"); +} + +struct InitialCondition { + enum Enum { sin, bell, rect, uniform }; + static std::string convert (const Enum& e) { + switch (e) { + case Enum::sin: return "sin"; + case Enum::bell: return "bell"; + case Enum::rect: return "rect"; + case Enum::uniform: return "uniform"; + } + cedr_throw_if(true, "InitialCondition::convert can't convert " << e); + } + static Enum convert (const std::string& s) { + using util::eq; + if (eq(s, "sin")) return Enum::sin; + if (eq(s, "bell")) return Enum::bell; + if (eq(s, "rect")) return Enum::rect; + if (eq(s, "uniform")) return Enum::uniform; + cedr_throw_if(true, "InitialCondition::convert can't convert " << s); + } + static Real eval (const Enum& ic, const Real x) { + switch (ic) { + case Enum::sin: return 0.1 + 0.8*0.5*(1 + std::sin(6*M_PI*x)); + case Enum::bell: return x < 0.5 ? std::sin(2*M_PI*x) : 0; + case Enum::rect: return x > 0.66 || x < 0.33 ? 0 : 1; + case Enum::uniform: return 0.42; + } + cedr_throw_if(true, "InitialCondition::eval can't convert " << ic); + } +}; + +class Problem1D { + std::vector xb_, xcp_, rwrk_; + std::vector iwrk_; + + void init_mesh (const Int ncells, const bool nonuniform_mesh) { + xb_.resize(ncells+1); + xcp_.resize(ncells+1); + xb_[0] = 0; + if (nonuniform_mesh) { + // Large-scale, continuous variation in cell size, plus a huge jump at the + // periodic boundary. + for (Int i = 1; i <= ncells; ++i) { + const Real x = cedr::util::square(Real(i) / ncells); + xb_[i] = 0.01 + sin(0.5*M_PI*x*x*x*x); + } + // Random local cell sizes. + for (Int i = 1; i <= ncells; ++i) + xb_[i] *= 0.3 + cedr::util::urand(); + // Cumsum. + for (Int i = 1; i <= ncells; ++i) + xb_[i] += xb_[i-1]; + // Normalize. + for (Int i = 1; i <= ncells; ++i) + xb_[i] /= xb_[ncells]; + } else { + xb_.back() = 1; + for (Int i = 1; i < ncells; ++i) + xb_[i] = Real(i) / ncells; + } + for (Int i = 0; i < ncells; ++i) + xcp_[i] = 0.5*(xb_[i] + xb_[i+1]); + xcp_.back() = 1 + xcp_[0]; + } + + static void run_cdr (const Problem1D& p, CDR& cdr, + const Real* yp, Real* y, const Int* dods) { + const Int n = p.ncells(); + for (Int i = 0; i < n; ++i) { + const Int* dod = dods + 4*i; + Real min = yp[dod[0]], max = min; + for (Int j = 1; j < 4; ++j) { + const Real v = yp[dod[j]]; + min = std::min(min, v); + max = std::max(max, v); + } + const Real area_i = p.area(i); + cdr.set_Qm(i, 0, y[i]*area_i, min*area_i, max*area_i, yp[i]*area_i); + } + cdr.run(); + for (Int i = 0; i < n; ++i) + y[i] = cdr.get_Qm(i, 0) / p.area(i); + y[n] = y[0]; + } + + static void run_caas (const Problem1D& p, const Real* yp, Real* y, const Int* dods) { + const Int n = p.ncells(); + std::vector lo(n), up(n), w(n); + Real m = 0; + for (Int i = 0; i < n; ++i) { + const Int* dod = dods + 4*i; + Real min = yp[dod[0]], max = min; + for (Int j = 1; j < 4; ++j) { + const Real v = yp[dod[j]]; + min = std::min(min, v); + max = std::max(max, v); + } + const Real area_i = p.area(i); + lo[i] = min*area_i; + up[i] = max*area_i; + y[i] = std::max(min, std::min(max, y[i])); + m += (yp[i] - y[i])*area_i; + } + Real wsum = 0; + for (Int i = 0; i < n; ++i) { + w[i] = m >= 0 ? up[i] - y[i]*p.area(i) : y[i]*p.area(i) - lo[i]; + wsum += w[i]; + } + for (Int i = 0; i < n; ++i) + y[i] += (m/(wsum*p.area(i)))*w[i]; + } + +public: + Problem1D (const Int ncells, const bool nonuniform_mesh = false) { + init_mesh(ncells, nonuniform_mesh); + } + + Int ncells () const { return xb_.size() - 1; } + Real xb (const Int& i) const { return xb_[i]; } + Real xcp (const Int& i) const { return xcp_[i]; } + Real area (const Int& i) const { return xb_[i+1] - xb_[i]; } + + const std::vector get_xb () const { return xb_; } + const std::vector get_xcp () const { return xcp_; } + + void cycle (const Int& nsteps, const Real* y0, Real* yf, CDR* cdr = nullptr) { + const Int n = xcp_.size(); + rwrk_.resize(2*n); + iwrk_.resize(4*n); + Real* xcpi = rwrk_.data(); + Int* dod = iwrk_.data(); + + const Real xos = -1.0 / nsteps; + for (Int i = 0; i < n; ++i) + xcpi[i] = xcp_[i] + xos; + + Real* ys[] = {xcpi + n, yf}; + std::copy(y0, y0 + n, ys[0]); + for (Int ti = 0; ti < nsteps; ++ti) { + interp::cubic_interp_periodic(xcp_.data(), n, ys[0], + xcpi, n, ys[1], dod); + if (cdr) + run_cdr(*this, *cdr, ys[0], ys[1], dod); + else + run_caas(*this, ys[0], ys[1], dod); + std::swap(ys[0], ys[1]); + } + std::copy(ys[0], ys[0] + n, yf); + } +}; + +//todo Clean this up. Right now everything is hardcoded and kludgy. +// - optional write +// - some sort of brief quantitative output +// - better, more canonical IC +// - optional tree imbalance +// - optional mesh nonuniformity +// - parallel? +Int run (const mpi::Parallel::Ptr& parallel, const Input& in) { + cedr_throw_if(parallel->size() > 1, "run_1d_transport_test runs in serial only."); + Int nerr = 0; + + Problem1D p(in.ncells, false /* nonuniform_mesh */ ); + + auto tree = qlt::tree::make_tree_over_1d_mesh(parallel, in.ncells, + false /* imbalanced */); + typedef qlt::QLT QLTT; + QLTT qlt(parallel, in.ncells, tree); + + typedef caas::CAAS CAAST; + CAAST caas(parallel, in.ncells); + + CDR* cdrs[] = {&qlt, &caas}; + const int ncdrs = sizeof(cdrs)/sizeof(*cdrs); + + for (CDR* cdr : cdrs) { + cdr->declare_tracer(cedr::ProblemType::conserve | + cedr::ProblemType::shapepreserve, 0); + cdr->end_tracer_declarations(); + for (Int i = 0; i < in.ncells; ++i) + cdr->set_rhom(i, 0, p.area(i)); + cdr->print(std::cout); + } + + std::vector y0(in.ncells+1); + for (Int i = 0, nc = p.ncells(); i < nc; ++i) + y0[i] = (p.xcp(i) < 0.4 || p.xcp(i) > 0.9 ? + InitialCondition::eval(InitialCondition::sin, p.xcp(i)) : + InitialCondition::eval(InitialCondition::rect, p.xcp(i))); + y0.back() = y0[0]; + + PyWriter w("out_transport1d"); + w.write("xb", p.get_xb()); + w.write("xcp", p.get_xcp()); + w.write("y0", y0); + + std::vector yf(in.ncells+1); + const Int nsteps = Int(3.17*in.ncells); + const Int ncycles = 1; + + const char* names[] = {"yqlt", "ycaas"}; + for (int ic = 0; ic < ncdrs; ++ic) { + std::copy(y0.begin(), y0.end(), yf.begin()); + for (Int i = 0; i < ncycles; ++i) + p.cycle(nsteps, yf.data(), yf.data(), cdrs[ic]); + w.write(names[ic], yf); + } + + std::copy(y0.begin(), y0.end(), yf.begin()); + for (Int i = 0; i < ncycles; ++i) + p.cycle(nsteps, yf.data(), yf.data()); + w.write("ylcaas", yf); + + return nerr; +} + +} // namespace transport1d +} // namespace test +} // namespace cedr diff --git a/cedr/cedr_test_randomized.cpp b/cedr/cedr_test_randomized.cpp new file mode 100644 index 0000000..32ede9b --- /dev/null +++ b/cedr/cedr_test_randomized.cpp @@ -0,0 +1,439 @@ +#include "cedr_test_randomized.hpp" + +namespace cedr { +namespace test { + +std::string TestRandomized::Tracer::str () const { + std::stringstream ss; + ss << "(ti " << idx; + if (problem_type & PT::conserve) ss << " c"; + if (problem_type & PT::shapepreserve) ss << " s"; + if (problem_type & PT::consistent) ss << " t"; + ss << " pt " << perturbation_type << " ssh " << safe_should_hold + << " lsh " << local_should_hold << ")"; + return ss.str(); +} + +TestRandomized::Writer::~Writer () { + if ( ! fh) return; + fprintf(fh.get(), " return s\n"); +} + +void TestRandomized::init_tracers_vector () { + typedef Tracer::PT PT; + static const Int pts[] = { + PT::conserve | PT::shapepreserve | PT::consistent, + PT::shapepreserve, // Test a noncanonical problem type. + PT::conserve | PT::consistent, + PT::consistent + }; + Int tracer_idx = 0; + for (Int perturb = 0; perturb < 6; ++perturb) + for (Int ti = 0; ti < 4; ++ti) { + Tracer t; + t.problem_type = pts[ti]; + const bool shapepreserve = t.problem_type & PT::shapepreserve; + t.idx = tracer_idx++; + t.perturbation_type = perturb; + t.safe_should_hold = true; + t.no_change_should_hold = perturb == 0; + t.local_should_hold = perturb < 4 && shapepreserve; + t.write = perturb == 2 && ti == 2; + tracers_.push_back(t); + } +} + +static Real urand () { return rand() / ((Real) RAND_MAX + 1.0); } + +void TestRandomized::generate_rho (Values& v) { + auto r = v.rhom(); + const Int n = v.ncells(); + for (Int i = 0; i < n; ++i) + r[i] = 0.5 + 1.5*urand(); +} + +void TestRandomized::generate_Q (const Tracer& t, Values& v) { + Real* rhom = v.rhom(), * Qm_min = v.Qm_min(t.idx), * Qm = v.Qm(t.idx), + * Qm_max = v.Qm_max(t.idx), * Qm_prev = v.Qm_prev(t.idx); + const Int n = v.ncells(); + for (Int i = 0; i < n; ++i) { + const Real + q_min = 0.1 + 0.8*urand(), + q_max = std::min(1, q_min + (0.9 - q_min)*urand()), + q = q_min + (q_max - q_min)*urand(); + // Check correctness up to FP. + cedr_assert(q_min >= 0 && + q_max <= 1 + 10*std::numeric_limits::epsilon() && + q_min <= q && q <= q_max); + Qm_min[i] = q_min*rhom[i]; + Qm_max[i] = q_max*rhom[i]; + // Protect against FP error. + Qm[i] = std::max(Qm_min[i], std::min(Qm_max[i], q*rhom[i])); + // Set previous Qm to the current unperturbed value. + Qm_prev[i] = Qm[i]; + } +} + +static void gen_rand_perm (const size_t n, std::vector& p) { + p.resize(n); + for (size_t i = 0; i < n; ++i) + p[i] = i; + for (size_t i = 0; i < n; ++i) { + const int j = urand()*n, k = urand()*n; + std::swap(p[j], p[k]); + } +} + +// Permuting the Qm array, even just on a rank as long as there is > 1 cell, +// produces a problem likely requiring considerable reconstruction, which +// reconstruction assuredly satisfies the properties. But because this is a +// local operation only, it doesn't test the 1 cell/rank case. +void TestRandomized::permute_Q (const Tracer& t, Values& v) { + Real* const Qm = v.Qm(t.idx); + const Int N = v.ncells(); + std::vector p; + gen_rand_perm(N, p); + std::vector Qm_orig(N); + std::copy(Qm, Qm + N, Qm_orig.begin()); + for (Int i = 0; i < N; ++i) + Qm[i] = Qm_orig[p[i]]; +} + +void TestRandomized +::add_const_to_Q (const Tracer& t, Values& v, + // Move 0 < alpha <= 1 of the way to the QLT or safety + // feasibility bound. + const Real& alpha, + // Whether the modification should be done in a + // mass-conserving way. + const bool conserve_mass, + // Only safety problem is feasible. + const bool safety_problem) { + // Some of these reductions aren't used at present. Might add more test + // options later that use them. + Real rhom, Qm, Qm_max; { + Real Qm_sum_lcl[3] = {0}; + for (Int i = 0; i < v.ncells(); ++i) { + Qm_sum_lcl[0] += v.rhom()[i]; + Qm_sum_lcl[1] += v.Qm(t.idx)[i]; + Qm_sum_lcl[2] += v.Qm_max(t.idx)[i]; + } + Real Qm_sum_gbl[3] = {0}; + mpi::all_reduce(*p_, Qm_sum_lcl, Qm_sum_gbl, 3, MPI_SUM); + rhom = Qm_sum_gbl[0]; Qm = Qm_sum_gbl[1]; Qm_max = Qm_sum_gbl[2]; + } + Real Qm_max_safety = 0; + if (safety_problem && v.ncells()) { + Real q_safety_lcl = v.Qm_max(t.idx)[0] / v.rhom()[0]; + for (Int i = 1; i < v.ncells(); ++i) + q_safety_lcl = std::max(q_safety_lcl, v.Qm_max(t.idx)[i] / v.rhom()[i]); + Real q_safety_gbl = 0; + mpi::all_reduce(*p_, &q_safety_lcl, &q_safety_gbl, 1, MPI_MAX); + Qm_max_safety = q_safety_gbl*rhom; + } + const Real dQm = safety_problem ? + ((Qm_max - Qm) + alpha * (Qm_max_safety - Qm_max)) / ncells_ : + alpha * (Qm_max - Qm) / ncells_; + for (Int i = 0; i < v.ncells(); ++i) + v.Qm(t.idx)[i] += dQm; + // Now permute Qm so that it's a little more interesting. + permute_Q(t, v); + // Adjust Qm_prev. Qm_prev is used to test the PT::conserve case, and also + // simply to record the correct total mass. The modification above modified + // Q's total mass. If conserve_mass, then Qm_prev needs to be made to sum to + // the same new mass. If ! conserve_mass, we want Qm_prev to be modified in + // an interesting way, so that PT::conserve doesn't trivially undo the mod + // that was made above when the root fixes the mass discrepancy. + const Real + relax = 0.9, + dQm_prev = (conserve_mass ? dQm : + (safety_problem ? + ((Qm_max - Qm) + relax*alpha * (Qm_max_safety - Qm_max)) / ncells_ : + relax*alpha * (Qm_max - Qm) / ncells_)); + for (Int i = 0; i < v.ncells(); ++i) + v.Qm_prev(t.idx)[i] += dQm_prev; +} + +void TestRandomized::perturb_Q (const Tracer& t, Values& v) { + // QLT is naturally mass conserving. But if QLT isn't being asked to impose + // mass conservation, then the caller better have a conservative + // method. Here, we model that by saying that Qm_prev and Qm should sum to + // the same mass. + const bool cm = ! (t.problem_type & Tracer::PT::conserve); + // For the edge cases, we cannot be exactly on the edge and still expect the + // q-limit checks to pass to machine precision. Thus, back away from the + // edge by an amount that bounds the error in the global mass due to FP, + // assuming each cell's mass is O(1). + const Real edg = 1 - ncells_*std::numeric_limits::epsilon(); + switch (t.perturbation_type) { + case 0: + // Do nothing, to test that QLT doesn't make any changes if none is + // needed. + break; + case 1: permute_Q(t, v); break; + case 2: add_const_to_Q(t, v, 0.5, cm, false); break; + case 3: add_const_to_Q(t, v, edg, cm, false); break; + case 4: add_const_to_Q(t, v, 0.5, cm, true ); break; + case 5: add_const_to_Q(t, v, edg, cm, true ); break; + } +} + +std::string TestRandomized::get_tracer_name (const Tracer& t) { + std::stringstream ss; + ss << "t" << t.idx; + return ss.str(); +} + +void TestRandomized::init_writer () { + if (p_->amroot()) { + w_ = std::make_shared(); + w_->fh = std::unique_ptr(fopen("out_QLT.py", "w")); + int n = gcis_.size(); + w_->ngcis.resize(p_->size()); + mpi::gather(*p_, &n, 1, w_->ngcis.data(), 1, p_->root()); + w_->displs.resize(p_->size() + 1); + w_->displs[0] = 0; + for (size_t i = 0; i < w_->ngcis.size(); ++i) + w_->displs[i+1] = w_->displs[i] + w_->ngcis[i]; + cedr_assert(w_->displs.back() == ncells_); + w_->gcis.resize(ncells_); + mpi::gatherv(*p_, gcis_.data(), gcis_.size(), w_->gcis.data(), w_->ngcis.data(), + w_->displs.data(), p_->root()); + } else { + int n = gcis_.size(); + mpi::gather(*p_, &n, 1, static_cast(nullptr), 0, p_->root()); + Long* Lnull = nullptr; + const int* inull = nullptr; + mpi::gatherv(*p_, gcis_.data(), gcis_.size(), Lnull, inull, inull, p_->root()); + } + write_inited_ = true; +} + +void TestRandomized +::gather_field (const Real* Qm_lcl, std::vector& Qm_gbl, + std::vector& wrk) { + if (p_->amroot()) { + Qm_gbl.resize(ncells_); + wrk.resize(ncells_); + mpi::gatherv(*p_, Qm_lcl, gcis_.size(), wrk.data(), w_->ngcis.data(), + w_->displs.data(), p_->root()); + for (Int i = 0; i < ncells_; ++i) + Qm_gbl[w_->gcis[i]] = wrk[i]; + } else { + Real* rnull = nullptr; + const int* inull = nullptr; + mpi::gatherv(*p_, Qm_lcl, gcis_.size(), rnull, inull, inull, p_->root()); + } +} + +void TestRandomized +::write_field (const std::string& tracer_name, const std::string& field_name, + const std::vector& Qm) { + if ( ! p_->amroot()) return; + fprintf(w_->fh.get(), " s.%s.%s = [", tracer_name.c_str(), field_name.c_str()); + for (const auto& e : Qm) + fprintf(w_->fh.get(), "%1.15e, ", e); + fprintf(w_->fh.get(), "]\n"); +} + +void TestRandomized::write_pre (const Tracer& t, Values& v) { + if ( ! t.write) return; + std::vector f, wrk; + if ( ! write_inited_) { + init_writer(); + if (w_) + fprintf(w_->fh.get(), + "def getsolns():\n" + " class Struct:\n" + " pass\n" + " s = Struct()\n" + " s.all = Struct()\n"); + gather_field(v.rhom(), f, wrk); + write_field("all", "rhom", f); + } + const auto name = get_tracer_name(t); + if (w_) + fprintf(w_->fh.get(), " s.%s = Struct()\n", name.c_str()); + gather_field(v.Qm_min(t.idx), f, wrk); + write_field(name, "Qm_min", f); + gather_field(v.Qm_prev(t.idx), f, wrk); + write_field(name, "Qm_orig", f); + gather_field(v.Qm(t.idx), f, wrk); + write_field(name, "Qm_pre", f); + gather_field(v.Qm_max(t.idx), f, wrk); + write_field(name, "Qm_max", f); +} + +void TestRandomized::write_post (const Tracer& t, Values& v) { + if ( ! t.write) return; + const auto name = get_tracer_name(t); + std::vector Qm, wrk; + gather_field(v.Qm(t.idx), Qm, wrk); + write_field(name, "Qm_qlt", Qm); +} + +Int TestRandomized +::check (const std::string& cdr_name, const mpi::Parallel& p, + const std::vector& ts, const Values& v) { + static const bool details = false; + static const Real ulp3 = 3*std::numeric_limits::epsilon(); + Int nerr = 0; + std::vector lcl_mass(2*ts.size()), q_min_lcl(ts.size()), q_max_lcl(ts.size()); + std::vector t_ok(ts.size(), 1), local_violated(ts.size(), 0); + for (size_t ti = 0; ti < ts.size(); ++ti) { + const auto& t = ts[ti]; + + cedr_assert(t.safe_should_hold); + const bool safe_only = ! t.local_should_hold; + const Int n = v.ncells(); + const Real* rhom = v.rhom(), * Qm_min = v.Qm_min(t.idx), * Qm = v.Qm(t.idx), + * Qm_max = v.Qm_max(t.idx), * Qm_prev = v.Qm_prev(t.idx); + + q_min_lcl[ti] = 1; + q_max_lcl[ti] = 0; + for (Int i = 0; i < n; ++i) { + const bool lv = (Qm[i] < Qm_min[i] || Qm[i] > Qm_max[i]); + if (lv) local_violated[ti] = 1; + if ( ! safe_only && lv) { + // If this fails at ~ machine eps, check r2l_nl_adjust_bounds code in + // solve_node_problem. + if (details) + pr("check q " << t.str() << ": " << Qm[i] << " " << + (Qm[i] < Qm_min[i] ? Qm[i] - Qm_min[i] : Qm[i] - Qm_max[i])); + t_ok[ti] = false; + ++nerr; + } + if (t.no_change_should_hold && Qm[i] != Qm_prev[i]) { + if (details) + pr("Q should be unchanged but is not: " << Qm_prev[i] << " changed to " << + Qm[i] << " in " << t.str()); + t_ok[ti] = false; + ++nerr; + } + lcl_mass[2*ti ] += Qm_prev[i]; + lcl_mass[2*ti + 1] += Qm[i]; + q_min_lcl[ti] = std::min(q_min_lcl[ti], Qm_min[i]/rhom[i]); + q_max_lcl[ti] = std::max(q_max_lcl[ti], Qm_max[i]/rhom[i]); + } + } + + std::vector q_min_gbl(ts.size(), 0), q_max_gbl(ts.size(), 0); + mpi::all_reduce(p, q_min_lcl.data(), q_min_gbl.data(), q_min_lcl.size(), MPI_MIN); + mpi::all_reduce(p, q_max_lcl.data(), q_max_gbl.data(), q_max_lcl.size(), MPI_MAX); + + for (size_t ti = 0; ti < ts.size(); ++ti) { + // Check safety problem. If local_should_hold and it does, then the safety + // problem is by construction also solved (since it's a relaxation of the + // local problem). + const auto& t = ts[ti]; + const bool safe_only = ! t.local_should_hold; + if (safe_only) { + const Int n = v.ncells(); + const Real* rhom = v.rhom(), * Qm_min = v.Qm_min(t.idx), * Qm = v.Qm(t.idx), + * Qm_max = v.Qm_max(t.idx); + const Real q_min = q_min_gbl[ti], q_max = q_max_gbl[ti]; + for (Int i = 0; i < n; ++i) { + if (Qm[i] < q_min*rhom[i]*(1 - ulp3) || + Qm[i] > q_max*rhom[i]*(1 + ulp3)) { + if (details) + pr("check q (safety) " << t.str() << ": " << q_min*rhom[i] << " " + << Qm_min[i] << " " << Qm[i] << " " << Qm_max[i] << " " + << q_max*rhom[i] << " | " << (Qm[i] < q_min*rhom[i] ? + Qm[i] - q_min*rhom[i] : + Qm[i] - q_max*rhom[i])); + t_ok[ti] = false; + ++nerr; + } + } + } + } + + std::vector glbl_mass(2*ts.size(), 0); + mpi::reduce(p, lcl_mass.data(), glbl_mass.data(), lcl_mass.size(), MPI_SUM, + p.root()); + std::vector t_ok_gbl(ts.size(), 0); + mpi::reduce(p, t_ok.data(), t_ok_gbl.data(), t_ok.size(), MPI_MIN, p.root()); + // Right now we're not using these: + std::vector local_violated_gbl(ts.size(), 0); + mpi::reduce(p, local_violated.data(), local_violated_gbl.data(), + local_violated.size(), MPI_MAX, p.root()); + + if (p.amroot()) { + const Real tol = 1e3*std::numeric_limits::epsilon(); + for (size_t ti = 0; ti < ts.size(); ++ti) { + // Check mass conservation. + const Real desired_mass = glbl_mass[2*ti], actual_mass = glbl_mass[2*ti+1], + rd = cedr::util::reldif(desired_mass, actual_mass); + const bool mass_failed = rd > tol; + if (mass_failed) { + ++nerr; + t_ok_gbl[ti] = false; + } + if ( ! t_ok_gbl[ti]) { + std::cout << "FAIL " << cdr_name << ": " << ts[ti].str(); + if (mass_failed) std::cout << " mass re " << rd; + std::cout << "\n"; + } + } + } + + return nerr; +} + +TestRandomized +::TestRandomized (const std::string& name, const mpi::Parallel::Ptr& p, + const Int& ncells, const bool verbose) + : cdr_name_(name), p_(p), ncells_(ncells), write_inited_(false) +{} + +void TestRandomized::init () { + init_numbering(); + init_tracers_vector(); + init_tracers(); +} + +Int TestRandomized::run (const Int nrepeat, const bool write) { + const Int nt = tracers_.size(), nlclcells = gcis_.size(); + + Values v(nt, nlclcells); + generate_rho(v); + for (const auto& t : tracers_) { + generate_Q(t, v); + perturb_Q(t, v); + } + + if (write) + for (const auto& t : tracers_) + write_pre(t, v); + + CDR& cdr = get_cdr(); + { + Real* rhom = v.rhom(); + for (Int i = 0; i < nlclcells; ++i) + cdr.set_rhom(i, 0, rhom[i]); + } + for (Int trial = 0; trial <= nrepeat; ++trial) { + for (Int ti = 0; ti < nt; ++ti) { + Real* Qm_min = v.Qm_min(ti), * Qm = v.Qm(ti), * Qm_max = v.Qm_max(ti), + * Qm_prev = v.Qm_prev(ti); + for (Int i = 0; i < nlclcells; ++i) + cdr.set_Qm(i, ti, Qm[i], Qm_min[i], Qm_max[i], Qm_prev[i]); + } + + run_impl(trial); + } + + for (Int ti = 0; ti < nt; ++ti) { + Real* Qm = v.Qm(ti); + for (Int i = 0; i < nlclcells; ++i) + Qm[i] = cdr.get_Qm(i, ti); + } + + if (write) + for (const auto& t : tracers_) + write_post(t, v); + return check(cdr_name_, *p_, tracers_, v); +} + +} // namespace test +} // namespace cedr diff --git a/cedr/cedr_test_randomized.hpp b/cedr/cedr_test_randomized.hpp new file mode 100644 index 0000000..dd4f54d --- /dev/null +++ b/cedr/cedr_test_randomized.hpp @@ -0,0 +1,128 @@ +#ifndef INCLUDE_CEDR_TEST_RANDOMIZED_HPP +#define INCLUDE_CEDR_TEST_RANDOMIZED_HPP + +#include "cedr_cdr.hpp" +#include "cedr_mpi.hpp" +#include "cedr_util.hpp" + +namespace cedr { +namespace test { + +class TestRandomized { +public: + TestRandomized(const std::string& cdr_name, const mpi::Parallel::Ptr& p, + const Int& ncells, const bool verbose = false); + + // The subclass should call this, probably in its constructor. + void init(); + + Int run(const Int nrepeat = 1, const bool write=false); + +private: + const std::string cdr_name_; + +protected: + struct Tracer { + typedef ProblemType PT; + + Int idx; + Int problem_type; + Int perturbation_type; + bool no_change_should_hold, safe_should_hold, local_should_hold; + bool write; + + std::string str() const; + + Tracer () + : idx(-1), problem_type(-1), perturbation_type(-1), no_change_should_hold(false), + safe_should_hold(true), local_should_hold(true), write(false) + {} + }; + + struct Values { + Values (const Int ntracers, const Int ncells) + : ncells_(ncells), v_((4*ntracers + 1)*ncells) + {} + Int ncells () const { return ncells_; } + Real* rhom () { return v_.data(); } + Real* Qm_min (const Int& ti) { return v_.data() + ncells_*(1 + 4*ti ); } + Real* Qm (const Int& ti) { return v_.data() + ncells_*(1 + 4*ti + 1); } + Real* Qm_max (const Int& ti) { return v_.data() + ncells_*(1 + 4*ti + 2); } + Real* Qm_prev (const Int& ti) { return v_.data() + ncells_*(1 + 4*ti + 3); } + const Real* rhom () const { return const_cast(this)->rhom(); } + const Real* Qm_min (const Int& ti) const + { return const_cast(this)->Qm_min (ti); } + const Real* Qm (const Int& ti) const + { return const_cast(this)->Qm (ti); } + const Real* Qm_max (const Int& ti) const + { return const_cast(this)->Qm_max (ti); } + const Real* Qm_prev (const Int& ti) const + { return const_cast(this)->Qm_prev(ti); } + private: + Int ncells_; + std::vector v_; + }; + + // For solution output, if requested. + struct Writer { + std::unique_ptr fh; + std::vector ngcis; // Number of i'th rank's gcis_ array. + std::vector gcis; // Global cell indices packed by rank's gcis_ vector. + std::vector displs; // Cumsum of above. + ~Writer(); + }; + + const mpi::Parallel::Ptr p_; + const Int ncells_; + // Global mesh entity IDs, 1-1 with reduction array index or QLT leaf node. + std::vector gcis_; + std::vector tracers_; + + // Tell this class the CDR. + virtual CDR& get_cdr() = 0; + + // Fill gcis_. + virtual void init_numbering() = 0; + + // Using tracers_, the vector of Tracers, initialize the CDR's tracers. + virtual void init_tracers() = 0; + + virtual void run_impl(const Int trial) = 0; + +private: + // For optional output. + bool write_inited_; + std::shared_ptr w_; // Only on root. + + void init_tracers_vector(); + + void add_const_to_Q( + const Tracer& t, Values& v, + // Move 0 < alpha <= 1 of the way to the QLT or safety feasibility bound. + const Real& alpha, + // Whether the modification should be done in a mass-conserving way. + const bool conserve_mass, + // Only safety problem is feasible. + const bool safety_problem); + + void perturb_Q(const Tracer& t, Values& v); + void init_writer(); + void gather_field(const Real* Qm_lcl, std::vector& Qm_gbl, + std::vector& wrk); + void write_field(const std::string& tracer_name, const std::string& field_name, + const std::vector& Qm); + void write_pre(const Tracer& t, Values& v); + void write_post(const Tracer& t, Values& v); + + static void generate_rho(Values& v); + static void generate_Q(const Tracer& t, Values& v); + static void permute_Q(const Tracer& t, Values& v); + static std::string get_tracer_name(const Tracer& t); + static Int check(const std::string& cdr_name, const mpi::Parallel& p, + const std::vector& ts, const Values& v); +}; + +} // namespace test +} // namespace cedr + +#endif diff --git a/cedr/cedr_util.cpp b/cedr/cedr_util.cpp new file mode 100644 index 0000000..3854888 --- /dev/null +++ b/cedr/cedr_util.cpp @@ -0,0 +1,23 @@ +#include "cedr_util.hpp" + +namespace cedr { +namespace util { + +bool eq (const std::string& a, const char* const b1, const char* const b2) { + return (a == std::string(b1) || (b2 && a == std::string(b2)) || + a == std::string("-") + std::string(b1)); +} + +Real urand () { return std::rand() / ((Real) RAND_MAX + 1.0); } + +Real reldif (const Real* a, const Real* b, const Int n) { + Real num = 0, den = 0; + for (Int i = 0; i < n; ++i) { + num += std::abs(a[i] - b[i]); + den += std::abs(a[i]); + } + return num/den; +} + +} +} diff --git a/cedr/cedr_util.hpp b/cedr/cedr_util.hpp new file mode 100644 index 0000000..87f5e2b --- /dev/null +++ b/cedr/cedr_util.hpp @@ -0,0 +1,90 @@ +#ifndef INCLUDE_CEDR_UTIL_HPP +#define INCLUDE_CEDR_UTIL_HPP + +#include + +#include "cedr_kokkos.hpp" +#include "cedr_mpi.hpp" + +namespace cedr { +namespace util { + +template KOKKOS_INLINE_FUNCTION constexpr +T square (const T& x) { return x*x; } + +bool eq(const std::string& a, const char* const b1, const char* const b2 = 0); + +// Uniform rand in [0, 1). +Real urand(); + +#define pr(m) do { \ + int _pid_ = 0; \ + MPI_Comm_rank(MPI_COMM_WORLD, &_pid_); \ + std::stringstream _ss_; \ + _ss_.precision(15); \ + _ss_ << "pid " << _pid_ << " " << m << std::endl; \ + std::cerr << _ss_.str(); \ + } while (0) +#define pr0(m) do { \ + int _pid_; MPI_Comm_rank(MPI_COMM_WORLD, &_pid_); \ + if (_pid_ != 0) break; \ + std::stringstream _ss_; \ + _ss_ << "pid " << _pid_ << " " << m << std::endl; \ + std::cerr << _ss_.str(); \ + } while (0) +#define prc(m) pr(#m << " | " << (m)) +#define pr0c(m) pr0(#m << " | " << (m)) +#define puf(m) "(" << #m << " " << (m) << ")" +#define pu(m) << " " << puf(m) +template +void prarr (const std::string& name, const T* const v, const size_t n) { + std::stringstream ss; + ss.precision(15); + ss << name << " = ["; + for (size_t i = 0; i < n; ++i) ss << " " << v[i]; + ss << "];"; + pr(ss.str()); +} +#define mprarr(m) cedr::util::prarr(#m, m.data(), m.size()) + +#ifndef NDEBUG +# define cedr_assert(condition) do { \ + if ( ! (condition)) { \ + std::stringstream _ss_; \ + _ss_ << __FILE__ << ":" << __LINE__ << ": FAIL:\n" << #condition \ + << "\n"; \ + throw std::logic_error(_ss_.str()); \ + } \ + } while (0) +# define cedr_kernel_assert(condition) do { \ + if ( ! (condition)) \ + Kokkos::abort(#condition); \ + } while (0) +#else +# define cedr_assert(condition) +# define cedr_kernel_assert(condition) +#endif +#define cedr_throw_if(condition, message) do { \ + if (condition) { \ + std::stringstream _ss_; \ + _ss_ << __FILE__ << ":" << __LINE__ << ": The condition:\n" \ + << #condition "\nled to the exception\n" << message << "\n"; \ + throw std::logic_error(_ss_.str()); \ + } \ + } while (0) +#define cedr_kernel_throw_if(condition, message) do { \ + if (condition) \ + Kokkos::abort(#condition " led to the exception\n" message); \ + } while (0) + +inline Real reldif (const Real a, const Real b) +{ return std::abs(b - a)/std::max(std::abs(a), std::abs(b)); } + +Real reldif(const Real* a, const Real* b, const Int n); + +struct FILECloser { void operator() (FILE* fh) { fclose(fh); } }; + +} +} + +#endif diff --git a/cedr/make_qltcpp.sh b/cedr/make_qltcpp.sh new file mode 100644 index 0000000..c0267c7 --- /dev/null +++ b/cedr/make_qltcpp.sh @@ -0,0 +1,10 @@ +# bash make_qltcpp.sh +# mpicxx -Wall -pedantic -fopenmp -std=c++11 -I/home/ambradl/lib/kokkos/cpu/include qlt.cpp -L/home/ambradl/lib/kokkos/cpu/lib -lkokkos -ldl +# OMP_PROC_BIND=false OMP_NUM_THREADS=2 mpirun -np 14 ./a.out -t + +(for f in cedr_kokkos.hpp cedr.hpp cedr_mpi.hpp cedr_util.hpp cedr_cdr.hpp cedr_qlt.hpp cedr_caas.hpp cedr_caas_inl.hpp cedr_local.hpp cedr_mpi_inl.hpp cedr_local_inl.hpp cedr_qlt_inl.hpp cedr_test_randomized.hpp cedr_test.hpp cedr_util.cpp cedr_local.cpp cedr_mpi.cpp cedr_qlt.cpp cedr_caas.cpp cedr_test_randomized.cpp cedr_test_1d_transport.cpp cedr_test.cpp; do + echo "//>> $f" + cat $f + echo "" +done) > qlt.cpp +sed sV'#include "cedr'V'//#include "cedr'V -i qlt.cpp diff --git a/siqk/CMakeLists.txt b/siqk/CMakeLists.txt new file mode 100644 index 0000000..9ff299c --- /dev/null +++ b/siqk/CMakeLists.txt @@ -0,0 +1,13 @@ +add_executable (siqk_test siqk_test.cpp) +set_target_properties (siqk_test PROPERTIES + COMPILE_FLAGS ${COMPOSE_COMPILE_FLAGS} + LINK_FLAGS ${COMPOSE_LINK_FLAGS}) +target_include_directories (siqk_test PRIVATE ${COMPOSE_INCLUDES}) +target_link_libraries (siqk_test ${COMPOSE_LIBRARIES}) + +configure_file (siqk_runtests.py siqk_runtests.py) + +add_test (NAME siqk-test-area + COMMAND python siqk_runtests.py $ 0) +add_test (NAME siqk-test-cube + COMMAND python siqk_runtests.py $ 1) diff --git a/siqk/readme.txt b/siqk/readme.txt new file mode 100644 index 0000000..261c49a --- /dev/null +++ b/siqk/readme.txt @@ -0,0 +1,13 @@ +For clarity, suppose your your C++ compiler is g++-4.8 in what follows. But it +can be something else. + +1. Get and install the standalone Kokkos TPL: + +$ git clone https://github.com/kokkos/kokkos.git +$ ./kokkos/generate_makefile.bash --with-openmp --ldflags=-fPIC --prefix=/path/to/desired/installation --compiler=g++-4.8 + +2. cp an existing make.inc.* file to one for your machine, say, +make.inc.mymachine. Edit it with machine-specific information. Then + $ ln -s make.inc.machine make.inc + $ make -j8 + $ ./siqk_runtests.py diff --git a/siqk/siqk.hpp b/siqk/siqk.hpp new file mode 100644 index 0000000..f71b94b --- /dev/null +++ b/siqk/siqk.hpp @@ -0,0 +1,10 @@ +#ifndef INCLUDE_SIQK_HPP +#define INCLUDE_SIQK_HPP + +#include "siqk_geometry.hpp" +#include "siqk_search.hpp" +#include "siqk_intersect.hpp" +#include "siqk_quadrature.hpp" +#include "siqk_sqr.hpp" + +#endif diff --git a/siqk/siqk_defs.hpp b/siqk/siqk_defs.hpp new file mode 100644 index 0000000..9c3cbd0 --- /dev/null +++ b/siqk/siqk_defs.hpp @@ -0,0 +1,230 @@ +#ifndef INCLUDE_SIQK_DEFS_HPP +#define INCLUDE_SIQK_DEFS_HPP + +#include +#include +#include +#include +#include + +#include + +#ifdef SIQK_TIME +# include +# include +# include +#endif + +// Always want this for GPU. +#define SIQK_NONRECURSIVE + +#ifdef KOKKOS_HAVE_CUDA +# define KOKKOS_CONSTANT __constant__ __device__ +#else +# define KOKKOS_CONSTANT +#endif + +namespace siqk { +namespace ko = Kokkos; +#define pr(m) do { \ + std::stringstream _ss_; \ + _ss_ << m << std::endl; \ + std::cerr << _ss_.str(); \ + } while (0) +#define prc(m) pr(#m << " | " << (m)) +#define puf(m)"(" << #m << " " << (m) << ")" +#define pu(m) << " " << puf(m) +template +static void prarr (const std::string& name, const T* const v, const size_t n) { + std::cerr << name << ": "; + for (size_t i = 0; i < n; ++i) std::cerr << " " << v[i]; + std::cerr << "\n"; +} + +#define SIQK_THROW_IF(condition, message) do { \ + if (condition) { \ + std::stringstream _ss_; \ + _ss_ << __FILE__ << ":" << __LINE__ << ": The condition:\n" << #condition \ + "\nled to the exception\n" << message << "\n"; \ + throw std::logic_error(_ss_.str()); \ + } \ + } while (0) + +#define SIQK_STDERR_IF(condition, message) do { \ + try { SIQK_THROW_IF(condition, message); } \ + catch (const std::logic_error& e) { std::cerr << e.what(); } \ +} while (0) + +#ifdef SIQK_TIME +static timeval tic () { + timeval t; + gettimeofday(&t, 0); + return t; +} +static double calc_et (const timeval& t1, const timeval& t2) { + static const double us = 1.0e6; + return (t2.tv_sec * us + t2.tv_usec - t1.tv_sec * us - t1.tv_usec) / us; +} +static double toc (const timeval& t1) { + Kokkos::fence(); + timeval t; + gettimeofday(&t, 0); + return calc_et(t1, t); +} +static double get_memusage () { + static const double scale = 1.0 / (1 << 10); // Memory in MB. + rusage ru; + getrusage(RUSAGE_SELF, &ru); + return ru.ru_maxrss*scale; +} +#else +inline int tic () { return 0; } +inline double toc (const int&) { return 0; } +inline double get_memusage () { return 0; } +#endif +static void print_times (const std::string& name, const double* const parts, + const int nparts) { +#ifdef SIQK_TIME + double total = 0; for (int i = 0; i < nparts; ++i) total += parts[i]; + printf("%20s %1.3e s %7.1f MB", name.c_str(), total, get_memusage()); + for (int i = 0; i < nparts; ++i) printf(" %1.3e s", parts[i]); + printf("\n"); +#endif +} +static void print_times (const std::string& name, const double total) { +#ifdef SIQK_TIME + printf("%20s %1.3e s %5.1f MB\n", name.c_str(), total, get_memusage()); +#endif +} + +KOKKOS_INLINE_FUNCTION static void error (const char* const msg) +{ ko::abort(msg); } + +KOKKOS_INLINE_FUNCTION static void message (const char* const msg) +{ printf("%s\n", msg); } + +typedef int Int; +typedef double Real; + +#ifdef KOKKOS_HAVE_CUDA +typedef ko::LayoutLeft Layout; +#else +typedef ko::LayoutRight Layout; +#endif + +// SIQK's array types. +typedef ko::View Vec3s; +typedef ko::View ConstVec3s; +typedef ko::View Vec6s; +typedef ko::View ConstVec6s; +typedef ko::View > RawVec3s; +typedef ko::View > RawConstVec3s; +typedef ko::View > RawArray; +typedef ko::View > RawConstArray; +typedef ko::View Idxs; +typedef ko::View ConstIdxs; +typedef ko::View Nodes; +typedef ko::View ConstNodes; + +// Decorator for a View. UnmanagedView gives the same view as +// ViewType, except the memory is unmanaged. +template +using UnmanagedView = ko::View< + typename ViewT::data_type, typename ViewT::array_layout, + typename ViewT::device_type, ko::MemoryTraits >; + +// Get the host or device version of the array. +template struct InExeSpace { + typedef VT type; +}; +template struct InExeSpace { + typedef typename VT::HostMirror type; +}; + +#ifdef KOKKOS_HAVE_CUDA +// A 1D slice of an array. +template KOKKOS_FORCEINLINE_FUNCTION +ko::View > +slice (const VT& v, Int i) { return ko::subview(v, i, ko::ALL()); } +// An explicitly const 1D slice of an array. +template KOKKOS_FORCEINLINE_FUNCTION +ko::View > +const_slice (const VT& v, Int i) { return ko::subview(v, i, ko::ALL()); } +#else +template KOKKOS_FORCEINLINE_FUNCTION +typename VT::value_type* +slice (const VT& v, Int i) { return v.data() + v.extent(1)*i; } + +template KOKKOS_FORCEINLINE_FUNCTION +typename VT::const_value_type* +const_slice (const VT& v, Int i) { return v.data() + v.extent(1)*i; } +#endif + +// Number of slices in a 2D array, where each row is a slice. +template KOKKOS_FORCEINLINE_FUNCTION +Int nslices (const A2D& a) { return static_cast(a.extent(0)); } + +// Number of entries in a 2D array's row. +template KOKKOS_FORCEINLINE_FUNCTION +Int szslice (const A2D& a) { return static_cast(a.extent(1)); } + +template +KOKKOS_INLINE_FUNCTION +static void copy (V dst, CV src, const Int n) { + for (Int i = 0; i < n; ++i) dst[i] = src[i]; +} + +template +void resize_and_copy (DV& d, const SV& s, + typename std::enable_if::type* = 0) { + ko::resize(d, nslices(s)); + ko::deep_copy(d, s); +} + +template +void resize_and_copy ( + DV& d, const SV& s, + typename std::enable_if::type* = 0) +{ + ko::resize(d, nslices(s)); + ko::deep_copy(d, s); +} + +template +void resize_and_copy ( + DV& d, const SV& s, + typename std::enable_if::type* = 0) +{ + ko::resize(d, nslices(s), szslice(s)); + ko::deep_copy(d, s); +} + +template +void hm_resize_and_copy (DV& d, const SA& s, const Int n) { + ko::resize(d, n); + auto d_hm = ko::create_mirror_view(d); + for (Int i = 0; i < n; ++i) d_hm[i] = s[i]; + ko::deep_copy(d, d_hm); +} + +// GPU-friendly replacements for std::min/max. +template KOKKOS_INLINE_FUNCTION +const T& min (const T& a, const T& b) { return a < b ? a : b; } +template KOKKOS_INLINE_FUNCTION +const T& max (const T& a, const T& b) { return a > b ? a : b; } +template KOKKOS_INLINE_FUNCTION +void swap (T& a, T&b) { + T tmp = a; + a = b; + b = tmp; +} +template KOKKOS_INLINE_FUNCTION constexpr T square (const T& x) { return x*x; } + +template KOKKOS_INLINE_FUNCTION +T sign (const T& a) { return a > 0 ? 1 : (a < 0 ? -1 : 0); } + +} // namespace siqk + +#endif // INCLUDE_SIQK_DEFS_HPP diff --git a/siqk/siqk_geometry.hpp b/siqk/siqk_geometry.hpp new file mode 100644 index 0000000..9ad9ecd --- /dev/null +++ b/siqk/siqk_geometry.hpp @@ -0,0 +1,310 @@ +#ifndef INCLUDE_SIQK_GEOMETRY_HPP +#define INCLUDE_SIQK_GEOMETRY_HPP + +#include "siqk_defs.hpp" +#include "siqk_quadrature.hpp" + +namespace siqk { + +// Vectors and points are 2D. Thus, if you're working on planes in 3D, project +// to a 2D space before calling these. +struct PlaneGeometry { + template KOKKOS_INLINE_FUNCTION + static void scale (const Real& a, V v) { + v[0] *= a; v[1] *= a; + } + template KOKKOS_INLINE_FUNCTION + static Real dot_c_amb (const CV c, const CV a, const CV b) { + return c[0]*(a[0] - b[0]) + c[1]*(a[1] - b[1]); + } + template KOKKOS_INLINE_FUNCTION + static void combine (const CV u, const CV v, const Real& a, V x) { + const Real& oma = 1 - a; + x[0] = oma*u[0] + a*v[0]; + x[1] = oma*u[1] + a*v[1]; + } + template KOKKOS_INLINE_FUNCTION + static void axpy (const Real& a, const CV x, V y) { + y[0] += a*x[0]; + y[1] += a*x[1]; + } + + template KOKKOS_INLINE_FUNCTION + static void edge_normal (const CV e1, const CV e2, V en) { + en[0] = e1[1] - e2[1]; + en[1] = e2[0] - e1[0]; + } + + template KOKKOS_INLINE_FUNCTION + static bool inside (const CV v, const CV e1, const CV en) { + return dot_c_amb(en, v, e1) >= 0; + } + + template KOKKOS_INLINE_FUNCTION + static void intersect (const CV v1, const CV v2, const CV e1, const CV en, + V intersection) { + Real a; { + const Real + num = dot_c_amb(en, e1, v1), + den = dot_c_amb(en, v2, v1); + a = num == 0 || den == 0 ? 0 : num/den; + a = a < 0 ? 0 : a > 1 ? 1 : a; + } + combine(v1, v2, a, intersection); + } + + template KOKKOS_INLINE_FUNCTION + static bool output (const CV v, Int& no, const V vo) { +#ifdef SIQK_DEBUG + if (no >= nslices(vo)) { + std::stringstream ss; + ss << "output: No room in vo; vo.n() is " << nslices(vo) << " but no is " + << no << "\n"; + message(ss.str().c_str()); + } +#endif + if (no >= nslices(vo)) return false; + vo(no,0) = v[0]; + vo(no,1) = v[1]; + ++no; + return true; + } + + //todo Handle non-convex case. + template + KOKKOS_INLINE_FUNCTION + static Real calc_area (const TriangleQuadrature& , const CV2s& v, + const Int n) { + return calc_area_formula(v, n); + } + + template + KOKKOS_INLINE_FUNCTION + static Real calc_area_formula (const CV2s& v, const Int n) { + Real area = 0; + for (Int i = 1, ilim = n - 1; i < ilim; ++i) + area += calc_tri_jacobian(slice(v,0), slice(v,i), slice(v,i+1)); + return 0.5*area; + } + + template + KOKKOS_INLINE_FUNCTION + static void bary2coord (const CV v1, const CV v2, const CV v3, const CA alpha, + Real u[2]) { + for (Int k = 0; k < 2; ++k) u[k] = 0; + axpy(alpha[0], v1, u); + axpy(alpha[1], v2, u); + axpy(alpha[2], v3, u); + } + + template + KOKKOS_INLINE_FUNCTION + static Real calc_tri_jacobian (const CV v1, const CV v2, const CV v3) { + Real r1[2], r2[2]; + r1[0] = v2[0] - v1[0]; + r1[1] = v2[1] - v1[1]; + r2[0] = v3[0] - v1[0]; + r2[1] = v3[1] - v1[1]; + const Real a = r1[0]*r2[1] - r1[1]*r2[0]; + return a; + } +}; + +// All inputs and outputs are relative to the unit-radius sphere. Vectors and +// points are 3D. +struct SphereGeometry { + template KOKKOS_INLINE_FUNCTION + static void cross (const CV a, const CV b, V c) { + c[0] = a[1]*b[2] - a[2]*b[1]; + c[1] = a[2]*b[0] - a[0]*b[2]; + c[2] = a[0]*b[1] - a[1]*b[0]; + } + template KOKKOS_INLINE_FUNCTION + static Real dot (const CV a, const CV b) { + return a[0]*b[0] + a[1]*b[1] + a[2]*b[2]; + } + template KOKKOS_INLINE_FUNCTION + static Real norm2 (const CV v) { + return dot(v, v); + } + template KOKKOS_INLINE_FUNCTION + static void scale (const Real& a, V v) { + v[0] *= a; v[1] *= a; v[2] *= a; + } + template KOKKOS_INLINE_FUNCTION + static void normalize (V v) { + scale(1.0/std::sqrt(norm2(v)), v); + } + template KOKKOS_INLINE_FUNCTION + static Real dot_c_amb (const CV c, const CV a, const CV b) { + return c[0]*(a[0] - b[0]) + c[1]*(a[1] - b[1]) + c[2]*(a[2] - b[2]); + } + template KOKKOS_INLINE_FUNCTION + static void axpy (const Real& a, const CV x, V y) { + y[0] += a*x[0]; + y[1] += a*x[1]; + y[2] += a*x[2]; + } + template KOKKOS_INLINE_FUNCTION + static void axpbyz (const Real& a, const CV x, const Real& b, const CV y, + V z) { + z[0] = a*x[0] + b*y[0]; + z[1] = a*x[1] + b*y[1]; + z[2] = a*x[2] + b*y[2]; + } + template KOKKOS_INLINE_FUNCTION + static void copy (V d, const CV s) { + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + } + template KOKKOS_INLINE_FUNCTION + static void combine (const CV u, const CV v, const Real& a, V x) { + const Real& oma = 1 - a; + x[0] = oma*u[0] + a*v[0]; + x[1] = oma*u[1] + a*v[1]; + x[2] = oma*u[2] + a*v[2]; + } + + template KOKKOS_INLINE_FUNCTION + static void edge_normal (const CV a, const CV b, V en) { + cross(a, b, en); + normalize(en); + } + + // Is v inside the line anchored at a with inward-facing normal n? + template KOKKOS_INLINE_FUNCTION + static bool inside (const CV v, const CV a, const CV n) { + return dot_c_amb(n, v, a) >= 0; + } + + /* Let + en = edge normal + e1 = edge starting point + d = en' e1 + v(a) = (1 - a) v1 + a v2. + Solve n' v = d for a: + a = (en' (e1 - v1)) / (en' (v2 - v1)). + Then uvec(v(a)) is the intersection point on the unit sphere. Assume + intersection exists. (Already filtered by 'inside'.) + */ + template KOKKOS_INLINE_FUNCTION + static void intersect (const CV v1, const CV v2, const CV e1, const CV en, + V intersection) { + Real a; { + const Real + num = dot_c_amb(en, e1, v1), + den = dot_c_amb(en, v2, v1); + a = num == 0 || den == 0 ? 0 : num/den; + a = a < 0 ? 0 : a > 1 ? 1 : a; + } + combine(v1, v2, a, intersection); + normalize(intersection); + } + + template KOKKOS_INLINE_FUNCTION + static bool output (const CV v, Int& no, V vo) { +#ifdef SIQK_DEBUG + if (no >= nslices(vo)) { + std::stringstream ss; + ss << "output: No room in vo; vo.n() is " << nslices(vo) << " but no is " + << no << "\n"; + message(ss.str().c_str()); + } +#endif + if (no >= nslices(vo)) return false; + vo(no,0) = v[0]; + vo(no,1) = v[1]; + vo(no,2) = v[2]; + ++no; + return true; + } + + //todo Handle non-convex case. + // This uses a terrible formula, but it's just for testing. + template + KOKKOS_INLINE_FUNCTION + static Real calc_area_formula (const CV3s& v, const Int n) { + Real area = 0; + for (Int i = 1, ilim = n - 1; i < ilim; ++i) { + const Real a = calc_arc_length(slice(v,0), slice(v,i)); + const Real b = calc_arc_length(slice(v,i), slice(v,i+1)); + const Real c = calc_arc_length(slice(v,i+1), slice(v,0)); + const Real s = 0.5*(a + b + c); + const Real d = (std::tan(0.5*s)*std::tan(0.5*(s-a))* + std::tan(0.5*(s-b))*std::tan(0.5*(s-c))); + if (d <= 0) continue; + area += 4*std::atan(std::sqrt(d)); + } + return area; + } + template KOKKOS_INLINE_FUNCTION + static Real calc_arc_length (const CV a, const CV b) { + const Real d = dot(a, b); + if (d >= 1) return 0; + return acos(d); + } + + template + KOKKOS_INLINE_FUNCTION + static Real calc_area (const TriangleQuadrature& q, const CV3s& v, + const Int n) { + Real area = 0, u[3]; + for (Int i = 1, ilim = n - 1; i < ilim; ++i) { + Real a = 0; + RawConstVec3s coord; + RawConstArray weight; + q.get_coef(8, coord, weight); + for (Int k = 0, klim = nslices(coord); k < klim; ++k) { + const Real jac = calc_tri_jacobian(slice(v,0), slice(v,i), slice(v,i+1), + slice(coord, k), u); + a += weight[k]*jac; + } + area += 0.5*a; + } + return area; + } + + template + KOKKOS_INLINE_FUNCTION + static Real calc_tri_jacobian (const CV v1, const CV v2, const CV v3, + const CA alpha, Real u[3]) { + // V(:,i) is vertex i of the spherical triangle on the unit sphere. The + // coefs + // alpha = [a1, a2, 1 - a1 - a2]' + // = [1 0; 0 1; -1 -1] [a1, a2]' + // = alpha_a a + // (barycentric coords) give the location + // v = V alpha + // on the planar triangle, and u = uvec(v) is the point on the unit sphere. + // For a planar tri in 3D, the jacobian is + // v_a = v_alpha alpha_a + // = V [1 0; 0 1; -1 -1] + // J = norm(cross(v_a(:,1), v_a(:,2))). + // For a spherical tri with the same vertices, + // u = v/(v' v)^{1/2} + // u_a = u_alpha alpha_a + // = (v'v)^{-1/2} (I - u u') V alpha_a + // = (v'v)^{-1/2} (I - u u') v_a + // J = norm(cross(u_a(:,1), u_a(:,2))). + for (Int k = 0; k < 3; ++k) u[k] = 0; + axpy(alpha[0], v1, u); + axpy(alpha[1], v2, u); + axpy(alpha[2], v3, u); + const auto oovn = 1/std::sqrt(norm2(u)); + scale(oovn, u); + Real u_a[3][3]; + axpbyz(1, v1, -1, v3, u_a[0]); + axpbyz(1, v2, -1, v3, u_a[1]); + for (int i = 0; i < 2; ++i) { + axpy(-dot(u, u_a[i]), u, u_a[i]); + scale(oovn, u_a[i]); + } + cross(u_a[0], u_a[1], u_a[2]); + return std::sqrt(norm2(u_a[2])); + } +}; + +} // namespace siqk + +#endif // INCLUDE_SIQK_GEOMETRY_HPP diff --git a/siqk/siqk_intersect.hpp b/siqk/siqk_intersect.hpp new file mode 100644 index 0000000..6fcde7b --- /dev/null +++ b/siqk/siqk_intersect.hpp @@ -0,0 +1,338 @@ +#ifndef INCLUDE_SIQK_INTERSECT_HPP +#define INCLUDE_SIQK_INTERSECT_HPP + +#include "siqk_defs.hpp" +#include "siqk_geometry.hpp" +#include "siqk_search.hpp" +#include "siqk_quadrature.hpp" + +namespace siqk { + +// Sutherland-Hodgmann polygon clipping algorithm. Follow Foley, van Dam, +// Feiner, Hughes Fig 3.49. +namespace sh { +/* A mesh is described by the following arrays: + p: 3 x #nodes, the array of vertices. + e: max(#verts) x #elems, the array of element base-0 indices. + nml: 3 x #edges, the array of edge normals. + en: max(#verts) x #elems, the array of edge-normal base-0 indices. + e. e indexes p. e(i,j) == -1 in column j indicates that j:end are not used. + nml. As a mesh is refined, cancellation error makes an edge normal based + off of an element's vertices increasingly inaccurate. Roughly, if an edge + subtends angle phi of the sphere, -log10(phi/(2 pi)) digits are lost in the + edge normal. Therefore, we compute edge normals offline, since in certain + meshes, they can be computed by an accurate means. E.g., in a cubed-sphere + mesh, the whole line of a square face can be used to compute the edge + normal. Furthermore, there are far fewer unique edge normals than edges. + */ +template +struct Mesh { + typename InExeSpace::type p, nml; + typename InExeSpace::type e, en; + + Mesh () {} + + Mesh (const Mesh& m) { + typename InExeSpace::type tp, tnml; + typename InExeSpace::type te, ten; + resize_and_copy(tp, m.p); p = tp; + resize_and_copy(tnml, m.nml); nml = tnml; + resize_and_copy(te, m.e); e = te; + resize_and_copy(ten, m.en); en = ten; + } +}; + +// Generally not a user routine. +template +KOKKOS_INLINE_FUNCTION +bool clip_against_edge ( + // Input vertex list. + const CV3s& vi, const Int ni, + // Output vertex list. + V3s& vo, Int& no, + // One point of the clip edge. + const CV ce1, + // Clip edge's inward-facing normal. + const CV cen) +{ + Real intersection[3]; + no = 0; + auto s = const_slice(vi, ni-1); + for (Int j = 0; j < ni; ++j) { + auto p = const_slice(vi,j); + if (geo::inside(p, ce1, cen)) { + if (geo::inside(s, ce1, cen)) { + if ( ! geo::output(p, no, vo)) return false; + } else { + geo::intersect(s, p, ce1, cen, intersection); + if ( ! geo::output(intersection, no, vo)) return false; + if ( ! geo::output(p, no, vo)) return false; + } + } else if (geo::inside(s, ce1, cen)) { + geo::intersect(s, p, ce1, cen, intersection); + if ( ! geo::output(intersection, no, vo)) return false; + } + s = p; + } + return true; +} + +// Efficient user routine that uses the mesh data structure. +//todo An optimization would be to have 2 clip_against_edge routines. One would +// handle the special case of the first vertex list being in (p,e) format. +template +KOKKOS_INLINE_FUNCTION +bool clip_against_poly ( + // Clip mesh. m.e(:,cp_e) is the element, and m.en(:,cp_e) is the + // corresponding list of normal indices. + const MeshT& m, const Int cp_e, + // A list of vertices describing the polygon to clip. The vertices must be in + // a convention-determined order, such as CCW. vi(:,1:ni-1) are valid entries. + const CV3s& vi, const Int ni, + // On output, vo(:,0:no-1) are vertices of the clipped polygon. no is 0 if + // there is no intersection. + V3s& vo, Int& no, + // Workspace. Both vo and wrk must be large enough to hold all generated + // vertices. If they are not, false is returned. + V3s& wrk) +{ + Int nos[] = { 0, 0 }; + V3s* vs[] = { &vo, &wrk }; + + const auto e = slice(m.e, cp_e); + const auto en = slice(m.en, cp_e); + + auto nv = szslice(m.e); // Number of vertices in clip polygon. + while (e[nv-1] == -1) --nv; + + no = 0; + if (nv % 2 == 0) { + // Make sure the final vertex output list is in the caller's buffer. + swap(vs[0], vs[1]); + swap(nos[0], nos[1]); + } + + if ( ! clip_against_edge(vi, ni, *vs[0], nos[0], const_slice(m.p, e[0]), + const_slice(m.nml, en[0]))) + return false; + if ( ! nos[0]) return true; + + for (Int ie = 1, ielim = nv - 1; ; ++ie) { + if ( ! clip_against_edge(*vs[0], nos[0], *vs[1], nos[1], + const_slice(m.p, e[ie]), + const_slice(m.nml, en[ie]))) + return false; + if ( ! nos[1]) return true; + if (ie == ielim) break; + swap(vs[0], vs[1]); + swap(nos[0], nos[1]); + } + + no = nos[1]; + return true; +} + +// Not used for real stuff; just a convenient version for testing. In this +// version, clip_poly is a list of clip polygon vertices. This is instead of the +// mesh data structure. +template +KOKKOS_INLINE_FUNCTION +bool clip_against_poly ( + // Clip polygon. + const CV3s_CP& clip_poly, + // Clip polygon edges' inward-facing normals. + const CV3s_CEN& clip_edge_normals, + const CV3s_VI& vi, const Int ni, + V3s& vo, Int& no, + V3s& wrk) +{ + Int nos[] = { 0, 0 }; + V3s* vs[] = { &vo, &wrk }; + + no = 0; + if (nslices(clip_poly) % 2 == 0) { + // Make sure the final vertex output list is in the caller's buffer. + swap(vs[0], vs[1]); + swap(nos[0], nos[1]); + } + + if ( ! clip_against_edge(vi, ni, *vs[0], nos[0], + const_slice(clip_poly, 0), + const_slice(clip_edge_normals, 0))) + return false; + if ( ! nos[0]) return true; + + for (Int ie = 1, ielim = nslices(clip_poly) - 1; ; ++ie) { + if ( ! clip_against_edge(*vs[0], nos[0], *vs[1], nos[1], + const_slice(clip_poly, ie), + const_slice(clip_edge_normals, ie))) + return false; + if ( ! nos[1]) return true; + if (ie == ielim) break; + swap(vs[0], vs[1]); + swap(nos[0], nos[1]); + } + + no = nos[1]; + return true; +} +} // namespace sh + +namespace test { +static constexpr Int max_nvert = 20; +static constexpr Int max_hits = 25; // Covers at least a 2-halo. + +// In practice, we want to form high-quality normals using information about the +// mesh. +template +void fill_normals (sh::Mesh& m) { + // Count number of edges. + Int ne = 0; + for (Int ip = 0; ip < nslices(m.e); ++ip) + for (Int iv = 0; iv < szslice(m.e); ++iv) + if (m.e(ip,iv) == -1) break; else ++ne; + // Fill. + Idxs::HostMirror en("en", nslices(m.e), szslice(m.e)); + ko::deep_copy(en, -1); + Vec3s::HostMirror nml("nml", ne); + Int ie = 0; + for (Int ip = 0; ip < nslices(m.e); ++ip) + for (Int iv = 0; iv < szslice(m.e); ++iv) + if (m.e(ip,iv) == -1) + break; + else { + // Somewhat complicated next node index. + const Int iv_next = (iv+1 == szslice(m.e) ? 0 : + (m.e(ip,iv+1) == -1 ? 0 : iv+1)); + geo::edge_normal(slice(m.p, m.e(ip, iv)), slice(m.p, m.e(ip, iv_next)), + slice(nml, ie)); + en(ip,iv) = ie; + ++ie; + } + m.en = en; + m.nml = nml; +} + +//todo The current approach is to do redundant clips so that the hits buffer can +// be small and static. Need to think about this. +template +class AreaOTFunctor { + const TriangleQuadrature quad_; + const sh::Mesh<>& cm_; + const ConstVec3s& p_; + const ConstIdxs& e_; + const Int k_; // Index into (p,e). + //todo More efficient method that also works on GPU. + Int hits_[max_hits]; + Int nh_; + Real area_; + +public: + KOKKOS_INLINE_FUNCTION + AreaOTFunctor (const sh::Mesh<>& cm, const ConstVec3s& p, const ConstIdxs& e, + const Int& k) + : cm_(cm), p_(p), e_(e), k_(k), nh_(0), area_(0) + {} + + KOKKOS_INLINE_FUNCTION void operator() (const Int mesh_elem_idx) { + // Check whether we've clipped against this polygon before and there was a + // non-0 intersection. + for (Int i = 0; i < nh_; ++i) + if (hits_[i] == mesh_elem_idx) + return; + // We have not, so do the intersection. + Int no = 0; + { + // Area of all overlapping regions. + // In and out vertex lists. + Real buf[9*max_nvert]; + RawVec3s + vi(buf, max_nvert), + vo(buf + 3*max_nvert, max_nvert), + wrk(buf + 6*max_nvert, max_nvert); + Int ni; + ni = 0; + for (Int i = 0; i < szslice(e_); ++i) { + if (e_(k_,i) == -1) break; + copy(slice(vi, i), slice(p_, e_(k_,i)), 3); + ++ni; + } + sh::clip_against_poly(cm_, mesh_elem_idx, vi, ni, vo, no, wrk); + if (no) area_ += geo::calc_area(quad_, vo, no); + } + if (no) { + // Non-0 intersection, so record. + if (nh_ == max_hits) Kokkos::abort("max_hits is too small."); + hits_[nh_++] = mesh_elem_idx; + } + } + + KOKKOS_INLINE_FUNCTION const Real& area () const { return area_; } +}; + +template +class TestAreaOTKernel { + const sh::Mesh<> cm_; + const OctreeT ot_; + mutable ConstVec3s p_; + mutable ConstIdxs e_; + +public: + typedef Real value_type; + + TestAreaOTKernel (const sh::Mesh& cm, + const ConstVec3s::HostMirror& p_hm, + const ConstIdxs::HostMirror& e_hm, const OctreeT& ot) + : cm_(cm), ot_(ot) + { + { Vec3s p; resize_and_copy(p, p_hm); p_ = p; } + { Idxs e; resize_and_copy(e, e_hm); e_ = e; } + } + + // Clip the k'th polygon in (p,e) against mesh cm. + KOKKOS_INLINE_FUNCTION void operator() (const Int k, Real& area) const { + // Clipped element bounding box. + Real ebb[6]; + OctreeT::calc_bb(p_, slice(e_, k), szslice(e_), ebb); + // Get list of possible overlaps. + AreaOTFunctor f(cm_, p_, e_, k); + //todo Team threads. + ot_.apply(ebb, f); + area += f.area(); + } + + KOKKOS_INLINE_FUNCTION + void join (volatile value_type& dst, volatile value_type const& src) const + { dst += src; } +}; + +template Real test_area_ot ( + const ConstVec3s::HostMirror& cp, const ConstIdxs::HostMirror& ce, + const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e) +{ + typedef Octree OctreeT; + + // Clip mesh and edge normal calculation. (In practice, we'd like to use + // higher-quality edge normals.) + sh::Mesh cm; cm.p = cp; cm.e = ce; + fill_normals(cm); + + Real et[2] = {0}; + auto t = tic(); + // Oct-tree over the clip mesh. + OctreeT ot(cp, ce); + et[0] = toc(t); + + Real area = 0; + TestAreaOTKernel f(cm, p, e, ot); + t = tic(); + ko::parallel_reduce(nslices(e), f, area); + et[1] = toc(t); + print_times("test_area_ot", et, 2); + return area; +} +} // namespace test +} // namespace siqk + +#endif // INCLUDE_SIQK_INTERSECT_HPP diff --git a/siqk/siqk_quadrature.hpp b/siqk/siqk_quadrature.hpp new file mode 100644 index 0000000..42164ad --- /dev/null +++ b/siqk/siqk_quadrature.hpp @@ -0,0 +1,613 @@ +#ifndef INCLUDE_SIQK_QUADRATURE_HPP +#define INCLUDE_SIQK_QUADRATURE_HPP + +#include "siqk_defs.hpp" + +namespace siqk { + +/* For the TRISYM entries, see, e.g., + Triangular quadrature to use for integration Dunavant, D.A. "High Degree + Efficient Symmetrical Gaussian Quadrature Rules for the Triangle." + J. Numer. Meth. Eng., 21, pp 1129-1148. + and + Zhang, Linbo, Tao Cui, and Hui Liu. "A set of symmetric quadrature rules on + triangles and tetrahedra." J. of Computational Mathematics (2009): 89-96. + For the TRITAYLOR, see + Day, David M. and Mark A. Taylor, "A new 11 point degree 6 cubature formula + for the triangle", PAMM 7 (2007) + and + Taylor, Mark A., Beth A. Wingate, and Len P. Bos. "A cardinal function + algorithm for computing multivariate quadrature points." SIAM Journal on + Numerical Analysis 45.1 (2007): 193-205. +*/ + +// The symmetric order-12 quadrature rule gives 1 fewer digit of conservation +// than, e.g., the order-14 one, so switch to the Taylor et al rule. Part of the +// problem I think is that the Dunavant table results from double precision +// computations (rather than quad) and is recorded to perhaps one fewer digit +// than might have been available. But I can't find a table with an extra +// digit. The Taylor et al. rule has one fewer coordinate, so that's also an +// advantage. The loss of symmetry in the coordinates I think is not relevant to +// this application. +#define SIQK_USE_TRITAY12 + +#define SIQK_QUADRATURE_TRISYM_ORDER4_COORD \ + {0.108103018168070, 0.445948490915965, 0.445948490915965, \ + 0.445948490915965, 0.108103018168070, 0.445948490915965, \ + 0.445948490915965, 0.445948490915965, 0.108103018168070, \ + 0.816847572980458, 0.091576213509771, 0.091576213509771, \ + 0.091576213509771, 0.816847572980458, 0.091576213509771, \ + 0.091576213509771, 0.091576213509771, 0.816847572980458} +#define SIQK_QUADRATURE_TRISYM_ORDER4_WEIGHT \ + {0.223381589678011, 0.223381589678011, 0.223381589678011, \ + 0.109951743655322, 0.109951743655322, 0.109951743655322} + +#define SIQK_QUADRATURE_TRISYM_ORDER8_COORD \ + {0.333333333333333, 0.333333333333333, 0.333333333333333, \ + 0.081414823414554, 0.459292588292723, 0.459292588292723, \ + 0.459292588292723, 0.081414823414554, 0.459292588292723, \ + 0.459292588292723, 0.459292588292723, 0.081414823414554, \ + 0.658861384496480, 0.170569307751760, 0.170569307751760, \ + 0.170569307751760, 0.658861384496480, 0.170569307751760, \ + 0.170569307751760, 0.170569307751760, 0.658861384496480, \ + 0.898905543365938, 0.050547228317031, 0.050547228317031, \ + 0.050547228317031, 0.898905543365938, 0.050547228317031, \ + 0.050547228317031, 0.050547228317031, 0.898905543365938, \ + 0.008394777409958, 0.263112829634638, 0.728492392955404, \ + 0.008394777409958, 0.728492392955404, 0.263112829634638, \ + 0.263112829634638, 0.008394777409958, 0.728492392955404, \ + 0.263112829634638, 0.728492392955404, 0.008394777409958, \ + 0.728492392955404, 0.263112829634638, 0.008394777409958, \ + 0.728492392955404, 0.008394777409958, 0.263112829634638} +#define SIQK_QUADRATURE_TRISYM_ORDER8_WEIGHT \ + {0.144315607677787, 0.095091634267285, 0.095091634267285, \ + 0.095091634267285, 0.103217370534718, 0.103217370534718, \ + 0.103217370534718, 0.032458497623198, 0.032458497623198, \ + 0.032458497623198, 0.027230314174435, 0.027230314174435, \ + 0.027230314174435, 0.027230314174435, 0.027230314174435, \ + 0.027230314174435} + +#define SIQK_QUADRATURE_TRISYM_ORDER12_COORD \ + {0.023565220452390, 0.488217389773805, 0.488217389773805, \ + 0.488217389773805, 0.023565220452390, 0.488217389773805, \ + 0.488217389773805, 0.488217389773805, 0.023565220452390, \ + 0.120551215411079, 0.439724392294460, 0.439724392294460, \ + 0.439724392294460, 0.120551215411079, 0.439724392294460, \ + 0.439724392294460, 0.439724392294460, 0.120551215411079, \ + 0.457579229975768, 0.271210385012116, 0.271210385012116, \ + 0.271210385012116, 0.457579229975768, 0.271210385012116, \ + 0.271210385012116, 0.271210385012116, 0.457579229975768, \ + 0.744847708916828, 0.127576145541586, 0.127576145541586, \ + 0.127576145541586, 0.744847708916828, 0.127576145541586, \ + 0.127576145541586, 0.127576145541586, 0.744847708916828, \ + 0.957365299093576, 0.021317350453210, 0.021317350453210, \ + 0.021317350453210, 0.957365299093576, 0.021317350453210, \ + 0.021317350453210, 0.021317350453210, 0.957365299093576, \ + 0.115343494534698, 0.275713269685514, 0.608943235779788, \ + 0.115343494534698, 0.608943235779788, 0.275713269685514, \ + 0.275713269685514, 0.115343494534698, 0.608943235779788, \ + 0.275713269685514, 0.608943235779788, 0.115343494534698, \ + 0.608943235779788, 0.115343494534698, 0.275713269685514, \ + 0.608943235779788, 0.275713269685514, 0.115343494534698, \ + 0.022838332222257, 0.281325580989940, 0.695836086787803, \ + 0.022838332222257, 0.695836086787803, 0.281325580989940, \ + 0.281325580989940, 0.022838332222257, 0.695836086787803, \ + 0.281325580989940, 0.695836086787803, 0.022838332222257, \ + 0.695836086787803, 0.022838332222257, 0.281325580989940, \ + 0.695836086787803, 0.281325580989940, 0.022838332222257, \ + 0.025734050548330, 0.116251915907597, 0.858014033544073, \ + 0.025734050548330, 0.858014033544073, 0.116251915907597, \ + 0.116251915907597, 0.025734050548330, 0.858014033544073, \ + 0.116251915907597, 0.858014033544073, 0.025734050548330, \ + 0.858014033544073, 0.025734050548330, 0.116251915907597, \ + 0.858014033544073, 0.116251915907597, 0.025734050548330} +#define SIQK_QUADRATURE_TRISYM_ORDER12_WEIGHT \ + {0.025731066440455, 0.025731066440455, 0.025731066440455, \ + 0.043692544538038, 0.043692544538038, 0.043692544538038, \ + 0.062858224217885, 0.062858224217885, 0.062858224217885, \ + 0.034796112930709, 0.034796112930709, 0.034796112930709, \ + 0.006166261051559, 0.006166261051559, 0.006166261051559, \ + 0.040371557766381, 0.040371557766381, 0.040371557766381, \ + 0.040371557766381, 0.040371557766381, 0.040371557766381, \ + 0.022356773202303, 0.022356773202303, 0.022356773202303, \ + 0.022356773202303, 0.022356773202303, 0.022356773202303, \ + 0.017316231108659, 0.017316231108659, 0.017316231108659, \ + 0.017316231108659, 0.017316231108659, 0.017316231108659} + +#define SIQK_QUADRATURE_TRISYM_ORDER14_COORD \ + {0.3333333333333333148296162562473910, 0.3333333333333333148296162562473910, 0.3333333333333333148296162562473910, \ + 0.0099797608064584319986778382371995, 0.0099797608064584319986778382371995, 0.9800404783870830804914930922677740, \ + 0.0099797608064584319986778382371995, 0.9800404783870830804914930922677740, 0.0099797608064584319986778382371995, \ + 0.9800404783870830804914930922677740, 0.0099797608064584319986778382371995, 0.0099797608064584319986778382371995, \ + 0.4799778935211884145495275788562139, 0.4799778935211884145495275788562139, 0.0400442129576231709009448422875721, \ + 0.4799778935211884145495275788562139, 0.0400442129576231709009448422875721, 0.4799778935211884145495275788562139, \ + 0.0400442129576231709009448422875721, 0.4799778935211884145495275788562139, 0.4799778935211884145495275788562139, \ + 0.1538119591769669114444951674158801, 0.1538119591769669114444951674158801, 0.6923760816460662326221608964260668, \ + 0.1538119591769669114444951674158801, 0.6923760816460662326221608964260668, 0.1538119591769669114444951674158801, \ + 0.6923760816460662326221608964260668, 0.1538119591769669114444951674158801, 0.1538119591769669114444951674158801, \ + 0.0740234771169878125185448425327195, 0.0740234771169878125185448425327195, 0.8519530457660243749629103149345610, \ + 0.0740234771169878125185448425327195, 0.8519530457660243749629103149345610, 0.0740234771169878125185448425327195, \ + 0.8519530457660243749629103149345610, 0.0740234771169878125185448425327195, 0.0740234771169878125185448425327195, \ + 0.1303546825033299882967696703417460, 0.1303546825033299882967696703417460, 0.7392906349933400234064606593165081, \ + 0.1303546825033299882967696703417460, 0.7392906349933400234064606593165081, 0.1303546825033299882967696703417460, \ + 0.7392906349933400234064606593165081, 0.1303546825033299882967696703417460, 0.1303546825033299882967696703417460, \ + 0.2306172260266531326422523306973744, 0.2306172260266531326422523306973744, 0.5387655479466937347154953386052512, \ + 0.2306172260266531326422523306973744, 0.5387655479466937347154953386052512, 0.2306172260266531326422523306973744, \ + 0.5387655479466937347154953386052512, 0.2306172260266531326422523306973744, 0.2306172260266531326422523306973744, \ + 0.4223320834191477968211358984262915, 0.4223320834191477968211358984262915, 0.1553358331617044063577282031474169, \ + 0.4223320834191477968211358984262915, 0.1553358331617044063577282031474169, 0.4223320834191477968211358984262915, \ + 0.1553358331617044063577282031474169, 0.4223320834191477968211358984262915, 0.4223320834191477968211358984262915, \ + 0.7862373859346609705767150444444269, 0.1906163600319009110428680742188590, 0.0231462540334381183804168813367141, \ + 0.7862373859346609705767150444444269, 0.0231462540334381183804168813367141, 0.1906163600319009110428680742188590, \ + 0.1906163600319009110428680742188590, 0.7862373859346609705767150444444269, 0.0231462540334381183804168813367141, \ + 0.1906163600319009110428680742188590, 0.0231462540334381183804168813367141, 0.7862373859346609705767150444444269, \ + 0.0231462540334381183804168813367141, 0.7862373859346609705767150444444269, 0.1906163600319009110428680742188590, \ + 0.0231462540334381183804168813367141, 0.1906163600319009110428680742188590, 0.7862373859346609705767150444444269, \ + 0.6305521436606074114905595706659369, 0.3623231377435471300962888108188054, 0.0071247185958454584131516185152577, \ + 0.6305521436606074114905595706659369, 0.0071247185958454584131516185152577, 0.3623231377435471300962888108188054, \ + 0.3623231377435471300962888108188054, 0.6305521436606074114905595706659369, 0.0071247185958454584131516185152577, \ + 0.3623231377435471300962888108188054, 0.0071247185958454584131516185152577, 0.6305521436606074114905595706659369, \ + 0.0071247185958454584131516185152577, 0.6305521436606074114905595706659369, 0.3623231377435471300962888108188054, \ + 0.0071247185958454584131516185152577, 0.3623231377435471300962888108188054, 0.6305521436606074114905595706659369, \ + 0.6265773298563063198329814440512564, 0.2907712058836673940653838599246228, 0.0826514642600262861016346960241208, \ + 0.6265773298563063198329814440512564, 0.0826514642600262861016346960241208, 0.2907712058836673940653838599246228, \ + 0.2907712058836673940653838599246228, 0.6265773298563063198329814440512564, 0.0826514642600262861016346960241208, \ + 0.2907712058836673940653838599246228, 0.0826514642600262861016346960241208, 0.6265773298563063198329814440512564, \ + 0.0826514642600262861016346960241208, 0.6265773298563063198329814440512564, 0.2907712058836673940653838599246228, \ + 0.0826514642600262861016346960241208, 0.2907712058836673940653838599246228, 0.6265773298563063198329814440512564, \ + 0.9142099849296254632236014003865421, 0.0711657108777507679819862573822320, 0.0146243041926237687944123422312259, \ + 0.9142099849296254632236014003865421, 0.0146243041926237687944123422312259, 0.0711657108777507679819862573822320, \ + 0.0711657108777507679819862573822320, 0.9142099849296254632236014003865421, 0.0146243041926237687944123422312259, \ + 0.0711657108777507679819862573822320, 0.0146243041926237687944123422312259, 0.9142099849296254632236014003865421, \ + 0.0146243041926237687944123422312259, 0.9142099849296254632236014003865421, 0.0711657108777507679819862573822320, \ + 0.0146243041926237687944123422312259, 0.0711657108777507679819862573822320, 0.9142099849296254632236014003865421} +#define SIQK_QUADRATURE_TRISYM_ORDER14_WEIGHT \ + {0.0585962852260285965710906452841300,0.0017351512297252675524200649093132,0.0017351512297252675524200649093132, \ + 0.0017351512297252675524200649093132,0.0261637825586145227052536910150593,0.0261637825586145227052536910150593, \ + 0.0261637825586145227052536910150593,0.0039197292424018289128118119890587,0.0039197292424018289128118119890587, \ + 0.0039197292424018289128118119890587,0.0122473597569408669538670864085361,0.0122473597569408669538670864085361, \ + 0.0122473597569408669538670864085361,0.0281996285032579604989955157634540,0.0281996285032579604989955157634540, \ + 0.0281996285032579604989955157634540,0.0508870871859594883779287499692146,0.0508870871859594883779287499692146, \ + 0.0508870871859594883779287499692146,0.0504534399016036000373830461285252,0.0504534399016036000373830461285252, \ + 0.0504534399016036000373830461285252,0.0170636442122334523741056244716674,0.0170636442122334523741056244716674, \ + 0.0170636442122334523741056244716674,0.0170636442122334523741056244716674,0.0170636442122334523741056244716674, \ + 0.0170636442122334523741056244716674,0.0096834664255066003890615178306689,0.0096834664255066003890615178306689, \ + 0.0096834664255066003890615178306689,0.0096834664255066003890615178306689,0.0096834664255066003890615178306689, \ + 0.0096834664255066003890615178306689,0.0363857559284850029523994408009457,0.0363857559284850029523994408009457, \ + 0.0363857559284850029523994408009457,0.0363857559284850029523994408009457,0.0363857559284850029523994408009457, \ + 0.0363857559284850029523994408009457,0.0069646633735184126576256424812073,0.0069646633735184126576256424812073, \ + 0.0069646633735184126576256424812073,0.0069646633735184126576256424812073,0.0069646633735184126576256424812073, \ + 0.0069646633735184126576256424812073} + +#define SIQK_QUADRATURE_TRISYM_ORDER20_COORD \ + {0.3333333333333333148296162562473910, 0.3333333333333333148296162562473910, 0.3333333333333333148296162562473910, \ + 0.2158743059329919777855621987328050, 0.2158743059329919777855621987328050, 0.5682513881340160999400268337922171, \ + 0.2158743059329919777855621987328050, 0.5682513881340160999400268337922171, 0.2158743059329919777855621987328050, \ + 0.5682513881340160999400268337922171, 0.2158743059329919777855621987328050, 0.2158743059329919777855621987328050, \ + 0.0753767665297472716501303580116655, 0.0753767665297472716501303580116655, 0.8492464669405054289441636683477554, \ + 0.0753767665297472716501303580116655, 0.8492464669405054289441636683477554, 0.0753767665297472716501303580116655, \ + 0.8492464669405054289441636683477554, 0.0753767665297472716501303580116655, 0.0753767665297472716501303580116655, \ + 0.0103008281372217926769030427180951, 0.0103008281372217926769030427180951, 0.9793983437255564528101103860535659, \ + 0.0103008281372217926769030427180951, 0.9793983437255564528101103860535659, 0.0103008281372217926769030427180951, \ + 0.9793983437255564528101103860535659, 0.0103008281372217926769030427180951, 0.0103008281372217926769030427180951, \ + 0.4936022112987001886352800283930264, 0.4936022112987001886352800283930264, 0.0127955774025996227294399432139471, \ + 0.4936022112987001886352800283930264, 0.0127955774025996227294399432139471, 0.4936022112987001886352800283930264, \ + 0.0127955774025996227294399432139471, 0.4936022112987001886352800283930264, 0.4936022112987001886352800283930264, \ + 0.4615509381069253236340443891094765, 0.4615509381069253236340443891094765, 0.0768981237861493527319112217810471, \ + 0.4615509381069253236340443891094765, 0.0768981237861493527319112217810471, 0.4615509381069253236340443891094765, \ + 0.0768981237861493527319112217810471, 0.4615509381069253236340443891094765, 0.4615509381069253236340443891094765, \ + 0.3286214064242369836676971317501739, 0.4293405702582103744546770940360148, 0.2420380233175526418776257742138114, \ + 0.3286214064242369836676971317501739, 0.2420380233175526418776257742138114, 0.4293405702582103744546770940360148, \ + 0.4293405702582103744546770940360148, 0.3286214064242369836676971317501739, 0.2420380233175526418776257742138114, \ + 0.4293405702582103744546770940360148, 0.2420380233175526418776257742138114, 0.3286214064242369836676971317501739, \ + 0.2420380233175526418776257742138114, 0.3286214064242369836676971317501739, 0.4293405702582103744546770940360148, \ + 0.2420380233175526418776257742138114, 0.4293405702582103744546770940360148, 0.3286214064242369836676971317501739, \ + 0.2604803617865687481724989993381314, 0.1015775342809694392620656344661256, 0.6379421039324617570542841349379160, \ + 0.2604803617865687481724989993381314, 0.6379421039324617570542841349379160, 0.1015775342809694392620656344661256, \ + 0.1015775342809694392620656344661256, 0.2604803617865687481724989993381314, 0.6379421039324617570542841349379160, \ + 0.1015775342809694392620656344661256, 0.6379421039324617570542841349379160, 0.2604803617865687481724989993381314, \ + 0.6379421039324617570542841349379160, 0.2604803617865687481724989993381314, 0.1015775342809694392620656344661256, \ + 0.6379421039324617570542841349379160, 0.1015775342809694392620656344661256, 0.2604803617865687481724989993381314, \ + 0.1370742358464553112273875967730419, 0.7100659730011301684626801034028176, 0.1528597911524145480655079154530540, \ + 0.1370742358464553112273875967730419, 0.1528597911524145480655079154530540, 0.7100659730011301684626801034028176, \ + 0.7100659730011301684626801034028176, 0.1370742358464553112273875967730419, 0.1528597911524145480655079154530540, \ + 0.7100659730011301684626801034028176, 0.1528597911524145480655079154530540, 0.1370742358464553112273875967730419, \ + 0.1528597911524145480655079154530540, 0.1370742358464553112273875967730419, 0.7100659730011301684626801034028176, \ + 0.1528597911524145480655079154530540, 0.7100659730011301684626801034028176, 0.1370742358464553112273875967730419, \ + 0.1467269458722997854671632467216114, 0.4985454776784148389623396724346094, 0.3547275764492854310816483121016063, \ + 0.1467269458722997854671632467216114, 0.3547275764492854310816483121016063, 0.4985454776784148389623396724346094, \ + 0.4985454776784148389623396724346094, 0.1467269458722997854671632467216114, 0.3547275764492854310816483121016063, \ + 0.4985454776784148389623396724346094, 0.3547275764492854310816483121016063, 0.1467269458722997854671632467216114, \ + 0.3547275764492854310816483121016063, 0.1467269458722997854671632467216114, 0.4985454776784148389623396724346094, \ + 0.3547275764492854310816483121016063, 0.4985454776784148389623396724346094, 0.1467269458722997854671632467216114, \ + 0.0269989777425532900823057502748270, 0.0491867226725819992050325879517914, 0.9238142995848647176515555656806100, \ + 0.0269989777425532900823057502748270, 0.9238142995848647176515555656806100, 0.0491867226725819992050325879517914, \ + 0.0491867226725819992050325879517914, 0.0269989777425532900823057502748270, 0.9238142995848647176515555656806100, \ + 0.0491867226725819992050325879517914, 0.9238142995848647176515555656806100, 0.0269989777425532900823057502748270, \ + 0.9238142995848647176515555656806100, 0.0269989777425532900823057502748270, 0.0491867226725819992050325879517914, \ + 0.9238142995848647176515555656806100, 0.0491867226725819992050325879517914, 0.0269989777425532900823057502748270, \ + 0.0618717859336170294959345028473763, 0.7796601465405693653920593533257488, 0.1584680675258135496008549125690479, \ + 0.0618717859336170294959345028473763, 0.1584680675258135496008549125690479, 0.7796601465405693653920593533257488, \ + 0.7796601465405693653920593533257488, 0.0618717859336170294959345028473763, 0.1584680675258135496008549125690479, \ + 0.7796601465405693653920593533257488, 0.1584680675258135496008549125690479, 0.0618717859336170294959345028473763, \ + 0.1584680675258135496008549125690479, 0.0618717859336170294959345028473763, 0.7796601465405693653920593533257488, \ + 0.1584680675258135496008549125690479, 0.7796601465405693653920593533257488, 0.0618717859336170294959345028473763, \ + 0.0477243674276219970176171614184568, 0.3704915391495476328920233299868414, 0.5817840934228304394792985476669855, \ + 0.0477243674276219970176171614184568, 0.5817840934228304394792985476669855, 0.3704915391495476328920233299868414, \ + 0.3704915391495476328920233299868414, 0.0477243674276219970176171614184568, 0.5817840934228304394792985476669855, \ + 0.3704915391495476328920233299868414, 0.5817840934228304394792985476669855, 0.0477243674276219970176171614184568, \ + 0.5817840934228304394792985476669855, 0.0477243674276219970176171614184568, 0.3704915391495476328920233299868414, \ + 0.5817840934228304394792985476669855, 0.3704915391495476328920233299868414, 0.0477243674276219970176171614184568, \ + 0.1206005151863643737319975457467081, 0.8633469487547525966775197048264090, 0.0160525360588830157126949416124262, \ + 0.1206005151863643737319975457467081, 0.0160525360588830157126949416124262, 0.8633469487547525966775197048264090, \ + 0.8633469487547525966775197048264090, 0.1206005151863643737319975457467081, 0.0160525360588830157126949416124262, \ + 0.8633469487547525966775197048264090, 0.0160525360588830157126949416124262, 0.1206005151863643737319975457467081, \ + 0.0160525360588830157126949416124262, 0.1206005151863643737319975457467081, 0.8633469487547525966775197048264090, \ + 0.0160525360588830157126949416124262, 0.8633469487547525966775197048264090, 0.1206005151863643737319975457467081, \ + 0.0026971477967097875517998861738533, 0.0561949381877454995359855161041196, 0.9411079140155447220195128466002643, \ + 0.0026971477967097875517998861738533, 0.9411079140155447220195128466002643, 0.0561949381877454995359855161041196, \ + 0.0561949381877454995359855161041196, 0.0026971477967097875517998861738533, 0.9411079140155447220195128466002643, \ + 0.0561949381877454995359855161041196, 0.9411079140155447220195128466002643, 0.0026971477967097875517998861738533, \ + 0.9411079140155447220195128466002643, 0.0026971477967097875517998861738533, 0.0561949381877454995359855161041196, \ + 0.9411079140155447220195128466002643, 0.0561949381877454995359855161041196, 0.0026971477967097875517998861738533, \ + 0.0030156332779423624702863637736527, 0.2086750067484213488899769117779215, 0.7883093599736362699914593576977495, \ + 0.0030156332779423624702863637736527, 0.7883093599736362699914593576977495, 0.2086750067484213488899769117779215, \ + 0.2086750067484213488899769117779215, 0.0030156332779423624702863637736527, 0.7883093599736362699914593576977495, \ + 0.2086750067484213488899769117779215, 0.7883093599736362699914593576977495, 0.0030156332779423624702863637736527, \ + 0.7883093599736362699914593576977495, 0.0030156332779423624702863637736527, 0.2086750067484213488899769117779215, \ + 0.7883093599736362699914593576977495, 0.2086750067484213488899769117779215, 0.0030156332779423624702863637736527, \ + 0.0299053757884570198255502759820956, 0.7211512409120340860724240883428138, 0.2489433832995089357353890591184609, \ + 0.0299053757884570198255502759820956, 0.2489433832995089357353890591184609, 0.7211512409120340860724240883428138, \ + 0.7211512409120340860724240883428138, 0.0299053757884570198255502759820956, 0.2489433832995089357353890591184609, \ + 0.7211512409120340860724240883428138, 0.2489433832995089357353890591184609, 0.0299053757884570198255502759820956, \ + 0.2489433832995089357353890591184609, 0.0299053757884570198255502759820956, 0.7211512409120340860724240883428138, \ + 0.2489433832995089357353890591184609, 0.7211512409120340860724240883428138, 0.0299053757884570198255502759820956, \ + 0.0067566542224609888248054723192126, 0.6400554419405418693500564586429391, 0.3531879038369971635091815187479369, \ + 0.0067566542224609888248054723192126, 0.3531879038369971635091815187479369, 0.6400554419405418693500564586429391, \ + 0.6400554419405418693500564586429391, 0.0067566542224609888248054723192126, 0.3531879038369971635091815187479369, \ + 0.6400554419405418693500564586429391, 0.3531879038369971635091815187479369, 0.0067566542224609888248054723192126, \ + 0.3531879038369971635091815187479369, 0.0067566542224609888248054723192126, 0.6400554419405418693500564586429391, \ + 0.3531879038369971635091815187479369, 0.6400554419405418693500564586429391, 0.0067566542224609888248054723192126} +#define SIQK_QUADRATURE_TRISYM_ORDER20_WEIGHT \ + {0.0125376079944966561247055025773989,0.0274718698764242139076507953632245,0.0274718698764242139076507953632245, \ + 0.0274718698764242139076507953632245,0.0097652722770514236577676925321612,0.0097652722770514236577676925321612, \ + 0.0097652722770514236577676925321612,0.0013984195353918234608348036829284,0.0013984195353918234608348036829284, \ + 0.0013984195353918234608348036829284,0.0092921026251851831373462786700657,0.0092921026251851831373462786700657, \ + 0.0092921026251851831373462786700657,0.0165778760323669269172164320025331,0.0165778760323669269172164320025331, \ + 0.0165778760323669269172164320025331,0.0206677623486650786921448030852844,0.0206677623486650786921448030852844, \ + 0.0206677623486650786921448030852844,0.0206677623486650786921448030852844,0.0206677623486650786921448030852844, \ + 0.0206677623486650786921448030852844,0.0208222355211545064046507746979842,0.0208222355211545064046507746979842, \ + 0.0208222355211545064046507746979842,0.0208222355211545064046507746979842,0.0208222355211545064046507746979842, \ + 0.0208222355211545064046507746979842,0.0095686384198490608693488113090098,0.0095686384198490608693488113090098, \ + 0.0095686384198490608693488113090098,0.0095686384198490608693488113090098,0.0095686384198490608693488113090098, \ + 0.0095686384198490608693488113090098,0.0244527709689724634389840218773315,0.0244527709689724634389840218773315, \ + 0.0244527709689724634389840218773315,0.0244527709689724634389840218773315,0.0244527709689724634389840218773315, \ + 0.0244527709689724634389840218773315,0.0031557306306305341579709899946238,0.0031557306306305341579709899946238, \ + 0.0031557306306305341579709899946238,0.0031557306306305341579709899946238,0.0031557306306305341579709899946238, \ + 0.0031557306306305341579709899946238,0.0121367963653212975611017654387069,0.0121367963653212975611017654387069, \ + 0.0121367963653212975611017654387069,0.0121367963653212975611017654387069,0.0121367963653212975611017654387069, \ + 0.0121367963653212975611017654387069,0.0149664801438864486504698447788542,0.0149664801438864486504698447788542, \ + 0.0149664801438864486504698447788542,0.0149664801438864486504698447788542,0.0149664801438864486504698447788542, \ + 0.0149664801438864486504698447788542,0.0063275933217777392825187376956819,0.0063275933217777392825187376956819, \ + 0.0063275933217777392825187376956819,0.0063275933217777392825187376956819,0.0063275933217777392825187376956819, \ + 0.0063275933217777392825187376956819,0.0013425603120636958685146788994302,0.0013425603120636958685146788994302, \ + 0.0013425603120636958685146788994302,0.0013425603120636958685146788994302,0.0013425603120636958685146788994302, \ + 0.0013425603120636958685146788994302,0.0027760769163475539772489852907711,0.0027760769163475539772489852907711, \ + 0.0027760769163475539772489852907711,0.0027760769163475539772489852907711,0.0027760769163475539772489852907711, \ + 0.0027760769163475539772489852907711,0.0107398444741849414391099415411190,0.0107398444741849414391099415411190, \ + 0.0107398444741849414391099415411190,0.0107398444741849414391099415411190,0.0107398444741849414391099415411190, \ + 0.0107398444741849414391099415411190,0.0053678057381874528034004789844857,0.0053678057381874528034004789844857, \ + 0.0053678057381874528034004789844857,0.0053678057381874528034004789844857,0.0053678057381874528034004789844857, \ + 0.0053678057381874528034004789844857} + +#define SIQK_QUADRATURE_TRITAY_ORDER6_COORD \ + {4.724686653264358e-02, 5.725498667747682e-02, 8.954981467898796e-01, \ + 4.280913872509884e-02, 8.953626400245792e-01, 6.182822125032195e-02, \ + 2.921805130458027e-01, 6.844757484565146e-01, 2.334373849768268e-02, \ + 8.712234683377076e-01, 6.874625591502949e-02, 6.003027574726293e-02, \ + 5.086198608278325e-02, 6.156762055758400e-01, 3.334618083413767e-01, \ + 2.128646728100595e-01, 6.279461411977890e-01, 1.591891859921515e-01, \ + 2.817957679526839e-01, 6.290913834186361e-02, 6.552950937054525e-01, \ + 6.225041026512227e-01, 6.837821192050995e-02, 3.091176854282673e-01, \ + 7.604403244598745e-02, 2.875294583743921e-01, 6.364265091796204e-01, \ + 5.941924379444020e-01, 3.287835564131346e-01, 7.702400564246337e-02, \ + 3.353648085404556e-01, 3.122904050136449e-01, 3.523447864458995e-01} + +#define SIQK_QUADRATURE_TRITAY_ORDER6_WEIGHT \ + {3.806807185295551e-02, 3.837935530775279e-02, 4.620045674456197e-02, \ + 5.346758944419899e-02, 8.375582696574595e-02, 1.016448330255167e-01, \ + 1.018615244613670e-01, 1.114218316600018e-01, 1.120094502629461e-01, \ + 1.247875714375583e-01, 1.884034888373949e-01} + +#define SIQK_QUADRATURE_TRITAY_ORDER12_COORD \ + {7.26510255160501828e-02, 9.27348974483949817e-01, 0.00000000000000000e+00, \ + 2.11790731803609689e-02, 2.35517332495786824e-02, 9.55269193570060349e-01, \ + 1.41841115784669236e-01, 5.40914911362088088e-17, 8.58158884215330708e-01, \ + 1.15143666726236216e-02, 9.45475073220970907e-01, 4.30105601064054710e-02, \ + 2.77555756156289135e-17, 1.54064601626856063e-01, 8.45935398373143910e-01, \ + 3.72684680767588483e-01, -1.88694080537681499e-16, 6.27315319232411683e-01, \ + 9.43134911146902510e-01, 2.71109713562557482e-02, 2.97541174968417414e-02, \ + 8.44725347421859452e-01, 1.46044961672175677e-01, 9.22969090596487129e-03, \ + 8.23277107647898521e-01, 2.11522233831219000e-02, 1.55570668968979586e-01, \ + 6.21586880750877868e-01, 1.45665147883470222e-02, 3.63846604460775103e-01, \ + 2.21919501597089841e-02, 7.88601719223131714e-01, 1.89206330617159302e-01, \ + 2.27722111443204644e-01, 7.49189739790679599e-01, 2.30881487661157569e-02, \ + 7.38137544226065284e-02, 7.18714961015890358e-02, 8.54314749475804436e-01, \ + 6.43364629415364875e-01, 3.32129083947645065e-01, 2.45062866369900600e-02, \ + 2.28091126376529507e-02, 3.61181591189672080e-01, 6.16009296172674969e-01, \ + 6.63093778446759319e-01, 2.43458133948799671e-01, 9.34480876044410103e-02, \ + 2.51456820638045198e-02, 5.81689214740147453e-01, 3.93165103196048027e-01, \ + 4.29837040104380730e-01, 5.44446676271925334e-01, 2.57162836236939363e-02, \ + 9.40413011410586863e-02, 8.26003314017559997e-01, 7.99553848413813162e-02, \ + 7.94010795132135239e-01, 1.16386499067277244e-01, 8.96027058005875177e-02, \ + 7.83496599417470019e-02, 2.03768481077729741e-01, 7.17881858980523258e-01, \ + 2.25505520049374242e-01, 6.44132203822605637e-02, 7.10081259568365097e-01, \ + 6.43800731623786371e-01, 9.54285858105846096e-02, 2.60770682565629019e-01, \ + 5.43837635808460451e-01, 2.44982965093490213e-01, 2.11179399098049336e-01, \ + 4.32112641877997194e-01, 7.05667243440369213e-02, 4.97320633777965815e-01, \ + 2.55495747579340349e-01, 6.19381257362555782e-01, 1.25122995058103870e-01, \ + 1.22162380966293838e-01, 6.27682615680314027e-01, 2.50155003353392136e-01, \ + 4.47861373562203791e-01, 4.22605657433460014e-01, 1.29532969004336196e-01, \ + 4.09354529674576528e-01, 2.10785259391403995e-01, 3.79860210934019449e-01, \ + 1.24718320885524481e-01, 4.08963804491244809e-01, 4.66317874623230710e-01, \ + 2.28197277938737758e-01, 2.13777432530059680e-01, 5.58025289531202562e-01, \ + 2.88796329020881648e-01, 4.09786577770025306e-01, 3.01417093209092990e-01} + +#define SIQK_QUADRATURE_TRITAY_ORDER12_WEIGHT \ + {4.888049814660050e-03, 6.675900027367356e-03, 6.845534654343699e-03, \ + 7.119751436080721e-03, 7.714492373624846e-03, 9.654708742436301e-03, \ + 1.050932673560249e-02, 1.068084365762828e-02, 1.848368581123072e-02, \ + 1.854548042160657e-02, 2.062000411968213e-02, 2.168508541701153e-02, \ + 2.249074619915818e-02, 2.490407320150775e-02, 2.509917342768508e-02, \ + 2.794373431987983e-02, 2.814555860521331e-02, 2.816965445973000e-02, \ + 3.052917241207244e-02, 3.057527760403899e-02, 3.957360579297199e-02, \ + 4.128188739546268e-02, 4.593784216579169e-02, 4.749957532530720e-02, \ + 4.814880503690738e-02, 5.096492487678762e-02, 5.335208304882109e-02, \ + 5.414687261316752e-02, 5.943783395113540e-02, 5.998970732710617e-02, \ + 6.316454642265663e-02, 7.522206260332436e-02} + +#define SIQK_QUADRATURE_TRITAY_ORDER16_COORD \ + {2.22044604925031308e-16, 1.00000000000000022e+00, -4.44089209850062616e-16, \ + 1.72652007459386422e-16, -1.72652007459386422e-16, 1.00000000000000000e+00, \ + 9.99999999999999556e-01, 1.67697146066824836e-16, 2.76392063783237780e-16, \ + 5.51287671788707190e-02, 9.39886358357719054e-01, 4.98487446341022711e-03, \ + 6.97876983249687277e-03, 5.43806683058353502e-02, 9.38640561861667777e-01, \ + 9.37963548813877668e-01, 9.39400491638755185e-03, 5.26424462697347786e-02, \ + 3.66619396286766500e-02, 1.64345086362403456e-02, 9.46903551735083004e-01, \ + 1.67139052970596280e-02, 9.46948726986246103e-01, 3.63373677166942688e-02, \ + 9.42217145243293808e-01, 4.26604005767651506e-02, 1.51224541799410417e-02, \ + 1.18395699389696601e-01, 1.22269495438720680e-02, 8.69377351066431325e-01, \ + 1.21386193179034985e-02, 8.67369652104666988e-01, 1.20491728577429513e-01, \ + 1.38549201074093298e-01, 8.45674402138906656e-01, 1.57763967870000466e-02, \ + 1.56119497522677064e-02, 1.39575963210261389e-01, 8.44812087037470905e-01, \ + 8.54716865118515079e-01, 1.31782174323082840e-01, 1.35009605584020809e-02, \ + 8.38676993516376368e-01, 1.57955126300247592e-02, 1.45527493853598866e-01, \ + 2.47883957465546700e-01, 7.36546288443630570e-01, 1.55697540908227294e-02, \ + 2.48047467521941595e-01, 1.39688430330388181e-02, 7.37983689445019575e-01, \ + 1.54489124190416716e-02, 2.54789518603903087e-01, 7.29761568977055242e-01, \ + 1.40536794130045051e-02, 7.31638652255490185e-01, 2.54307668331505310e-01, \ + 7.14650647525855276e-01, 1.57253728950845356e-02, 2.69623979579060202e-01, \ + 7.19291320004516122e-01, 2.66230284364682601e-01, 1.44783956308012773e-02, \ + 7.34816524385439873e-02, 8.67350406521407824e-01, 5.91679410400481887e-02, \ + 6.23723757982518195e-02, 7.41493666956614256e-02, 8.63478257506086755e-01, \ + 5.64947509640178147e-01, 1.59285948360033090e-02, 4.19123895523818568e-01, \ + 4.03471605078646045e-01, 1.56061028067777056e-02, 5.80922292114576355e-01, \ + 3.93065372986517114e-01, 5.91009481748388743e-01, 1.59251452650941427e-02, \ + 1.58528135007360294e-02, 4.03477149688871994e-01, 5.80670036810391865e-01, \ + 1.55759225172019677e-02, 5.69474562852597677e-01, 4.14949514630200356e-01, \ + 8.56028762075832783e-01, 6.78493700650298209e-02, 7.61218678591373960e-02, \ + 5.57652171741686020e-01, 4.26596859027159547e-01, 1.57509692311544325e-02, \ + 1.58711917968908656e-01, 6.70982507889701790e-02, 7.74189831242121151e-01, \ + 1.65257027288124081e-01, 7.52831023147951472e-01, 8.19119495639244466e-02, \ + 6.69143759151381579e-02, 7.75372778355688519e-01, 1.57712845729173323e-01, \ + 8.06983742470389620e-02, 1.68907315778736744e-01, 7.50394309974224294e-01, \ + 7.60435265981276642e-01, 1.68733583291941547e-01, 7.08311507267818108e-02, \ + 7.41575866479260215e-01, 8.21244708436324466e-02, 1.76299662677107338e-01, \ + 2.90354968333863872e-01, 6.28870536334479868e-01, 8.07744953316562597e-02, \ + 6.13421339495847429e-01, 8.11413015265752130e-02, 3.05437358977577345e-01, \ + 8.03401946048588056e-02, 2.96911206508048198e-01, 6.22748598887093108e-01, \ + 2.98521053628375943e-01, 7.67542314170573392e-02, 6.24724714954566718e-01, \ + 7.65491844989589776e-02, 6.22302233384477099e-01, 3.01148582116563923e-01, \ + 6.11711534686959046e-01, 3.10378628805096313e-01, 7.79098365079446409e-02, \ + 4.57714874646253878e-01, 8.19218215186586080e-02, 4.60363303835087556e-01, \ + 4.46142332818981191e-01, 4.71702266501346945e-01, 8.21554006796718639e-02, \ + 8.15831550859882348e-02, 4.54660341525047307e-01, 4.63756503388964458e-01, \ + 1.87663085257486151e-01, 1.70109133923693812e-01, 6.42227780818820149e-01, \ + 1.69570213325764829e-01, 6.40600432948674525e-01, 1.89829353725560646e-01, \ + 6.34777673094082173e-01, 1.91226758371660088e-01, 1.73995568534257739e-01, \ + 3.31577016252400436e-01, 1.88531576707023696e-01, 4.79891407040575868e-01, \ + 1.87871344418995001e-01, 4.77292995769074468e-01, 3.34835659811930531e-01, \ + 1.91505318098148747e-01, 3.12697462175977048e-01, 4.95797219725874205e-01, \ + 3.11122038514993648e-01, 4.96122594594562871e-01, 1.92755366890443480e-01, \ + 4.91017887987217960e-01, 1.92880531286706181e-01, 3.16101580726075804e-01, \ + 4.74506574489367838e-01, 3.36004145381649799e-01, 1.89489280128982363e-01, \ + 3.31914842734057136e-01, 3.33728055084797526e-01, 3.34357102181145338e-01} + +#define SIQK_QUADRATURE_TRITAY_ORDER16_WEIGHT \ + {3.101299925557040e-04, 3.157587355864167e-04, 3.543300779435999e-04, \ + 2.758185808404191e-03, 3.134620382788961e-03, 3.926570441300832e-03, \ + 4.727574193224073e-03, 4.891225563554369e-03, 4.993082174472287e-03, \ + 6.877690940807241e-03, 7.048958902004150e-03, 7.482343216857858e-03, \ + 7.804875180599580e-03, 7.884184667408244e-03, 8.789727319135741e-03, \ + 1.020569201350139e-02, 1.047814393079899e-02, 1.053567064989013e-02, \ + 1.088233801010153e-02, 1.111442043493028e-02, 1.120933468410323e-02, \ + 1.150613084965787e-02, 1.184069512498871e-02, 1.287323216839533e-02, \ + 1.289784008040242e-02, 1.290361638049960e-02, 1.301716160293398e-02, \ + 1.328840708045580e-02, 1.328923809154386e-02, 1.337661646188983e-02, \ + 1.878939033204372e-02, 1.915329470976454e-02, 1.924248475126509e-02, \ + 1.948099129262171e-02, 1.973020557737488e-02, 2.061823890489025e-02, \ + 2.564362192416913e-02, 2.582028209673193e-02, 2.591150211345546e-02, \ + 2.642639940905077e-02, 2.692527865136344e-02, 2.709476646596388e-02, \ + 2.923685732222178e-02, 2.964315841816427e-02, 2.971791383743251e-02, \ + 3.159001279314883e-02, 3.164634225766622e-02, 3.203536808857846e-02, \ + 4.060202979591518e-02, 4.072187567651760e-02, 4.073396006206902e-02, \ + 4.075252740422450e-02, 4.075823324694786e-02, 4.084655298115641e-02, \ + 4.616091672652638e-02} + +#define SIQK_QUADRATURE_TRITAY_ORDER18_COORD \ + {7.07029890425770434e-03, 1.16731059668412299e-02, 9.81256595128901066e-01, \ + 1.18506636748826333e-02, 9.81003085838793698e-01, 7.14625048632366866e-03, \ + 9.77787974953233552e-01, 1.06966317091697870e-02, 1.15153933375966612e-02, \ + 1.21952425108865503e-02, 9.38247698355045179e-01, 4.95570591340682709e-02, \ + 5.03248860967756076e-02, 1.26627518417214337e-02, 9.37012362061502957e-01, \ + 9.28052601109434661e-01, 5.98109409983804755e-02, 1.21364578921848640e-02, \ + 9.24985307647630872e-01, 1.37363297926722354e-02, 6.12783625596968889e-02, \ + 6.29343769992106727e-02, 9.22952795940546356e-01, 1.41128270602429717e-02, \ + 1.46695353279870377e-02, 6.33107354992695215e-02, 9.22019729172743441e-01, \ + 8.38221442443636167e-01, 1.17265100334603151e-02, 1.50052047522903520e-01, \ + 1.20132291087278187e-02, 1.55472058732347040e-01, 8.32514712158925141e-01, \ + 1.53147795225895278e-01, 8.34329388898221724e-01, 1.25228158758829977e-02, \ + 1.26364459307456434e-02, 8.50163803195673196e-01, 1.37199750873581161e-01, \ + 1.39355658599882609e-01, 1.28816350521976618e-02, 8.47762706347919726e-01, \ + 8.35267146700183760e-01, 1.51080160895878751e-01, 1.36526924039374886e-02, \ + 4.12764350243855882e-01, 1.01917879216578220e-02, 5.77043861834486305e-01, \ + 1.19773841073520515e-02, 2.81337239930327110e-01, 7.06685375962320839e-01, \ + 2.75105559050908943e-01, 7.12437462850100567e-01, 1.24569780989904899e-02, \ + 7.11523343775096961e-01, 2.76302525086338957e-01, 1.21741311385640816e-02, \ + 5.69603491897309744e-01, 1.09658368560618374e-02, 4.19430671246628417e-01, \ + 1.11273414647166669e-02, 4.28911051788389452e-01, 5.59961606746893992e-01, \ + 5.66810345010056338e-01, 4.21542055511477942e-01, 1.16475994784657200e-02, \ + 4.17052309556705914e-01, 5.71125859044442907e-01, 1.18218313988511792e-02, \ + 1.15242148311881509e-02, 5.82686827051090317e-01, 4.05788958117721532e-01, \ + 7.14440844241883699e-01, 1.30567806713246960e-02, 2.72502375086791593e-01, \ + 2.64452707580261070e-01, 1.30760400963919332e-02, 7.22471252323346969e-01, \ + 1.33578918342581732e-02, 7.26343706240674458e-01, 2.60298401925067369e-01, \ + 8.68135265415298840e-01, 6.87230068637382230e-02, 6.31417277209629368e-02, \ + 6.27086061132897665e-02, 8.65230210152941437e-01, 7.20611837337687966e-02, \ + 7.60967385052684769e-02, 6.48599071037368607e-02, 8.59043354390994662e-01, \ + 6.27716704398273706e-02, 1.48349494336207116e-01, 7.88878835223965513e-01, \ + 7.88170460224977831e-01, 6.24359898395942040e-02, 1.49393549935427972e-01, \ + 1.47224894550839758e-01, 7.87136901173502213e-01, 6.56382042756580297e-02, \ + 4.22525938278520530e-01, 5.19104921609511785e-02, 5.25563569560528299e-01, \ + 7.74048614563915161e-01, 1.54312992744383953e-01, 7.16383926917008862e-02, \ + 6.76067776910891149e-01, 2.61784274560294683e-01, 6.21479485288141675e-02, \ + 6.74530572355868108e-02, 7.66725787281281046e-01, 1.65821155483132143e-01, \ + 6.17776557233678525e-02, 2.58210367662733586e-01, 6.80011976613898561e-01, \ + 1.74941863707076289e-01, 6.79065925147429861e-02, 7.57151543778180725e-01, \ + 5.84917884088599349e-02, 5.29357827480425258e-01, 4.12150384110714807e-01, \ + 6.72145076162932620e-01, 6.66036150484161232e-02, 2.61251308788651271e-01, \ + 5.51208842356557649e-01, 5.85675461899432051e-02, 3.90223611453499153e-01, \ + 2.98183807982819626e-01, 6.44535360410836422e-02, 6.37362655976096759e-01, \ + 2.61427822878740113e-01, 6.74813842915130246e-01, 6.37583342061296410e-02, \ + 5.82159599068178268e-02, 3.91460231036876105e-01, 5.50323809056306068e-01, \ + 6.75570147429912504e-02, 6.48770149230717630e-01, 2.83672836026291120e-01, \ + 5.44832625703827067e-01, 3.94649822040802345e-01, 6.05175522553705880e-02, \ + 3.99787267113028255e-01, 5.39013715193329634e-01, 6.11990176936421104e-02, \ + 1.51078277618042822e-01, 1.62789508278475825e-01, 6.86132214103481353e-01, \ + 1.61959533146025403e-01, 6.81243632264066146e-01, 1.56796834589908451e-01, \ + 6.78965449795995379e-01, 1.54283287802020219e-01, 1.66751262401984401e-01, \ + 4.97246831616064200e-01, 2.52272775044453668e-01, 2.50480393339482132e-01, \ + 2.45792781854977660e-01, 2.54798153240703207e-01, 4.99409064904319133e-01, \ + 2.75839635471827105e-01, 1.48558054919434857e-01, 5.75602309608738039e-01, \ + 1.41286303940196589e-01, 2.93023960643619241e-01, 5.65689735416184170e-01, \ + 5.75308715344231558e-01, 2.80899127230990808e-01, 1.43792157424777634e-01, \ + 2.66045287116412177e-01, 4.82098959297083796e-01, 2.51855753586504028e-01, \ + 2.89515501140379161e-01, 5.64187824544361005e-01, 1.46296674315259834e-01, \ + 4.20272276953932211e-01, 1.30769964434388403e-01, 4.48957758611679414e-01, \ + 5.51913339122326096e-01, 1.47969222194756778e-01, 3.00117438682917126e-01, \ + 1.54754368775656848e-01, 5.63868422294592553e-01, 2.81377208929750600e-01, \ + 1.38678912478906013e-01, 4.36115742879047474e-01, 4.25205344642046457e-01, \ + 3.79754605982586757e-01, 3.60326393528548949e-01, 2.59919000488864349e-01, \ + 4.32257322202306393e-01, 4.22418833467425037e-01, 1.45323844330268570e-01, \ + 2.50087546338060018e-01, 3.71900183305238496e-01, 3.78012270356701430e-01, \ + 3.73879170813181227e-01, 2.41364500692846234e-01, 3.84756328493972566e-01} +#define SIQK_QUADRATURE_TRITAY_ORDER18_WEIGHT \ + {1.258287849322552e-03, 1.263672600361209e-03, 1.663464766659172e-03, \ + 4.075174606270012e-03, 4.306776287080819e-03, 4.389337308965301e-03, \ + 4.854979278083793e-03, 5.123310595743368e-03, 5.419884417037201e-03, \ + 6.469269508792310e-03, 6.816991179147562e-03, 6.923866407332497e-03, \ + 6.971077005242425e-03, 7.206069998379916e-03, 7.685172776701560e-03, \ + 8.124490112628030e-03, 8.485915214007324e-03, 8.504426621066338e-03, \ + 8.547676033732530e-03, 8.694442727954849e-03, 8.727198121935910e-03, \ + 8.920337864331938e-03, 8.922343193968446e-03, 8.952316877617482e-03, \ + 9.062987810035171e-03, 9.239241944101240e-03, 9.289678218556065e-03, \ + 1.016085758882769e-02, 1.068858309045880e-02, 1.159584270491392e-02, \ + 1.372133554295597e-02, 1.451509611701859e-02, 1.472613692527127e-02, \ + 1.497181258145377e-02, 1.535134740593910e-02, 1.626316829313562e-02, \ + 1.639421042530506e-02, 1.656173375959963e-02, 1.730837634372872e-02, \ + 1.735406869880698e-02, 1.736860247019273e-02, 1.742643812271074e-02, \ + 1.743007805929840e-02, 1.777357849874442e-02, 1.800914981913493e-02, \ + 1.814631429213930e-02, 1.909488510415974e-02, 1.961264000589436e-02, \ + 2.413550629437514e-02, 2.449560607831186e-02, 2.486104169360984e-02, \ + 2.535328684929062e-02, 2.548859970214835e-02, 2.606800318335970e-02, \ + 2.617304374623586e-02, 2.622203417758513e-02, 2.637298224112941e-02, \ + 2.647245318638137e-02, 2.711977972504153e-02, 2.717351017096441e-02, \ + 2.735502743194343e-02, 2.786441729563326e-02, 2.888671321165472e-02, \ + 2.926968908113495e-02, 3.045196253398069e-02, 3.186369822247498e-02} + +class TriangleQuadrature { + const Real trisym_order4_coord_ [ 18] = SIQK_QUADRATURE_TRISYM_ORDER4_COORD; + const Real trisym_order4_weight_ [ 6] = SIQK_QUADRATURE_TRISYM_ORDER4_WEIGHT; + const Real tritay_order6_coord_ [ 33] = SIQK_QUADRATURE_TRITAY_ORDER6_COORD; + const Real tritay_order6_weight_ [ 11] = SIQK_QUADRATURE_TRITAY_ORDER6_WEIGHT; + const Real trisym_order8_coord_ [ 48] = SIQK_QUADRATURE_TRISYM_ORDER8_COORD; + const Real trisym_order8_weight_ [ 16] = SIQK_QUADRATURE_TRISYM_ORDER8_WEIGHT; +#ifdef SIQK_USE_TRITAY12 + const Real tritay_order12_coord_ [ 96] = SIQK_QUADRATURE_TRITAY_ORDER12_COORD; + const Real tritay_order12_weight_[ 32] = SIQK_QUADRATURE_TRITAY_ORDER12_WEIGHT; +#else + const Real trisym_order12_coord_ [ 99] = SIQK_QUADRATURE_TRISYM_ORDER12_COORD; + const Real trisym_order12_weight_[ 33] = SIQK_QUADRATURE_TRISYM_ORDER12_WEIGHT; +#endif + const Real trisym_order14_coord_ [138] = SIQK_QUADRATURE_TRISYM_ORDER14_COORD; + const Real trisym_order14_weight_[ 46] = SIQK_QUADRATURE_TRISYM_ORDER14_WEIGHT; + const Real tritay_order16_coord_ [165] = SIQK_QUADRATURE_TRITAY_ORDER16_COORD; + const Real tritay_order16_weight_[ 55] = SIQK_QUADRATURE_TRITAY_ORDER16_WEIGHT; + const Real tritay_order18_coord_ [198] = SIQK_QUADRATURE_TRITAY_ORDER18_COORD; + const Real tritay_order18_weight_[ 66] = SIQK_QUADRATURE_TRITAY_ORDER18_WEIGHT; + const Real trisym_order20_coord_ [264] = SIQK_QUADRATURE_TRISYM_ORDER20_COORD; + const Real trisym_order20_weight_[ 88] = SIQK_QUADRATURE_TRISYM_ORDER20_WEIGHT; + +public: + KOKKOS_INLINE_FUNCTION TriangleQuadrature () {} + + KOKKOS_INLINE_FUNCTION + void get_coef (const int order, RawConstVec3s& coord, + RawConstArray& weight) const { + switch (order) { + case 4: + coord = RawConstVec3s(trisym_order4_coord_, 6); + weight = RawConstArray(trisym_order4_weight_, 6); + break; + case 6: + coord = RawConstVec3s(tritay_order6_coord_, 11); + weight = RawConstArray(tritay_order6_weight_, 11); + break; + case 8: + coord = RawConstVec3s(trisym_order8_coord_, 16); + weight = RawConstArray(trisym_order8_weight_, 16); + break; + case 12: +#ifdef SIQK_USE_TRITAY12 + coord = RawConstVec3s(tritay_order12_coord_, 32); + weight = RawConstArray(tritay_order12_weight_, 32); +#else + coord = RawConstVec3s(trisym_order12_coord_, 33); + weight = RawConstArray(trisym_order12_weight_, 33); +#endif + break; + case 14: + coord = RawConstVec3s(trisym_order14_coord_, 46); + weight = RawConstArray(trisym_order14_weight_, 46); + break; + case 16: + coord = RawConstVec3s(tritay_order16_coord_, 55); + weight = RawConstArray(tritay_order16_weight_, 55); + break; + case 18: + coord = RawConstVec3s(tritay_order18_coord_, 66); + weight = RawConstArray(tritay_order18_weight_, 66); + break; + case 20: + coord = RawConstVec3s(trisym_order20_coord_, 88); + weight = RawConstArray(trisym_order20_weight_, 88); + break; + default: + ko::abort("TriangleQuadrature::get_coef: order not supported."); + } + } +}; + +} // namespace siqk + +#endif // INCLUDE_SIQK_QUADRATURE_HPP diff --git a/siqk/siqk_runtests.py b/siqk/siqk_runtests.py new file mode 100755 index 0000000..8a05d9a --- /dev/null +++ b/siqk/siqk_runtests.py @@ -0,0 +1,57 @@ +#!/usr/bin/python + +import os, sys + +quick = True +exe = sys.argv[1] +testno = int(sys.argv[2]) + +stride = 2 +biggest = 1111 + +xlates = [4.2*10**f for f in range(-17, 0, stride)] +xlates.append(0) + +ylates = [0] + +angles = xlates + +fails = [] +cnt = 0 + +if testno == 0: + for n in [4, 50, 511, biggest]: + if quick and n > 50: break + for angle in angles: + for xlate in xlates: + for ylate in ylates: + cmd = ('OMP_NUM_THREADS=8 {exe:s} --testno 0 --xlate {xlate:1.15e} --ylate {ylate:1.14e} --angle {angle:1.15e} -n {n:d}'. + format(exe=exe, xlate=xlate, ylate=ylate, angle=angle, n=n)) + stat = os.system(cmd + ' |& grep PASSED &> /dev/null') + if stat: + fails.append(cmd) + else: + cnt += 1 + print len(fails) + +elif testno == 1: + for n in [4, 20, 40, 79]: + if quick and n > 20: break + for angle in angles: + cmd = ('OMP_NUM_THREADS=8 {exe:s} --testno 1 --angle {angle:1.15e} -n {n:d}'. + format(exe=exe, angle=angle, n=n)) + stat = os.system(cmd + ' |& grep PASSED &> /dev/null') + if stat: + fails.append(cmd) + else: + cnt += 1 + print len(fails) + +if len(fails) > 0: + print 'FAILED' + for f in fails: + print f + sys.exit(-1) +else: + print 'PASSED ({0:d})'.format(cnt) + sys.exit(0) diff --git a/siqk/siqk_search.hpp b/siqk/siqk_search.hpp new file mode 100644 index 0000000..1e517e3 --- /dev/null +++ b/siqk/siqk_search.hpp @@ -0,0 +1,378 @@ +#ifndef INCLUDE_SIQK_SEARCH_HPP +#define INCLUDE_SIQK_SEARCH_HPP + +#include "siqk_defs.hpp" +#include "siqk_geometry.hpp" +#include + +namespace siqk { + +// Oct-tree. Might do something else better suited to the sphere later. +template +class Octree { +public: + enum { max_depth = max_depth_ }; + typedef Real BoundingBox[6]; + + struct Options { + // Do not go beyond max_depth_ depth, including the root and leaf. With this + // constraInt, try to go deep enough so that a leaf has no more than + // max_nelem elements. + Int max_nelem; + Options () : max_nelem(8) {} + }; + + // Bounding box for a cluster of points ps (possibly vertices). + template + static void calc_bb (const CV3s& ps, const Int np, BB bb) { + if (np == 0) return; + for (Int j = 0; j < 3; ++j) + bb[j] = bb[j+3] = ps(0,j); + for (Int i = 1; i < np; ++i) { + for (Int j = 0; j < 3; ++j) { + bb[j] = min(bb[j], ps(i,j)); + bb[j+3] = max(bb[j+3], ps(i,j)); + } + } + pad_bb(bb); + } + + template + KOKKOS_INLINE_FUNCTION + static void calc_bb (const CV3s& p, const CIV e, const Int ne, BB ebb) { + for (Int j = 0; j < 3; ++j) + ebb[j] = ebb[j+3] = p(e[0], j); + for (Int i = 1; i < ne; ++i) { + if (e[i] == -1) break; + for (Int j = 0; j < 3; ++j) { + ebb[j] = min(ebb[j], p(e[i], j)); + ebb[j+3] = max(ebb[j+3], p(e[i], j)); + } + } + pad_bb(ebb); + } + + // If a bounding box was constructed from vertices of a spherical polygon, + // expand it to account for the possible protrusion of the sphere. + template + KOKKOS_INLINE_FUNCTION + static void pad_bb (BB bb) { + if (std::is_same::value) return; + Real hl = 0.5*std::sqrt(square(bb[3] - bb[0]) + square(bb[4] - bb[1]) + + square(bb[5] - bb[2])); + // Limit the half-length to the circle's radius. + hl = min(1.0, hl); + // Max distance from a chord of length 2 hl to the unit circle: + // hl = sin theta + // pad = 1 - cos theta = 1 - sqrt(1 - sin^2 theta) = 1 - sqrt(1 - hl^2). + const Real pad = 1 - std::sqrt(1 - square(hl)); + for (Int i = 0; i < 3; ++i) bb[ i] -= pad; + for (Int i = 0; i < 3; ++i) bb[3+i] += pad; + } + + template + static void calc_bb (const CV3s& ps, BoundingBox bb) { + calc_bb(ps, nslices(ps), bb); + } + + template + static void calc_bb (const CV3s& p, const CIs& e, V6s& ebbs) { + assert(nslices(ebbs) == nslices(e)); + for (Int k = 0, klim = nslices(e); k < klim; ++k) + calc_bb(p, slice(e, k), szslice(e), slice(ebbs, k)); + } + + // p is a 3xNp array of points. e is a KxNe array of elements. An entry <0 is + // ignored. All <0 entries must be at the end of an element's list. + Octree (const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e, + const Options& o) { + init(p, e, o); + } + Octree (const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e) { + Options o; + init(p, e, o); + } + + Octree() {} + void init (const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e) { + Options o; + init(p, e, o); + } + + // Apply f to every element in leaf nodes with which bb overlaps. f must have + // function + // void operator(const Int element). + template + KOKKOS_INLINE_FUNCTION + void apply (const CV bb, Functor& f) const { + if (nslices(nodes_) == 0) { + for (Int i = 0; i < offsets_[1]; ++i) + f(elems_[i]); + return; + } +#ifdef SIQK_NONRECURSIVE + // Non-recursive impl. + { + // Stack. + Real snbb[8*max_depth_]; + Int sni[max_depth_], si[max_depth_]; + Int sp = 0; + // Args for top-level call. + copy(snbb, bb_, 8); + sni[sp] = 0; + si[sp] = 0; + while (sp >= 0) { + // Get stack frame's (nbb, ni, current i) values. + const Int i = si[sp]; + if (i == 8) { + --sp; + continue; + } + // Increment stored value of i for next iteration. Current value is + // stored in 'i' above. + ++si[sp]; + const Int ni = sni[sp]; + const Real* const nbb = snbb + 8*sp; + // Can use the next stack frame's bb space for a child bb. + Real* const child_bb = snbb + 8*(sp+1); + fill_child_bb(nbb, i, child_bb); + if ( ! do_bb_overlap(child_bb, bb)) continue; + Int e = nodes_(ni,i); + if (e < 0) { + // Leaf, so apply functor to each element. + e = std::abs(e + 1); + for (Int k = offsets_[e]; k < offsets_[e+1]; ++k) + f(elems_[k]); + } else if (e > 0) { + // Recurse. + ++sp; + sni[sp] = e; + si[sp] = 0; + } + } + } +#else + apply_r(0, bb_, bb, f); +#endif + } + +private: + /* Each node in the oct-tree contains 8 integers, stored in 'nodes'. + + >0 is an index Into 'nodes', pointing to a child node. + + A <=0 entry in 'nodes' indicates a leaf node. If 0, there are no elements + in the leaf. If <0, the negative of the entry minus 1 is the index of an + offset array indexing 'elems'. + + Each segment of 'elems' contains a list of element indices covered by a + leaf node. Element indices refer to the list of elements the caller + provides during oct-tree construction. + */ + + // Static data structures holding the completed octree. + // nodes(:,i) is a list. The list includes children of node i (>0) and leaf + // node data (<=0). + //todo Make these const once ready to do full GPU stuff. + Nodes nodes_; + // A leaf node corresponding to -k covers elements + // elems[offset[k] : offset[k]-1]. + ko::View offsets_, elems_; + // Root node's bounding box. + BoundingBox bb_; + + // Dynamic data structures for construction phase. + class IntList { + Int* const buf_; + Int i_; + public: + IntList (Int* const buf) : buf_(buf), i_(0) {} + void reset () { i_ = 0; } + void push (const Int& i) { buf_[i_++] = i; } + Int* data () { return buf_; } + Int n () const { return i_; } + const Int& operator[] (const Int& i) const { return buf_[i]; } + }; + + class DynIntList { + std::vector buf_; + public: + DynIntList () {} + void push (const Int& i) { buf_.push_back(i); } + Int& back () { return buf_.back(); } + Int& operator[] (const size_t i) { + if (i >= buf_.size()) + buf_.resize(i+1); + return buf_[i]; + } + const Int& operator[] (const size_t i) const { return buf_[i]; } + Int n () const { return static_cast(buf_.size()); } + const Int* data () const { return buf_.data(); } + }; + + // Opposite index slot convention. + class DynNodes { + std::vector buf_; + public: + Int n () const { return static_cast(buf_.size()) >> 3; } + const Int* data () const { return buf_.data(); } + Int& operator() (const Int& r, const Int& c) { + const size_t ec = (c+1) << 3; + if (ec >= buf_.size()) + buf_.resize(ec); + return const_cast( + const_cast(this)->operator()(r, c)); + } + const Int& operator() (const Int& r, const Int& c) const { + assert(((c << 3) + r) >= 0); + assert(((c << 3) + r) < (Int) buf_.size()); + return buf_[(c << 3) + r]; + } + }; + + void init (const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e, + const Options& o) { + if (nslices(e) == 0) return; + // Get OT's bounding box. + calc_bb(p, bb_); + // Get elements' bounding boxes. + Vec6s::HostMirror ebbs("ebbs", nslices(e)); + calc_bb(p, e, ebbs); + // Static element lists for work. Each level has active work space. + std::vector buf(max_depth_*nslices(e)); + IntList es(buf.data()), wrk(buf.data() + nslices(e)); + for (Int i = 0, ilim = nslices(e); i < ilim; ++i) + es.push(i); + // Dynamic element lists. + DynIntList offsets, elems; + offsets[0] = 0; + // Dynamic node data structure. + DynNodes nodes; + // Recurse. We don't care about the return value. If it's 0 and nodes.n() == + // 0, we'll detect as much in 'apply'. + init_r(1, bb_, ebbs, o, es, wrk, offsets, elems, nodes); + // Build the static data structures. + if (elems.n() == 0) return; + init_static_ds(nodes, offsets, elems); + } + + Int init_r (const Int depth, // Tree's depth at this point, including root. + const BoundingBox& nbb, // My bounding box. + const ConstVec6s::HostMirror& ebbs, // All elements' bounding boxes. + const Options& o, // Options controlling construct of the tree. + IntList& es, // List of elements in my bounding box. + IntList& wrk, // Work space to store working element lists. + DynIntList& offsets, // Offsetss Into elems. + DynIntList& elems, // Elements belonging to leaf nodes. + DynNodes& nodes) // Dynamic nodes data structure. + { + const Int my_idx = nodes.n(); // My node index. + // Decide what to do. + if (es.n() == 0) { + // I have no elements, so return 0 to indicate I'm a leaf node containing + // nothing. + return 0; + } else if (es.n() <= o.max_nelem || depth == max_depth_) { + // I'm a leaf node with elements. Store my list of elements and return the + // storage location. + const Int os = offsets.back(); + offsets.push(os + es.n()); + for (Int i = 0, n = es.n(); i < n; ++i) + elems[os + i] = es[i]; + return 1 - offsets.n(); + } else { + // I'm not a leaf node. + nodes(0, my_idx) = 0; // Insert myself Into the nodes array. + for (Int ic = 0; ic < 8; ++ic) { + BoundingBox child_bb; + fill_child_bb(nbb, ic, child_bb); + // Find the elements that are in this child's bb. + IntList ces(wrk.data()); + for (Int i = 0, n = es.n(); i < n; ++i) + if (do_bb_overlap(child_bb, slice(ebbs, es[i]))) + ces.push(es[i]); + // Create some work space. + IntList cwrk(wrk.data() + ces.n()); + // Recurse. + const Int child_idx = init_r(depth+1, child_bb, ebbs, o, ces, cwrk, + offsets, elems, nodes); + nodes(ic, my_idx) = child_idx; + } + return my_idx; + } + } + + void init_static_ds (const DynNodes nodes, const DynIntList& offsets, + const DynIntList& elems) { + { + ko::resize(nodes_, nodes.n()); + auto nodes_hm = ko::create_mirror_view(nodes_); + for (Int i = 0; i < nodes.n(); ++i) + for (Int j = 0; j < 8; ++j) + nodes_hm(i,j) = nodes(j,i); + ko::deep_copy(nodes_, nodes_hm); + } + hm_resize_and_copy(offsets_, offsets, offsets.n()); + hm_resize_and_copy(elems_, elems, elems.n()); + } + + // Using parent bb p, fill child bb c, with child_idx in 0:7. + template + KOKKOS_INLINE_FUNCTION + static void fill_child_bb (const CBB& p, const Int& child_idx, BB& c) { + const Real m[] = { 0.5*(p[0] + p[3]), + 0.5*(p[1] + p[4]), + 0.5*(p[2] + p[5]) }; + switch (child_idx) { + case 0: c[0] = p[0]; c[1] = p[1]; c[2] = p[2]; c[3] = m[0]; c[4] = m[1]; c[5] = m[2]; break; + case 1: c[0] = m[0]; c[1] = p[1]; c[2] = p[2]; c[3] = p[3]; c[4] = m[1]; c[5] = m[2]; break; + case 2: c[0] = m[0]; c[1] = m[1]; c[2] = p[2]; c[3] = p[3]; c[4] = p[4]; c[5] = m[2]; break; + case 3: c[0] = p[0]; c[1] = m[1]; c[2] = p[2]; c[3] = m[0]; c[4] = p[4]; c[5] = m[2]; break; + case 4: c[0] = p[0]; c[1] = p[1]; c[2] = m[2]; c[3] = m[0]; c[4] = m[1]; c[5] = p[5]; break; + case 5: c[0] = m[0]; c[1] = p[1]; c[2] = m[2]; c[3] = p[3]; c[4] = m[1]; c[5] = p[5]; break; + case 6: c[0] = m[0]; c[1] = m[1]; c[2] = m[2]; c[3] = p[3]; c[4] = p[4]; c[5] = p[5]; break; + case 7: c[0] = p[0]; c[1] = m[1]; c[2] = m[2]; c[3] = m[0]; c[4] = p[4]; c[5] = p[5]; break; + default: + // impossible + error("fill_child_bb: The impossible has happened."); + } + } + + // Do bounding boxes a and b overlap? + template + KOKKOS_INLINE_FUNCTION + static bool do_bb_overlap (const BoundingBox a, const BB b) { + for (Int i = 0; i < 3; ++i) + if ( ! do_lines_overlap(a[i], a[i+3], b[i], b[i+3])) + return false; + return true; + } + + KOKKOS_INLINE_FUNCTION + static bool do_lines_overlap (const Real& a1, const Real& a2, + const Real& b1, const Real& b2) { + return ! (a2 < b1 || a1 > b2); + } + + template KOKKOS_INLINE_FUNCTION + void apply_r (const Int ni, const BoundingBox& nbb, const CV bb, + Functor& f) const { + for (Int i = 0; i < 8; ++i) { + BoundingBox child_bb; + fill_child_bb(nbb, i, child_bb); + if ( ! do_bb_overlap(child_bb, bb)) continue; + Int e = nodes_(ni,i); + if (e > 0) + apply_r(e, child_bb, bb, f); + else if (e < 0) { + e = std::abs(e + 1); + for (Int k = offsets_[e]; k < offsets_[e+1]; ++k) + f(elems_[k]); + } + } + } +}; + +} // namespace siqk + +#endif // INCLUDE_SIQK_SEARCH_HPP diff --git a/siqk/siqk_sqr.hpp b/siqk/siqk_sqr.hpp new file mode 100644 index 0000000..c50f710 --- /dev/null +++ b/siqk/siqk_sqr.hpp @@ -0,0 +1,267 @@ +#ifndef INCLUDE_SIQK_SQR_HPP +#define INCLUDE_SIQK_SQR_HPP + +#include "siqk_defs.hpp" +#include "siqk_intersect.hpp" + +namespace siqk { +namespace sqr { // spherical quadrilateral <-> reference square +/* Let p be a 3x4 matrix with p(:,i) the i'th vertex in a spherical quad in CCW + order. Let (a,b) be coordinates in the reference square [0,1]^2. (Here we + choose [0,1] instead of [-1,1].) (a,b) = (0,0) corresponds to p(:,1); (1,0) + is p(:,2); (1,1) is p(:,3); (0,1) is p(:,4). + The map from reference square to bilinear quad can be written + T = p*[ 1 -1 1 -1 + -1 1 0 0 + -1 0 0 1 + 1 0 0 0]'; + f(a,b) = T(:,1)*a*b + T(:,2)*a + T(:,3)*b + T(:,4); + The map to the sphere is then completed with + g(a,b) = norm(f(a,b)) + q = f(a,b) / g(a,b). + The Jacobian matrix for q is given by + q_a = f_a/g - (f g_a)/g^2 + g_a = g_f f_a + and similarly for q_b. +*/ + +namespace impl { +// Compute T(i,:). +template +KOKKOS_INLINE_FUNCTION +void calc_T_row (const ConstVec3sT& p, const Quad& e, const Int i, + Real& t1, Real& t2, Real& t3, Real& t4) { + t4 = p(e[0],i); + t3 = -t4 + p(e[3],i); + t2 = -t4 + p(e[1],i); + t1 = -t2 + p(e[2],i) - p(e[3],i); +} + +// Compute T(:,1)*a*b + T(:,2)*a + T(:,3)*b + T(:,4). +template +KOKKOS_INLINE_FUNCTION +void calc_ref_to_bilinear (const ConstVec3sT& p, const Quad& e, + Real a, Real b, Real q[3]) { + a = 0.5*(a + 1); + b = 0.5*(b + 1); + for (Int i = 0; i < 3; ++i) { + Real t1, t2, t3, t4; + impl::calc_T_row(p, e, i, t1, t2, t3, t4); + q[i] = t1*a*b + t2*a + t3*b + t4; + } +} + +// The residual function is r(a,b) = f(a,b)/g(a,b) - q. +template +KOKKOS_INLINE_FUNCTION +void calc_residual (const ConstVec3sT& p, const Quad& e, const Real a, + const Real b, const Real q[3], Real r[3]) { + calc_ref_to_bilinear(p, e, a, b, r); + const Real rnorm = std::sqrt(SphereGeometry::norm2(r)); + for (Int i = 0; i < 3; ++i) + r[i] = r[i]/rnorm - q[i]; +} + +// Compute the Jacobian matrix of the residual function: Jacobian(ref square -> +// sphere). +// TODO Consider rewriting this in terms of the p=1 basis isoparametric +// interpolation formulation. Better performance? See +// calc_isoparametric_jacobian in slmmir.cpp. +template +KOKKOS_INLINE_FUNCTION +void calc_Jacobian (const ConstVec3sT& p, const Quad& e, Real a, Real b, + Real J[6]) { + a = 0.5*(a + 1); + b = 0.5*(b + 1); + Real r[3]; + for (Int i = 0; i < 3; ++i) { + Real t1, t2, t3, t4; + calc_T_row(p, e, i, t1, t2, t3, t4); + r[ i] = t1*a*b + t2*a + t3*b + t4; + J[ i] = t1*b + t2; + J[3+i] = t1*a + t3; + } + Real rtJ[2] = {0}; + for (Int j = 0; j < 2; ++j) { + const Real* const Jj = J + 3*j; + for (Int i = 0; i < 3; ++i) + rtJ[j] += r[i]*Jj[i]; + } + const Real rnorm2 = SphereGeometry::norm2(r), rnorm = std::sqrt(rnorm2); + for (Int j = 0; j < 2; ++j) { + Real* const Jj = J + 3*j; + for (Int i = 0; i < 3; ++i) + Jj[i] = (Jj[i] - r[i]*rtJ[j]/rnorm2)/rnorm; + } +} + +// Solve J dx = r. +KOKKOS_INLINE_FUNCTION +void solve_Jxr (Real J[6], const Real r[3], Real dx[2]) { + // QR factorization: J -> J [n1 a; 0 n2]. + const Real n1 = std::sqrt(SphereGeometry::norm2(J)); + SphereGeometry::scale(1/n1, J); + const Real a = SphereGeometry::dot(J, J+3); + SphereGeometry::axpy(-a, J, J+3); + const Real n2 = std::sqrt(SphereGeometry::norm2(J+3)); + SphereGeometry::scale(1/n2, J+3); + // r -> Q' r. + Real Qtr[2] = {0}; + for (Int j = 0; j < 2; ++j) { + const Real* const Jj = J + 3*j; + for (Int i = 0; i < 3; ++i) + Qtr[j] += Jj[i]*r[i]; + } + // dx = R \ (Q' r). + dx[1] = 2*(Qtr[1] / n2); + dx[0] = 2*((Qtr[0] - a*dx[1]) / n1); +} +} // namespace impl + +struct Info { + bool success; + Int n_iterations; +}; + +template +KOKKOS_INLINE_FUNCTION +void calc_ref_to_sphere ( + // The spherical quad containing the point. + const ConstVec3sT& p, const Quad& e, + // (a,b) in [-1,1] + const Real a, const Real b, + // The point on the sphere. + Real q[3]) +{ + impl::calc_ref_to_bilinear(p, e, a, b, q); + SphereGeometry::normalize(q); +} + +template +KOKKOS_INLINE_FUNCTION +void calc_sphere_to_ref ( + // The spherical quad containing the point. + const ConstVec3sT& p, const Quad& e, + // The point on the sphere. + const Real q[3], + // (a,b) in [-1,1] + Real& a, Real& b, + // Optional info output. + Info* const info = nullptr, + // Max number of iterations before returning with failure. + const Int max_its = 10, + // Tolerance for Newton iteration. + const Real tol = 1e2*std::numeric_limits::epsilon()) +{ + const Real tol2 = square(tol); + Real rnorm2 = 1; + a = b = 0; + Int it = 0; + for (it = 1; it <= max_its; ++it) { // Newton's method. + Real r[3], J[6]; + impl::calc_residual(p, e, a, b, q, r); + rnorm2 = SphereGeometry::norm2(r); + if (rnorm2 <= tol2) break; + impl::calc_Jacobian(p, e, a, b, J); + Real dx[2]; + impl::solve_Jxr(J, r, dx); + a -= dx[0]; + b -= dx[1]; + } + if (info) { + info->success = rnorm2 <= tol2; + info->n_iterations = it; + } +} + +// Ref coords, packed (x,y), CCW, starting from (-1,-1). +KOKKOS_INLINE_FUNCTION +const Real* get_ref_vertices () { + static const Real c[] = {-1, -1, 1, -1, 1, 1, -1, 1}; + return c; +} + +namespace test { +struct Info { + Int sum_nits, max_nits, nfails; +}; + +class TestSphereToRefKernel { + const Real a_test[9] = {-0.1, -1e-16, 0, 1e-15, 0.1, 0.7, 1, 1-1e-14, 1.1}; + const Int n_a_test = sizeof(a_test)/sizeof(*a_test); + + const Real tol_; + mutable ConstVec3s p_; + mutable ConstIdxs e_; + +public: + typedef Info value_type; + + TestSphereToRefKernel (const ConstVec3s::HostMirror& p_hm, + const ConstIdxs::HostMirror& e_hm, + const Real tol = 1e1*std::numeric_limits::epsilon()) + : tol_(tol) + { + { Vec3s p; resize_and_copy(p, p_hm); p_ = p; } + { Idxs e; resize_and_copy(e, e_hm); e_ = e; } + } + + Int n () const { return nslices(e_)*square(n_a_test); } + const Real& tol () const { return tol_; } + + KOKKOS_INLINE_FUNCTION + void operator() (const Int k, value_type& jinfo) const { + const Int + ei = k / square(n_a_test), + ij = k % square(n_a_test), + i = ij / n_a_test, + j = ij % n_a_test; + const Real a_t = a_test[i], b_t = a_test[j]; + Real q[3]; + sqr::calc_ref_to_sphere(p_, slice(e_, ei), a_t, b_t, q); + Real a, b; + sqr::Info info; + sqr::calc_sphere_to_ref(p_, slice(e_, ei), q, a, b, &info, 100, tol_); + const Real err = std::sqrt(square(a_t - a) + square(b_t - b)); + // tol is on dx, not (a,b), so adjust slightly. + if ( ! info.success || err > 1e4*tol_) { + jinfo.nfails++; + printf("calc_sphere_to_ref ei %d i %d j %d: nits %d re %1.1e\n", + ei, i, j, info.n_iterations, err); + } + jinfo.sum_nits += info.n_iterations; + jinfo.max_nits = max(jinfo.max_nits, info.n_iterations); + } + + KOKKOS_INLINE_FUNCTION + void init (value_type& info) { + info.sum_nits = 0; + info.max_nits = 0; + info.nfails = 0; + } + + KOKKOS_INLINE_FUNCTION + void join (volatile value_type& dst, volatile value_type const& src) const { + dst.max_nits = max(dst.max_nits, src.max_nits); + dst.sum_nits += src.sum_nits; + dst.nfails += src.nfails; + } +}; + +inline Int test_sphere_to_ref (const ConstVec3s::HostMirror& p, + const ConstIdxs::HostMirror& e) { + TestSphereToRefKernel k(p, e); + Info info; + auto t = tic(); + ko::parallel_reduce(k.n(), k, info); + const auto et = toc(t); + fprintf(stderr, "sqr: #fails %d #iterations mean %1.1f max %d\n", + info.nfails, (Real) info.sum_nits / k.n(), info.max_nits); + print_times("test_sphere_to_ref", et); + return info.nfails; +} +} // namespace test +} // namespace sqr +} // namespace siqk + +#endif // INCLUDE_SIQK_SQR_HPP diff --git a/siqk/siqk_test.cpp b/siqk/siqk_test.cpp new file mode 100644 index 0000000..e08ab2a --- /dev/null +++ b/siqk/siqk_test.cpp @@ -0,0 +1,519 @@ +// ko=/home/ambradl/lib/kokkos/cpu; mycpp -I$ko/include -L$ko/lib -fopenmp unit_test.cpp -lkokkos -ldl -Wall -pedantic -DSIQK_TIME +// ./a.out -m | grep "mat=1" > foo.m +// >> msik('draw_unit_test0', 'foo'); + +#include + +#include "siqk.hpp" +using namespace siqk; + +#define INSTANTIATE_PLANE + +//> Code that will likely be moved to library files. + +template +void write_matlab (const std::string& name, const CV3s& p) { + printf("mat=1; %s = [", name.c_str()); + for (Int ip = 0; ip < nslices(p); ++ip) + printf(" %1.15e %1.15e %1.15e;", p(ip,0), p(ip,1), p(ip,2)); + printf("].';\n"); +} + +template +void write_matlab (const std::string& name, const CV3s& p, const CIs& e) { + printf("mat=1; %s.p = [", name.c_str()); + for (Int ip = 0; ip < nslices(p); ++ip) + printf(" %1.15e %1.15e %1.15e;", p(ip,0), p(ip,1), p(ip,2)); + printf("].';\n"); + printf("mat=1; %s.n = [", name.c_str()); + for (Int ie = 0; ie < nslices(e); ++ie) + printf(" %d %d %d %d;", e(ie,0)+1, e(ie,1)+1, e(ie,2)+1, e(ie,3)+1); + printf("].';\n"); +} + +static void make_planar_mesh (Vec3s::HostMirror& p, Idxs::HostMirror& e, + const Int n) { + const Real d = std::sqrt(0.5); + ko::resize(e, n*n, 4); + ko::resize(p, (n+1)*(n+1)); + for (Int iy = 0; iy < n+1; ++iy) + for (Int ix = 0; ix < n+1; ++ix) { + const auto idx = (n+1)*iy + ix; + p(idx,0) = 2*(static_cast(ix)/n - 0.5)*d; + p(idx,1) = 2*(static_cast(iy)/n - 0.5)*d; + p(idx,2) = 0; + } + for (Int iy = 0; iy < n; ++iy) + for (Int ix = 0; ix < n; ++ix) { + const auto idx = n*iy + ix; + e(idx,0) = (n+1)*iy + ix; + e(idx,1) = (n+1)*iy + ix+1; + e(idx,2) = (n+1)*(iy+1) + ix+1; + e(idx,3) = (n+1)*(iy+1) + ix; + } +} + +// Row-major R. +inline void form_rotation (const Real axis[3], const Real angle, Real r[9]) { + const Real nrm = std::sqrt(SphereGeometry::norm2(axis)); + const Real& x = axis[0] / nrm, & y = axis[1] / nrm, & z = axis[2] / nrm, + & th = angle; + const Real cth = std::cos(th), sth = std::sin(th), omcth = 1 - cth; + r[0] = cth + x*x*omcth; + r[3] = y*x*omcth + z*sth; + r[6] = z*x*omcth - y*sth; + r[1] = x*y*omcth - z*sth; + r[4] = cth + y*y*omcth; + r[7] = z*y*omcth + x*sth; + r[2] = x*z*omcth + y*sth; + r[5] = y*z*omcth - x*sth; + r[8] = cth + z*z*omcth; +} + +template +static void rotate (const Real R[9], V p) { + const Real x = p[0], y = p[1], z = p[2]; + p[0] = R[0]*x + R[1]*y + R[2]*z; + p[1] = R[3]*x + R[4]*y + R[5]*z; + p[2] = R[6]*x + R[7]*y + R[8]*z; +} + +template +static void translate (const Real xlate[3], V p) { + for (Int i = 0; i < 3; ++i) p[i] += xlate[i]; +} + +static void transform_planar_mesh (const Real R[9], const Real xlate[3], + Vec3s::HostMirror& p) { + for (Int i = 0; i < nslices(p); ++i) { + rotate(R, slice(p, i)); + translate(xlate, slice(p, i)); + } +} + +// Remove vertices marked unused and adjust numbering. +static void remove_unused_vertices (Vec3s::HostMirror& p, Idxs::HostMirror& e, + const Real unused) { + // adjust[i] is the number to subtract from i. Hence if e(ei,0) was originally + // i, it is adjusted to i - adjust[i]. + std::vector adjust(nslices(p), 0); + Int rmcnt = 0; + for (Int i = 0; i < nslices(p); ++i) { + if (p(i,0) != unused) continue; + adjust[i] = 1; + ++rmcnt; + } + // Cumsum. + for (Int i = 1; i < nslices(p); ++i) + adjust[i] += adjust[i-1]; + // Adjust e. + for (Int ei = 0; ei < nslices(e); ++ei) + for (Int k = 0; k < szslice(e); ++k) + e(ei,k) -= adjust[e(ei,k)]; + // Remove unused from p. + Vec3s::HostMirror pc("copy", nslices(p)); + ko::deep_copy(pc, p); + ko::resize(p, nslices(p) - rmcnt); + for (Int i = 0, j = 0; i < nslices(pc); ++i) { + if (pc(i,0) == unused) continue; + for (Int k = 0; k < szslice(pc); ++k) p(j,k) = pc(i,k); + ++j; + } +} + +// A very simple cube-sphere mesh with nxn elements per face. At least for now +// I'm not bothering with making the elements well proportioned. +void make_cubesphere_mesh (Vec3s::HostMirror& p, Idxs::HostMirror& e, + const Int n) { + // Transformation of the reference mesh make_planar_mesh to make each of the + // six faces. + const Real d = std::sqrt(0.5); + static Real R[6][9] = {{ 1, 0, 0, 0, 0, 0, 0, 1, 0}, // face 0, -y + { 0, 0, 0, 1, 0, 0, 0, 1, 0}, // 1, +x + {-1, 0, 0, 0, 0, 0, 0, 1, 0}, // 2, +y + { 0, 0, 0,-1, 0, 0, 0, 1, 0}, // 3, -x + { 1, 0, 0, 0, 1, 0, 0, 0, 0}, // 4, +z + {-1, 0, 0, 0, 1, 0, 0, 0, 0}}; // 5, -z + static Real xlate[6][3] = {{ 0,-d, 0}, { d, 0, 0}, { 0, d, 0}, + {-d, 0, 0}, { 0, 0, d}, { 0, 0,-d}}; + // Construct 6 uncoupled faces. + Vec3s::HostMirror ps[6]; + Vec3s::HostMirror& p_ref = ps[0]; + Idxs::HostMirror es[6]; + Idxs::HostMirror& e_ref = es[0]; + make_planar_mesh(p_ref, e_ref, n); + ko::resize(e, 6*nslices(e_ref), 4); + ko::resize(p, 6*nslices(p_ref)); + for (Int i = 1; i < 6; ++i) { + ko::resize(es[i], nslices(e_ref), 4); + ko::deep_copy(es[i], e_ref); + ko::resize(ps[i], nslices(p_ref)); + ko::deep_copy(ps[i], p_ref); + transform_planar_mesh(R[i], xlate[i], ps[i]); + } + transform_planar_mesh(R[0], xlate[0], ps[0]); + // Pack (p,e), accounting for equivalent vertices. For the moment, keep the p + // slot for an equivalent vertex to make node numbering simpler, but make the + // value bogus so we know if there's a problem in the numbering. + const Real unused = -2; + ko::deep_copy(p, unused); + Int p_base = 0, e_base = 0; + { // -y face + const Vec3s::HostMirror& fp = ps[0]; + Idxs::HostMirror& fe = es[0]; + for (Int j = 0; j < nslices(fp); ++j) + for (Int k = 0; k < 3; ++k) p(j,k) = fp(j,k); + for (Int j = 0; j < nslices(fe); ++j) + for (Int k = 0; k < 4; ++k) e(j,k) = fe(j,k); + p_base += nslices(p_ref); + e_base += nslices(e_ref); + } + for (Int fi = 1; fi <= 2; ++fi) { // +x, +y faces + const Vec3s::HostMirror& fp = ps[fi]; + Idxs::HostMirror& fe = es[fi]; + for (Int j = 0; j < nslices(fp); ++j) { + if (j % (n+1) == 0) continue; // equiv vertex + for (Int k = 0; k < 3; ++k) p(p_base+j,k) = fp(j,k); + } + for (Int j = 0; j < nslices(fe); ++j) { + for (Int k = 0; k < 4; ++k) fe(j,k) += p_base; + // Left 2 vertices of left elem on face fi equiv to right 2 vertices of + // right elem on face fi-1. Write to the face, then copy to e, so that + // other faces can use these updated data. + if (j % n == 0) { + fe(j,0) = es[fi-1](j+n-1,1); + fe(j,3) = es[fi-1](j+n-1,2); + } + for (Int k = 0; k < 4; ++k) e(e_base+j,k) = fe(j,k); + } + p_base += nslices(p_ref); + e_base += nslices(e_ref); + } + { // -x face + const Vec3s::HostMirror& fp = ps[3]; + Idxs::HostMirror& fe = es[3]; + for (Int j = 0; j < nslices(fp); ++j) { + if (j % (n+1) == 0 || (j+1) % (n+1) == 0) continue; + for (Int k = 0; k < 3; ++k) p(p_base+j,k) = fp(j,k); + } + for (Int j = 0; j < nslices(fe); ++j) { + for (Int k = 0; k < 4; ++k) fe(j,k) += p_base; + if (j % n == 0) { + fe(j,0) = es[2](j+n-1,1); + fe(j,3) = es[2](j+n-1,2); + } else if ((j+1) % n == 0) { + fe(j,1) = es[0]((j+1)-n,0); + fe(j,2) = es[0]((j+1)-n,3); + } + for (Int k = 0; k < 4; ++k) e(e_base+j,k) = fe(j,k); + } + p_base += nslices(p_ref); + e_base += nslices(e_ref); + } + { // +z face + const Vec3s::HostMirror& fp = ps[4]; + Idxs::HostMirror& fe = es[4]; + for (Int j = n+1; j < nslices(fp) - (n+1); ++j) { + if (j % (n+1) == 0 || (j+1) % (n+1) == 0) continue; + for (Int k = 0; k < 3; ++k) p(p_base+j,k) = fp(j,k); + } + for (Int j = 0; j < nslices(fe); ++j) + for (Int k = 0; k < 4; ++k) fe(j,k) += p_base; + for (Int j = 0; j < n; ++j) { // -y + fe(j,0) = es[0](n*(n-1)+j,3); + fe(j,1) = es[0](n*(n-1)+j,2); + } + for (Int j = 0; j < n; ++j) { // +y + fe(n*(n-1)+j,2) = es[2](n*n-1-j,3); + fe(n*(n-1)+j,3) = es[2](n*n-1-j,2); + } + for (Int j = 0, i3 = 0; j < nslices(fe); j += n, ++i3) { // -x + fe(j,0) = es[3](n*n-1-i3,2); + fe(j,3) = es[3](n*n-1-i3,3); + } + for (Int j = n-1, i1 = 0; j < nslices(fe); j += n, ++i1) { // +x + fe(j,1) = es[1](n*(n-1)+i1,3); + fe(j,2) = es[1](n*(n-1)+i1,2); + } + for (Int j = 0; j < nslices(fe); ++j) + for (Int k = 0; k < 4; ++k) e(e_base+j,k) = fe(j,k); + p_base += nslices(p_ref); + e_base += nslices(e_ref); + } + { // -z face + const Vec3s::HostMirror& fp = ps[5]; + Idxs::HostMirror& fe = es[5]; + for (Int j = n+1; j < nslices(fp) - (n+1); ++j) { + if (j % (n+1) == 0 || (j+1) % (n+1) == 0) continue; + for (Int k = 0; k < 3; ++k) p(p_base+j,k) = fp(j,k); + } + for (Int j = 0; j < nslices(fe); ++j) + for (Int k = 0; k < 4; ++k) fe(j,k) += p_base; + for (Int j = 0; j < n; ++j) { // -y + fe(j,0) = es[0](n-1-j,1); + fe(j,1) = es[0](n-1-j,0); + } + for (Int j = 0; j < n; ++j) { // +y + fe(n*(n-1)+j,2) = es[2](j,1); + fe(n*(n-1)+j,3) = es[2](j,0); + } + for (Int j = 0, i3 = 0; j < nslices(fe); j += n, ++i3) { // -x + fe(j,0) = es[1](i3,0); + fe(j,3) = es[1](i3,1); + } + for (Int j = n-1, i1 = 0; j < nslices(fe); j += n, ++i1) { // +x + fe(j,1) = es[3](n-1-i1,1); + fe(j,2) = es[3](n-1-i1,0); + } + for (Int j = 0; j < nslices(fe); ++j) + for (Int k = 0; k < 4; ++k) e(e_base+j,k) = fe(j,k); + } + // Now go back and remove the unused vertices and adjust the numbering. + remove_unused_vertices(p, e, unused); + // Project to the unit sphere. + for (Int i = 0; i < nslices(p); ++i) + SphereGeometry::normalize(slice(p, i)); +} + +void calc_elem_ctr (const Vec3s::HostMirror& p, const Idxs::HostMirror& e, + const Int ei, Real ctr[3]) { + for (Int j = 0; j < 3; ++j) ctr[j] = 0; + Int n = 0; + for (Int i = 0; i < szslice(e); ++i) { + if (e(ei,i) < 0) break; + for (Int j = 0; j < 3; ++j) ctr[j] += p(e(ei,i),j); + ++n; + } + for (Int j = 0; j < 3; ++j) ctr[j] /= n; +} + +// Return 0 if all elements' subtri normals point outward relative to the +// sphere. +Int check_elem_normal_against_sphere (const Vec3s::HostMirror& p, + const Idxs::HostMirror& e) { + Int nerr = 0; + for (Int ei = 0; ei < nslices(e); ++ei) { // for each element + Real sphere[3]; // ray through elem ctr + calc_elem_ctr(p, e, ei, sphere); + for (Int ti = 0; ti < szslice(e) - 2; ++ti) { // for each tri + if (e(ei,ti+2) < 0) break; + Real tri_normal[3]; { + Real v[2][3]; + for (Int j = 0; j < 2; ++j) { + SphereGeometry::copy(v[j], slice(p, e(ei,ti+j+1))); + SphereGeometry::axpy(-1, slice(p, e(ei,0)), v[j]); + } + SphereGeometry::cross(v[0], v[1], tri_normal); + } + if (SphereGeometry::dot(tri_normal, sphere) <= 0) + ++nerr; + } + } + return nerr; +} + +//> Unit test code. + +struct Input { + Int testno; + Int n; + Real angle, xlate, ylate; + bool write_matlab, geo_sphere; + + Input(Int argc, char** argv); + void print(std::ostream& os) const; +}; + +static void project_onto_sphere (Vec3s::HostMirror& p) { + for (Int ip = 0; ip < nslices(p); ++ip) { + p(ip,2) = 1; + SphereGeometry::normalize(slice(p, ip)); + } +} + +static void +perturb_mesh (Vec3s::HostMirror& p, const Real angle, const Real xlate, + const Real ylate) { + const Real cr = std::cos(angle), sr = std::sin(angle); + for (Int ip = 0; ip < nslices(p); ++ip) { + const Real x = p(ip,0), y = p(ip,1); + p(ip,0) = cr*x - sr*y + xlate; + p(ip,1) = -sr*x + cr*y + ylate; + } +} + +static void +rotate_mesh (Vec3s::HostMirror& p, const Real axis[3], const Real angle) { + Real R[9]; + form_rotation(axis, angle, R); + for (Int i = 0; i < nslices(p); ++i) + rotate(R, slice(p,i)); +} + +static void fill_quad (const ConstVec3s::HostMirror& p, + Vec3s::HostMirror& poly) { + const Int n = static_cast(std::sqrt(nslices(p) - 1)); + copy(slice(poly, 0), slice(p, 0), 3); + copy(slice(poly, 1), slice(p, n), 3); + copy(slice(poly, 2), slice(p, nslices(p) - 1), 3); + copy(slice(poly, 3), slice(p, nslices(p) - 1 - n), 3); +} + +// Area of the outline of (p,e) clipped against the outline of (cp,ce). +template +static Real calc_true_area ( + const ConstVec3s::HostMirror& cp, const ConstIdxs::HostMirror& ce, + const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e, + const bool wm) +{ + Vec3s::HostMirror clip_poly("clip_poly", 4), poly("poly", 4), + nml("nml", 4); + fill_quad(cp, clip_poly); + fill_quad(p, poly); + for (Int i = 0; i < 4; ++i) + Geo::edge_normal(slice(clip_poly, i), slice(clip_poly, (i+1) % 4), + slice(nml, i)); + Vec3s::HostMirror vo("vo", test::max_nvert); + Int no; + { + Vec3s::HostMirror wrk("wrk", test::max_nvert); + sh::clip_against_poly(clip_poly, nml, poly, 4, vo, no, wrk); + } + if (wm) { + write_matlab("clip_poly", clip_poly); + write_matlab("poly", poly); + write_matlab("intersection", + ko::subview(vo, std::pair(0, no), ko::ALL())); + } + return Geo::calc_area_formula(vo, no); +} + +template void finalize_mesh (Vec3s::HostMirror& p) {} +template <> void finalize_mesh (Vec3s::HostMirror& p) { + project_onto_sphere(p); +} + +template +static Int +test_area (const Int n, const Real angle, const Real xlate, const Real ylate, + const bool wm) { + Vec3s::HostMirror cp; + Idxs::HostMirror ce; + make_planar_mesh(cp, ce, n); + + Vec3s::HostMirror p; resize_and_copy(p, cp); + Idxs::HostMirror e; resize_and_copy(e, ce); + perturb_mesh(p, angle, xlate, ylate); + + finalize_mesh(cp); + finalize_mesh(p); + + const Real ta = calc_true_area(cp, ce, p, e, wm); + const Real a = test::test_area_ot(cp, ce, p, e); + + const Real re = std::abs(a - ta)/ta; + fprintf(stderr, "true area %1.4e mesh area %1.4e relerr %1.4e\n", ta, a, re); + if (wm) { + write_matlab("cm", cp, ce); + write_matlab("m", p, e); + } + return re < 1e-8 ? 0 : 1; +} + +static Int test_cube (const Input& in) { + Vec3s::HostMirror cp; + Idxs::HostMirror ce; + make_cubesphere_mesh(cp, ce, in.n); + Vec3s::HostMirror p; resize_and_copy(p, cp); + Idxs::HostMirror e; resize_and_copy(e, ce); + Int nerr = 0; + { + const Int ne = check_elem_normal_against_sphere(cp, ce); + if (ne) std::cerr << "FAIL: check_elem_normal_against_sphere\n"; + nerr += ne; + } + { // Make a copy, perturb it, and compute the area of the sphere from the + // overlap mesh. + Real axis[] = {0.1, -0.3, 0.2}; + rotate_mesh(p, axis, in.angle); + const Real + a = test::test_area_ot(cp, ce, p, e), + ta = 4*M_PI, + re = std::abs(a - ta)/ta; + fprintf(stderr, "true area %1.4e mesh area %1.4e relerr %1.4e\n", + ta, a, re); + nerr += re < 1e-8 ? 0 : 1; + } + // Test ref square <-> spherical quad transformations. + nerr += sqr::test::test_sphere_to_ref(p, e); + if (in.write_matlab) { + write_matlab("cm", cp, ce); + write_matlab("m", p, e); + } + return nerr; +} + +template +Int run (const Input& in) { + switch (in.testno) { + case 0: + return test_area(in.n, in.angle, in.xlate, in.ylate, in.write_matlab); + case 1: + return test_cube(in); + default: + return 1; + } +} + +inline bool +eq (const std::string& a, const char* const b1, const char* const b2 = 0) { + return (a == std::string(b1) || (b2 && a == std::string(b2)) || + a == std::string("-") + std::string(b1)); +} + +Input::Input (Int argc, char** argv) + : testno(0), n(25), angle(M_PI*1e-1), xlate(1e-1), ylate(1e-1), + write_matlab(false), geo_sphere(true) +{ + for (Int i = 1; i < argc; ++i) { + const std::string& token = argv[i]; + if (eq(token, "--testno")) testno = atoi(argv[++i]); + else if (eq(token, "-n")) n = atoi(argv[++i]); + else if (eq(token, "-m", "--write-matlab")) write_matlab = true; + else if (eq(token, "--plane")) geo_sphere = false; + else if (eq(token, "--xlate")) xlate = atof(argv[++i]); + else if (eq(token, "--ylate")) ylate = atof(argv[++i]); + else if (eq(token, "--angle")) angle = atof(argv[++i]); + } + + print(std::cout); +} + +void Input::print (std::ostream& os) const { + os << "testno " << testno << "\n" + << "n (-n): " << n << "\n" + << "write matlab (-m): " << write_matlab << "\n" + << "planar geometry (--plane): " << ! geo_sphere << "\n" + << "angle (--angle): " << angle << "\n" + << "xlate (--xlate): " << xlate << "\n" + << "ylate (--ylate): " << ylate << "\n"; +} + +int main (int argc, char** argv) { + Kokkos::initialize(argc, argv); + { + Input in(argc, argv); + Int nerr = 0; + if (in.geo_sphere) + nerr += run(in); + else { +#ifdef INSTANTIATE_PLANE + nerr += run(in); +#else + Kokkos::abort("PlaneGeometry not instantiated."); +#endif + } + std::cerr << (nerr ? "FAIL" : "PASS") << "ED\n"; + } + Kokkos::finalize_all(); +}