From ae50798d4ab0129d1d57675cf427d2477db7e1a1 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Tue, 10 May 2016 11:30:50 -0600 Subject: [PATCH 01/28] SIQK/SI: Small prototype sphere intersection code. * No-Kokkos version only. The Kokkos version will be one directory up. * Correctness and performance test. * Very simple build instructions. No dependencies other than a C++11-compliant compiler. --- siqk/README.md | 1 + siqk/si/Array_raw.hpp | 141 ++++++++ siqk/si/README.md | 25 ++ siqk/si/sik.hpp | 792 ++++++++++++++++++++++++++++++++++++++++++ siqk/si/test.cpp | 190 ++++++++++ 5 files changed, 1149 insertions(+) create mode 100644 siqk/README.md create mode 100644 siqk/si/Array_raw.hpp create mode 100644 siqk/si/README.md create mode 100644 siqk/si/sik.hpp create mode 100644 siqk/si/test.cpp diff --git a/siqk/README.md b/siqk/README.md new file mode 100644 index 0000000..da86fd7 --- /dev/null +++ b/siqk/README.md @@ -0,0 +1 @@ +Sphere Intersection and Quadrature with Kokkos. diff --git a/siqk/si/Array_raw.hpp b/siqk/si/Array_raw.hpp new file mode 100644 index 0000000..2394232 --- /dev/null +++ b/siqk/si/Array_raw.hpp @@ -0,0 +1,141 @@ +#ifndef INCLUDE_ARRAY_RAW_HPP +#define INCLUDE_ARRAY_RAW_HPP + +#include +#include +#include +#include + +static inline void error(const std::string& msg) +{ throw std::runtime_error(msg.c_str()); } + +template static inline void share_nodelete_delete (T* p) {} +template inline std::shared_ptr share_nodelete (T* o) +{ return std::shared_ptr(o, share_nodelete_delete); } + +template +class Array1D { + typedef typename std::remove_const::type T_nonconst; + friend class Array1D; + int n_; + std::shared_ptr a_p_; + T* a_; +public: + typedef int size_type; + Array1D () : n_(0) {} + Array1D (const int n) { reset(n); } + Array1D (const int n, T* const a) { reset(n, a); } + Array1D (const Array1D& v) + : n_(v.n_), a_p_(v.a_p_), a_(v.a_) + {} + void reset (const int n) { + n_ = n; + a_p_ = std::shared_ptr(new T[n], std::default_delete()); + a_ = a_p_.get(); + } + void reset (const int n, T* const a) { n_ = n; a_p_ = nullptr; a_ = a; } + const int& n () const { return n_; } + T* data () { return a_; } + const T* data () const { return a_; } + T& operator[] (const int i) { debug(i); return a_[i]; } + const T& operator[] (const int i) const { debug(i); return a_[i]; } + void set (const T& init) { for (int i = 0; i < n_; ++i) a_[i] = init; } + Array1D& device () { return *this; } + const Array1D& device () const { return *this; } + void sync () {} + void modify () {} +private: +#ifdef SIQK_DEBUG + void debug (const int& i) const { + if (i < 0 || i >= m_) { + std::stringstream ss; + ss << "Array1D: i is " << i << " but n_ is " << n_ << "\n"; + error(ss.str().c_str()); + } + } +#else + static void debug (const int& i) {} +#endif +}; + +template +class Array2D { + typedef typename std::remove_const::type T_nonconst; + friend class Array2D; + int m_, n_; + std::shared_ptr a_p_; + T* a_; +public: + typedef int size_type; + Array2D () : m_(0), n_(0) {} + Array2D (const int m, const int n) { reset(m, n); } + Array2D (const int m, const int n, T* const a) { reset(m, n, a); } + Array2D (const Array2D& v) + : m_(v.m_), n_(v.n_), a_p_(v.a_p_), a_(v.a_) + {} + void reset (const int m, const int n) { + m_ = m; n_ = n; + a_p_ = std::shared_ptr(new T[m*n], std::default_delete()); + a_ = a_p_.get(); + } + void reset (const int m, const int n, T* const a) { m_ = m; n_ = n; a_p_ = nullptr; a_ = a; } + const int& m () const { return m_; } + const int& n () const { return n_; } + T* data () { return a_; } + const T* data () const { return a_; } + T& operator() (const int r, const int c) { debug(r, c); return a_[c*m_ + r]; } + const T& operator() (const int r, const int c) const { debug(r, c); return a_[c*m_ + r]; } + T* operator() (const int c) { debug(0, c); return a_ + m_*c; } + const T* operator() (const int c) const { debug(0, c); return a_ + m_*c; } + void set (const T& init) { for (int i = 0; i < m_*n_; ++i) a_[i] = init; } + Array2D& device () { return *this; } + const Array2D& device () const { return *this; } + void sync () {} + void modify () {} +private: +#ifdef SIQK_DEBUG + void debug (const int& r, const int& c) const { + if (r < 0 || r >= m_) { + std::stringstream ss; + ss << "Array2D: r is " << r << " but m_ is " << m_ << "\n"; + error(ss.str().c_str()); + } + if (c < 0 || c >= n_) { + std::stringstream ss; + ss << "Array2D: c is " << c << " but n_ is " << n_ << "\n"; + error(ss.str().c_str()); + } + } +#else + static void debug (const int& r, const int& c) {} +#endif +}; + +// Define a few things to minimize KOKKOS guards. +# ifndef KOKKOS_FUNCTION +# define KOKKOS_FUNCTION +# endif +# ifndef KOKKOS_INLINE_FUNCTION +# define KOKKOS_INLINE_FUNCTION inline +# endif +# ifndef KOKKOS_FORCEINLINE_FUNCTION +# define KOKKOS_FORCEINLINE_FUNCTION inline +# endif + +namespace Kokkos { +typedef void DefaultExecutionSpace; +inline void fence() {} +}; + +namespace ko { +using std::min; +using std::max; + +template +void parallel_reduce (const int n, Functor f, Scalar& r) { + for (int i = 0; i < n; ++i) + f(i, r); +} +} + +#endif // INCLUDE_ARRAY_RAW_HPP diff --git a/siqk/si/README.md b/siqk/si/README.md new file mode 100644 index 0000000..191bf61 --- /dev/null +++ b/siqk/si/README.md @@ -0,0 +1,25 @@ +Simple sphere interesection prototype with optional no-Kokkos build. + +Basic build and test run: + +$ g++ -std=c++11 test.cpp +$ ./a.out -n 20 + +For performance profiling, + +$ g++ -O3 -DSIQK_TIME -std=c++11 test.cpp +$ ./a.out -n 20 + +You should see + +n (-n): 20 + test_area_ot 1.276e-02 s 1.4 MB 1.228e-03 s 1.153e-02 s +true area 1.0196e+00 mesh area 1.0196e+00 relerr 3.2447e-13 + +The first line is the input. The second line shows total test time, memory +highwater, octree construction time, and (search, clip, and area calculation) +time. The third line shows the true overlap area, the area based on the meshes, +and the relative error. As the mesh is refined, the relative error drops because +(a) the sphere polygon area calculation is naive and (b) the edge normals have +increasing cancellatione error. Each is part of the test setup and would not be +used in practice. diff --git a/siqk/si/sik.hpp b/siqk/si/sik.hpp new file mode 100644 index 0000000..ef3bce1 --- /dev/null +++ b/siqk/si/sik.hpp @@ -0,0 +1,792 @@ +#ifndef INCLUDE_SIK_HPP +#define INCLUDE_SIK_HPP + +#include +#include +#include +#include +#include +#include +#include + +#ifdef SIQK_TIME +# include +# include +# include +#endif + +namespace siqk { + +#ifdef SIQK_TIME +static timeval tic () { + timeval t; + gettimeofday(&t, 0); + return t; +} +static double calc_et (const timeval& t1, const timeval& t2) { + static const double us = 1.0e6; + return (t2.tv_sec * us + t2.tv_usec - t1.tv_sec * us - t1.tv_usec) / us; +} +static double toc (const timeval& t1) { +#ifdef SIQK_USE_KOKKOS + Kokkos::fence(); +#endif + timeval t; + gettimeofday(&t, 0); + return calc_et(t1, t); +} +static double get_memusage () { + static const double scale = 1.0 / (1 << 10); // Memory in MB. + rusage ru; + getrusage(RUSAGE_SELF, &ru); + return ru.ru_maxrss*scale; +} +#else +static inline int tic () { return 0; } +static inline double toc (const int&) { return 0; } +#endif +static void print_times (const std::string& name, const double* const parts, + const int nparts) { +#ifdef SIQK_TIME + double total = 0; for (int i = 0; i < nparts; ++i) total += parts[i]; + printf("%20s %1.3e s %7.1f MB", name.c_str(), total, get_memusage()); + for (int i = 0; i < nparts; ++i) printf(" %1.3e s", parts[i]); + printf("\n"); +#endif +} +static void print_times (const std::string& name, const double total) { +#ifdef SIQK_TIME + printf("%20s %1.3e s %5.1f MB\n", name.c_str(), total, get_memusage()); +#endif +} + +template +static void copy (V dst, CV src, const int n) { + for (int i = 0; i < n; ++i) dst[i] = src[i]; +} + +// A decorator function so that a for loop's counter can be auto typed. +template KOKKOS_INLINE_FUNCTION +typename V::size_type zero(const V& v) { return 0; } + +// Planar geometry calculations. +struct PlaneGeometry { + template KOKKOS_INLINE_FUNCTION + static void scale (const double a, V v) { + v[0] *= a; v[1] *= a; + } + template KOKKOS_INLINE_FUNCTION + static double dot_c_amb (const CV c, const CV a, const CV b) { + return c[0]*(a[0] - b[0]) + c[1]*(a[1] - b[1]); + } + template KOKKOS_INLINE_FUNCTION + static void combine (const CV u, const CV v, const double a, V x) { + const double oma = 1 - a; + x[0] = oma*u[0] + a*v[0]; + x[1] = oma*u[1] + a*v[1]; + } + + template KOKKOS_INLINE_FUNCTION + static void edge_normal (const CV e1, const CV e2, V en) { + en[0] = e1[1] - e2[1]; + en[1] = e2[0] - e1[0]; + } + + template KOKKOS_INLINE_FUNCTION + static bool inside (const CV v, const CV e1, const CV en) { + return dot_c_amb(en, v, e1) >= 0; + } + + template KOKKOS_INLINE_FUNCTION + static void intersect (const CV v1, const CV v2, const CV e1, const CV en, + V intersection) { + const double a = dot_c_amb(en, e1, v1) / dot_c_amb(en, v2, v1); + combine(v1, v2, a, intersection); + } + + template KOKKOS_INLINE_FUNCTION + static void output (const CV v, int& no, Array2D& vo) { +#ifdef SIKQ_DEBUG + if (no >= vo.n()) { + std::stringstream ss; + ss << "output: No room in vo; vo.n() is " << vo.n() << " but no is " + << no << "\n"; + error(ss.str().c_str()); + } +#endif + vo(0,no) = v[0]; + vo(1,no) = v[1]; + ++no; + } + + //todo Handle non-convex case. + KOKKOS_INLINE_FUNCTION + static double calc_area (const Array2D& v) { + double area = 0; + for (int i = 1; i < v.n() - 1; ++i) { + double v1[2], v2[2]; + v1[0] = v(0,i) - v(0,0); + v1[1] = v(1,i) - v(1,0); + v2[0] = v(0,i+1) - v(0,0); + v2[1] = v(1,i+1) - v(1,0); + const double a = v1[0]*v2[1] - v1[1]*v2[0]; + area += a; + } + return 0.5*area; + } +}; + +// Geometry on the sphere. All inputs and outputs are relative to the +// unit-radius sphere. +struct SphereGeometry { + template KOKKOS_INLINE_FUNCTION + static void cross (const CV a, const CV b, V c) { + c[0] = a[1]*b[2] - a[2]*b[1]; + c[1] = a[2]*b[0] - a[0]*b[2]; + c[2] = a[0]*b[1] - a[1]*b[0]; + } + template KOKKOS_INLINE_FUNCTION + static double dot (const CV a, const CV b) { + return a[0]*b[0] + a[1]*b[1] + a[2]*b[2]; + } + template KOKKOS_INLINE_FUNCTION + static double norm2 (const CV v) { + return dot(v, v); + } + template KOKKOS_INLINE_FUNCTION + static void scale (const double a, V v) { + v[0] *= a; v[1] *= a; v[2] *= a; + } + template KOKKOS_INLINE_FUNCTION + static void normalize (V v) { + scale(1.0/std::sqrt(norm2(v)), v); + } + template KOKKOS_INLINE_FUNCTION + static double dot_c_amb (const CV c, const CV a, const CV b) { + return c[0]*(a[0] - b[0]) + c[1]*(a[1] - b[1]) + c[2]*(a[2] - b[2]); + } + template KOKKOS_INLINE_FUNCTION + static void combine (const CV u, const CV v, const double a, V x) { + const double oma = 1 - a; + x[0] = oma*u[0] + a*v[0]; + x[1] = oma*u[1] + a*v[1]; + x[2] = oma*u[2] + a*v[2]; + } + + template KOKKOS_INLINE_FUNCTION + static void edge_normal (const CV a, const CV b, V en) { + cross(a, b, en); + normalize(en); + } + + template KOKKOS_INLINE_FUNCTION + static bool inside (const CV v, const CV a, const CV n) { + return dot_c_amb(n, v, a) >= 0; + } + + /* Let + n = edge normal + c = edge point + d = n' c + v(a) = (1 - a) v1 + a v2. + Solve n' v = d for a: + a = (d - n' v1) / (n' (v2 - v1)). + Then uvec(v(a)) is the intersection point on the unit sphere. Assume + intersection exists. (Already filtered by 'inside'.) + */ + template KOKKOS_INLINE_FUNCTION + static void intersect (const CV v1, const CV v2, const CV e1, const CV en, + V intersection) { + const double a = dot_c_amb(en, e1, v1) / dot_c_amb(en, v2, v1); + combine(v1, v2, a, intersection); + normalize(intersection); + } + + template KOKKOS_INLINE_FUNCTION + static void output (const CV v, int& no, Array2D& vo) { +#ifdef SIKQ_DEBUG + if (no >= vo.n()) { + std::stringstream ss; + ss << "output: No room in vo; vo.n() is " << vo.n() << " but no is " + << no << "\n"; + error(ss.str().c_str()); + } +#endif + vo(0,no) = v[0]; + vo(1,no) = v[1]; + vo(2,no) = v[2]; + ++no; + } + + //todo Handle non-convex case. + // This uses a terrible formula, but it's just for testing. + KOKKOS_INLINE_FUNCTION + static double calc_area (const Array2D& v) { + double area = 0; + for (int i = 1; i < v.n() - 1; ++i) { + const double a = calc_arc_length(v(0), v(i)); + const double b = calc_arc_length(v(i), v(i+1)); + const double c = calc_arc_length(v(i+1), v(0)); + const double s = 0.5*(a + b + c); + const double d = (std::tan(0.5*s)*std::tan(0.5*(s-a))* + std::tan(0.5*(s-b))*std::tan(0.5*(s-c))); + if (d <= 0) continue; + area += 4*std::atan(std::sqrt(d)); + } + return area; + } + template KOKKOS_INLINE_FUNCTION + static double calc_arc_length (const CV a, const CV b) { + const double d = dot(a, b); + if (d >= 1) return 0; + return acos(d); + } +}; + +// Sutherland-Hodgmann polygon clipping algorithm. Follow Foley, van Dam, +// Feiner, Hughes Fig 3.49. +namespace sh { +// Max number of vertices in a clipped polygon. We want to use a lot of small +// stack-allocated arrays; use this number in those declarations. +static constexpr int max_nvert = 20; + +/* A mesh is described by the following arrays: + p: 3 x #nodes, the array of vertices. + e: max(#verts) x #elems, the array of element base-0 indices. + nml: 3 x #edges, the array of edge normals. + en: max(#verts) x #elems, the array of edge-normal base-0 indices. + e. e indexes p. e(i,j) == -1 in column j indicates that j:end are not used. + nml. As a mesh is refined, cancellation error makes an edge normal based + off of an element's vertices increasingly inaccurate. Roughly, if an edge + subtends angle phi of the sphere, -log10(phi/(2 pi)) digits are lost in the + edge normal. Therefore, we compute edge normals offline, since in certain + meshes, they can be computed by an accurate means. E.g., in a cubed-sphere + mesh, the whole line of a square face can be used to compute the edge + normal. Furthermore, there are far fewer unique edge normals than edges. + */ +struct Mesh { + Array2D p, nml; + Array2D e, en; +}; + +// Generally not a user routine. +template KOKKOS_INLINE_FUNCTION +void clip_against_edge ( + // Input vertex list. + const Array2D& vi, const int ni, + // Output vertex list. + Array2D& vo, int& no, + // One point of the clip edge. + const CV ce1, + // Clip edge's inward-facing normal. + const CV cen) +{ + const double* s, * p; + double intersection[3]; + no = 0; + s = vi(ni-1); + for (int j = 0; j < ni; ++j) { + p = vi(j); + if (geo::inside(p, ce1, cen)) { + if (geo::inside(s, ce1, cen)) + geo::output(p, no, vo); + else { + geo::intersect(s, p, ce1, cen, intersection); + geo::output(intersection, no, vo); + geo::output(p, no, vo); + } + } else if (geo::inside(s, ce1, cen)) { + geo::intersect(s, p, ce1, cen, intersection); + geo::output(intersection, no, vo); + } + s = p; + } +} + +// Efficient user routine that uses the mesh data structure. +template KOKKOS_INLINE_FUNCTION +void clip_against_poly ( + // Clip mesh. m.e(:,cp_e) is the element, and m.en(:,cp_e) is the + // corresponding list of normal indices. + const Mesh& m, const int cp_e, + // A list of vertices describing the polygon to clip. The vertices must be in + // a convention-determined order, such as CCW. vi(:,1:ni-1) are valid entries. + const Array2D& vi, const int ni, + // On output, vo(:,0:no-1) are vertices of the clipped polygon. no is 0 if + // there is no intersection. + Array2D& vo, int& no) +{ + double buf[3*sh::max_nvert]; + Array2D vo1(3, sh::max_nvert, buf); + int nos[] = { no, 0 }; + Array2D* vs[] = { &vo, &vo1 }; + + const auto e = m.e(cp_e); + const auto en = m.en(cp_e); + + auto nv = m.e.m(); // Number of vertices in clip polygon. + while (e[nv-1] == -1) --nv; + + no = 0; + if (nv % 2 == 0) { + // Make sure the final vertex output list is in the caller's buffer. + std::swap(vs[0], vs[1]); + std::swap(nos[0], nos[1]); + } + + clip_against_edge(vi, ni, *vs[0], nos[0], m.p(e[0]), m.nml(en[0])); + if ( ! nos[0]) return; + + for (int ie = 1, ielim = nv - 1; ; ++ie) { + clip_against_edge(*vs[0], nos[0], *vs[1], nos[1], m.p(e[ie]), + m.nml(en[ie])); + if ( ! nos[1]) return; + if (ie == ielim) break; + std::swap(vs[0], vs[1]); + std::swap(nos[0], nos[1]); + } + + no = nos[1]; +} + +// Not used for real stuff; just a convenient version for testing. In this +// version, clip_poly is a list of clip polygon vertices. This is instead of the +// mesh data structure. +template KOKKOS_INLINE_FUNCTION +void clip_against_poly ( + // Clip polygon's (p, e) pair. + const Array2D& clip_poly, + // Clip polygon edges' inward-facing normals. + const Array2D& clip_edge_normals, + const Array2D& vi, const int ni, + Array2D& vo, int& no) +{ + double buf[3*sh::max_nvert]; + Array2D vo1(3, sh::max_nvert, buf); + int nos[] = { no, 0 }; + Array2D* vs[] = { &vo, &vo1 }; + + no = 0; + if (clip_poly.n() % 2 == 0) { + // Make sure the final vertex output list is in the caller's buffer. + std::swap(vs[0], vs[1]); + std::swap(nos[0], nos[1]); + } + + clip_against_edge(vi, ni, *vs[0], nos[0], clip_poly(0), + clip_edge_normals(0)); + if ( ! nos[0]) return; + + for (int ie = 1, ielim = clip_poly.n() - 1; ; ++ie) { + clip_against_edge(*vs[0], nos[0], *vs[1], nos[1], clip_poly(ie), + clip_edge_normals(ie)); + if ( ! nos[1]) return; + if (ie == ielim) break; + std::swap(vs[0], vs[1]); + std::swap(nos[0], nos[1]); + } + + no = nos[1]; +} +} // namespace sh + +// Octree for search. +class Octree { +public: + typedef double BoundingBox[6]; + + struct Options { + // Do not go beyond max_depth depth, including the root and leaf. With this + // constraint, try to go deep enough so that a leaf has no more than + // max_nelem elements. + int max_depth, max_nelem; + Options () : max_depth(10), max_nelem(8) {} + }; + + // Bounding box for a cluster of points ps (possibly vertices). + static void calc_bb (const Array2D& ps, const int np, + BoundingBox bb) { + if (np == 0) return; + for (int j = 0; j < 3; ++j) + bb[j] = bb[j+3] = ps(j,0); + for (int i = 1; i < np; ++i) + for (int j = 0; j < 3; ++j) { + bb[j] = std::min(bb[j], ps(j,i)); + bb[j+3] = std::max(bb[j+3], ps(j,i)); + } + } + + static void calc_bb (const Array2D& ps, BoundingBox bb) { + calc_bb(ps, ps.n(), bb); + } + + template + static void calc_bb (const Array2D& p, const CIV& e, + const int ne, V ebb) { + for (int j = 0; j < 3; ++j) + ebb[j] = ebb[j+3] = p(j, e[0]); + for (int i = 1; i < ne; ++i) { + if (e[i] == -1) break; + for (int j = 0; j < 3; ++j) { + ebb[j] = ko::min(ebb[j], p(j, e[i])); + ebb[j+3] = ko::max(ebb[j+3], p(j, e[i])); + } + } + } + + static void calc_bb (const Array2D& p, const Array2D& e, + Array2D& ebbs) { + assert(ebbs.n() == e.n()); + for (int k = 0; k < e.n(); ++k) + calc_bb(p, e(k), e.m(), ebbs(k)); + } + + // p is a 3xNp array of points. e is a KxNe array of elements. An entry <0 is + // ignored. All <0 entries must be at the end of an element's list. + Octree (const Array2D& p, const Array2D& e, + const Options& o) { + init(p, e, o); + } + Octree (const Array2D& p, const Array2D& e) { + Options o; + init(p, e, o); + } + + // Apply f to every element in leaf nodes with which bb overlaps. f must have + // function + // void operator(const int element_index). + // element_index indexes e. + template KOKKOS_INLINE_FUNCTION + void apply (const CV bb, Functor& f) const { + if (nodes_.n() == 0) { + for (int i = 0; i < offset_[1]; ++i) + f(elems_[i]); + return; + } + apply_r(0, bb_, bb, f); + } + +private: + /* Each node in the oct-tree contains 8 integers, stored in 'nodes'. + + >0 is an index into 'nodes', pointing to a child node. + + A <=0 entry in 'nodes' indicates a leaf node. If 0, there are no elements + in the leaf. If <0, the negative of the entry minus 1 is the index of an + offset array indexing 'elems'. + + Each segment of 'elems' contains a list of element indices covered by a + leaf node. Element indices refer to the list of elements the caller + provides during oct-tree construction. + */ + + // nodes(:,i) is a list. The list includes children of node i (>0) and leaf + // node data (<=0). + Array2D nodes_; + // A leaf node corresponding to -k covers elements + // elems[offset[k] : offset[k]-1]. + Array1D offset_, elems_; + // Root node's bounding box. + BoundingBox bb_; + + class IntList { + int* const buf_; + int i_; + public: + IntList (int* const buf) : buf_(buf), i_(0) {} + void reset () { i_ = 0; } + void push (const int& i) { buf_[i_++] = i; } + int* data () { return buf_; } + int n () const { return i_; } + const int& operator[] (const int& i) const { return buf_[i]; } + }; + + class DynIntList { + std::vector buf_; + public: + DynIntList () {} + void push (const int& i) { buf_.push_back(i); } + int& back () { return buf_.back(); } + int& operator[] (const size_t i) { + if (i >= buf_.size()) + buf_.resize(i+1); + return buf_[i]; + } + int n () const { return static_cast(buf_.size()); } + const int* data () const { return buf_.data(); } + }; + + class Nodes { + std::vector buf_; + public: + int n () const { return static_cast(buf_.size()) >> 3; } + const int* data () const { return buf_.data(); } + int& operator() (const int& r, const int& c) { + const size_t ec = (c+1) << 3; + if (ec >= buf_.size()) + buf_.resize(ec); + assert(((c << 3) + r) >= 0); + assert(((c << 3) + r) < (int) buf_.size()); + return buf_[(c << 3) + r]; + } + }; + + void init (const Array2D& p, const Array2D& e, + const Options& o) { + if (e.n() == 0) return; + // Get OT's bounding box. + calc_bb(p, bb_); + // Get elements' bounding boxes. + Array2D ebbs(6, e.n()); + calc_bb(p, e, ebbs); + // Static element lists for work. Each level has active work space. + std::vector buf((o.max_depth - 1)*e.n()); + IntList es(buf.data()), wrk(buf.data() + e.n()); + for (int i = 0; i < e.n(); ++i) + es.push(i); + // Dynamic element lists. + DynIntList offset, elems; + offset[0] = 0; + // Dynamic node data structure. + Nodes nodes; + // Recurse. We don't care about the return value. If it's 0 and nodes.n() == + // 0, we'll detect as much in 'apply'. + init_r(1, bb_, ebbs, o, es, wrk, offset, elems, nodes); + // Build the static data structures. + if (elems.n() == 0) return; + offset_.reset(offset.n()); + elems_.reset(elems.n()); + memcpy(offset_.data(), offset.data(), offset.n() * sizeof(*offset_.data())); + memcpy(elems_.data(), elems.data(), elems.n() * sizeof(*offset_.data())); + if (nodes.n() == 0) return; + nodes_.reset(8, nodes.n()); + memcpy(nodes_.data(), nodes.data(), (nodes.n() << 3) * sizeof(*offset_.data())); + // Move them to the device. + nodes_.modify(); nodes_.device().sync(); + offset_.modify(); offset_.device().sync(); + elems_.modify(); elems_.device().sync(); + } + + int init_r (const int depth, // Tree's depth at this point, including root. + const BoundingBox& nbb, // My bounding box. + const Array2D& ebbs, // All elements' bounding boxes. + const Options& o, // Options controlling construct of the tree. + IntList& es, // List of elements in my bounding box. + IntList& wrk, // Work space to store working element lists. + DynIntList& offset, // Offsets into elems. + DynIntList& elems, // Elements belonging to leaf nodes. + Nodes& nodes) // Dynamic nodes data structure. + { + const int my_idx = nodes.n(); // My node index. + // Decide what to do. + if (es.n() == 0) { + // I have no elements, so return 0 to indicate I'm a leaf node containing + // nothing. + return 0; + } else if (es.n() <= o.max_nelem || depth == o.max_depth) { + // I'm a leaf node with elements. Store my list of elements and return the + // storage location. + const int os = offset.back(); + offset.push(os + es.n()); + for (int i = 0, n = es.n(); i < n; ++i) + elems[os + i] = es[i]; + return 1 - offset.n(); + } else { + // I'm not a leaf node. + nodes(0, my_idx) = 0; // Insert myself into the nodes array. + for (int ic = 0; ic < 8; ++ic) { + BoundingBox child_bb; + fill_child_bb(nbb, ic, child_bb); + // Find the elements that are in this child's bb. + IntList ces(wrk.data()); + for (int i = 0, n = es.n(); i < n; ++i) + if (do_bb_overlap(child_bb, ebbs(es[i]))) + ces.push(es[i]); + // Create some work space. + IntList cwrk(wrk.data() + ces.n()); + // Recurse. + const int child_idx = init_r(depth+1, child_bb, ebbs, o, ces, cwrk, + offset, elems, nodes); + nodes(ic, my_idx) = child_idx; + } + return my_idx; + } + } + + // Using parent bb p, fill child bb c, with child_idx in 0:7. + KOKKOS_INLINE_FUNCTION + static void fill_child_bb (const BoundingBox& p, const int& child_idx, + BoundingBox& c) { + const double m[] = { 0.5*(p[0] + p[3]), + 0.5*(p[1] + p[4]), + 0.5*(p[2] + p[5]) }; + switch (child_idx) { + case 0: c[0] = p[0]; c[1] = p[1]; c[2] = p[2]; c[3] = m[0]; c[4] = m[1]; c[5] = m[2]; break; + case 1: c[0] = m[0]; c[1] = p[1]; c[2] = p[2]; c[3] = p[3]; c[4] = m[1]; c[5] = m[2]; break; + case 2: c[0] = m[0]; c[1] = m[1]; c[2] = p[2]; c[3] = p[3]; c[4] = p[4]; c[5] = m[2]; break; + case 3: c[0] = p[0]; c[1] = m[1]; c[2] = p[2]; c[3] = m[0]; c[4] = p[4]; c[5] = m[2]; break; + case 4: c[0] = p[0]; c[1] = p[1]; c[2] = m[2]; c[3] = m[0]; c[4] = m[1]; c[5] = p[5]; break; + case 5: c[0] = m[0]; c[1] = p[1]; c[2] = m[2]; c[3] = p[3]; c[4] = m[1]; c[5] = p[5]; break; + case 6: c[0] = m[0]; c[1] = m[1]; c[2] = m[2]; c[3] = p[3]; c[4] = p[4]; c[5] = p[5]; break; + case 7: c[0] = p[0]; c[1] = m[1]; c[2] = m[2]; c[3] = m[0]; c[4] = p[4]; c[5] = p[5]; break; + default: + // impossible + error("fill_child_bb: The impossible has happened."); + } + } + + // Do bounding boxes a and b overlap? + KOKKOS_INLINE_FUNCTION + static bool do_bb_overlap (const BoundingBox a, const BoundingBox b) { + for (int i = 0; i < 3; ++i) + if ( ! do_lines_overlap(a[i], a[i+3], b[i], b[i+3])) + return false; + return true; + } + + KOKKOS_INLINE_FUNCTION + static bool do_lines_overlap (const double& a1, const double& a2, + const double& b1, const double& b2) { + return ! (a2 < b1 || a1 > b2); + } + + template KOKKOS_INLINE_FUNCTION + void apply_r (const int ni, const BoundingBox& nbb, const CV bb, + Functor& f) const { + for (int i = 0; i < 8; ++i) { + BoundingBox child_bb; + fill_child_bb(nbb, i, child_bb); + if ( ! do_bb_overlap(child_bb, bb)) continue; + int e = nodes_(i,ni); + if (e > 0) + apply_r(e, child_bb, bb, f); + else if (e < 0) { + e = std::abs(e + 1); + for (int k = offset_[e]; k < offset_[e+1]; ++k) + f(elems_[k]); + } + } + } +}; + +namespace test { +// In practice, we want to form high-quality normals using information about the +// mesh, such as that it is a CS mesh. For testing, form the normals from edge +// vertices. (This leads to increasing cancellation error with mesh refinement.) +template +void fill_normals (sh::Mesh& m) { + // Count number of edges. + int ne = 0; + for (auto ip = zero(m.e); ip < m.e.n(); ++ip) + for (auto iv = zero(m.e); iv < m.e.m(); ++iv) + if (m.e(iv,ip) == -1) break; else ++ne; + // Fill. + Array2D en(m.e.m(), m.e.n()); + en.set(-1); + Array2D nml(3, ne); + int ie = 0; + for (auto ip = zero(m.e); ip < m.e.n(); ++ip) + for (auto iv = zero(m.e); iv < m.e.m(); ++iv) + if (m.e(iv,ip) == -1) + break; + else { + // Somewhat complicated next node index. + const int iv_next = (iv+1 == m.e.m() ? 0 : + (m.e(iv+1,ip) == -1 ? 0 : iv+1)); + geo::edge_normal(m.p(m.e(iv, ip)), m.p(m.e(iv_next, ip)), nml(ie)); + en(iv,ip) = ie; + ++ie; + } + m.en = en; + m.nml = nml; +} + +// Used in Octree::apply to gather a set of possibly intersecting polygons. +struct OTSearchFunctor { + std::set hits; + KOKKOS_INLINE_FUNCTION void operator() (const int i) { hits.insert(i); } +}; + +// Find the area of the overlapping part of two meshes by summing over the areas +// of the common refinement polygons. Obviously a silly thing to do, but a good +// test and demonstration problem. +template +class TestAreaOTFunctor { + // Mesh against which to clip. ("Eulerian mesh".) + sh::Mesh cm; + // Mesh of clipped polygons. ("Departure mesh".) + const Array2D p; // 3 x #verts array of polygon vertices. + const Array2D e; // Array of polygons. e(:,k) is the k'th polygon. + // Already initialized octree used to search for possibly intersecting + // polygons. + Octree ot; + +public: + typedef double value_type; + + TestAreaOTFunctor (const sh::Mesh& cm, const Array2D& p, + const Array2D& e, const Octree& ot) + : cm(cm), p(p), e(e), ot(ot) + {} + + // k indexes (p,e). + KOKKOS_INLINE_FUNCTION void operator() (const int k, double& area) const { + // Clipped element bounding box. + double ebb[6]; + Octree::calc_bb(p, e(k), e.m(), ebb); + // Get list of possible overlaps. + OTSearchFunctor f; + ot.apply(ebb, f); + // In and out vertex lists. + double buf[6*sh::max_nvert]; + Array2D + vi(3, sh::max_nvert, buf), + vo(3, sh::max_nvert, buf + 3*sh::max_nvert); + int ni, no; + // Area of all overlapping regions. + double a = 0; + for (const auto icp : f.hits) { + // Create the polygon to be clipped. + ni = 0; + for (int i = 0; i < e.m(); ++i) { + if (e(i,k) == -1) break; + copy(vi(i), p(e(i,k)), 3); + ++ni; + } + sh::clip_against_poly(cm, icp, vi, ni, vo, no); + if (no) { + // A non-0 intersection was found. Accumulate the area. + a += geo::calc_area(Array2D(vo.m(), no, vo.data())); + } + } + // Add our area to the reduction. + area += a; + } +}; + +template +double test_area_ot (const Array2D& cp, const Array2D& ce, + const Array2D& p, const Array2D& e) { + // Clip mesh and edge normal calculation. (In practice, we'd like to use + // higher-quality edge normals.) + sh::Mesh cm; cm.p = cp; cm.e = ce; + fill_normals(cm); + + double et[2]; + auto t = tic(); + // Build an octree over the clip mesh. + Octree ot(cp, ce); + et[0] = toc(t); + + // Compute the area in a silly way to test search and interesection. + t = tic(); + double area = 0; + ko::parallel_reduce(e.n(), TestAreaOTFunctor(cm, p, e, ot), area); + et[1] = toc(t); + print_times("test_area_ot", et, 2); + return area; +} +} // namespace test +} // namespace siqk + +#endif // INCLUDE_SIK_HPP diff --git a/siqk/si/test.cpp b/siqk/si/test.cpp new file mode 100644 index 0000000..69f9659 --- /dev/null +++ b/siqk/si/test.cpp @@ -0,0 +1,190 @@ +// mycpp -g test.cpp; if [ $? == 0 ]; then ./a.out -m | grep "mat=1" > foo.m; fi +// >> clf;msik('draw_test_output','foo');ic + +// ko=/home/ambradl/lib/kokkos/cpu; mycpp -DSIQK_USE_KOKKOS -I$ko/include -L$ko/lib -fopenmp test.cpp -lkokkos -ldl + +#ifdef SIQK_USE_KOKKOS +# include "Array_Kokkos.hpp" +#else +# include "Array_raw.hpp" +#endif +#include "sik.hpp" +using namespace siqk; + +typedef SphereGeometry Geo; + +template +void copy (Array2D& d, const Array2D& s) { + for (auto i = zero(s); i < s.n(); ++i) + for (auto j = zero(s); j < s.m(); ++j) + d(j,i) = s(j,i); +} + +static void +write_matlab (const std::string& name, const Array2D& p) { + printf("mat=1; %s = [", name.c_str()); + for (int ip = zero(p); ip < p.n(); ++ip) + printf(" %1.15e %1.15e %1.15e;", p(0,ip), p(1,ip), p(2,ip)); + printf("].';\n"); +} + +static void +write_matlab (const std::string& name, const Array2D& p, + const Array2D& e) { + printf("mat=1; %s.p = [", name.c_str()); + for (int ip = zero(p); ip < p.n(); ++ip) + printf(" %1.15e %1.15e %1.15e;", p(0,ip), p(1,ip), p(2,ip)); + printf("].';\n"); + printf("mat=1; %s.n = [", name.c_str()); + for (int ie = zero(e); ie < e.n(); ++ie) + printf(" %d %d %d %d;", e(0,ie)+1, e(1,ie)+1, e(2,ie)+1, e(3,ie)+1); + printf("].';\n"); +} + +static void make_planar_mesh (Array2D& p, Array2D& e, + const int n) { + const double d = std::sqrt(0.5); + e.reset(4, n*n); + p.reset(3, (n+1)*(n+1)); + p.set(0); + for (int iy = 0; iy < n+1; ++iy) + for (int ix = 0; ix < n+1; ++ix) { + const auto idx = (n+1)*iy + ix; + p(0,idx) = 2*(static_cast(ix)/n - 0.5)*d; + p(1,idx) = 2*(static_cast(iy)/n - 0.5)*d; + } + for (int iy = 0; iy < n; ++iy) + for (int ix = 0; ix < n; ++ix) { + const auto idx = n*iy + ix; + e(0,idx) = (n+1)*iy + ix; + e(1,idx) = (n+1)*iy + ix+1; + e(2,idx) = (n+1)*(iy+1) + ix+1; + e(3,idx) = (n+1)*(iy+1) + ix; + } +} + +static void project_onto_sphere (Array2D& p) { + for (auto ip = zero(p); ip < p.n(); ++ip) { + p(2,ip) = 1; + SphereGeometry::normalize(p(ip)); + } +} + +static void +perturb_mesh (Array2D& p, Array2D& e, const double angle, + const double xlate, const double ylate) { + const double cr = std::cos(angle), sr = std::sin(angle); + for (auto ip = zero(p); ip < p.n(); ++ip) { + const double x = p(0,ip), y = p(1,ip); + p(0,ip) = cr*x - sr*y + xlate; + p(1,ip) = -sr*x + cr*y + ylate; + } +} + +static void fill_quad (const Array2D& p, Array2D& poly) { + const int n = static_cast(std::sqrt(p.n() - 1)); + copy(poly(0), p(0), 3); + copy(poly(1), p(n), 3); + copy(poly(2), p(p.n() - 1), 3); + copy(poly(3), p(p.n() - 1 - n), 3); +} + +// Area of the outline of (p,e) clipped against the outline of (cp,ce). +static double +calc_true_area (const Array2D& cp, const Array2D& ce, + const Array2D& p, const Array2D& e, + const bool wm) { + Array2D clip_poly(3, 4), poly(3, 4), nml(3, 4); + fill_quad(cp, clip_poly); + fill_quad(p, poly); + for (int i = 0; i < 4; ++i) + Geo::edge_normal(clip_poly(i), clip_poly((i+1) % 4), nml(i)); + Array2D vo(3, sh::max_nvert); + int no; + sh::clip_against_poly(clip_poly, nml, poly, 4, vo, no); + Array2D intersection(3, no, vo.data()); + if (wm) { + write_matlab("clip_poly", clip_poly); + write_matlab("poly", poly); + write_matlab("intersection", intersection); + } + return Geo::calc_area(intersection); +} + +static int +run (const int n, const double angle, const double xlate, const double ylate, + const bool wm) { + // Make the clip mesh. + Array2D cp; + Array2D ce; + make_planar_mesh(cp, ce, n); + + // Make a perturbed mesh. + Array2D p(cp.m(), cp.n()); + Array2D e(ce.m(), ce.n()); + copy(p, cp); + copy(e, ce); + perturb_mesh(p, e, angle, xlate, ylate); + + // Project these meshes onto the sphere. + project_onto_sphere(cp); + project_onto_sphere(p); + + // True intersection area from quadrilateral boundary of the mesh. + const double ta = calc_true_area(cp, ce, p, e, wm); + // Area from the sum over the common refinement polygons. + const double a = test::test_area_ot(cp, ce, p, e); + + // Report information. + const double re = std::abs(a - ta)/ta; + fprintf(stderr, "true area %1.4e mesh area %1.4e relerr %1.4e\n", ta, a, re); + if (wm) { + write_matlab("cm", cp, ce); + write_matlab("m", p, e); + } + return re < 1e-10 ? 0 : 1; +} + +inline bool +eq (const std::string& a, const char* const b1, const char* const b2 = 0) { + return (a == std::string(b1) || (b2 && a == std::string(b2)) || + a == std::string("-") + std::string(b1)); +} + +struct Input { + int n; + double angle, xlate, ylate; + bool write_matlab; + + Input (int argc, char** argv) + : n(5), angle(M_PI*1e-1), xlate(1e-1), ylate(1e-1), write_matlab(false) + { + for (int i = 1; i < argc; ++i) { + const std::string& token = argv[i]; + if (eq(token, "-n")) + n = atoi(argv[++i]); + if (eq(token, "-m", "--write-matlab")) + write_matlab = true; + } + + print(std::cout); + } + + void print (std::ostream& os) { + os << "n (-n): " << n + << "\n"; + } +}; + +int main (int argc, char** argv) { +#ifdef SIQK_USE_KOKKOS + Kokkos::initialize(argc, argv); +#endif + { + Input in(argc, argv); + run(in.n, in.angle, in.xlate, in.ylate, in.write_matlab); + } +#ifdef SIQK_USE_KOKKOS + Kokkos::finalize_all(); +#endif +} From 3c2a3916f333ea3fc25aa55dd954b1c06bcdd8ee Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Thu, 12 May 2016 13:49:59 -0600 Subject: [PATCH 02/28] SIQK/SI: Use caller work array. --- siqk/si/Array_raw.hpp | 2 +- siqk/si/README.md | 8 ++-- siqk/si/sik.hpp | 88 ++++++++++++++++++++++++------------------- siqk/si/test.cpp | 8 +++- 4 files changed, 61 insertions(+), 45 deletions(-) diff --git a/siqk/si/Array_raw.hpp b/siqk/si/Array_raw.hpp index 2394232..740a549 100644 --- a/siqk/si/Array_raw.hpp +++ b/siqk/si/Array_raw.hpp @@ -125,7 +125,7 @@ class Array2D { namespace Kokkos { typedef void DefaultExecutionSpace; inline void fence() {} -}; +} namespace ko { using std::min; diff --git a/siqk/si/README.md b/siqk/si/README.md index 191bf61..5134578 100644 --- a/siqk/si/README.md +++ b/siqk/si/README.md @@ -19,7 +19,7 @@ true area 1.0196e+00 mesh area 1.0196e+00 relerr 3.2447e-13 The first line is the input. The second line shows total test time, memory highwater, octree construction time, and (search, clip, and area calculation) time. The third line shows the true overlap area, the area based on the meshes, -and the relative error. As the mesh is refined, the relative error drops because -(a) the sphere polygon area calculation is naive and (b) the edge normals have -increasing cancellatione error. Each is part of the test setup and would not be -used in practice. +and the relative error. As the mesh is refined, the relative error increases +because (a) the sphere polygon area calculation is naive and (b) the edge +normals have increasing cancellation error. Each is part of the test setup and +would not be used in practice. diff --git a/siqk/si/sik.hpp b/siqk/si/sik.hpp index ef3bce1..1bb4aa3 100644 --- a/siqk/si/sik.hpp +++ b/siqk/si/sik.hpp @@ -105,7 +105,7 @@ struct PlaneGeometry { } template KOKKOS_INLINE_FUNCTION - static void output (const CV v, int& no, Array2D& vo) { + static bool output (const CV v, int& no, Array2D& vo) { #ifdef SIKQ_DEBUG if (no >= vo.n()) { std::stringstream ss; @@ -114,9 +114,11 @@ struct PlaneGeometry { error(ss.str().c_str()); } #endif + if (no >= vo.n()) return false; vo(0,no) = v[0]; vo(1,no) = v[1]; ++no; + return true; } //todo Handle non-convex case. @@ -203,7 +205,7 @@ struct SphereGeometry { } template KOKKOS_INLINE_FUNCTION - static void output (const CV v, int& no, Array2D& vo) { + static bool output (const CV v, int& no, Array2D& vo) { #ifdef SIKQ_DEBUG if (no >= vo.n()) { std::stringstream ss; @@ -212,10 +214,12 @@ struct SphereGeometry { error(ss.str().c_str()); } #endif + if (no >= vo.n()) return false; vo(0,no) = v[0]; vo(1,no) = v[1]; vo(2,no) = v[2]; ++no; + return true; } //todo Handle non-convex case. @@ -246,10 +250,6 @@ struct SphereGeometry { // Sutherland-Hodgmann polygon clipping algorithm. Follow Foley, van Dam, // Feiner, Hughes Fig 3.49. namespace sh { -// Max number of vertices in a clipped polygon. We want to use a lot of small -// stack-allocated arrays; use this number in those declarations. -static constexpr int max_nvert = 20; - /* A mesh is described by the following arrays: p: 3 x #nodes, the array of vertices. e: max(#verts) x #elems, the array of element base-0 indices. @@ -271,7 +271,7 @@ struct Mesh { // Generally not a user routine. template KOKKOS_INLINE_FUNCTION -void clip_against_edge ( +bool clip_against_edge ( // Input vertex list. const Array2D& vi, const int ni, // Output vertex list. @@ -288,24 +288,25 @@ void clip_against_edge ( for (int j = 0; j < ni; ++j) { p = vi(j); if (geo::inside(p, ce1, cen)) { - if (geo::inside(s, ce1, cen)) - geo::output(p, no, vo); - else { + if (geo::inside(s, ce1, cen)) { + if ( ! geo::output(p, no, vo)) return false; + } else { geo::intersect(s, p, ce1, cen, intersection); - geo::output(intersection, no, vo); - geo::output(p, no, vo); + if ( ! geo::output(intersection, no, vo)) return false; + if ( ! geo::output(p, no, vo)) return false; } } else if (geo::inside(s, ce1, cen)) { geo::intersect(s, p, ce1, cen, intersection); - geo::output(intersection, no, vo); + if ( ! geo::output(intersection, no, vo)) return false; } s = p; } + return true; } // Efficient user routine that uses the mesh data structure. template KOKKOS_INLINE_FUNCTION -void clip_against_poly ( +bool clip_against_poly ( // Clip mesh. m.e(:,cp_e) is the element, and m.en(:,cp_e) is the // corresponding list of normal indices. const Mesh& m, const int cp_e, @@ -314,11 +315,11 @@ void clip_against_poly ( const Array2D& vi, const int ni, // On output, vo(:,0:no-1) are vertices of the clipped polygon. no is 0 if // there is no intersection. - Array2D& vo, int& no) + Array2D& vo, int& no, + double* const wrk, const int nwrk) { - double buf[3*sh::max_nvert]; - Array2D vo1(3, sh::max_nvert, buf); - int nos[] = { no, 0 }; + Array2D vo1(3, nwrk/3, wrk); + int nos[] = { 0, 0 }; Array2D* vs[] = { &vo, &vo1 }; const auto e = m.e(cp_e); @@ -334,36 +335,39 @@ void clip_against_poly ( std::swap(nos[0], nos[1]); } - clip_against_edge(vi, ni, *vs[0], nos[0], m.p(e[0]), m.nml(en[0])); - if ( ! nos[0]) return; + if ( ! clip_against_edge(vi, ni, *vs[0], nos[0], m.p(e[0]), m.nml(en[0]))) + return false; + if ( ! nos[0]) return true; for (int ie = 1, ielim = nv - 1; ; ++ie) { - clip_against_edge(*vs[0], nos[0], *vs[1], nos[1], m.p(e[ie]), - m.nml(en[ie])); - if ( ! nos[1]) return; + if ( ! clip_against_edge(*vs[0], nos[0], *vs[1], nos[1], m.p(e[ie]), + m.nml(en[ie]))) + return false; + if ( ! nos[1]) return true; if (ie == ielim) break; std::swap(vs[0], vs[1]); std::swap(nos[0], nos[1]); } no = nos[1]; + return true; } // Not used for real stuff; just a convenient version for testing. In this // version, clip_poly is a list of clip polygon vertices. This is instead of the // mesh data structure. template KOKKOS_INLINE_FUNCTION -void clip_against_poly ( +bool clip_against_poly ( // Clip polygon's (p, e) pair. const Array2D& clip_poly, // Clip polygon edges' inward-facing normals. const Array2D& clip_edge_normals, const Array2D& vi, const int ni, - Array2D& vo, int& no) + Array2D& vo, int& no, + double* const wrk, const int nwrk) { - double buf[3*sh::max_nvert]; - Array2D vo1(3, sh::max_nvert, buf); - int nos[] = { no, 0 }; + Array2D vo1(3, nwrk/3, wrk); + int nos[] = { 0, 0 }; Array2D* vs[] = { &vo, &vo1 }; no = 0; @@ -373,20 +377,23 @@ void clip_against_poly ( std::swap(nos[0], nos[1]); } - clip_against_edge(vi, ni, *vs[0], nos[0], clip_poly(0), - clip_edge_normals(0)); - if ( ! nos[0]) return; + if ( ! clip_against_edge(vi, ni, *vs[0], nos[0], clip_poly(0), + clip_edge_normals(0))) + return false; + if ( ! nos[0]) return true; for (int ie = 1, ielim = clip_poly.n() - 1; ; ++ie) { - clip_against_edge(*vs[0], nos[0], *vs[1], nos[1], clip_poly(ie), - clip_edge_normals(ie)); - if ( ! nos[1]) return; + if ( ! clip_against_edge(*vs[0], nos[0], *vs[1], nos[1], clip_poly(ie), + clip_edge_normals(ie))) + return false; + if ( ! nos[1]) return true; if (ie == ielim) break; std::swap(vs[0], vs[1]); std::swap(nos[0], nos[1]); } no = nos[1]; + return true; } } // namespace sh @@ -670,6 +677,8 @@ class Octree { }; namespace test { +static constexpr int max_nvert = 20; + // In practice, we want to form high-quality normals using information about the // mesh, such as that it is a CS mesh. For testing, form the normals from edge // vertices. (This leads to increasing cancellation error with mesh refinement.) @@ -738,11 +747,14 @@ class TestAreaOTFunctor { OTSearchFunctor f; ot.apply(ebb, f); // In and out vertex lists. - double buf[6*sh::max_nvert]; + double buf[6*max_nvert]; Array2D - vi(3, sh::max_nvert, buf), - vo(3, sh::max_nvert, buf + 3*sh::max_nvert); + vi(3, max_nvert, buf), + vo(3, max_nvert, buf + 3*max_nvert); int ni, no; + // Workspace. + double wrk[3*max_nvert]; + const int nwrk = 3*max_nvert; // Area of all overlapping regions. double a = 0; for (const auto icp : f.hits) { @@ -753,7 +765,7 @@ class TestAreaOTFunctor { copy(vi(i), p(e(i,k)), 3); ++ni; } - sh::clip_against_poly(cm, icp, vi, ni, vo, no); + sh::clip_against_poly(cm, icp, vi, ni, vo, no, wrk, nwrk); if (no) { // A non-0 intersection was found. Accumulate the area. a += geo::calc_area(Array2D(vo.m(), no, vo.data())); diff --git a/siqk/si/test.cpp b/siqk/si/test.cpp index 69f9659..92b7454 100644 --- a/siqk/si/test.cpp +++ b/siqk/si/test.cpp @@ -99,9 +99,13 @@ calc_true_area (const Array2D& cp, const Array2D& ce, fill_quad(p, poly); for (int i = 0; i < 4; ++i) Geo::edge_normal(clip_poly(i), clip_poly((i+1) % 4), nml(i)); - Array2D vo(3, sh::max_nvert); + Array2D vo(3, test::max_nvert); int no; - sh::clip_against_poly(clip_poly, nml, poly, 4, vo, no); + { + double wrk[3*test::max_nvert]; + const int nwrk = 3*test::max_nvert; + sh::clip_against_poly(clip_poly, nml, poly, 4, vo, no, wrk, nwrk); + } Array2D intersection(3, no, vo.data()); if (wm) { write_matlab("clip_poly", clip_poly); From 7f659f5d3daa013ab54160875591c4358989af2c Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Thu, 12 May 2016 14:07:17 -0600 Subject: [PATCH 03/28] SIQK/SI: Add test for planar geometry. --- siqk/si/test.cpp | 51 +++++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/siqk/si/test.cpp b/siqk/si/test.cpp index 92b7454..4afb3a3 100644 --- a/siqk/si/test.cpp +++ b/siqk/si/test.cpp @@ -1,8 +1,3 @@ -// mycpp -g test.cpp; if [ $? == 0 ]; then ./a.out -m | grep "mat=1" > foo.m; fi -// >> clf;msik('draw_test_output','foo');ic - -// ko=/home/ambradl/lib/kokkos/cpu; mycpp -DSIQK_USE_KOKKOS -I$ko/include -L$ko/lib -fopenmp test.cpp -lkokkos -ldl - #ifdef SIQK_USE_KOKKOS # include "Array_Kokkos.hpp" #else @@ -11,8 +6,6 @@ #include "sik.hpp" using namespace siqk; -typedef SphereGeometry Geo; - template void copy (Array2D& d, const Array2D& s) { for (auto i = zero(s); i < s.n(); ++i) @@ -90,6 +83,7 @@ static void fill_quad (const Array2D& p, Array2D& poly) { } // Area of the outline of (p,e) clipped against the outline of (cp,ce). +template static double calc_true_area (const Array2D& cp, const Array2D& ce, const Array2D& p, const Array2D& e, @@ -115,6 +109,12 @@ calc_true_area (const Array2D& cp, const Array2D& ce, return Geo::calc_area(intersection); } +template void finalize_mesh (Array2D& p) {} +template <> void finalize_mesh (Array2D& p) { + project_onto_sphere(p); +} + +template static int run (const int n, const double angle, const double xlate, const double ylate, const bool wm) { @@ -131,11 +131,11 @@ run (const int n, const double angle, const double xlate, const double ylate, perturb_mesh(p, e, angle, xlate, ylate); // Project these meshes onto the sphere. - project_onto_sphere(cp); - project_onto_sphere(p); + finalize_mesh(cp); + finalize_mesh(p); // True intersection area from quadrilateral boundary of the mesh. - const double ta = calc_true_area(cp, ce, p, e, wm); + const double ta = calc_true_area(cp, ce, p, e, wm); // Area from the sum over the common refinement polygons. const double a = test::test_area_ot(cp, ce, p, e); @@ -146,7 +146,7 @@ run (const int n, const double angle, const double xlate, const double ylate, write_matlab("cm", cp, ce); write_matlab("m", p, e); } - return re < 1e-10 ? 0 : 1; + return re < 1e-8 ? 0 : 1; } inline bool @@ -158,25 +158,32 @@ eq (const std::string& a, const char* const b1, const char* const b2 = 0) { struct Input { int n; double angle, xlate, ylate; - bool write_matlab; + bool write_matlab, geo_sphere; Input (int argc, char** argv) - : n(5), angle(M_PI*1e-1), xlate(1e-1), ylate(1e-1), write_matlab(false) + : n(5), angle(M_PI*1e-1), xlate(1e-1), ylate(1e-1), write_matlab(false), + geo_sphere(true) { for (int i = 1; i < argc; ++i) { const std::string& token = argv[i]; - if (eq(token, "-n")) - n = atoi(argv[++i]); - if (eq(token, "-m", "--write-matlab")) - write_matlab = true; + if (eq(token, "-n")) n = atoi(argv[++i]); + if (eq(token, "-m", "--write-matlab")) write_matlab = true; + if (eq(token, "--plane")) geo_sphere = false; + if (eq(token, "--xlate")) xlate = atof(argv[++i]); + if (eq(token, "--ylate")) ylate = atof(argv[++i]); + if (eq(token, "--angle")) angle = atof(argv[++i]); } print(std::cout); } void print (std::ostream& os) { - os << "n (-n): " << n - << "\n"; + os << "n (-n): " << n << "\n" + << "write matlab (-m): " << write_matlab << "\n" + << "planar geometry (--plane): " << ! geo_sphere << "\n" + << "angle (--angle): " << angle << "\n" + << "xlate (--xlate): " << xlate << "\n" + << "ylate (--ylate): " << ylate << "\n"; } }; @@ -186,7 +193,11 @@ int main (int argc, char** argv) { #endif { Input in(argc, argv); - run(in.n, in.angle, in.xlate, in.ylate, in.write_matlab); + int nerr = 0; + nerr += (in.geo_sphere ? + run(in.n, in.angle, in.xlate, in.ylate, in.write_matlab) : + run(in.n, in.angle, in.xlate, in.ylate, in.write_matlab)); + std::cerr << (nerr ? "FAIL" : "PASS") << "ED\n"; } #ifdef SIQK_USE_KOKKOS Kokkos::finalize_all(); From 43e4eb5f288d118a71a6f482894c624a273ae1a4 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Thu, 12 May 2016 15:36:22 -0600 Subject: [PATCH 04/28] SIQK/SI: Add a Fortran interface and test it in C++. --- siqk/si/Makefile | 30 ++++++++++++++++++++++++++++++ siqk/si/fsi.cpp | 17 +++++++++++++++++ siqk/si/fsi.h | 15 +++++++++++++++ siqk/si/sik.hpp | 20 ++++++++------------ siqk/si/test.cpp | 34 ++++++++++++++++++++++++++++++++-- siqk/si/testf.f90 | 43 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 145 insertions(+), 14 deletions(-) create mode 100644 siqk/si/Makefile create mode 100644 siqk/si/fsi.cpp create mode 100644 siqk/si/fsi.h create mode 100644 siqk/si/testf.f90 diff --git a/siqk/si/Makefile b/siqk/si/Makefile new file mode 100644 index 0000000..fddabe1 --- /dev/null +++ b/siqk/si/Makefile @@ -0,0 +1,30 @@ +opt= +CXX=g++-4.7 +FC=gfortran-4.7 +CXXFLAGS=$(opt) -Wall -pedantic -std=c++11 -DSIQK_FORTRAN +FFLAGS=$(opt) -ffixed-line-length-none +LDFLAGS=-lgfortran + +CXXSOURCES=test.cpp fsi.cpp +F90SOURCES=testf.f90 + +CXXOBJECTS=$(CXXSOURCES:.cpp=.o) +F90OBJECTS=$(F90SOURCES:.f90=.o) + +.SUFFIXES: +.SUFFIXES: .cpp .f .f90 .o + +.cpp.o: + $(CXX) $(CFLAGS) $(CXXFLAGS) -c $< -o $@ + +.f90.o: + $(FC) $(FFLAGS) -c $< -o $@ + +all: test + +test: $(CXXOBJECTS) $(F90OBJECTS) + $(CXX) $(CXXOBJECTS) $(LDFLAGS) -o test.exe + $(CXX) $(F90OBJECTS) fsi.o $(LDFLAGS) -o testf.exe + +clean: + rm -f *.o test.exe testf.exe diff --git a/siqk/si/fsi.cpp b/siqk/si/fsi.cpp new file mode 100644 index 0000000..467c657 --- /dev/null +++ b/siqk/si/fsi.cpp @@ -0,0 +1,17 @@ +#include "Array_raw.hpp" +#include "sik.hpp" + +extern "C" void clipagainstpolysphere_ ( + double const* const clip_poly, int const* const clip_poly_n_vertices, + double const* const clip_edge_normals, double const* const vi, int const* const ni, + double* const vo, int* const no, double* const wrk, int const* const n_vertices, + int* const info) +{ + Array2D avo(3, *n_vertices, vo); + const bool success = siqk::sh::clip_against_poly( + Array2D(3, *clip_poly_n_vertices, clip_poly), + Array2D(3, *clip_poly_n_vertices, clip_edge_normals), + Array2D(3, *ni, vi), *ni, + avo, *no, wrk, *n_vertices); + *info = success ? 0 : 1; +} diff --git a/siqk/si/fsi.h b/siqk/si/fsi.h new file mode 100644 index 0000000..e12605f --- /dev/null +++ b/siqk/si/fsi.h @@ -0,0 +1,15 @@ +// Fortran interface to simple polygon clipping routine. + +extern "C" void clipagainstpolysphere_( + // 3 x clip_poly_n_vertices clip spherical polygon vertex list. + double const* const clip_poly, int const* const clip_poly_n_vertices, + // 3 x clip_poly_n_vertices clip polygon's inward-facing edge normals. + double const* const clip_edge_normals, + // 3 x ni polygon to clip. + double const* const to_clip_poly, int const* const ni, + // On output, a 3 x no clipped polygon. + double* const vo, int* const no, + // Workspace. Both vo and wrk must have n_vertices of space available. + double* const wrk, int const* const n_vertices, + // info = 0 on success. info = 1 if n_vertices is not large enough. + int* const info); diff --git a/siqk/si/sik.hpp b/siqk/si/sik.hpp index 1bb4aa3..7b78958 100644 --- a/siqk/si/sik.hpp +++ b/siqk/si/sik.hpp @@ -54,11 +54,6 @@ static void print_times (const std::string& name, const double* const parts, printf("\n"); #endif } -static void print_times (const std::string& name, const double total) { -#ifdef SIQK_TIME - printf("%20s %1.3e s %5.1f MB\n", name.c_str(), total, get_memusage()); -#endif -} template static void copy (V dst, CV src, const int n) { @@ -316,9 +311,11 @@ bool clip_against_poly ( // On output, vo(:,0:no-1) are vertices of the clipped polygon. no is 0 if // there is no intersection. Array2D& vo, int& no, - double* const wrk, const int nwrk) + // Workspace. nvertwrk applies to both wrk and vo.n(). If nvertwrk is not + // large enough, false is returned. + double* const wrk, const int nvertwrk) { - Array2D vo1(3, nwrk/3, wrk); + Array2D vo1(3, nvertwrk, wrk); int nos[] = { 0, 0 }; Array2D* vs[] = { &vo, &vo1 }; @@ -358,15 +355,15 @@ bool clip_against_poly ( // mesh data structure. template KOKKOS_INLINE_FUNCTION bool clip_against_poly ( - // Clip polygon's (p, e) pair. + // Clip polygon. const Array2D& clip_poly, // Clip polygon edges' inward-facing normals. const Array2D& clip_edge_normals, const Array2D& vi, const int ni, Array2D& vo, int& no, - double* const wrk, const int nwrk) + double* const wrk, const int nvertwrk) { - Array2D vo1(3, nwrk/3, wrk); + Array2D vo1(3, nvertwrk, wrk); int nos[] = { 0, 0 }; Array2D* vs[] = { &vo, &vo1 }; @@ -754,7 +751,6 @@ class TestAreaOTFunctor { int ni, no; // Workspace. double wrk[3*max_nvert]; - const int nwrk = 3*max_nvert; // Area of all overlapping regions. double a = 0; for (const auto icp : f.hits) { @@ -765,7 +761,7 @@ class TestAreaOTFunctor { copy(vi(i), p(e(i,k)), 3); ++ni; } - sh::clip_against_poly(cm, icp, vi, ni, vo, no, wrk, nwrk); + sh::clip_against_poly(cm, icp, vi, ni, vo, no, wrk, max_nvert); if (no) { // A non-0 intersection was found. Accumulate the area. a += geo::calc_area(Array2D(vo.m(), no, vo.data())); diff --git a/siqk/si/test.cpp b/siqk/si/test.cpp index 4afb3a3..8a09042 100644 --- a/siqk/si/test.cpp +++ b/siqk/si/test.cpp @@ -5,6 +5,7 @@ #endif #include "sik.hpp" using namespace siqk; +#include "fsi.h" template void copy (Array2D& d, const Array2D& s) { @@ -34,6 +35,28 @@ write_matlab (const std::string& name, const Array2D& p, printf("].';\n"); } +#ifdef SIQK_FORTRAN +static int test_fortran (const Array2D& clip_poly, + const Array2D& nml, + const Array2D& poly) { + int nerr = 0, no, fno, info; + const int nvi = poly.n(); + Array2D vo(3, test::max_nvert), fvo(3, test::max_nvert); + double wrk[3*test::max_nvert]; + sh::clip_against_poly(clip_poly, nml, poly, nvi, vo, no, + wrk, test::max_nvert); + const int ncp = clip_poly.n(); + clipagainstpolysphere_(clip_poly.data(), &ncp, nml.data(), poly.data(), &nvi, + fvo.data(), &fno, wrk, &test::max_nvert, &info); + if (info != 0) ++nerr; + if (fno != no) ++nerr; + for (int i = 0; i < no; ++i) + for (int j = 0; j < 3; ++j) + if (fvo(j,i) != vo(j,i)) ++nerr; + return nerr; +} +#endif + static void make_planar_mesh (Array2D& p, Array2D& e, const int n) { const double d = std::sqrt(0.5); @@ -97,8 +120,8 @@ calc_true_area (const Array2D& cp, const Array2D& ce, int no; { double wrk[3*test::max_nvert]; - const int nwrk = 3*test::max_nvert; - sh::clip_against_poly(clip_poly, nml, poly, 4, vo, no, wrk, nwrk); + sh::clip_against_poly(clip_poly, nml, poly, 4, vo, no, + wrk, test::max_nvert); } Array2D intersection(3, no, vo.data()); if (wm) { @@ -106,6 +129,13 @@ calc_true_area (const Array2D& cp, const Array2D& ce, write_matlab("poly", poly); write_matlab("intersection", intersection); } +#ifdef SIQK_FORTRAN + { + // Sneak in a test of the Fortran interface. + const int nerr = test_fortran(clip_poly, nml, poly); + std::cerr << "Fortran test " << (nerr ? "FAIL" : "PASS") << "ED\n"; + } +#endif return Geo::calc_area(intersection); } diff --git a/siqk/si/testf.f90 b/siqk/si/testf.f90 new file mode 100644 index 0000000..097d5bc --- /dev/null +++ b/siqk/si/testf.f90 @@ -0,0 +1,43 @@ +program main + implicit none + real*8 :: clip(3,4) = reshape( & + (/ -5.000000000000000000d-01, -5.000000000000000000d-01, 7.071067811865474617d-01, & + 5.000000000000000000d-01, -5.000000000000000000d-01, 7.071067811865474617d-01, & + 5.000000000000000000d-01, 5.000000000000000000d-01, 7.071067811865474617d-01, & + -5.000000000000000000d-01, 5.000000000000000000d-01, 7.071067811865474617d-01 /), (/3,4/)) + real*8 :: nml(3,4) = reshape( & + (/ 0.000000000000000000d+00, 8.164965809277260345d-01, 5.773502691896258421d-01, & + -8.164965809277260345d-01, 0.000000000000000000d+00, 5.773502691896258421d-01, & + 0.000000000000000000d+00, -8.164965809277260345d-01, 5.773502691896258421d-01, & + 8.164965809277260345d-01, 0.000000000000000000d+00, 5.773502691896258421d-01 /), (/3,4/)) + real*8 :: poly(3,4) = reshape( & + (/ 5.644736133437637804d-01, 5.644736133437637804d-01, 6.022782410790465946d-01, & + 3.127479665047677160d-01, -3.127479665047677160d-01, 8.968709042522592378d-01, & + -5.644736133437637804d-01, -5.644736133437637804d-01, 6.022782410790465946d-01, & + -3.127479665047677160d-01, 3.127479665047677160d-01, 8.968709042522592378d-01 /), (/3,4/)) + real*8 :: intersection(3,8) = reshape( & + (/ 3.342826900143281987d-01, 5.441369567663348894d-01, 7.695258640473731093d-01, & + 4.999999999999998890d-01, 5.000000000000001110d-01, 7.071067811865476838d-01, & + 5.441369567663348894d-01, 3.342826900143283098d-01, 7.695258640473731093d-01, & + 3.127479665047677160d-01, -3.127479665047677160d-01, 8.968709042522592378d-01, & + -3.342826900143280877d-01, -5.441369567663347784d-01, 7.695258640473732203d-01, & + -5.000000000000000000d-01, -4.999999999999998890d-01, 7.071067811865474617d-01, & + -5.441369567663348894d-01, -3.342826900143283098d-01, 7.695258640473731093d-01, & + -3.127479665047677160d-01, 3.127479665047677160d-01, 8.968709042522592378d-01 /), (/3,8/)) + real*8 :: vo(3,20), wrk(3,20) + integer :: ncp = 4, np = 4, nvert = 20, no, info, i, j + real*8 :: err + + call clipagainstpolysphere(clip, ncp, nml, poly, np, vo, no, wrk, nvert, info) + + err = 0 + do i = 1,8 + do j = 1,3 + err = err + (vo(j,i) - intersection(j,i))**2 + end do + end do + err = sqrt(err) + if (no /= 8) err = err + 1 + + print *, 'err', err +end program main From 7fd9d0337706f7329332d605c7f74afe08e145b2 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Mon, 16 May 2016 11:19:54 -0600 Subject: [PATCH 05/28] SIQK/SI: Use shared_ptr::reset instead of = nullptr. --- siqk/si/Array_raw.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/siqk/si/Array_raw.hpp b/siqk/si/Array_raw.hpp index 740a549..f27da33 100644 --- a/siqk/si/Array_raw.hpp +++ b/siqk/si/Array_raw.hpp @@ -33,7 +33,7 @@ class Array1D { a_p_ = std::shared_ptr(new T[n], std::default_delete()); a_ = a_p_.get(); } - void reset (const int n, T* const a) { n_ = n; a_p_ = nullptr; a_ = a; } + void reset (const int n, T* const a) { n_ = n; a_p_.reset(); a_ = a; } const int& n () const { return n_; } T* data () { return a_; } const T* data () const { return a_; } @@ -78,7 +78,7 @@ class Array2D { a_p_ = std::shared_ptr(new T[m*n], std::default_delete()); a_ = a_p_.get(); } - void reset (const int m, const int n, T* const a) { m_ = m; n_ = n; a_p_ = nullptr; a_ = a; } + void reset (const int m, const int n, T* const a) { m_ = m; n_ = n; a_p_.reset(); a_ = a; } const int& m () const { return m_; } const int& n () const { return n_; } T* data () { return a_; } From 0b2898c23120129bd8691161c13512dea2df44fc Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Mon, 16 May 2016 19:35:28 -0600 Subject: [PATCH 06/28] SIQK: Initial commit. Runs on CPU, Phi, GPU. --- siqk/siqk.hpp | 189 ++++++++ siqk/siqk_intersect.hpp | 922 +++++++++++++++++++++++++++++++++++++++ siqk/siqk_quadrature.hpp | 91 ++++ siqk/test.cpp | 191 ++++++++ 4 files changed, 1393 insertions(+) create mode 100644 siqk/siqk.hpp create mode 100644 siqk/siqk_intersect.hpp create mode 100644 siqk/siqk_quadrature.hpp create mode 100644 siqk/test.cpp diff --git a/siqk/siqk.hpp b/siqk/siqk.hpp new file mode 100644 index 0000000..52431e2 --- /dev/null +++ b/siqk/siqk.hpp @@ -0,0 +1,189 @@ +#ifndef INCLUDE_SIQK_HPP +#define INCLUDE_SIQK_HPP + +#include +#include +#include +#include +#include + +#include + +#ifdef SIQK_TIME +# include +# include +# include +#endif + +// Always want this for GPU. +#define SIQK_NONRECURSIVE + +#ifdef KOKKOS_HAVE_CUDA +# define KOKKOS_CONSTANT __constant__ __device__ +#else +# define KOKKOS_CONSTANT +#endif + +namespace siqk { +namespace ko = Kokkos; +#define pr(m) do { \ + std::stringstream _ss_; \ + _ss_ << m << std::endl; \ + std::cerr << _ss_.str(); \ + } while (0) +#define prc(m) pr(#m << " | " << (m)) +#define puf(m)"(" << #m << " " << (m) << ")" +#define pu(m) << " " << puf(m) +template +static void prarr (const std::string& name, const T* const v, const size_t n) { + std::cerr << name << ": "; + for (size_t i = 0; i < n; ++i) std::cerr << " " << v[i]; + std::cerr << "\n"; +} + +#ifdef SIQK_TIME +static timeval tic () { + timeval t; + gettimeofday(&t, 0); + return t; +} +static double calc_et (const timeval& t1, const timeval& t2) { + static const double us = 1.0e6; + return (t2.tv_sec * us + t2.tv_usec - t1.tv_sec * us - t1.tv_usec) / us; +} +static double toc (const timeval& t1) { + Kokkos::fence(); + timeval t; + gettimeofday(&t, 0); + return calc_et(t1, t); +} +static double get_memusage () { + static const double scale = 1.0 / (1 << 10); // Memory in MB. + rusage ru; + getrusage(RUSAGE_SELF, &ru); + return ru.ru_maxrss*scale; +} +#else +static inline int tic () { return 0; } +static inline double toc (const int&) { return 0; } +#endif +static void print_times (const std::string& name, const double* const parts, + const int nparts) { +#ifdef SIQK_TIME + double total = 0; for (int i = 0; i < nparts; ++i) total += parts[i]; + printf("%20s %1.3e s %7.1f MB", name.c_str(), total, get_memusage()); + for (int i = 0; i < nparts; ++i) printf(" %1.3e s", parts[i]); + printf("\n"); +#endif +} +static void print_times (const std::string& name, const double total) { +#ifdef SIQK_TIME + printf("%20s %1.3e s %5.1f MB\n", name.c_str(), total, get_memusage()); +#endif +} + +KOKKOS_INLINE_FUNCTION static void error(const char* const msg) +{ ko::abort(msg); } + +typedef int Int; +typedef double Real; + +#ifdef KOKKOS_HAVE_CUDA +typedef ko::LayoutLeft Layout; +#else +typedef ko::LayoutRight Layout; +#endif + +// SIQK's array types. +typedef ko::View Vec3s; +typedef ko::View ConstVec3s; +typedef ko::View Vec6s; +typedef ko::View ConstVec6s; +typedef ko::View > RawVec3s; +typedef ko::View > RawConstVec3s; +typedef ko::View > RawArray; +typedef ko::View > RawConstArray; +typedef ko::View Idxs; +typedef ko::View ConstIdxs; +typedef ko::View Nodes; +typedef ko::View ConstNodes; + +// Get the host or device version of the array. +template struct InExeSpace { + typedef VT type; +}; +template struct InExeSpace { + typedef typename VT::HostMirror type; +}; + +#ifdef KOKKOS_HAVE_CUDA +// A 1D slice of an array. +template KOKKOS_FORCEINLINE_FUNCTION +ko::View > +slice (const VT& v, Int i) { return ko::subview(v, i, ko::ALL()); } +// An explicitly const 1D slice of an array. +template KOKKOS_FORCEINLINE_FUNCTION +ko::View > +const_slice (const VT& v, Int i) { return ko::subview(v, i, ko::ALL()); } +#else +template KOKKOS_FORCEINLINE_FUNCTION +typename VT::value_type* +slice (const VT& v, Int i) { return v.ptr_on_device() + v.dimension_1()*i; } + +template KOKKOS_FORCEINLINE_FUNCTION +typename VT::const_value_type* +const_slice (const VT& v, Int i) { return v.ptr_on_device() + v.dimension_1()*i; } +#endif + +// Number of slices in a 2D array, where each row is a slice. +template KOKKOS_FORCEINLINE_FUNCTION +Int nslices (const A2D& a) { return static_cast(a.dimension_0()); } + +// Number of entries in a 2D array's row. +template KOKKOS_FORCEINLINE_FUNCTION +Int szslice (const A2D& a) { return static_cast(a.dimension_1()); } + +template +KOKKOS_FORCEINLINE_FUNCTION +static void copy (V dst, CV src, const Int n) { + for (Int i = 0; i < n; ++i) dst[i] = src[i]; +} + +template +void resize_and_copy (DV& d, const SV& s, + typename std::enable_if::type* = 0) { + ko::resize(d, nslices(s)); + ko::deep_copy(d, s); +} + +template +void resize_and_copy (DV& d, const SV& s, + typename std::enable_if::type* = 0) { + ko::resize(d, nslices(s), szslice(s)); + ko::deep_copy(d, s); +} + +template +void hm_resize_and_copy (DV& d, const SA& s, const Int n) { + ko::resize(d, n); + auto d_hm = ko::create_mirror_view(d); + for (Int i = 0; i < n; ++i) d_hm[i] = s[i]; + ko::deep_copy(d, d_hm); +} + +// GPU-friendly replacements for std::min/max. +template KOKKOS_INLINE_FUNCTION +const T& min (const T& a, const T& b) { return a < b ? a : b; } +template KOKKOS_INLINE_FUNCTION +const T& max (const T& a, const T& b) { return a > b ? a : b; } +template KOKKOS_INLINE_FUNCTION +void swap (T& a, T&b) { + T tmp = a; + a = b; + b = tmp; +} +} + +#endif diff --git a/siqk/siqk_intersect.hpp b/siqk/siqk_intersect.hpp new file mode 100644 index 0000000..5d008bd --- /dev/null +++ b/siqk/siqk_intersect.hpp @@ -0,0 +1,922 @@ +#ifndef INCLUDE_SIQK_INTERSECT_HPP +#define INCLUDE_SIQK_INTERSECT_HPP + +#include "siqk.hpp" +#include "siqk_quadrature.hpp" + +namespace siqk { +struct PlaneGeometry { + template KOKKOS_INLINE_FUNCTION + static void scale (const Real& a, V v) { + v[0] *= a; v[1] *= a; + } + template KOKKOS_INLINE_FUNCTION + static Real dot_c_amb (const CV c, const CV a, const CV b) { + return c[0]*(a[0] - b[0]) + c[1]*(a[1] - b[1]); + } + template KOKKOS_INLINE_FUNCTION + static void combine (const CV u, const CV v, const Real& a, V x) { + const Real& oma = 1 - a; + x[0] = oma*u[0] + a*v[0]; + x[1] = oma*u[1] + a*v[1]; + } + + template KOKKOS_INLINE_FUNCTION + static void edge_normal (const CV e1, const CV e2, V en) { + en[0] = e1[1] - e2[1]; + en[1] = e2[0] - e1[0]; + } + + template KOKKOS_INLINE_FUNCTION + static bool inside (const CV v, const CV e1, const CV en) { + return dot_c_amb(en, v, e1) >= 0; + } + + template KOKKOS_INLINE_FUNCTION + static void intersect (const CV v1, const CV v2, const CV e1, const CV en, + V intersection) { + const Real& a = dot_c_amb(en, e1, v1) / dot_c_amb(en, v2, v1); + combine(v1, v2, a, intersection); + } + + template KOKKOS_INLINE_FUNCTION + static bool output (const CV v, Int& no, const V& vo) { +#ifdef SIQK_DEBUG + if (no >= nslices(vo)) { + std::stringstream ss; + ss << "output: No room in vo; vo.n() is " << vo.n() << " but no is " + << no << "\n"; + error(ss.str().c_str()); + } +#endif + if (no >= nslices(vo)) return false; + vo(no,0) = v[0]; + vo(no,1) = v[1]; + ++no; + return true; + } + + //todo Handle non-convex case. + template + KOKKOS_INLINE_FUNCTION + static Real calc_area (const CV3s& v, const Int n) { + Real area = 0; + for (Int i = 1, ilim = n - 1; i < ilim; ++i) { + Real v1[2], v2[2]; + v1[0] = v(i,0) - v(0,0); + v1[1] = v(i,1) - v(0,1); + v2[0] = v(i+1,0) - v(0,0); + v2[1] = v(i+1,1) - v(0,1); + const Real a = v1[0]*v2[1] - v1[1]*v2[0]; + area += a; + } + return 0.5*area; + } + + template + KOKKOS_INLINE_FUNCTION + static Real calc_area_formula (const CV3s& v, const Int n) { + return calc_area(v, n); + } +}; + +// All inputs and outputs are relative to the unit-radius sphere. +struct SphereGeometry { + template KOKKOS_INLINE_FUNCTION + static void cross (const CV a, const CV b, V c) { + c[0] = a[1]*b[2] - a[2]*b[1]; + c[1] = a[2]*b[0] - a[0]*b[2]; + c[2] = a[0]*b[1] - a[1]*b[0]; + } + template KOKKOS_INLINE_FUNCTION + static Real dot (const CV a, const CV b) { + return a[0]*b[0] + a[1]*b[1] + a[2]*b[2]; + } + template KOKKOS_INLINE_FUNCTION + static Real norm2 (const CV v) { + return dot(v, v); + } + template KOKKOS_INLINE_FUNCTION + static void scale (const Real& a, V v) { + v[0] *= a; v[1] *= a; v[2] *= a; + } + template KOKKOS_INLINE_FUNCTION + static void normalize (V v) { + scale(1.0/std::sqrt(norm2(v)), v); + } + template KOKKOS_INLINE_FUNCTION + static Real dot_c_amb (const CV c, const CV a, const CV b) { + return c[0]*(a[0] - b[0]) + c[1]*(a[1] - b[1]) + c[2]*(a[2] - b[2]); + } + template KOKKOS_INLINE_FUNCTION + static void axpy (const Real& a, const CV& x, V& y) { + y[0] += a*x[0]; + y[1] += a*x[1]; + y[2] += a*x[2]; + } + template KOKKOS_INLINE_FUNCTION + static void axpbyz (const Real& a, const CV& x, const Real& b, const CV& y, + V& z) { + z[0] = a*x[0] + b*y[0]; + z[1] = a*x[1] + b*y[1]; + z[2] = a*x[2] + b*y[2]; + } + template KOKKOS_INLINE_FUNCTION + static void combine (const CV& u, const CV& v, const Real& a, V& x) { + const Real& oma = 1 - a; + x[0] = oma*u[0] + a*v[0]; + x[1] = oma*u[1] + a*v[1]; + x[2] = oma*u[2] + a*v[2]; + } + + template KOKKOS_INLINE_FUNCTION + static void edge_normal (const CV a, const CV b, V en) { + cross(a, b, en); + normalize(en); + } + + // Is v inside the line anchored at a with inward-facing normal n? + template KOKKOS_INLINE_FUNCTION + static bool inside (const CV& v, const CV& a, const CV& n) { + return dot_c_amb(n, v, a) >= 0; + } + + /* Let + en = edge normal + e1 = edge starting point + d = en' e1 + v(a) = (1 - a) v1 + a v2. + Solve n' v = d for a: + a = (en' (e1 - v1)) / (en' (v2 - v1)). + Then uvec(v(a)) is the intersection point on the unit sphere. Assume + intersection exists. (Already filtered by 'inside'.) + */ + template KOKKOS_INLINE_FUNCTION + static void intersect (const CV v1, const CV v2, const CV e1, const CV en, + V intersection) { + const Real a = dot_c_amb(en, e1, v1) / dot_c_amb(en, v2, v1); + combine(v1, v2, a, intersection); + normalize(intersection); + } + + template KOKKOS_INLINE_FUNCTION + static bool output (const CV v, Int& no, V& vo) { + if (no >= nslices(vo)) return false; + vo(no,0) = v[0]; + vo(no,1) = v[1]; + vo(no,2) = v[2]; + ++no; + return true; + } + + //todo Handle non-convex case. + // This uses a terrible formula, but it's just for testing. + template + KOKKOS_INLINE_FUNCTION + static Real calc_area_formula (const CV3s& v, const Int n) { + Real area = 0; + for (Int i = 1, ilim = n - 1; i < ilim; ++i) { + const Real a = calc_arc_length(slice(v,0), slice(v,i)); + const Real b = calc_arc_length(slice(v,i), slice(v,i+1)); + const Real c = calc_arc_length(slice(v,i+1), slice(v,0)); + const Real s = 0.5*(a + b + c); + const Real d = (std::tan(0.5*s)*std::tan(0.5*(s-a))* + std::tan(0.5*(s-b))*std::tan(0.5*(s-c))); + if (d <= 0) continue; + area += 4*std::atan(std::sqrt(d)); + } + return area; + } + template KOKKOS_INLINE_FUNCTION + static Real calc_arc_length (const CV a, const CV b) { + const Real d = dot(a, b); + if (d >= 1) return 0; + return acos(d); + } + + template + KOKKOS_INLINE_FUNCTION + static Real calc_area (const CV3s& v, const Int n) { + Real area = 0; + for (Int i = 1, ilim = n - 1; i < ilim; ++i) { + Real a = 0; + RawConstVec3s coord; + RawConstArray weight; + quadrature::get_coef(4, coord, weight); + for (Int k = 0, klim = nslices(coord); k < klim; ++k) { + const Real jac = calc_tri_jacobian(slice(v,0), slice(v,i), slice(v,i+1), + slice(coord, k)); + a += weight[k]*jac; + } + area += 0.5*a; + } + return area; + } + template + KOKKOS_INLINE_FUNCTION + static Real calc_tri_jacobian (const CV& v1, const CV& v2, const CV& v3, + const CA& alpha) { + // V(:,i) is vertex i of the spherical triangle on the unit sphere. The + // coefs + // alpha = [a1, a2, 1 - a1 - a2]' + // = [1 0; 0 1; -1 -1] [a1, a2]' + // = alpha_a a + // (barycentric coords) give the location + // v = V alpha + // on the planar triangle, and u = uvec(v) is the point on the unit sphere. + // For a planar tri in 3D, the jacobian is + // v_a = v_alpha alpha_a + // = V [1 0; 0 1; -1 -1] + // J = norm(cross(v_a(:,1), v_a(:,2))). + // For a spherical tri with the same vertices, + // u = v/(v' v) + // u_a = u_alpha alpha_a + // = (v'v)^{-1/2} (I - u u') V alpha_a + // = (v'v)^{-1/2} (I - u u') v_a + // J = norm(cross(u_a(:,1), u_a(:,2))). + Real u[3] = {0}; + axpy(alpha[0], v1, u); + axpy(alpha[1], v2, u); + axpy(alpha[2], v3, u); + const auto oovn = 1/std::sqrt(norm2(u)); + scale(oovn, u); + Real u_a[2][3]; + axpbyz(1, v1, -1, v3, u_a[0]); + axpbyz(1, v2, -1, v3, u_a[1]); + for (int i = 0; i < 2; ++i) { + axpy(-dot(u, u_a[i]), u, u_a[i]); + scale(oovn, u_a[i]); + } + cross(u_a[0], u_a[1], u); + return std::sqrt(norm2(u)); + } +}; + +// Sutherland-Hodgmann polygon clipping algorithm. Follow Foley, van Dam, +// Feiner, Hughes Fig 3.49. +namespace sh { +/* A mesh is described by the following arrays: + p: 3 x #nodes, the array of vertices. + e: max(#verts) x #elems, the array of element base-0 indices. + nml: 3 x #edges, the array of edge normals. + en: max(#verts) x #elems, the array of edge-normal base-0 indices. + e. e indexes p. e(i,j) == -1 in column j indicates that j:end are not used. + nml. As a mesh is refined, cancellation error makes an edge normal based + off of an element's vertices increasingly inaccurate. Roughly, if an edge + subtends angle phi of the sphere, -log10(phi/(2 pi)) digits are lost in the + edge normal. Therefore, we compute edge normals offline, since in certain + meshes, they can be computed by an accurate means. E.g., in a cubed-sphere + mesh, the whole line of a square face can be used to compute the edge + normal. Furthermore, there are far fewer unique edge normals than edges. + */ +template +struct Mesh { + typename InExeSpace::type p, nml; + typename InExeSpace::type e, en; + + Mesh () {} + + Mesh (const Mesh& m) { + typename InExeSpace::type tp, tnml; + typename InExeSpace::type te, ten; + resize_and_copy(tp, m.p); p = tp; + resize_and_copy(tnml, m.nml); nml = tnml; + resize_and_copy(te, m.e); e = te; + resize_and_copy(ten, m.en); en = ten; + } +}; + +// Generally not a user routine. +template +KOKKOS_INLINE_FUNCTION +bool clip_against_edge ( + // Input vertex list. + const CV3s& vi, const Int ni, + // Output vertex list. + V3s& vo, Int& no, + // One point of the clip edge. + const CV ce1, + // Clip edge's inward-facing normal. + const CV cen) +{ + Real intersection[3]; + no = 0; + auto s = const_slice(vi, ni-1); + for (Int j = 0; j < ni; ++j) { + auto p = const_slice(vi,j); + if (geo::inside(p, ce1, cen)) { + if (geo::inside(s, ce1, cen)) { + if ( ! geo::output(p, no, vo)) return false; + } else { + geo::intersect(s, p, ce1, cen, intersection); + if ( ! geo::output(intersection, no, vo)) return false; + if ( ! geo::output(p, no, vo)) return false; + } + } else if (geo::inside(s, ce1, cen)) { + geo::intersect(s, p, ce1, cen, intersection); + if ( ! geo::output(intersection, no, vo)) return false; + } + s = p; + } + return true; +} + +// Efficient user routine that uses the mesh data structure. +//todo An optimization would be to have 2 clip_against_edge routines. One would +// handle the special case of the first vertex list being in (p,e) format. +template +KOKKOS_INLINE_FUNCTION +bool clip_against_poly ( + // Clip mesh. m.e(:,cp_e) is the element, and m.en(:,cp_e) is the + // corresponding list of normal indices. + const MeshT& m, const Int cp_e, + // A list of vertices describing the polygon to clip. The vertices must be in + // a convention-determined order, such as CCW. vi(:,1:ni-1) are valid entries. + const CV3s& vi, const Int ni, + // On output, vo(:,0:no-1) are vertices of the clipped polygon. no is 0 if + // there is no intersection. + V3s& vo, Int& no, + // Workspace. Both vo and wrk must be large enough to hold all generated + // vertices. If they are not, false is returned. + V3s& wrk) +{ + Int nos[] = { 0, 0 }; + V3s* vs[] = { &vo, &wrk }; + + const auto e = slice(m.e, cp_e); + const auto en = slice(m.en, cp_e); + + auto nv = szslice(m.e); // Number of vertices in clip polygon. + while (e[nv-1] == -1) --nv; + + no = 0; + if (nv % 2 == 0) { + // Make sure the final vertex output list is in the caller's buffer. + swap(vs[0], vs[1]); + swap(nos[0], nos[1]); + } + + if ( ! clip_against_edge(vi, ni, *vs[0], nos[0], const_slice(m.p, e[0]), + const_slice(m.nml, en[0]))) + return false; + if ( ! nos[0]) return true; + + for (Int ie = 1, ielim = nv - 1; ; ++ie) { + if ( ! clip_against_edge(*vs[0], nos[0], *vs[1], nos[1], + const_slice(m.p, e[ie]), + const_slice(m.nml, en[ie]))) + return false; + if ( ! nos[1]) return true; + if (ie == ielim) break; + swap(vs[0], vs[1]); + swap(nos[0], nos[1]); + } + + no = nos[1]; + return true; +} + +// Not used for real stuff; just a convenient version for testing. In this +// version, clip_poly is a list of clip polygon vertices. This is instead of the +// mesh data structure. +template +KOKKOS_INLINE_FUNCTION +bool clip_against_poly ( + // Clip polygon. + const CV3s& clip_poly, + // Clip polygon edges' inward-facing normals. + const CV3s& clip_edge_normals, + const CV3s& vi, const Int ni, + V3s& vo, Int& no, + V3s& wrk) +{ + Int nos[] = { 0, 0 }; + V3s* vs[] = { &vo, &wrk }; + + no = 0; + if (nslices(clip_poly) % 2 == 0) { + // Make sure the final vertex output list is in the caller's buffer. + swap(vs[0], vs[1]); + swap(nos[0], nos[1]); + } + + if ( ! clip_against_edge(vi, ni, *vs[0], nos[0], + const_slice(clip_poly, 0), + const_slice(clip_edge_normals, 0))) + return false; + if ( ! nos[0]) return true; + + for (Int ie = 1, ielim = nslices(clip_poly) - 1; ; ++ie) { + if ( ! clip_against_edge(*vs[0], nos[0], *vs[1], nos[1], + const_slice(clip_poly, ie), + const_slice(clip_edge_normals, ie))) + return false; + if ( ! nos[1]) return true; + if (ie == ielim) break; + swap(vs[0], vs[1]); + swap(nos[0], nos[1]); + } + + no = nos[1]; + return true; +} +} // namespace sh + +// Oct-tree. Might do something else better suited to the sphere later. +template +class Octree { +public: + enum { max_depth = max_depth_ }; + typedef Real BoundingBox[6]; + + struct Options { + // Do not go beyond max_depth_ depth, including the root and leaf. With this + // constraInt, try to go deep enough so that a leaf has no more than + // max_nelem elements. + Int max_nelem; + Options () : max_nelem(8) {} + }; + + // Bounding box for a cluster of points ps (possibly vertices). + //todo kernelize + template + static void calc_bb (const CV3s& ps, const Int np, BoundingBox bb) { + if (np == 0) return; + for (Int j = 0; j < 3; ++j) + bb[j] = bb[j+3] = ps(0,j); + for (Int i = 1; i < np; ++i) + for (Int j = 0; j < 3; ++j) { + bb[j] = min(bb[j], ps(i,j)); + bb[j+3] = max(bb[j+3], ps(i,j)); + } + } + + template + static void calc_bb (const CV3s& ps, BoundingBox bb) { + calc_bb(ps, nslices(ps), bb); + } + + template + KOKKOS_INLINE_FUNCTION + static void calc_bb (const CV3s& p, const CIV& e, const Int ne, V ebb) { + for (Int j = 0; j < 3; ++j) + ebb[j] = ebb[j+3] = p(e[0], j); + for (Int i = 1; i < ne; ++i) { + if (e[i] == -1) break; + for (Int j = 0; j < 3; ++j) { + ebb[j] = min(ebb[j], p(e[i], j)); + ebb[j+3] = max(ebb[j+3], p(e[i], j)); + } + } + } + + //todo kernelize + template + static void calc_bb (const CV3s& p, const CIs& e, V6s& ebbs) { + assert(nslices(ebbs) == nslices(e)); + for (Int k = 0, klim = nslices(e); k < klim; ++k) + calc_bb(p, slice(e, k), szslice(e), slice(ebbs, k)); + } + + // p is a 3xNp array of points. e is a KxNe array of elements. An entry <0 is + // ignored. All <0 entries must be at the end of an element's list. + Octree (const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e, + const Options& o) { + init(p, e, o); + } + Octree (const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e) { + Options o; + init(p, e, o); + } + + // Apply f to every element in leaf nodes with which bb overlaps. f must have + // function + // void operator(const Int element). + template + KOKKOS_INLINE_FUNCTION + void apply (const CV bb, Functor& f) const { + if (nslices(nodes_) == 0) { + for (Int i = 0; i < offsets_[1]; ++i) + f(elems_[i]); + return; + } +#ifdef SIQK_NONRECURSIVE + // Non-recursive impl. + { + // Stack. + Real snbb[8*max_depth_]; + Int sni[max_depth_], si[max_depth_]; + Int sp = 0; + // Args for top-level call. + copy(snbb, bb_, 8); + sni[sp] = 0; + si[sp] = 0; + while (sp >= 0) { + // Get stack frame's (nbb, ni, current i) values. + const Int i = si[sp]; + if (i == 8) { + --sp; + continue; + } + // Increment stored value of i for next iteration. Current value is + // stored in 'i' above. + ++si[sp]; + const Int ni = sni[sp]; + const Real* const nbb = snbb + 8*sp; + // Can use the next stack frame's bb space for a child bb. + Real* const child_bb = snbb + 8*(sp+1); + fill_child_bb(nbb, i, child_bb); + if ( ! do_bb_overlap(child_bb, bb)) continue; + Int e = nodes_(ni,i); + if (e < 0) { + // Leaf, so apply functor to each element. + e = std::abs(e + 1); + for (Int k = offsets_[e]; k < offsets_[e+1]; ++k) + f(elems_[k]); + } else if (e > 0) { + // Recurse. + ++sp; + sni[sp] = e; + si[sp] = 0; + } + } + } +#else + apply_r(0, bb_, bb, f); +#endif + } + +private: + /* Each node in the oct-tree contains 8 integers, stored in 'nodes'. + + >0 is an index Into 'nodes', pointing to a child node. + + A <=0 entry in 'nodes' indicates a leaf node. If 0, there are no elements + in the leaf. If <0, the negative of the entry minus 1 is the index of an + offset array indexing 'elems'. + + Each segment of 'elems' contains a list of element indices covered by a + leaf node. Element indices refer to the list of elements the caller + provides during oct-tree construction. + */ + + // Static data structures holding the completed octree. + // nodes(:,i) is a list. The list includes children of node i (>0) and leaf + // node data (<=0). + //todo Make these const once ready to do full GPU stuff. + Nodes nodes_; + // A leaf node corresponding to -k covers elements + // elems[offset[k] : offset[k]-1]. + ko::View offsets_, elems_; + // Root node's bounding box. + BoundingBox bb_; + + // Dynamic data structures for construction phase. + class IntList { + Int* const buf_; + Int i_; + public: + IntList (Int* const buf) : buf_(buf), i_(0) {} + void reset () { i_ = 0; } + void push (const Int& i) { buf_[i_++] = i; } + Int* data () { return buf_; } + Int n () const { return i_; } + const Int& operator[] (const Int& i) const { return buf_[i]; } + }; + + class DynIntList { + std::vector buf_; + public: + DynIntList () {} + void push (const Int& i) { buf_.push_back(i); } + Int& back () { return buf_.back(); } + Int& operator[] (const size_t i) { + if (i >= buf_.size()) + buf_.resize(i+1); + return buf_[i]; + } + const Int& operator[] (const size_t i) const { return buf_[i]; } + Int n () const { return static_cast(buf_.size()); } + const Int* data () const { return buf_.data(); } + }; + + // Opposite index slot convention. + class DynNodes { + std::vector buf_; + public: + Int n () const { return static_cast(buf_.size()) >> 3; } + const Int* data () const { return buf_.data(); } + Int& operator() (const Int& r, const Int& c) { + const size_t ec = (c+1) << 3; + if (ec >= buf_.size()) + buf_.resize(ec); + return const_cast( + const_cast(this)->operator()(r, c)); + } + const Int& operator() (const Int& r, const Int& c) const { + assert(((c << 3) + r) >= 0); + assert(((c << 3) + r) < (Int) buf_.size()); + return buf_[(c << 3) + r]; + } + }; + + void init (const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e, + const Options& o) { + if (nslices(e) == 0) return; + // Get OT's bounding box. + calc_bb(p, bb_); + // Get elements' bounding boxes. + Vec6s::HostMirror ebbs("ebbs", nslices(e), 6); + calc_bb(p, e, ebbs); + // Static element lists for work. Each level has active work space. + std::vector buf(max_depth_*nslices(e)); + IntList es(buf.data()), wrk(buf.data() + nslices(e)); + for (Int i = 0, ilim = nslices(e); i < ilim; ++i) + es.push(i); + // Dynamic element lists. + DynIntList offsets, elems; + offsets[0] = 0; + // Dynamic node data structure. + DynNodes nodes; + // Recurse. We don't care about the return value. If it's 0 and nodes.n() == + // 0, we'll detect as much in 'apply'. + init_r(1, bb_, ebbs, o, es, wrk, offsets, elems, nodes); + // Build the static data structures. + if (elems.n() == 0) return; + init_static_ds(nodes, offsets, elems); + } + + Int init_r (const Int depth, // Tree's depth at this point, including root. + const BoundingBox& nbb, // My bounding box. + const ConstVec6s::HostMirror& ebbs, // All elements' bounding boxes. + const Options& o, // Options controlling construct of the tree. + IntList& es, // List of elements in my bounding box. + IntList& wrk, // Work space to store working element lists. + DynIntList& offsets, // Offsetss Into elems. + DynIntList& elems, // Elements belonging to leaf nodes. + DynNodes& nodes) // Dynamic nodes data structure. + { + const Int my_idx = nodes.n(); // My node index. + // Decide what to do. + if (es.n() == 0) { + // I have no elements, so return 0 to indicate I'm a leaf node containing + // nothing. + return 0; + } else if (es.n() <= o.max_nelem || depth == max_depth_) { + // I'm a leaf node with elements. Store my list of elements and return the + // storage location. + const Int os = offsets.back(); + offsets.push(os + es.n()); + for (Int i = 0, n = es.n(); i < n; ++i) + elems[os + i] = es[i]; + return 1 - offsets.n(); + } else { + // I'm not a leaf node. + nodes(0, my_idx) = 0; // Insert myself Into the nodes array. + for (Int ic = 0; ic < 8; ++ic) { + BoundingBox child_bb; + fill_child_bb(nbb, ic, child_bb); + // Find the elements that are in this child's bb. + IntList ces(wrk.data()); + for (Int i = 0, n = es.n(); i < n; ++i) + if (do_bb_overlap(child_bb, slice(ebbs, es[i]))) + ces.push(es[i]); + // Create some work space. + IntList cwrk(wrk.data() + ces.n()); + // Recurse. + const Int child_idx = init_r(depth+1, child_bb, ebbs, o, ces, cwrk, + offsets, elems, nodes); + nodes(ic, my_idx) = child_idx; + } + return my_idx; + } + } + + void init_static_ds (const DynNodes nodes, const DynIntList& offsets, + const DynIntList& elems) { + { + ko::resize(nodes_, nodes.n(), 8); + auto nodes_hm = ko::create_mirror_view(nodes_); + for (Int i = 0; i < nodes.n(); ++i) + for (Int j = 0; j < 8; ++j) + nodes_hm(i,j) = nodes(j,i); + ko::deep_copy(nodes_, nodes_hm); + } + hm_resize_and_copy(offsets_, offsets, offsets.n()); + hm_resize_and_copy(elems_, elems, elems.n()); + } + + // Using parent bb p, fill child bb c, with child_idx in 0:7. + template + KOKKOS_INLINE_FUNCTION + static void fill_child_bb (const CBB& p, const Int& child_idx, BB& c) { + const Real m[] = { 0.5*(p[0] + p[3]), + 0.5*(p[1] + p[4]), + 0.5*(p[2] + p[5]) }; + switch (child_idx) { + case 0: c[0] = p[0]; c[1] = p[1]; c[2] = p[2]; c[3] = m[0]; c[4] = m[1]; c[5] = m[2]; break; + case 1: c[0] = m[0]; c[1] = p[1]; c[2] = p[2]; c[3] = p[3]; c[4] = m[1]; c[5] = m[2]; break; + case 2: c[0] = m[0]; c[1] = m[1]; c[2] = p[2]; c[3] = p[3]; c[4] = p[4]; c[5] = m[2]; break; + case 3: c[0] = p[0]; c[1] = m[1]; c[2] = p[2]; c[3] = m[0]; c[4] = p[4]; c[5] = m[2]; break; + case 4: c[0] = p[0]; c[1] = p[1]; c[2] = m[2]; c[3] = m[0]; c[4] = m[1]; c[5] = p[5]; break; + case 5: c[0] = m[0]; c[1] = p[1]; c[2] = m[2]; c[3] = p[3]; c[4] = m[1]; c[5] = p[5]; break; + case 6: c[0] = m[0]; c[1] = m[1]; c[2] = m[2]; c[3] = p[3]; c[4] = p[4]; c[5] = p[5]; break; + case 7: c[0] = p[0]; c[1] = m[1]; c[2] = m[2]; c[3] = m[0]; c[4] = p[4]; c[5] = p[5]; break; + default: + // impossible + error("fill_child_bb: The impossible has happened."); + } + } + + // Do bounding boxes a and b overlap? + template + KOKKOS_INLINE_FUNCTION + static bool do_bb_overlap (const BoundingBox a, const BB b) { + for (Int i = 0; i < 3; ++i) + if ( ! do_lines_overlap(a[i], a[i+3], b[i], b[i+3])) + return false; + return true; + } + + KOKKOS_INLINE_FUNCTION + static bool do_lines_overlap (const Real& a1, const Real& a2, + const Real& b1, const Real& b2) { + return ! (a2 < b1 || a1 > b2); + } + + template KOKKOS_INLINE_FUNCTION + void apply_r (const Int ni, const BoundingBox& nbb, const CV bb, + Functor& f) const { + for (Int i = 0; i < 8; ++i) { + BoundingBox child_bb; + fill_child_bb(nbb, i, child_bb); + if ( ! do_bb_overlap(child_bb, bb)) continue; + Int e = nodes_(ni,i); + if (e > 0) + apply_r(e, child_bb, bb, f); + else if (e < 0) { + e = std::abs(e + 1); + for (Int k = offsets_[e]; k < offsets_[e+1]; ++k) + f(elems_[k]); + } + } + } +}; + +namespace test { +static constexpr Int max_nvert = 20; +static constexpr Int max_hits = 25; // Covers at least a 2-halo. + +// In practice, we want to form high-quality normals using information about the +// mesh. +template +void fill_normals (sh::Mesh& m) { + // Count number of edges. + Int ne = 0; + for (Int ip = 0; ip < nslices(m.e); ++ip) + for (Int iv = 0; iv < szslice(m.e); ++iv) + if (m.e(ip,iv) == -1) break; else ++ne; + // Fill. + Idxs::HostMirror en("en", nslices(m.e), szslice(m.e)); + ko::deep_copy(en, -1); + Vec3s::HostMirror nml("nml", ne, 3); + Int ie = 0; + for (Int ip = 0; ip < nslices(m.e); ++ip) + for (Int iv = 0; iv < szslice(m.e); ++iv) + if (m.e(ip,iv) == -1) + break; + else { + // Somewhat complicated next node index. + const Int iv_next = (iv+1 == szslice(m.e) ? 0 : + (m.e(ip,iv+1) == -1 ? 0 : iv+1)); + geo::edge_normal(slice(m.p, m.e(ip, iv)), slice(m.p, m.e(ip, iv_next)), + slice(nml, ie)); + en(ip,iv) = ie; + ++ie; + } + m.en = en; + m.nml = nml; +} + +//todo The current approach is to do redundant clips so that the hits buffer can +// be small and static. Need to think about this. +template +class AreaOTFunctor { + const sh::Mesh<>& cm_; + const ConstVec3s& p_; + const ConstIdxs& e_; + const Int k_; // Index into (p,e). + //todo More efficient method that also works on GPU. + Int hits_[max_hits]; + Int nh_; + Real area_; + +public: + KOKKOS_INLINE_FUNCTION + AreaOTFunctor (const sh::Mesh<>& cm, const ConstVec3s& p, const ConstIdxs& e, + const Int& k) + : cm_(cm), p_(p), e_(e), k_(k), nh_(0), area_(0) + {} + + KOKKOS_INLINE_FUNCTION void operator() (const Int mesh_elem_idx) { + // Check whether we've clipped against this polygon before and there was a + // non-0 intersection. + for (Int i = 0; i < nh_; ++i) + if (hits_[i] == mesh_elem_idx) + return; + // We have not, so do the intersection. + Int no = 0; + { + // Area of all overlapping regions. + // In and out vertex lists. + Real buf[9*max_nvert]; + RawVec3s + vi(buf, max_nvert, 3), + vo(buf + 3*max_nvert, max_nvert, 3), + wrk(buf + 6*max_nvert, max_nvert, 3); + Int ni; + ni = 0; + for (Int i = 0; i < szslice(e_); ++i) { + if (e_(k_,i) == -1) break; + copy(slice(vi, i), slice(p_, e_(k_,i)), 3); + ++ni; + } + sh::clip_against_poly(cm_, mesh_elem_idx, vi, ni, vo, no, wrk); + if (no) area_ += geo::calc_area(vo, no); + } + if (no) { + // Non-0 intersection, so record. + if (nh_ == max_hits) Kokkos::abort("max_hits is too small."); + hits_[nh_++] = mesh_elem_idx; + } + } + + KOKKOS_INLINE_FUNCTION const Real& area () const { return area_; } +}; + +template +class TestAreaOTFunctor { + typedef Octree OctreeT; + + const sh::Mesh<> cm_; + const OctreeT ot_; + mutable ConstVec3s p_; + mutable ConstIdxs e_; + +public: + typedef Real value_type; + + TestAreaOTFunctor (const sh::Mesh& cm, + const ConstVec3s::HostMirror& p_hm, + const ConstIdxs::HostMirror& e_hm, const OctreeT& ot) + : cm_(cm), ot_(ot) + { + { Vec3s p; resize_and_copy(p, p_hm); p_ = p; } + { Idxs e; resize_and_copy(e, e_hm); e_ = e; } + } + + // Clip the k'th polygon in (p,e) against mesh cm. + KOKKOS_INLINE_FUNCTION void operator() (const Int k, Real& area) const { + // Clipped element bounding box. + Real ebb[6]; + OctreeT::calc_bb(p_, slice(e_, k), szslice(e_), ebb); + // Get list of possible overlaps. + AreaOTFunctor f(cm_, p_, e_, k); + //todo Team threads. + ot_.apply(ebb, f); + area += f.area(); + } +}; + +template Real test_area_ot ( + const ConstVec3s::HostMirror& cp, const ConstIdxs::HostMirror& ce, + const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e) +{ + typedef Octree<10> OctreeT; + + // Clip mesh and edge normal calculation. (In practice, we'd like to use + // higher-quality edge normals.) + sh::Mesh cm; cm.p = cp; cm.e = ce; + fill_normals(cm); + + Real et[2]; + auto t = tic(); + // Oct-tree over the clip mesh. + OctreeT ot(cp, ce); + et[0] = toc(t); + + Real area = 0; + TestAreaOTFunctor f(cm, p, e, ot); + t = tic(); + ko::parallel_reduce(nslices(e), f, area); + et[1] = toc(t); +#ifdef SIQK_TIME + printf("%10d", nslices(ce)); + print_times("test_area_ot", et, 2); +#endif + return area; +} +} // namespace test +} // namespace siqk + +#endif // INCLUDE_SIQK_HPP diff --git a/siqk/siqk_quadrature.hpp b/siqk/siqk_quadrature.hpp new file mode 100644 index 0000000..2684fe9 --- /dev/null +++ b/siqk/siqk_quadrature.hpp @@ -0,0 +1,91 @@ +#ifndef INCLUDE_SIQK_QUADRATURE_HPP +#define INCLUDE_SIQK_QUADRATURE_HPP + +#include "siqk.hpp" + +namespace siqk { +namespace quadrature { +#define SIQK_QUADRATURE_TRISYM_ORDER4_COORD \ + {0.108103018168070, 0.445948490915965, 0.445948490915965, \ + 0.445948490915965, 0.108103018168070, 0.445948490915965, \ + 0.445948490915965, 0.445948490915965, 0.108103018168070, \ + 0.816847572980458, 0.091576213509771, 0.091576213509771, \ + 0.091576213509771, 0.816847572980458, 0.091576213509771, \ + 0.091576213509771, 0.091576213509771, 0.816847572980458} +#define SIQK_QUADRATURE_TRISYM_ORDER4_WEIGHT \ + {0.223381589678011, 0.223381589678011, 0.223381589678011, \ + 0.109951743655322, 0.109951743655322, 0.109951743655322} + +#define SIQK_QUADRATURE_TRISYM_ORDER8_COORD \ + {0.333333333333333, 0.333333333333333, 0.333333333333333, \ + 0.081414823414554, 0.459292588292723, 0.459292588292723, \ + 0.459292588292723, 0.081414823414554, 0.459292588292723, \ + 0.459292588292723, 0.459292588292723, 0.081414823414554, \ + 0.658861384496480, 0.170569307751760, 0.170569307751760, \ + 0.170569307751760, 0.658861384496480, 0.170569307751760, \ + 0.170569307751760, 0.170569307751760, 0.658861384496480, \ + 0.898905543365938, 0.050547228317031, 0.050547228317031, \ + 0.050547228317031, 0.898905543365938, 0.050547228317031, \ + 0.050547228317031, 0.050547228317031, 0.898905543365938, \ + 0.008394777409958, 0.263112829634638, 0.728492392955404, \ + 0.008394777409958, 0.728492392955404, 0.263112829634638, \ + 0.263112829634638, 0.008394777409958, 0.728492392955404, \ + 0.263112829634638, 0.728492392955404, 0.008394777409958, \ + 0.728492392955404, 0.263112829634638, 0.008394777409958, \ + 0.728492392955404, 0.008394777409958, 0.263112829634638} +#define SIQK_QUADRATURE_TRISYM_ORDER8_WEIGHT \ + {0.144315607677787, 0.095091634267285, 0.095091634267285, \ + 0.095091634267285, 0.103217370534718, 0.103217370534718, \ + 0.103217370534718, 0.032458497623198, 0.032458497623198, \ + 0.032458497623198, 0.027230314174435, 0.027230314174435, \ + 0.027230314174435, 0.027230314174435, 0.027230314174435, \ + 0.027230314174435} + +namespace host { +static const Real trisym_order4_coord[] = SIQK_QUADRATURE_TRISYM_ORDER4_COORD; +static const Real trisym_order4_weight[] = SIQK_QUADRATURE_TRISYM_ORDER4_WEIGHT; +static const Real trisym_order8_coord[] = SIQK_QUADRATURE_TRISYM_ORDER8_COORD; +static const Real trisym_order8_weight[] = SIQK_QUADRATURE_TRISYM_ORDER8_WEIGHT; +} + +namespace device { +KOKKOS_CONSTANT Real trisym_order4_coord[] = SIQK_QUADRATURE_TRISYM_ORDER4_COORD; +KOKKOS_CONSTANT Real trisym_order4_weight[] = SIQK_QUADRATURE_TRISYM_ORDER4_WEIGHT; +KOKKOS_CONSTANT Real trisym_order8_coord[] = SIQK_QUADRATURE_TRISYM_ORDER8_COORD; +KOKKOS_CONSTANT Real trisym_order8_weight[] = SIQK_QUADRATURE_TRISYM_ORDER8_WEIGHT; +} + +template +KOKKOS_INLINE_FUNCTION +void get_coef (const int order, RawConstVec3s& coord, RawConstArray& weight) { + switch (order) { + case 4: + coord = RawConstVec3s(device::trisym_order4_coord, 6, 3); + weight = RawConstArray(device::trisym_order4_weight, 6); + break; + case 8: + coord = RawConstVec3s(device::trisym_order8_coord, 16, 3); + weight = RawConstArray(device::trisym_order8_weight, 16); + break; + } +} + +template <> +KOKKOS_INLINE_FUNCTION +void get_coef (const int order, RawConstVec3s& coord, + RawConstArray& weight) { + switch (order) { + case 4: + coord = RawConstVec3s(host::trisym_order4_coord, 6, 3); + weight = RawConstArray(host::trisym_order4_weight, 6); + break; + case 8: + coord = RawConstVec3s(host::trisym_order8_coord, 16, 3); + weight = RawConstArray(host::trisym_order8_weight, 16); + break; + } +} +} +} + +#endif diff --git a/siqk/test.cpp b/siqk/test.cpp new file mode 100644 index 0000000..dd10981 --- /dev/null +++ b/siqk/test.cpp @@ -0,0 +1,191 @@ +// ko=/home/ambradl/lib/kokkos/cpu; mycpp -I$ko/include -L$ko/lib -fopenmp test.cpp -lkokkos -ldl -Wall -pedantic +// ./a.out -m | grep "mat=1" > foo.m +// >> msik('draw_test_output', 'foo'); + +#include "siqk_intersect.hpp" +using namespace siqk; + +template +static void +write_matlab (const std::string& name, const CV3s& p) { + printf("mat=1; %s = [", name.c_str()); + for (Int ip = 0; ip < nslices(p); ++ip) + printf(" %1.15e %1.15e %1.15e;", p(ip,0), p(ip,1), p(ip,2)); + printf("].';\n"); +} + +template +static void +write_matlab (const std::string& name, const CV3s& p, const CIs& e) { + printf("mat=1; %s.p = [", name.c_str()); + for (Int ip = 0; ip < nslices(p); ++ip) + printf(" %1.15e %1.15e %1.15e;", p(ip,0), p(ip,1), p(ip,2)); + printf("].';\n"); + printf("mat=1; %s.n = [", name.c_str()); + for (Int ie = 0; ie < nslices(e); ++ie) + printf(" %d %d %d %d;", e(ie,0)+1, e(ie,1)+1, e(ie,2)+1, e(ie,3)+1); + printf("].';\n"); +} + +static void make_planar_mesh (Vec3s::HostMirror& p, Idxs::HostMirror& e, + const Int n) { + const Real d = std::sqrt(0.5); + ko::resize(e, n*n, 4); + ko::resize(p, (n+1)*(n+1), 3); + for (Int iy = 0; iy < n+1; ++iy) + for (Int ix = 0; ix < n+1; ++ix) { + const auto idx = (n+1)*iy + ix; + p(idx,0) = 2*(static_cast(ix)/n - 0.5)*d; + p(idx,1) = 2*(static_cast(iy)/n - 0.5)*d; + p(idx,2) = 0; + } + for (Int iy = 0; iy < n; ++iy) + for (Int ix = 0; ix < n; ++ix) { + const auto idx = n*iy + ix; + e(idx,0) = (n+1)*iy + ix; + e(idx,1) = (n+1)*iy + ix+1; + e(idx,2) = (n+1)*(iy+1) + ix+1; + e(idx,3) = (n+1)*(iy+1) + ix; + } +} + +static void project_onto_sphere (Vec3s::HostMirror& p) { + for (Int ip = 0; ip < nslices(p); ++ip) { + p(ip,2) = 1; + SphereGeometry::normalize(slice(p, ip)); + } +} + +static void +perturb_mesh (Vec3s::HostMirror& p, Idxs::HostMirror& e, + const Real angle, const Real xlate, const Real ylate) { + const Real cr = std::cos(angle), sr = std::sin(angle); + for (Int ip = 0; ip < nslices(p); ++ip) { + const Real x = p(ip,0), y = p(ip,1); + p(ip,0) = cr*x - sr*y + xlate; + p(ip,1) = -sr*x + cr*y + ylate; + } +} + +static void fill_quad (const ConstVec3s::HostMirror& p, + Vec3s::HostMirror& poly) { + const Int n = static_cast(std::sqrt(nslices(p) - 1)); + copy(slice(poly, 0), slice(p, 0), 3); + copy(slice(poly, 1), slice(p, n), 3); + copy(slice(poly, 2), slice(p, nslices(p) - 1), 3); + copy(slice(poly, 3), slice(p, nslices(p) - 1 - n), 3); +} + +// Area of the outline of (p,e) clipped against the outline of (cp,ce). +template +static Real calc_true_area ( + const ConstVec3s::HostMirror& cp, const ConstIdxs::HostMirror& ce, + const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e, + const bool wm) +{ + Vec3s::HostMirror clip_poly("clip_poly", 4, 3), poly("poly", 4, 3), + nml("nml", 4, 3); + fill_quad(cp, clip_poly); + fill_quad(p, poly); + for (Int i = 0; i < 4; ++i) + Geo::edge_normal(slice(clip_poly, i), slice(clip_poly, (i+1) % 4), + slice(nml, i)); + Vec3s::HostMirror vo("vo", test::max_nvert, 3); + Int no; + { + Vec3s::HostMirror wrk("wrk", test::max_nvert, 3); + sh::clip_against_poly(clip_poly, nml, poly, 4, vo, no, wrk); + } + if (wm) { + write_matlab("clip_poly", clip_poly); + write_matlab("poly", poly); + write_matlab("intersection", + ko::subview(vo, std::pair(0, no), ko::ALL())); + } + return Geo::calc_area_formula(vo, no); +} + +template void finalize_mesh (Vec3s::HostMirror& p) {} +template <> void finalize_mesh (Vec3s::HostMirror& p) { + project_onto_sphere(p); +} + +template +static int +run (const Int n, const Real angle, const Real xlate, const Real ylate, + const bool wm) { + Vec3s::HostMirror cp; + Idxs::HostMirror ce; + make_planar_mesh(cp, ce, n); + + Vec3s::HostMirror p("p", nslices(cp), szslice(cp)); + Idxs::HostMirror e("e", nslices(ce), szslice(ce)); + ko::deep_copy(p, cp); + ko::deep_copy(e, ce); + perturb_mesh(p, e, angle, xlate, ylate); + + finalize_mesh(cp); + finalize_mesh(p); + + const Real ta = calc_true_area(cp, ce, p, e, wm); + const Real a = test::test_area_ot(cp, ce, p, e); + + const Real re = std::abs(a - ta)/ta; + fprintf(stderr, "true area %1.4e mesh area %1.4e relerr %1.4e\n", ta, a, re); + if (wm) { + write_matlab("cm", cp, ce); + write_matlab("m", p, e); + } + return re < 1e-10 ? 0 : 1; +} + +inline bool +eq (const std::string& a, const char* const b1, const char* const b2 = 0) { + return (a == std::string(b1) || (b2 && a == std::string(b2)) || + a == std::string("-") + std::string(b1)); +} + +struct Input { + Int n; + Real angle, xlate, ylate; + bool write_matlab, geo_sphere; + + Input (Int argc, char** argv) + : n(5), angle(M_PI*1e-1), xlate(1e-1), ylate(1e-1), write_matlab(false), + geo_sphere(true) + { + for (Int i = 1; i < argc; ++i) { + const std::string& token = argv[i]; + if (eq(token, "-n")) n = atoi(argv[++i]); + if (eq(token, "-m", "--write-matlab")) write_matlab = true; + if (eq(token, "--plane")) geo_sphere = false; + if (eq(token, "--xlate")) xlate = atof(argv[++i]); + if (eq(token, "--ylate")) ylate = atof(argv[++i]); + if (eq(token, "--angle")) angle = atof(argv[++i]); + } + + print(std::cout); + } + + void print (std::ostream& os) { + os << "n (-n): " << n << "\n" + << "write matlab (-m): " << write_matlab << "\n" + << "planar geometry (--plane): " << ! geo_sphere << "\n" + << "angle (--angle): " << angle << "\n" + << "xlate (--xlate): " << xlate << "\n" + << "ylate (--ylate): " << ylate << "\n"; + } +}; + +int main (int argc, char** argv) { + Kokkos::initialize(argc, argv); + { + Input in(argc, argv); + Int nerr = 0; + nerr += (in.geo_sphere ? + run(in.n, in.angle, in.xlate, in.ylate, in.write_matlab) : + run(in.n, in.angle, in.xlate, in.ylate, in.write_matlab)); + std::cerr << (nerr ? "FAIL" : "PASS") << "ED\n"; + } + Kokkos::finalize_all(); +} From fb2f67ca2a29fe88539e733425b79fc236544309 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Wed, 18 May 2016 13:19:23 -0600 Subject: [PATCH 07/28] SIQK/SI: Make 'intersect' more robust. Thanks to Dave for the bug report and testing. --- siqk/si/sik.hpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/siqk/si/sik.hpp b/siqk/si/sik.hpp index 7b78958..4f70137 100644 --- a/siqk/si/sik.hpp +++ b/siqk/si/sik.hpp @@ -95,7 +95,13 @@ struct PlaneGeometry { template KOKKOS_INLINE_FUNCTION static void intersect (const CV v1, const CV v2, const CV e1, const CV en, V intersection) { - const double a = dot_c_amb(en, e1, v1) / dot_c_amb(en, v2, v1); + double a; { + const double + num = dot_c_amb(en, e1, v1), + den = dot_c_amb(en, v2, v1); + a = num == 0 || den == 0 ? 0 : num/den; + a = a < 0 ? 0 : a > 1 ? 1 : a; + } combine(v1, v2, a, intersection); } @@ -194,7 +200,13 @@ struct SphereGeometry { template KOKKOS_INLINE_FUNCTION static void intersect (const CV v1, const CV v2, const CV e1, const CV en, V intersection) { - const double a = dot_c_amb(en, e1, v1) / dot_c_amb(en, v2, v1); + double a; { + const double + num = dot_c_amb(en, e1, v1), + den = dot_c_amb(en, v2, v1); + a = num == 0 || den == 0 ? 0 : num/den; + a = a < 0 ? 0 : a > 1 ? 1 : a; + } combine(v1, v2, a, intersection); normalize(intersection); } From 3370eedbb92454984a7b9a6063fc69cccaba6dad Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Thu, 19 May 2016 12:23:32 -0600 Subject: [PATCH 08/28] SIQK/SI: Enforce fortuitous floating point identity when it arises. Might help Dave with filtering points. --- siqk/si/sik.hpp | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/siqk/si/sik.hpp b/siqk/si/sik.hpp index 4f70137..fda4b28 100644 --- a/siqk/si/sik.hpp +++ b/siqk/si/sik.hpp @@ -168,6 +168,12 @@ struct SphereGeometry { static double dot_c_amb (const CV c, const CV a, const CV b) { return c[0]*(a[0] - b[0]) + c[1]*(a[1] - b[1]) + c[2]*(a[2] - b[2]); } + template KOKKOS_INLINE_FUNCTION + static void copy (V& d, const CV& s) { + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + } template KOKKOS_INLINE_FUNCTION static void combine (const CV u, const CV v, const double a, V x) { const double oma = 1 - a; @@ -207,8 +213,15 @@ struct SphereGeometry { a = num == 0 || den == 0 ? 0 : num/den; a = a < 0 ? 0 : a > 1 ? 1 : a; } - combine(v1, v2, a, intersection); - normalize(intersection); + // FP identity is sometimes useful, so let's enforce it. + if (a == 0) + copy(intersection, v1); + else if (a == 1) + copy(intersection, v2); + else { + combine(v1, v2, a, intersection); + normalize(intersection); + } } template KOKKOS_INLINE_FUNCTION From 1e4c98a848c91c9e47fbfe658042604bc601df57 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Thu, 19 May 2016 16:05:24 -0600 Subject: [PATCH 09/28] SIQK/SI: More experimentation for Dave's application. For Dave's use case, we're trying to get more than just a clipped polygon. Ideally, it will do a small bit of topology stuff, too, to simplify post-processing. This is another cut at trying to solve that problem. In this version, 'inside' is modified to accomplish two things: (1) Polygons are now open, not closed, sets. (2) A clipping polygon's vertex that is FP identical to a clip polygon's vertex will definitely be evaluated as outside. To be clear, none of these details is properly part of clipping. An algorithm based purely on integration shouldn't care if FP-different but logically duplicate vertices are in the output vertex list, as long as the output vertex list describes a polygon that is, up to approximation commensurate with FP, identical to the correct, infinite-arithmetic polygon. But an algorithm that has to do a lot of topology stuff will have an easier time if the output vertex list is free of these types of vertices. --- siqk/si/sik.hpp | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/siqk/si/sik.hpp b/siqk/si/sik.hpp index fda4b28..5a3a3a7 100644 --- a/siqk/si/sik.hpp +++ b/siqk/si/sik.hpp @@ -16,7 +16,6 @@ #endif namespace siqk { - #ifdef SIQK_TIME static timeval tic () { timeval t; @@ -88,8 +87,8 @@ struct PlaneGeometry { } template KOKKOS_INLINE_FUNCTION - static bool inside (const CV v, const CV e1, const CV en) { - return dot_c_amb(en, v, e1) >= 0; + static bool inside (const CV v, const CV e1, const CV e2, const CV en) { + return dot_c_amb(en, v, e1) > 0 && dot_c_amb(en, v, e2) > 0; } template KOKKOS_INLINE_FUNCTION @@ -189,8 +188,8 @@ struct SphereGeometry { } template KOKKOS_INLINE_FUNCTION - static bool inside (const CV v, const CV a, const CV n) { - return dot_c_amb(n, v, a) >= 0; + static bool inside (const CV v, const CV a1, const CV a2, const CV n) { + return dot_c_amb(n, v, a1) > 0 && dot_c_amb(n, v, a2) > 0; } /* Let @@ -214,14 +213,8 @@ struct SphereGeometry { a = a < 0 ? 0 : a > 1 ? 1 : a; } // FP identity is sometimes useful, so let's enforce it. - if (a == 0) - copy(intersection, v1); - else if (a == 1) - copy(intersection, v2); - else { - combine(v1, v2, a, intersection); - normalize(intersection); - } + combine(v1, v2, a, intersection); + normalize(intersection); } template KOKKOS_INLINE_FUNCTION @@ -296,8 +289,8 @@ bool clip_against_edge ( const Array2D& vi, const int ni, // Output vertex list. Array2D& vo, int& no, - // One point of the clip edge. - const CV ce1, + // The end points of the clip edge segment. + const CV ce1, const CV ce2, // Clip edge's inward-facing normal. const CV cen) { @@ -307,15 +300,15 @@ bool clip_against_edge ( s = vi(ni-1); for (int j = 0; j < ni; ++j) { p = vi(j); - if (geo::inside(p, ce1, cen)) { - if (geo::inside(s, ce1, cen)) { + if (geo::inside(p, ce1, ce2, cen)) { + if (geo::inside(s, ce1, ce2, cen)) { if ( ! geo::output(p, no, vo)) return false; } else { geo::intersect(s, p, ce1, cen, intersection); if ( ! geo::output(intersection, no, vo)) return false; if ( ! geo::output(p, no, vo)) return false; } - } else if (geo::inside(s, ce1, cen)) { + } else if (geo::inside(s, ce1, ce2, cen)) { geo::intersect(s, p, ce1, cen, intersection); if ( ! geo::output(intersection, no, vo)) return false; } @@ -357,13 +350,14 @@ bool clip_against_poly ( std::swap(nos[0], nos[1]); } - if ( ! clip_against_edge(vi, ni, *vs[0], nos[0], m.p(e[0]), m.nml(en[0]))) + if ( ! clip_against_edge(vi, ni, *vs[0], nos[0], m.p(e[0]), m.p(e[1]), + m.nml(en[0]))) return false; if ( ! nos[0]) return true; for (int ie = 1, ielim = nv - 1; ; ++ie) { if ( ! clip_against_edge(*vs[0], nos[0], *vs[1], nos[1], m.p(e[ie]), - m.nml(en[ie]))) + m.p(e[(ie+1) % nv]), m.nml(en[ie]))) return false; if ( ! nos[1]) return true; if (ie == ielim) break; @@ -393,20 +387,21 @@ bool clip_against_poly ( Array2D* vs[] = { &vo, &vo1 }; no = 0; - if (clip_poly.n() % 2 == 0) { + const auto nv = clip_poly.n(); + if (nv % 2 == 0) { // Make sure the final vertex output list is in the caller's buffer. std::swap(vs[0], vs[1]); std::swap(nos[0], nos[1]); } - if ( ! clip_against_edge(vi, ni, *vs[0], nos[0], clip_poly(0), + if ( ! clip_against_edge(vi, ni, *vs[0], nos[0], clip_poly(0), clip_poly(1), clip_edge_normals(0))) return false; if ( ! nos[0]) return true; - for (int ie = 1, ielim = clip_poly.n() - 1; ; ++ie) { + for (int ie = 1, ielim = nv - 1; ; ++ie) { if ( ! clip_against_edge(*vs[0], nos[0], *vs[1], nos[1], clip_poly(ie), - clip_edge_normals(ie))) + clip_poly((ie+1) % nv), clip_edge_normals(ie))) return false; if ( ! nos[1]) return true; if (ie == ielim) break; From 213fccc55cc944c50ad39d34c4985c32c9c44d3b Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Thu, 19 May 2016 16:29:04 -0600 Subject: [PATCH 10/28] SIQK/SI: More tweaks for FP identity use case. --- siqk/si/README.md | 6 +++--- siqk/si/sik.hpp | 33 ++++++++++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/siqk/si/README.md b/siqk/si/README.md index 5134578..b465b36 100644 --- a/siqk/si/README.md +++ b/siqk/si/README.md @@ -1,4 +1,4 @@ -Simple sphere interesection prototype with optional no-Kokkos build. +Simple sphere interesection prototype with no-Kokkos build. Basic build and test run: @@ -10,7 +10,7 @@ For performance profiling, $ g++ -O3 -DSIQK_TIME -std=c++11 test.cpp $ ./a.out -n 20 -You should see +You should see something like n (-n): 20 test_area_ot 1.276e-02 s 1.4 MB 1.228e-03 s 1.153e-02 s @@ -22,4 +22,4 @@ time. The third line shows the true overlap area, the area based on the meshes, and the relative error. As the mesh is refined, the relative error increases because (a) the sphere polygon area calculation is naive and (b) the edge normals have increasing cancellation error. Each is part of the test setup and -would not be used in practice. +are not used in practice. diff --git a/siqk/si/sik.hpp b/siqk/si/sik.hpp index 5a3a3a7..54ac376 100644 --- a/siqk/si/sik.hpp +++ b/siqk/si/sik.hpp @@ -8,6 +8,7 @@ #include #include #include +#include #ifdef SIQK_TIME # include @@ -15,6 +16,21 @@ # include #endif +#define pr(m) do { \ + std::stringstream _ss_; \ + _ss_ << m << std::endl; \ + std::cerr << _ss_.str(); \ + } while (0) +#define prc(m) pr(#m << " | " << (m)) +#define puf(m)"(" << #m << " " << (m) << ")" +#define pu(m) << " " << puf(m) +template +static void prarr (const std::string& name, const T* const v, const size_t n) { + std::cerr << name << ": "; + for (size_t i = 0; i < n; ++i) std::cerr << " " << v[i]; + std::cerr << "\n"; +} + namespace siqk { #ifdef SIQK_TIME static timeval tic () { @@ -205,6 +221,12 @@ struct SphereGeometry { template KOKKOS_INLINE_FUNCTION static void intersect (const CV v1, const CV v2, const CV e1, const CV en, V intersection) { + /* Consider the case where e1 == v1 or e1 == v2. All == are FP. + If e1 == v1, then num = 0, a = 0, and intersection is set to v1. + If e2 == v1, then num == den, a = 1, and intersection is set to v2. + These two cases I believe are the only ones that matter to the bow-tie + issue in Dave's use case. + */ double a; { const double num = dot_c_amb(en, e1, v1), @@ -212,9 +234,14 @@ struct SphereGeometry { a = num == 0 || den == 0 ? 0 : num/den; a = a < 0 ? 0 : a > 1 ? 1 : a; } - // FP identity is sometimes useful, so let's enforce it. - combine(v1, v2, a, intersection); - normalize(intersection); + if (a == 0) + copy(intersection, v1); + else if (a == 1) + copy(intersection, v2); + else { + combine(v1, v2, a, intersection); + normalize(intersection); + } } template KOKKOS_INLINE_FUNCTION From 51d46cbc5a1e474cb95fa23123f6e99a03fa37c6 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Tue, 5 Jul 2016 19:10:46 -0600 Subject: [PATCH 11/28] SLMM, SIQK, slmmir: Update SIQK, impl infrastructure for SLMM, impl IR driver. SLMM is a library for SLMM methods. It will eventually run on the GPU, but it does not yet. slmmir is a driver built on SLMM that implements the basic incremental remap method. --- siqk/siqk.cpp | 153 ++++ siqk/siqk.hpp | 189 +--- siqk/siqk_defs.hpp | 200 ++++ siqk/siqk_geometry.hpp | 287 ++++++ siqk/siqk_intersect.hpp | 628 +------------ siqk/siqk_quadrature.hpp | 323 ++++++- siqk/siqk_search.hpp | 377 ++++++++ siqk/siqk_sqr.hpp | 259 ++++++ siqk/siqk_test.cpp | 517 +++++++++++ siqk/slmm/Makefile | 45 + siqk/slmm/slmm_debug.hpp | 37 + siqk/slmm/slmm_defs.hpp | 36 + siqk/slmm/slmm_gallery.cpp | 14 + siqk/slmm/slmm_gallery.hpp | 312 +++++++ siqk/slmm/slmm_gll.hpp | 75 ++ siqk/slmm/slmm_io.cpp | 314 +++++++ siqk/slmm/slmm_io.hpp | 73 ++ siqk/slmm/slmm_mesh.cpp | 486 ++++++++++ siqk/slmm/slmm_mesh.hpp | 69 ++ siqk/slmm/slmm_runtests.py | 71 ++ siqk/slmm/slmm_test.cpp | 201 ++++ siqk/slmm/slmm_time_int.cpp | 156 ++++ siqk/slmm/slmm_time_int.hpp | 424 +++++++++ siqk/slmm/slmm_util.cpp | 30 + siqk/slmm/slmm_util.hpp | 153 ++++ siqk/slmm/slmmir.cpp | 1712 +++++++++++++++++++++++++++++++++++ siqk/test.cpp | 191 ---- 27 files changed, 6307 insertions(+), 1025 deletions(-) create mode 100644 siqk/siqk.cpp create mode 100644 siqk/siqk_defs.hpp create mode 100644 siqk/siqk_geometry.hpp create mode 100644 siqk/siqk_search.hpp create mode 100644 siqk/siqk_sqr.hpp create mode 100644 siqk/siqk_test.cpp create mode 100644 siqk/slmm/Makefile create mode 100644 siqk/slmm/slmm_debug.hpp create mode 100644 siqk/slmm/slmm_defs.hpp create mode 100644 siqk/slmm/slmm_gallery.cpp create mode 100644 siqk/slmm/slmm_gallery.hpp create mode 100644 siqk/slmm/slmm_gll.hpp create mode 100644 siqk/slmm/slmm_io.cpp create mode 100644 siqk/slmm/slmm_io.hpp create mode 100644 siqk/slmm/slmm_mesh.cpp create mode 100644 siqk/slmm/slmm_mesh.hpp create mode 100755 siqk/slmm/slmm_runtests.py create mode 100644 siqk/slmm/slmm_test.cpp create mode 100644 siqk/slmm/slmm_time_int.cpp create mode 100644 siqk/slmm/slmm_time_int.hpp create mode 100644 siqk/slmm/slmm_util.cpp create mode 100644 siqk/slmm/slmm_util.hpp create mode 100644 siqk/slmm/slmmir.cpp delete mode 100644 siqk/test.cpp diff --git a/siqk/siqk.cpp b/siqk/siqk.cpp new file mode 100644 index 0000000..870bdbe --- /dev/null +++ b/siqk/siqk.cpp @@ -0,0 +1,153 @@ +#include +#include "siqk_intersect.hpp" +#include "mexutil.hpp" +using namespace siqk; + +static void make_elems (const mexutil::ConstDenseMexMat& me, Idxs& e) { + for (size_t i = 0; i < me.n; ++i) + for (size_t j = 0; j < me.m; ++j) + e(i,j) = static_cast(me.a[me.m*i + j]) - 1; +} + +static void merror (const std::string& msg) { + Kokkos::finalize(); + mexErrMsgTxt(msg.c_str()); +} + +void mexFunction (int nlhs, mxArray** plhs, int nrhs, const mxArray** prhs) { + omp_set_num_threads(4); + Kokkos::initialize(); + using namespace mexutil; + std::string cmd = init_mex(nrhs, prhs); + try { + typedef PlaneGeometry geo; + if (cmd == "inside") { + if (nlhs != 1 || nrhs != 2) merror("in = inside(edge, points)"); + ConstDenseMexMat edge(prhs[0]); + reqorexit(edge.m == 3 && edge.n == 2); + ConstDenseMexMat points(prhs[1]); + reqorexit(points.m == 3); + DenseMexMat in(1, points.n); + plhs[0] = in.ma; + for (size_t i = 0; i < points.n; ++i) { + double en[3]; + geo::edge_normal(edge.a, edge.a + 3, en); + in.a[i] = geo::inside(points.a + points.m*i, edge.a, + const_cast(en)); + } + } else if (cmd == "intersect") { + // Assumption: Intersection exists. + if (nlhs != 1 || nrhs != 2) + merror("points = intersect(edge, edges)"); + ConstDenseMexMat edge(prhs[0]); + reqorexit(edge.m == 3 && edge.n == 2); + ConstDenseMexMat edges(prhs[1]); + DenseMexMat points(edge.m, edges.n), exists(1, edges.n); + plhs[0] = points.ma; + for (size_t i = 0; i < edges.n; ++i) { + double en[3]; + geo::edge_normal(edge.a, edge.a + 3, en); + geo::intersect(edges.a + 6*i, edges.a + 6*i + 3, edge.a, + const_cast(en), points.a + points.m*i); + } + } else if (cmd == "clip_against_edge") { + if (nlhs != 1 || nrhs != 2) + merror("vo = clip_against_edge(edge, vi)"); + ConstDenseMexMat edge(prhs[0]); + reqorexit(edge.m == 3 && edge.n == 2); + ConstDenseMexMat vi(prhs[1]); + reqorexit(vi.m == 3); + Vec3s vo("vo", test::max_nvert, 3); + int no; + double en[3]; + geo::edge_normal(edge.a, edge.a + 3, en); + sh::clip_against_edge(RawConstVec3s(vi.a, vi.n, vi.m), vi.n, + vo, no, edge.a, const_cast(en)); + DenseMexMat vom(vi.n, no); + memcpy(vom.a, vo.ptr_on_device(), vi.n*no*sizeof(double)); + plhs[0] = vom.ma; + } else if (cmd == "clip_against_poly") { + if (nlhs != 1 || nrhs != 2) + merror("vo = clip_against_poly(clip_polygon, vi)"); + ConstDenseMexMat mcp(prhs[0]); + reqorexit(mcp.m == 3); + ConstDenseMexMat vi(prhs[1]); + reqorexit(vi.m == 3); + RawConstVec3s cp(mcp.a, mcp.n, mcp.m); + Vec3s cens("cens", nslices(cp), 3); + for (int i = 0; i < nslices(cp); ++i) + geo::edge_normal(slice(cp,i), slice(cp, (i + 1) % nslices(cp)), slice(cens,i)); + Vec3s vo("vo", test::max_nvert, 3), wrk("wrk", test::max_nvert, 3); + int no; + sh::clip_against_poly(cp, cens, + RawConstVec3s(vi.a, vi.n, vi.m), vi.n, + vo, no, wrk); + DenseMexMat vom(vi.m, no); + memcpy(vom.a, vo.ptr_on_device(), vi.m*no*sizeof(double)); + plhs[0] = vom.ma; + } else if (cmd == "clip_against_poly_sphere") { + if (nlhs != 1 || nrhs != 2) + merror("vo = clip_against_poly_sphere(clip_polygon, vi)"); + ConstDenseMexMat mcp(prhs[0]); + reqorexit(mcp.m == 3); + ConstDenseMexMat vi(prhs[1]); + reqorexit(vi.m == 3); + RawConstVec3s cp(mcp.a, mcp.n, mcp.m); + Vec3s cens("cens", nslices(cp), 3); + for (int i = 0; i < nslices(cp); ++i) + SphereGeometry::edge_normal(slice(cp,i), slice(cp, (i + 1) % nslices(cp)), + slice(cens,i)); + Vec3s vo("vo", test::max_nvert, 3), wrk("wrk", test::max_nvert, 3); + int no; + sh::clip_against_poly(cp, cens, + RawConstVec3s(vi.a, vi.n, vi.m), vi.n, + vo, no, wrk); + DenseMexMat vom(vi.m, no); + memcpy(vom.a, vo.ptr_on_device(), vi.m*no*sizeof(double)); + plhs[0] = vom.ma; +#if 0 + } else if (cmd == "test_area_ot") { + // Test using oct-tree. + if (nlhs != 1 || nrhs != 4) + merror("area = test_area_ot(cp, ce, p, e)"); + ConstDenseMexMat mcp(prhs[0]); + reqorexit(mcp.m == 3); + ConstDenseMexMat mce(prhs[1]); + ConstDenseMexMat mp(prhs[2]); + reqorexit(mp.m == 3); + ConstDenseMexMat me(prhs[3]); + Array2D cp(3, mcp.n, mcp.a); + Array2D p(3, mp.n, mp.a); + Array2D ce(mce.m, mce.n), e(me.m, me.n); + make_elems(mce, ce); + make_elems(me, e); + DenseMexMat area(1, 1); + plhs[0] = area.ma; + area.a[0] = test::test_area_ot(cp, ce, p, e); + } else if (cmd == "test_area_ot_sphere") { + // Test using oct-tree. + if (nlhs != 1 || nrhs != 4) + merror("area = test_area_ot(cp, ce, p, e)"); + ConstDenseMexMat mcp(prhs[0]); + reqorexit(mcp.m == 3); + ConstDenseMexMat mce(prhs[1]); + ConstDenseMexMat mp(prhs[2]); + reqorexit(mp.m == 3); + ConstDenseMexMat me(prhs[3]); + Array2D cp(3, mcp.n, mcp.a); + Array2D p(3, mp.n, mp.a); + Array2D ce(mce.m, mce.n), e(me.m, me.n); + make_elems(mce, ce); + make_elems(me, e); + DenseMexMat area(1, 1); + plhs[0] = area.ma; + area.a[0] = test::test_area_ot(cp, ce, p, e); +#endif + } else { + merror((string("Invalid function: ") + cmd).c_str()); + } + } catch (const std::exception& e) { + merror(e.what()); + } + Kokkos::finalize(); +} diff --git a/siqk/siqk.hpp b/siqk/siqk.hpp index 52431e2..f71b94b 100644 --- a/siqk/siqk.hpp +++ b/siqk/siqk.hpp @@ -1,189 +1,10 @@ #ifndef INCLUDE_SIQK_HPP #define INCLUDE_SIQK_HPP -#include -#include -#include -#include -#include - -#include - -#ifdef SIQK_TIME -# include -# include -# include -#endif - -// Always want this for GPU. -#define SIQK_NONRECURSIVE - -#ifdef KOKKOS_HAVE_CUDA -# define KOKKOS_CONSTANT __constant__ __device__ -#else -# define KOKKOS_CONSTANT -#endif - -namespace siqk { -namespace ko = Kokkos; -#define pr(m) do { \ - std::stringstream _ss_; \ - _ss_ << m << std::endl; \ - std::cerr << _ss_.str(); \ - } while (0) -#define prc(m) pr(#m << " | " << (m)) -#define puf(m)"(" << #m << " " << (m) << ")" -#define pu(m) << " " << puf(m) -template -static void prarr (const std::string& name, const T* const v, const size_t n) { - std::cerr << name << ": "; - for (size_t i = 0; i < n; ++i) std::cerr << " " << v[i]; - std::cerr << "\n"; -} - -#ifdef SIQK_TIME -static timeval tic () { - timeval t; - gettimeofday(&t, 0); - return t; -} -static double calc_et (const timeval& t1, const timeval& t2) { - static const double us = 1.0e6; - return (t2.tv_sec * us + t2.tv_usec - t1.tv_sec * us - t1.tv_usec) / us; -} -static double toc (const timeval& t1) { - Kokkos::fence(); - timeval t; - gettimeofday(&t, 0); - return calc_et(t1, t); -} -static double get_memusage () { - static const double scale = 1.0 / (1 << 10); // Memory in MB. - rusage ru; - getrusage(RUSAGE_SELF, &ru); - return ru.ru_maxrss*scale; -} -#else -static inline int tic () { return 0; } -static inline double toc (const int&) { return 0; } -#endif -static void print_times (const std::string& name, const double* const parts, - const int nparts) { -#ifdef SIQK_TIME - double total = 0; for (int i = 0; i < nparts; ++i) total += parts[i]; - printf("%20s %1.3e s %7.1f MB", name.c_str(), total, get_memusage()); - for (int i = 0; i < nparts; ++i) printf(" %1.3e s", parts[i]); - printf("\n"); -#endif -} -static void print_times (const std::string& name, const double total) { -#ifdef SIQK_TIME - printf("%20s %1.3e s %5.1f MB\n", name.c_str(), total, get_memusage()); -#endif -} - -KOKKOS_INLINE_FUNCTION static void error(const char* const msg) -{ ko::abort(msg); } - -typedef int Int; -typedef double Real; - -#ifdef KOKKOS_HAVE_CUDA -typedef ko::LayoutLeft Layout; -#else -typedef ko::LayoutRight Layout; -#endif - -// SIQK's array types. -typedef ko::View Vec3s; -typedef ko::View ConstVec3s; -typedef ko::View Vec6s; -typedef ko::View ConstVec6s; -typedef ko::View > RawVec3s; -typedef ko::View > RawConstVec3s; -typedef ko::View > RawArray; -typedef ko::View > RawConstArray; -typedef ko::View Idxs; -typedef ko::View ConstIdxs; -typedef ko::View Nodes; -typedef ko::View ConstNodes; - -// Get the host or device version of the array. -template struct InExeSpace { - typedef VT type; -}; -template struct InExeSpace { - typedef typename VT::HostMirror type; -}; - -#ifdef KOKKOS_HAVE_CUDA -// A 1D slice of an array. -template KOKKOS_FORCEINLINE_FUNCTION -ko::View > -slice (const VT& v, Int i) { return ko::subview(v, i, ko::ALL()); } -// An explicitly const 1D slice of an array. -template KOKKOS_FORCEINLINE_FUNCTION -ko::View > -const_slice (const VT& v, Int i) { return ko::subview(v, i, ko::ALL()); } -#else -template KOKKOS_FORCEINLINE_FUNCTION -typename VT::value_type* -slice (const VT& v, Int i) { return v.ptr_on_device() + v.dimension_1()*i; } - -template KOKKOS_FORCEINLINE_FUNCTION -typename VT::const_value_type* -const_slice (const VT& v, Int i) { return v.ptr_on_device() + v.dimension_1()*i; } -#endif - -// Number of slices in a 2D array, where each row is a slice. -template KOKKOS_FORCEINLINE_FUNCTION -Int nslices (const A2D& a) { return static_cast(a.dimension_0()); } - -// Number of entries in a 2D array's row. -template KOKKOS_FORCEINLINE_FUNCTION -Int szslice (const A2D& a) { return static_cast(a.dimension_1()); } - -template -KOKKOS_FORCEINLINE_FUNCTION -static void copy (V dst, CV src, const Int n) { - for (Int i = 0; i < n; ++i) dst[i] = src[i]; -} - -template -void resize_and_copy (DV& d, const SV& s, - typename std::enable_if::type* = 0) { - ko::resize(d, nslices(s)); - ko::deep_copy(d, s); -} - -template -void resize_and_copy (DV& d, const SV& s, - typename std::enable_if::type* = 0) { - ko::resize(d, nslices(s), szslice(s)); - ko::deep_copy(d, s); -} - -template -void hm_resize_and_copy (DV& d, const SA& s, const Int n) { - ko::resize(d, n); - auto d_hm = ko::create_mirror_view(d); - for (Int i = 0; i < n; ++i) d_hm[i] = s[i]; - ko::deep_copy(d, d_hm); -} - -// GPU-friendly replacements for std::min/max. -template KOKKOS_INLINE_FUNCTION -const T& min (const T& a, const T& b) { return a < b ? a : b; } -template KOKKOS_INLINE_FUNCTION -const T& max (const T& a, const T& b) { return a > b ? a : b; } -template KOKKOS_INLINE_FUNCTION -void swap (T& a, T&b) { - T tmp = a; - a = b; - b = tmp; -} -} +#include "siqk_geometry.hpp" +#include "siqk_search.hpp" +#include "siqk_intersect.hpp" +#include "siqk_quadrature.hpp" +#include "siqk_sqr.hpp" #endif diff --git a/siqk/siqk_defs.hpp b/siqk/siqk_defs.hpp new file mode 100644 index 0000000..0a0ae89 --- /dev/null +++ b/siqk/siqk_defs.hpp @@ -0,0 +1,200 @@ +#ifndef INCLUDE_SIQK_DEFS_HPP +#define INCLUDE_SIQK_DEFS_HPP + +#include +#include +#include +#include +#include + +#include + +#ifdef SIQK_TIME +# include +# include +# include +#endif + +// Always want this for GPU. +#define SIQK_NONRECURSIVE + +#ifdef KOKKOS_HAVE_CUDA +# define KOKKOS_CONSTANT __constant__ __device__ +#else +# define KOKKOS_CONSTANT +#endif + +namespace siqk { +namespace ko = Kokkos; +#define pr(m) do { \ + std::stringstream _ss_; \ + _ss_ << m << std::endl; \ + std::cerr << _ss_.str(); \ + } while (0) +#define prc(m) pr(#m << " | " << (m)) +#define puf(m)"(" << #m << " " << (m) << ")" +#define pu(m) << " " << puf(m) +template +static void prarr (const std::string& name, const T* const v, const size_t n) { + std::cerr << name << ": "; + for (size_t i = 0; i < n; ++i) std::cerr << " " << v[i]; + std::cerr << "\n"; +} + +#ifdef SIQK_TIME +static timeval tic () { + timeval t; + gettimeofday(&t, 0); + return t; +} +static double calc_et (const timeval& t1, const timeval& t2) { + static const double us = 1.0e6; + return (t2.tv_sec * us + t2.tv_usec - t1.tv_sec * us - t1.tv_usec) / us; +} +static double toc (const timeval& t1) { + Kokkos::fence(); + timeval t; + gettimeofday(&t, 0); + return calc_et(t1, t); +} +static double get_memusage () { + static const double scale = 1.0 / (1 << 10); // Memory in MB. + rusage ru; + getrusage(RUSAGE_SELF, &ru); + return ru.ru_maxrss*scale; +} +#else +static inline int tic () { return 0; } +static inline double toc (const int&) { return 0; } +#endif +static void print_times (const std::string& name, const double* const parts, + const int nparts) { +#ifdef SIQK_TIME + double total = 0; for (int i = 0; i < nparts; ++i) total += parts[i]; + printf("%20s %1.3e s %7.1f MB", name.c_str(), total, get_memusage()); + for (int i = 0; i < nparts; ++i) printf(" %1.3e s", parts[i]); + printf("\n"); +#endif +} +static void print_times (const std::string& name, const double total) { +#ifdef SIQK_TIME + printf("%20s %1.3e s %5.1f MB\n", name.c_str(), total, get_memusage()); +#endif +} + +KOKKOS_INLINE_FUNCTION static void error (const char* const msg) +{ ko::abort(msg); } + +KOKKOS_INLINE_FUNCTION static void message (const char* const msg) +{ printf("%s\n", msg); } + +typedef int Int; +typedef double Real; + +#ifdef KOKKOS_HAVE_CUDA +typedef ko::LayoutLeft Layout; +#else +typedef ko::LayoutRight Layout; +#endif + +// SIQK's array types. +typedef ko::View Vec3s; +typedef ko::View ConstVec3s; +typedef ko::View Vec6s; +typedef ko::View ConstVec6s; +typedef ko::View > RawVec3s; +typedef ko::View > RawConstVec3s; +typedef ko::View > RawArray; +typedef ko::View > RawConstArray; +typedef ko::View Idxs; +typedef ko::View ConstIdxs; +typedef ko::View Nodes; +typedef ko::View ConstNodes; + +// Decorator for a View. UnmanagedView gives the same view as +// ViewType, except the memory is unmanaged. +template +using UnmanagedView = ko::View< + typename ViewT::data_type, typename ViewT::array_layout, + typename ViewT::device_type, ko::MemoryTraits >; + +// Get the host or device version of the array. +template struct InExeSpace { + typedef VT type; +}; +template struct InExeSpace { + typedef typename VT::HostMirror type; +}; + +#ifdef KOKKOS_HAVE_CUDA +// A 1D slice of an array. +template KOKKOS_FORCEINLINE_FUNCTION +ko::View > +slice (const VT& v, Int i) { return ko::subview(v, i, ko::ALL()); } +// An explicitly const 1D slice of an array. +template KOKKOS_FORCEINLINE_FUNCTION +ko::View > +const_slice (const VT& v, Int i) { return ko::subview(v, i, ko::ALL()); } +#else +template KOKKOS_FORCEINLINE_FUNCTION +typename VT::value_type* +slice (const VT& v, Int i) { return v.ptr_on_device() + v.dimension_1()*i; } + +template KOKKOS_FORCEINLINE_FUNCTION +typename VT::const_value_type* +const_slice (const VT& v, Int i) { return v.ptr_on_device() + v.dimension_1()*i; } +#endif + +// Number of slices in a 2D array, where each row is a slice. +template KOKKOS_FORCEINLINE_FUNCTION +Int nslices (const A2D& a) { return static_cast(a.dimension_0()); } + +// Number of entries in a 2D array's row. +template KOKKOS_FORCEINLINE_FUNCTION +Int szslice (const A2D& a) { return static_cast(a.dimension_1()); } + +template +KOKKOS_INLINE_FUNCTION +static void copy (V dst, CV src, const Int n) { + for (Int i = 0; i < n; ++i) dst[i] = src[i]; +} + +template +void resize_and_copy (DV& d, const SV& s, + typename std::enable_if::type* = 0) { + ko::resize(d, nslices(s)); + ko::deep_copy(d, s); +} + +template +void resize_and_copy (DV& d, const SV& s, + typename std::enable_if::type* = 0) { + ko::resize(d, nslices(s), szslice(s)); + ko::deep_copy(d, s); +} + +template +void hm_resize_and_copy (DV& d, const SA& s, const Int n) { + ko::resize(d, n); + auto d_hm = ko::create_mirror_view(d); + for (Int i = 0; i < n; ++i) d_hm[i] = s[i]; + ko::deep_copy(d, d_hm); +} + +// GPU-friendly replacements for std::min/max. +template KOKKOS_INLINE_FUNCTION +const T& min (const T& a, const T& b) { return a < b ? a : b; } +template KOKKOS_INLINE_FUNCTION +const T& max (const T& a, const T& b) { return a > b ? a : b; } +template KOKKOS_INLINE_FUNCTION +void swap (T& a, T&b) { + T tmp = a; + a = b; + b = tmp; +} +template KOKKOS_INLINE_FUNCTION constexpr T square (const T& x) { return x*x; } +} + +#endif diff --git a/siqk/siqk_geometry.hpp b/siqk/siqk_geometry.hpp new file mode 100644 index 0000000..0fc7fb6 --- /dev/null +++ b/siqk/siqk_geometry.hpp @@ -0,0 +1,287 @@ +#ifndef INCLUDE_SIQK_GEOMETRY_HPP +#define INCLUDE_SIQK_GEOMETRY_HPP + +#include "siqk_defs.hpp" +#include "siqk_quadrature.hpp" + +namespace siqk { + +struct PlaneGeometry { + template KOKKOS_INLINE_FUNCTION + static void scale (const Real& a, V v) { + v[0] *= a; v[1] *= a; + } + template KOKKOS_INLINE_FUNCTION + static Real dot_c_amb (const CV c, const CV a, const CV b) { + return c[0]*(a[0] - b[0]) + c[1]*(a[1] - b[1]); + } + template KOKKOS_INLINE_FUNCTION + static void combine (const CV u, const CV v, const Real& a, V x) { + const Real& oma = 1 - a; + x[0] = oma*u[0] + a*v[0]; + x[1] = oma*u[1] + a*v[1]; + } + + template KOKKOS_INLINE_FUNCTION + static void edge_normal (const CV e1, const CV e2, V en) { + en[0] = e1[1] - e2[1]; + en[1] = e2[0] - e1[0]; + } + + template KOKKOS_INLINE_FUNCTION + static bool inside (const CV v, const CV e1, const CV en) { + return dot_c_amb(en, v, e1) >= 0; + } + + template KOKKOS_INLINE_FUNCTION + static void intersect (const CV v1, const CV v2, const CV e1, const CV en, + V intersection) { + Real a; { + const Real + num = dot_c_amb(en, e1, v1), + den = dot_c_amb(en, v2, v1); + a = num == 0 || den == 0 ? 0 : num/den; + a = a < 0 ? 0 : a > 1 ? 1 : a; + } + combine(v1, v2, a, intersection); + } + + template KOKKOS_INLINE_FUNCTION + static bool output (const CV v, Int& no, const V vo) { +#ifdef SIQK_DEBUG + if (no >= nslices(vo)) { + std::stringstream ss; + ss << "output: No room in vo; vo.n() is " << nslices(vo) << " but no is " + << no << "\n"; + message(ss.str().c_str()); + } +#endif + if (no >= nslices(vo)) return false; + vo(no,0) = v[0]; + vo(no,1) = v[1]; + ++no; + return true; + } + + //todo Handle non-convex case. + template + KOKKOS_INLINE_FUNCTION + static Real calc_area (const TriangleQuadrature& , const CV3s& v, + const Int n) { + return calc_area_formula(v, n); + } + + template + KOKKOS_INLINE_FUNCTION + static Real calc_area_formula (const CV3s& v, const Int n) { + Real area = 0; + for (Int i = 1, ilim = n - 1; i < ilim; ++i) { + Real v1[2], v2[2]; + v1[0] = v(i,0) - v(0,0); + v1[1] = v(i,1) - v(0,1); + v2[0] = v(i+1,0) - v(0,0); + v2[1] = v(i+1,1) - v(0,1); + const Real a = v1[0]*v2[1] - v1[1]*v2[0]; + area += a; + } + return 0.5*area; + } +}; + +// All inputs and outputs are relative to the unit-radius sphere. +struct SphereGeometry { + template KOKKOS_INLINE_FUNCTION + static void cross (const CV a, const CV b, V c) { + c[0] = a[1]*b[2] - a[2]*b[1]; + c[1] = a[2]*b[0] - a[0]*b[2]; + c[2] = a[0]*b[1] - a[1]*b[0]; + } + template KOKKOS_INLINE_FUNCTION + static Real dot (const CV a, const CV b) { + return a[0]*b[0] + a[1]*b[1] + a[2]*b[2]; + } + template KOKKOS_INLINE_FUNCTION + static Real norm2 (const CV v) { + return dot(v, v); + } + template KOKKOS_INLINE_FUNCTION + static void scale (const Real& a, V v) { + v[0] *= a; v[1] *= a; v[2] *= a; + } + template KOKKOS_INLINE_FUNCTION + static void normalize (V v) { + scale(1.0/std::sqrt(norm2(v)), v); + } + template KOKKOS_INLINE_FUNCTION + static Real dot_c_amb (const CV c, const CV a, const CV b) { + return c[0]*(a[0] - b[0]) + c[1]*(a[1] - b[1]) + c[2]*(a[2] - b[2]); + } + template KOKKOS_INLINE_FUNCTION + static void axpy (const Real& a, const CV x, V y) { + y[0] += a*x[0]; + y[1] += a*x[1]; + y[2] += a*x[2]; + } + template KOKKOS_INLINE_FUNCTION + static void axpbyz (const Real& a, const CV x, const Real& b, const CV y, + V z) { + z[0] = a*x[0] + b*y[0]; + z[1] = a*x[1] + b*y[1]; + z[2] = a*x[2] + b*y[2]; + } + template KOKKOS_INLINE_FUNCTION + static void copy (V d, const CV s) { + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + } + template KOKKOS_INLINE_FUNCTION + static void combine (const CV u, const CV v, const Real& a, V x) { + const Real& oma = 1 - a; + x[0] = oma*u[0] + a*v[0]; + x[1] = oma*u[1] + a*v[1]; + x[2] = oma*u[2] + a*v[2]; + } + + template KOKKOS_INLINE_FUNCTION + static void edge_normal (const CV a, const CV b, V en) { + cross(a, b, en); + normalize(en); + } + + // Is v inside the line anchored at a with inward-facing normal n? + template KOKKOS_INLINE_FUNCTION + static bool inside (const CV v, const CV a, const CV n) { + return dot_c_amb(n, v, a) >= 0; + } + + /* Let + en = edge normal + e1 = edge starting point + d = en' e1 + v(a) = (1 - a) v1 + a v2. + Solve n' v = d for a: + a = (en' (e1 - v1)) / (en' (v2 - v1)). + Then uvec(v(a)) is the intersection point on the unit sphere. Assume + intersection exists. (Already filtered by 'inside'.) + */ + template KOKKOS_INLINE_FUNCTION + static void intersect (const CV v1, const CV v2, const CV e1, const CV en, + V intersection) { + Real a; { + const Real + num = dot_c_amb(en, e1, v1), + den = dot_c_amb(en, v2, v1); + a = num == 0 || den == 0 ? 0 : num/den; + a = a < 0 ? 0 : a > 1 ? 1 : a; + } + combine(v1, v2, a, intersection); + normalize(intersection); + } + + template KOKKOS_INLINE_FUNCTION + static bool output (const CV v, Int& no, V vo) { +#ifdef SIQK_DEBUG + if (no >= nslices(vo)) { + std::stringstream ss; + ss << "output: No room in vo; vo.n() is " << nslices(vo) << " but no is " + << no << "\n"; + message(ss.str().c_str()); + } +#endif + if (no >= nslices(vo)) return false; + vo(no,0) = v[0]; + vo(no,1) = v[1]; + vo(no,2) = v[2]; + ++no; + return true; + } + + //todo Handle non-convex case. + // This uses a terrible formula, but it's just for testing. + template + KOKKOS_INLINE_FUNCTION + static Real calc_area_formula (const CV3s& v, const Int n) { + Real area = 0; + for (Int i = 1, ilim = n - 1; i < ilim; ++i) { + const Real a = calc_arc_length(slice(v,0), slice(v,i)); + const Real b = calc_arc_length(slice(v,i), slice(v,i+1)); + const Real c = calc_arc_length(slice(v,i+1), slice(v,0)); + const Real s = 0.5*(a + b + c); + const Real d = (std::tan(0.5*s)*std::tan(0.5*(s-a))* + std::tan(0.5*(s-b))*std::tan(0.5*(s-c))); + if (d <= 0) continue; + area += 4*std::atan(std::sqrt(d)); + } + return area; + } + template KOKKOS_INLINE_FUNCTION + static Real calc_arc_length (const CV a, const CV b) { + const Real d = dot(a, b); + if (d >= 1) return 0; + return acos(d); + } + + template + KOKKOS_INLINE_FUNCTION + static Real calc_area (const TriangleQuadrature& q, const CV3s& v, + const Int n) { + Real area = 0, u[3]; + for (Int i = 1, ilim = n - 1; i < ilim; ++i) { + Real a = 0; + RawConstVec3s coord; + RawConstArray weight; + q.get_coef(8, coord, weight); + for (Int k = 0, klim = nslices(coord); k < klim; ++k) { + const Real jac = calc_tri_jacobian(slice(v,0), slice(v,i), slice(v,i+1), + slice(coord, k), u); + a += weight[k]*jac; + } + area += 0.5*a; + } + return area; + } + + template + KOKKOS_INLINE_FUNCTION + static Real calc_tri_jacobian (const CV v1, const CV v2, const CV v3, + const CA alpha, Real u[3]) { + // V(:,i) is vertex i of the spherical triangle on the unit sphere. The + // coefs + // alpha = [a1, a2, 1 - a1 - a2]' + // = [1 0; 0 1; -1 -1] [a1, a2]' + // = alpha_a a + // (barycentric coords) give the location + // v = V alpha + // on the planar triangle, and u = uvec(v) is the point on the unit sphere. + // For a planar tri in 3D, the jacobian is + // v_a = v_alpha alpha_a + // = V [1 0; 0 1; -1 -1] + // J = norm(cross(v_a(:,1), v_a(:,2))). + // For a spherical tri with the same vertices, + // u = v/(v' v)^{1/2} + // u_a = u_alpha alpha_a + // = (v'v)^{-1/2} (I - u u') V alpha_a + // = (v'v)^{-1/2} (I - u u') v_a + // J = norm(cross(u_a(:,1), u_a(:,2))). + for (Int k = 0; k < 3; ++k) u[k] = 0; + axpy(alpha[0], v1, u); + axpy(alpha[1], v2, u); + axpy(alpha[2], v3, u); + const auto oovn = 1/std::sqrt(norm2(u)); + scale(oovn, u); + Real u_a[3][3]; + axpbyz(1, v1, -1, v3, u_a[0]); + axpbyz(1, v2, -1, v3, u_a[1]); + for (int i = 0; i < 2; ++i) { + axpy(-dot(u, u_a[i]), u, u_a[i]); + scale(oovn, u_a[i]); + } + cross(u_a[0], u_a[1], u_a[2]); + return std::sqrt(norm2(u_a[2])); + } +}; + +} // namespace siqk + +#endif diff --git a/siqk/siqk_intersect.hpp b/siqk/siqk_intersect.hpp index 5d008bd..fbc6dfd 100644 --- a/siqk/siqk_intersect.hpp +++ b/siqk/siqk_intersect.hpp @@ -1,256 +1,12 @@ #ifndef INCLUDE_SIQK_INTERSECT_HPP #define INCLUDE_SIQK_INTERSECT_HPP -#include "siqk.hpp" +#include "siqk_defs.hpp" +#include "siqk_geometry.hpp" +#include "siqk_search.hpp" #include "siqk_quadrature.hpp" namespace siqk { -struct PlaneGeometry { - template KOKKOS_INLINE_FUNCTION - static void scale (const Real& a, V v) { - v[0] *= a; v[1] *= a; - } - template KOKKOS_INLINE_FUNCTION - static Real dot_c_amb (const CV c, const CV a, const CV b) { - return c[0]*(a[0] - b[0]) + c[1]*(a[1] - b[1]); - } - template KOKKOS_INLINE_FUNCTION - static void combine (const CV u, const CV v, const Real& a, V x) { - const Real& oma = 1 - a; - x[0] = oma*u[0] + a*v[0]; - x[1] = oma*u[1] + a*v[1]; - } - - template KOKKOS_INLINE_FUNCTION - static void edge_normal (const CV e1, const CV e2, V en) { - en[0] = e1[1] - e2[1]; - en[1] = e2[0] - e1[0]; - } - - template KOKKOS_INLINE_FUNCTION - static bool inside (const CV v, const CV e1, const CV en) { - return dot_c_amb(en, v, e1) >= 0; - } - - template KOKKOS_INLINE_FUNCTION - static void intersect (const CV v1, const CV v2, const CV e1, const CV en, - V intersection) { - const Real& a = dot_c_amb(en, e1, v1) / dot_c_amb(en, v2, v1); - combine(v1, v2, a, intersection); - } - - template KOKKOS_INLINE_FUNCTION - static bool output (const CV v, Int& no, const V& vo) { -#ifdef SIQK_DEBUG - if (no >= nslices(vo)) { - std::stringstream ss; - ss << "output: No room in vo; vo.n() is " << vo.n() << " but no is " - << no << "\n"; - error(ss.str().c_str()); - } -#endif - if (no >= nslices(vo)) return false; - vo(no,0) = v[0]; - vo(no,1) = v[1]; - ++no; - return true; - } - - //todo Handle non-convex case. - template - KOKKOS_INLINE_FUNCTION - static Real calc_area (const CV3s& v, const Int n) { - Real area = 0; - for (Int i = 1, ilim = n - 1; i < ilim; ++i) { - Real v1[2], v2[2]; - v1[0] = v(i,0) - v(0,0); - v1[1] = v(i,1) - v(0,1); - v2[0] = v(i+1,0) - v(0,0); - v2[1] = v(i+1,1) - v(0,1); - const Real a = v1[0]*v2[1] - v1[1]*v2[0]; - area += a; - } - return 0.5*area; - } - - template - KOKKOS_INLINE_FUNCTION - static Real calc_area_formula (const CV3s& v, const Int n) { - return calc_area(v, n); - } -}; - -// All inputs and outputs are relative to the unit-radius sphere. -struct SphereGeometry { - template KOKKOS_INLINE_FUNCTION - static void cross (const CV a, const CV b, V c) { - c[0] = a[1]*b[2] - a[2]*b[1]; - c[1] = a[2]*b[0] - a[0]*b[2]; - c[2] = a[0]*b[1] - a[1]*b[0]; - } - template KOKKOS_INLINE_FUNCTION - static Real dot (const CV a, const CV b) { - return a[0]*b[0] + a[1]*b[1] + a[2]*b[2]; - } - template KOKKOS_INLINE_FUNCTION - static Real norm2 (const CV v) { - return dot(v, v); - } - template KOKKOS_INLINE_FUNCTION - static void scale (const Real& a, V v) { - v[0] *= a; v[1] *= a; v[2] *= a; - } - template KOKKOS_INLINE_FUNCTION - static void normalize (V v) { - scale(1.0/std::sqrt(norm2(v)), v); - } - template KOKKOS_INLINE_FUNCTION - static Real dot_c_amb (const CV c, const CV a, const CV b) { - return c[0]*(a[0] - b[0]) + c[1]*(a[1] - b[1]) + c[2]*(a[2] - b[2]); - } - template KOKKOS_INLINE_FUNCTION - static void axpy (const Real& a, const CV& x, V& y) { - y[0] += a*x[0]; - y[1] += a*x[1]; - y[2] += a*x[2]; - } - template KOKKOS_INLINE_FUNCTION - static void axpbyz (const Real& a, const CV& x, const Real& b, const CV& y, - V& z) { - z[0] = a*x[0] + b*y[0]; - z[1] = a*x[1] + b*y[1]; - z[2] = a*x[2] + b*y[2]; - } - template KOKKOS_INLINE_FUNCTION - static void combine (const CV& u, const CV& v, const Real& a, V& x) { - const Real& oma = 1 - a; - x[0] = oma*u[0] + a*v[0]; - x[1] = oma*u[1] + a*v[1]; - x[2] = oma*u[2] + a*v[2]; - } - - template KOKKOS_INLINE_FUNCTION - static void edge_normal (const CV a, const CV b, V en) { - cross(a, b, en); - normalize(en); - } - - // Is v inside the line anchored at a with inward-facing normal n? - template KOKKOS_INLINE_FUNCTION - static bool inside (const CV& v, const CV& a, const CV& n) { - return dot_c_amb(n, v, a) >= 0; - } - - /* Let - en = edge normal - e1 = edge starting point - d = en' e1 - v(a) = (1 - a) v1 + a v2. - Solve n' v = d for a: - a = (en' (e1 - v1)) / (en' (v2 - v1)). - Then uvec(v(a)) is the intersection point on the unit sphere. Assume - intersection exists. (Already filtered by 'inside'.) - */ - template KOKKOS_INLINE_FUNCTION - static void intersect (const CV v1, const CV v2, const CV e1, const CV en, - V intersection) { - const Real a = dot_c_amb(en, e1, v1) / dot_c_amb(en, v2, v1); - combine(v1, v2, a, intersection); - normalize(intersection); - } - - template KOKKOS_INLINE_FUNCTION - static bool output (const CV v, Int& no, V& vo) { - if (no >= nslices(vo)) return false; - vo(no,0) = v[0]; - vo(no,1) = v[1]; - vo(no,2) = v[2]; - ++no; - return true; - } - - //todo Handle non-convex case. - // This uses a terrible formula, but it's just for testing. - template - KOKKOS_INLINE_FUNCTION - static Real calc_area_formula (const CV3s& v, const Int n) { - Real area = 0; - for (Int i = 1, ilim = n - 1; i < ilim; ++i) { - const Real a = calc_arc_length(slice(v,0), slice(v,i)); - const Real b = calc_arc_length(slice(v,i), slice(v,i+1)); - const Real c = calc_arc_length(slice(v,i+1), slice(v,0)); - const Real s = 0.5*(a + b + c); - const Real d = (std::tan(0.5*s)*std::tan(0.5*(s-a))* - std::tan(0.5*(s-b))*std::tan(0.5*(s-c))); - if (d <= 0) continue; - area += 4*std::atan(std::sqrt(d)); - } - return area; - } - template KOKKOS_INLINE_FUNCTION - static Real calc_arc_length (const CV a, const CV b) { - const Real d = dot(a, b); - if (d >= 1) return 0; - return acos(d); - } - - template - KOKKOS_INLINE_FUNCTION - static Real calc_area (const CV3s& v, const Int n) { - Real area = 0; - for (Int i = 1, ilim = n - 1; i < ilim; ++i) { - Real a = 0; - RawConstVec3s coord; - RawConstArray weight; - quadrature::get_coef(4, coord, weight); - for (Int k = 0, klim = nslices(coord); k < klim; ++k) { - const Real jac = calc_tri_jacobian(slice(v,0), slice(v,i), slice(v,i+1), - slice(coord, k)); - a += weight[k]*jac; - } - area += 0.5*a; - } - return area; - } - template - KOKKOS_INLINE_FUNCTION - static Real calc_tri_jacobian (const CV& v1, const CV& v2, const CV& v3, - const CA& alpha) { - // V(:,i) is vertex i of the spherical triangle on the unit sphere. The - // coefs - // alpha = [a1, a2, 1 - a1 - a2]' - // = [1 0; 0 1; -1 -1] [a1, a2]' - // = alpha_a a - // (barycentric coords) give the location - // v = V alpha - // on the planar triangle, and u = uvec(v) is the point on the unit sphere. - // For a planar tri in 3D, the jacobian is - // v_a = v_alpha alpha_a - // = V [1 0; 0 1; -1 -1] - // J = norm(cross(v_a(:,1), v_a(:,2))). - // For a spherical tri with the same vertices, - // u = v/(v' v) - // u_a = u_alpha alpha_a - // = (v'v)^{-1/2} (I - u u') V alpha_a - // = (v'v)^{-1/2} (I - u u') v_a - // J = norm(cross(u_a(:,1), u_a(:,2))). - Real u[3] = {0}; - axpy(alpha[0], v1, u); - axpy(alpha[1], v2, u); - axpy(alpha[2], v3, u); - const auto oovn = 1/std::sqrt(norm2(u)); - scale(oovn, u); - Real u_a[2][3]; - axpbyz(1, v1, -1, v3, u_a[0]); - axpbyz(1, v2, -1, v3, u_a[1]); - for (int i = 0; i < 2; ++i) { - axpy(-dot(u, u_a[i]), u, u_a[i]); - scale(oovn, u_a[i]); - } - cross(u_a[0], u_a[1], u); - return std::sqrt(norm2(u)); - } -}; // Sutherland-Hodgmann polygon clipping algorithm. Follow Foley, van Dam, // Feiner, Hughes Fig 3.49. @@ -379,14 +135,15 @@ bool clip_against_poly ( // Not used for real stuff; just a convenient version for testing. In this // version, clip_poly is a list of clip polygon vertices. This is instead of the // mesh data structure. -template +template KOKKOS_INLINE_FUNCTION bool clip_against_poly ( // Clip polygon. - const CV3s& clip_poly, + const CV3s_CP& clip_poly, // Clip polygon edges' inward-facing normals. - const CV3s& clip_edge_normals, - const CV3s& vi, const Int ni, + const CV3s_CEN& clip_edge_normals, + const CV3s_VI& vi, const Int ni, V3s& vo, Int& no, V3s& wrk) { @@ -422,347 +179,6 @@ bool clip_against_poly ( } } // namespace sh -// Oct-tree. Might do something else better suited to the sphere later. -template -class Octree { -public: - enum { max_depth = max_depth_ }; - typedef Real BoundingBox[6]; - - struct Options { - // Do not go beyond max_depth_ depth, including the root and leaf. With this - // constraInt, try to go deep enough so that a leaf has no more than - // max_nelem elements. - Int max_nelem; - Options () : max_nelem(8) {} - }; - - // Bounding box for a cluster of points ps (possibly vertices). - //todo kernelize - template - static void calc_bb (const CV3s& ps, const Int np, BoundingBox bb) { - if (np == 0) return; - for (Int j = 0; j < 3; ++j) - bb[j] = bb[j+3] = ps(0,j); - for (Int i = 1; i < np; ++i) - for (Int j = 0; j < 3; ++j) { - bb[j] = min(bb[j], ps(i,j)); - bb[j+3] = max(bb[j+3], ps(i,j)); - } - } - - template - static void calc_bb (const CV3s& ps, BoundingBox bb) { - calc_bb(ps, nslices(ps), bb); - } - - template - KOKKOS_INLINE_FUNCTION - static void calc_bb (const CV3s& p, const CIV& e, const Int ne, V ebb) { - for (Int j = 0; j < 3; ++j) - ebb[j] = ebb[j+3] = p(e[0], j); - for (Int i = 1; i < ne; ++i) { - if (e[i] == -1) break; - for (Int j = 0; j < 3; ++j) { - ebb[j] = min(ebb[j], p(e[i], j)); - ebb[j+3] = max(ebb[j+3], p(e[i], j)); - } - } - } - - //todo kernelize - template - static void calc_bb (const CV3s& p, const CIs& e, V6s& ebbs) { - assert(nslices(ebbs) == nslices(e)); - for (Int k = 0, klim = nslices(e); k < klim; ++k) - calc_bb(p, slice(e, k), szslice(e), slice(ebbs, k)); - } - - // p is a 3xNp array of points. e is a KxNe array of elements. An entry <0 is - // ignored. All <0 entries must be at the end of an element's list. - Octree (const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e, - const Options& o) { - init(p, e, o); - } - Octree (const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e) { - Options o; - init(p, e, o); - } - - // Apply f to every element in leaf nodes with which bb overlaps. f must have - // function - // void operator(const Int element). - template - KOKKOS_INLINE_FUNCTION - void apply (const CV bb, Functor& f) const { - if (nslices(nodes_) == 0) { - for (Int i = 0; i < offsets_[1]; ++i) - f(elems_[i]); - return; - } -#ifdef SIQK_NONRECURSIVE - // Non-recursive impl. - { - // Stack. - Real snbb[8*max_depth_]; - Int sni[max_depth_], si[max_depth_]; - Int sp = 0; - // Args for top-level call. - copy(snbb, bb_, 8); - sni[sp] = 0; - si[sp] = 0; - while (sp >= 0) { - // Get stack frame's (nbb, ni, current i) values. - const Int i = si[sp]; - if (i == 8) { - --sp; - continue; - } - // Increment stored value of i for next iteration. Current value is - // stored in 'i' above. - ++si[sp]; - const Int ni = sni[sp]; - const Real* const nbb = snbb + 8*sp; - // Can use the next stack frame's bb space for a child bb. - Real* const child_bb = snbb + 8*(sp+1); - fill_child_bb(nbb, i, child_bb); - if ( ! do_bb_overlap(child_bb, bb)) continue; - Int e = nodes_(ni,i); - if (e < 0) { - // Leaf, so apply functor to each element. - e = std::abs(e + 1); - for (Int k = offsets_[e]; k < offsets_[e+1]; ++k) - f(elems_[k]); - } else if (e > 0) { - // Recurse. - ++sp; - sni[sp] = e; - si[sp] = 0; - } - } - } -#else - apply_r(0, bb_, bb, f); -#endif - } - -private: - /* Each node in the oct-tree contains 8 integers, stored in 'nodes'. - - >0 is an index Into 'nodes', pointing to a child node. - - A <=0 entry in 'nodes' indicates a leaf node. If 0, there are no elements - in the leaf. If <0, the negative of the entry minus 1 is the index of an - offset array indexing 'elems'. - - Each segment of 'elems' contains a list of element indices covered by a - leaf node. Element indices refer to the list of elements the caller - provides during oct-tree construction. - */ - - // Static data structures holding the completed octree. - // nodes(:,i) is a list. The list includes children of node i (>0) and leaf - // node data (<=0). - //todo Make these const once ready to do full GPU stuff. - Nodes nodes_; - // A leaf node corresponding to -k covers elements - // elems[offset[k] : offset[k]-1]. - ko::View offsets_, elems_; - // Root node's bounding box. - BoundingBox bb_; - - // Dynamic data structures for construction phase. - class IntList { - Int* const buf_; - Int i_; - public: - IntList (Int* const buf) : buf_(buf), i_(0) {} - void reset () { i_ = 0; } - void push (const Int& i) { buf_[i_++] = i; } - Int* data () { return buf_; } - Int n () const { return i_; } - const Int& operator[] (const Int& i) const { return buf_[i]; } - }; - - class DynIntList { - std::vector buf_; - public: - DynIntList () {} - void push (const Int& i) { buf_.push_back(i); } - Int& back () { return buf_.back(); } - Int& operator[] (const size_t i) { - if (i >= buf_.size()) - buf_.resize(i+1); - return buf_[i]; - } - const Int& operator[] (const size_t i) const { return buf_[i]; } - Int n () const { return static_cast(buf_.size()); } - const Int* data () const { return buf_.data(); } - }; - - // Opposite index slot convention. - class DynNodes { - std::vector buf_; - public: - Int n () const { return static_cast(buf_.size()) >> 3; } - const Int* data () const { return buf_.data(); } - Int& operator() (const Int& r, const Int& c) { - const size_t ec = (c+1) << 3; - if (ec >= buf_.size()) - buf_.resize(ec); - return const_cast( - const_cast(this)->operator()(r, c)); - } - const Int& operator() (const Int& r, const Int& c) const { - assert(((c << 3) + r) >= 0); - assert(((c << 3) + r) < (Int) buf_.size()); - return buf_[(c << 3) + r]; - } - }; - - void init (const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e, - const Options& o) { - if (nslices(e) == 0) return; - // Get OT's bounding box. - calc_bb(p, bb_); - // Get elements' bounding boxes. - Vec6s::HostMirror ebbs("ebbs", nslices(e), 6); - calc_bb(p, e, ebbs); - // Static element lists for work. Each level has active work space. - std::vector buf(max_depth_*nslices(e)); - IntList es(buf.data()), wrk(buf.data() + nslices(e)); - for (Int i = 0, ilim = nslices(e); i < ilim; ++i) - es.push(i); - // Dynamic element lists. - DynIntList offsets, elems; - offsets[0] = 0; - // Dynamic node data structure. - DynNodes nodes; - // Recurse. We don't care about the return value. If it's 0 and nodes.n() == - // 0, we'll detect as much in 'apply'. - init_r(1, bb_, ebbs, o, es, wrk, offsets, elems, nodes); - // Build the static data structures. - if (elems.n() == 0) return; - init_static_ds(nodes, offsets, elems); - } - - Int init_r (const Int depth, // Tree's depth at this point, including root. - const BoundingBox& nbb, // My bounding box. - const ConstVec6s::HostMirror& ebbs, // All elements' bounding boxes. - const Options& o, // Options controlling construct of the tree. - IntList& es, // List of elements in my bounding box. - IntList& wrk, // Work space to store working element lists. - DynIntList& offsets, // Offsetss Into elems. - DynIntList& elems, // Elements belonging to leaf nodes. - DynNodes& nodes) // Dynamic nodes data structure. - { - const Int my_idx = nodes.n(); // My node index. - // Decide what to do. - if (es.n() == 0) { - // I have no elements, so return 0 to indicate I'm a leaf node containing - // nothing. - return 0; - } else if (es.n() <= o.max_nelem || depth == max_depth_) { - // I'm a leaf node with elements. Store my list of elements and return the - // storage location. - const Int os = offsets.back(); - offsets.push(os + es.n()); - for (Int i = 0, n = es.n(); i < n; ++i) - elems[os + i] = es[i]; - return 1 - offsets.n(); - } else { - // I'm not a leaf node. - nodes(0, my_idx) = 0; // Insert myself Into the nodes array. - for (Int ic = 0; ic < 8; ++ic) { - BoundingBox child_bb; - fill_child_bb(nbb, ic, child_bb); - // Find the elements that are in this child's bb. - IntList ces(wrk.data()); - for (Int i = 0, n = es.n(); i < n; ++i) - if (do_bb_overlap(child_bb, slice(ebbs, es[i]))) - ces.push(es[i]); - // Create some work space. - IntList cwrk(wrk.data() + ces.n()); - // Recurse. - const Int child_idx = init_r(depth+1, child_bb, ebbs, o, ces, cwrk, - offsets, elems, nodes); - nodes(ic, my_idx) = child_idx; - } - return my_idx; - } - } - - void init_static_ds (const DynNodes nodes, const DynIntList& offsets, - const DynIntList& elems) { - { - ko::resize(nodes_, nodes.n(), 8); - auto nodes_hm = ko::create_mirror_view(nodes_); - for (Int i = 0; i < nodes.n(); ++i) - for (Int j = 0; j < 8; ++j) - nodes_hm(i,j) = nodes(j,i); - ko::deep_copy(nodes_, nodes_hm); - } - hm_resize_and_copy(offsets_, offsets, offsets.n()); - hm_resize_and_copy(elems_, elems, elems.n()); - } - - // Using parent bb p, fill child bb c, with child_idx in 0:7. - template - KOKKOS_INLINE_FUNCTION - static void fill_child_bb (const CBB& p, const Int& child_idx, BB& c) { - const Real m[] = { 0.5*(p[0] + p[3]), - 0.5*(p[1] + p[4]), - 0.5*(p[2] + p[5]) }; - switch (child_idx) { - case 0: c[0] = p[0]; c[1] = p[1]; c[2] = p[2]; c[3] = m[0]; c[4] = m[1]; c[5] = m[2]; break; - case 1: c[0] = m[0]; c[1] = p[1]; c[2] = p[2]; c[3] = p[3]; c[4] = m[1]; c[5] = m[2]; break; - case 2: c[0] = m[0]; c[1] = m[1]; c[2] = p[2]; c[3] = p[3]; c[4] = p[4]; c[5] = m[2]; break; - case 3: c[0] = p[0]; c[1] = m[1]; c[2] = p[2]; c[3] = m[0]; c[4] = p[4]; c[5] = m[2]; break; - case 4: c[0] = p[0]; c[1] = p[1]; c[2] = m[2]; c[3] = m[0]; c[4] = m[1]; c[5] = p[5]; break; - case 5: c[0] = m[0]; c[1] = p[1]; c[2] = m[2]; c[3] = p[3]; c[4] = m[1]; c[5] = p[5]; break; - case 6: c[0] = m[0]; c[1] = m[1]; c[2] = m[2]; c[3] = p[3]; c[4] = p[4]; c[5] = p[5]; break; - case 7: c[0] = p[0]; c[1] = m[1]; c[2] = m[2]; c[3] = m[0]; c[4] = p[4]; c[5] = p[5]; break; - default: - // impossible - error("fill_child_bb: The impossible has happened."); - } - } - - // Do bounding boxes a and b overlap? - template - KOKKOS_INLINE_FUNCTION - static bool do_bb_overlap (const BoundingBox a, const BB b) { - for (Int i = 0; i < 3; ++i) - if ( ! do_lines_overlap(a[i], a[i+3], b[i], b[i+3])) - return false; - return true; - } - - KOKKOS_INLINE_FUNCTION - static bool do_lines_overlap (const Real& a1, const Real& a2, - const Real& b1, const Real& b2) { - return ! (a2 < b1 || a1 > b2); - } - - template KOKKOS_INLINE_FUNCTION - void apply_r (const Int ni, const BoundingBox& nbb, const CV bb, - Functor& f) const { - for (Int i = 0; i < 8; ++i) { - BoundingBox child_bb; - fill_child_bb(nbb, i, child_bb); - if ( ! do_bb_overlap(child_bb, bb)) continue; - Int e = nodes_(ni,i); - if (e > 0) - apply_r(e, child_bb, bb, f); - else if (e < 0) { - e = std::abs(e + 1); - for (Int k = offsets_[e]; k < offsets_[e+1]; ++k) - f(elems_[k]); - } - } - } -}; - namespace test { static constexpr Int max_nvert = 20; static constexpr Int max_hits = 25; // Covers at least a 2-halo. @@ -802,6 +218,7 @@ void fill_normals (sh::Mesh& m) { // be small and static. Need to think about this. template class AreaOTFunctor { + const TriangleQuadrature quad_; const sh::Mesh<>& cm_; const ConstVec3s& p_; const ConstIdxs& e_; @@ -842,7 +259,7 @@ class AreaOTFunctor { ++ni; } sh::clip_against_poly(cm_, mesh_elem_idx, vi, ni, vo, no, wrk); - if (no) area_ += geo::calc_area(vo, no); + if (no) area_ += geo::calc_area(quad_, vo, no); } if (no) { // Non-0 intersection, so record. @@ -854,10 +271,8 @@ class AreaOTFunctor { KOKKOS_INLINE_FUNCTION const Real& area () const { return area_; } }; -template -class TestAreaOTFunctor { - typedef Octree OctreeT; - +template +class TestAreaOTKernel { const sh::Mesh<> cm_; const OctreeT ot_; mutable ConstVec3s p_; @@ -866,9 +281,9 @@ class TestAreaOTFunctor { public: typedef Real value_type; - TestAreaOTFunctor (const sh::Mesh& cm, - const ConstVec3s::HostMirror& p_hm, - const ConstIdxs::HostMirror& e_hm, const OctreeT& ot) + TestAreaOTKernel (const sh::Mesh& cm, + const ConstVec3s::HostMirror& p_hm, + const ConstIdxs::HostMirror& e_hm, const OctreeT& ot) : cm_(cm), ot_(ot) { { Vec3s p; resize_and_copy(p, p_hm); p_ = p; } @@ -886,34 +301,35 @@ class TestAreaOTFunctor { ot_.apply(ebb, f); area += f.area(); } + + KOKKOS_INLINE_FUNCTION + void join (volatile value_type& dst, volatile value_type const& src) const + { dst += src; } }; template Real test_area_ot ( const ConstVec3s::HostMirror& cp, const ConstIdxs::HostMirror& ce, const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e) { - typedef Octree<10> OctreeT; + typedef Octree OctreeT; // Clip mesh and edge normal calculation. (In practice, we'd like to use // higher-quality edge normals.) sh::Mesh cm; cm.p = cp; cm.e = ce; fill_normals(cm); - Real et[2]; + Real et[2] = {0}; auto t = tic(); // Oct-tree over the clip mesh. OctreeT ot(cp, ce); et[0] = toc(t); Real area = 0; - TestAreaOTFunctor f(cm, p, e, ot); + TestAreaOTKernel f(cm, p, e, ot); t = tic(); ko::parallel_reduce(nslices(e), f, area); et[1] = toc(t); -#ifdef SIQK_TIME - printf("%10d", nslices(ce)); print_times("test_area_ot", et, 2); -#endif return area; } } // namespace test diff --git a/siqk/siqk_quadrature.hpp b/siqk/siqk_quadrature.hpp index 2684fe9..8f2a300 100644 --- a/siqk/siqk_quadrature.hpp +++ b/siqk/siqk_quadrature.hpp @@ -1,10 +1,14 @@ #ifndef INCLUDE_SIQK_QUADRATURE_HPP #define INCLUDE_SIQK_QUADRATURE_HPP -#include "siqk.hpp" +#include "siqk_defs.hpp" namespace siqk { -namespace quadrature { + +/* See, e.g., + Zhang, Linbo, Tao Cui, and Hui Liu. "A set of symmetric quadrature rules on + triangles and tetrahedra." J. of Computational Mathematics (2009): 89-96. +*/ #define SIQK_QUADRATURE_TRISYM_ORDER4_COORD \ {0.108103018168070, 0.445948490915965, 0.445948490915965, \ 0.445948490915965, 0.108103018168070, 0.445948490915965, \ @@ -41,51 +45,282 @@ namespace quadrature { 0.027230314174435, 0.027230314174435, 0.027230314174435, \ 0.027230314174435} -namespace host { -static const Real trisym_order4_coord[] = SIQK_QUADRATURE_TRISYM_ORDER4_COORD; -static const Real trisym_order4_weight[] = SIQK_QUADRATURE_TRISYM_ORDER4_WEIGHT; -static const Real trisym_order8_coord[] = SIQK_QUADRATURE_TRISYM_ORDER8_COORD; -static const Real trisym_order8_weight[] = SIQK_QUADRATURE_TRISYM_ORDER8_WEIGHT; -} +#define SIQK_QUADRATURE_TRISYM_ORDER12_COORD \ + {0.023565220452390, 0.488217389773805, 0.488217389773805, \ + 0.488217389773805, 0.023565220452390, 0.488217389773805, \ + 0.488217389773805, 0.488217389773805, 0.023565220452390, \ + 0.120551215411079, 0.439724392294460, 0.439724392294460, \ + 0.439724392294460, 0.120551215411079, 0.439724392294460, \ + 0.439724392294460, 0.439724392294460, 0.120551215411079, \ + 0.457579229975768, 0.271210385012116, 0.271210385012116, \ + 0.271210385012116, 0.457579229975768, 0.271210385012116, \ + 0.271210385012116, 0.271210385012116, 0.457579229975768, \ + 0.744847708916828, 0.127576145541586, 0.127576145541586, \ + 0.127576145541586, 0.744847708916828, 0.127576145541586, \ + 0.127576145541586, 0.127576145541586, 0.744847708916828, \ + 0.957365299093576, 0.021317350453210, 0.021317350453210, \ + 0.021317350453210, 0.957365299093576, 0.021317350453210, \ + 0.021317350453210, 0.021317350453210, 0.957365299093576, \ + 0.115343494534698, 0.275713269685514, 0.608943235779788, \ + 0.115343494534698, 0.608943235779788, 0.275713269685514, \ + 0.275713269685514, 0.115343494534698, 0.608943235779788, \ + 0.275713269685514, 0.608943235779788, 0.115343494534698, \ + 0.608943235779788, 0.115343494534698, 0.275713269685514, \ + 0.608943235779788, 0.275713269685514, 0.115343494534698, \ + 0.022838332222257, 0.281325580989940, 0.695836086787803, \ + 0.022838332222257, 0.695836086787803, 0.281325580989940, \ + 0.281325580989940, 0.022838332222257, 0.695836086787803, \ + 0.281325580989940, 0.695836086787803, 0.022838332222257, \ + 0.695836086787803, 0.022838332222257, 0.281325580989940, \ + 0.695836086787803, 0.281325580989940, 0.022838332222257, \ + 0.025734050548330, 0.116251915907597, 0.858014033544073, \ + 0.025734050548330, 0.858014033544073, 0.116251915907597, \ + 0.116251915907597, 0.025734050548330, 0.858014033544073, \ + 0.116251915907597, 0.858014033544073, 0.025734050548330, \ + 0.858014033544073, 0.025734050548330, 0.116251915907597, \ + 0.858014033544073, 0.116251915907597, 0.025734050548330} +#define SIQK_QUADRATURE_TRISYM_ORDER12_WEIGHT \ + {0.025731066440455, 0.025731066440455, 0.025731066440455, \ + 0.043692544538038, 0.043692544538038, 0.043692544538038, \ + 0.062858224217885, 0.062858224217885, 0.062858224217885, \ + 0.034796112930709, 0.034796112930709, 0.034796112930709, \ + 0.006166261051559, 0.006166261051559, 0.006166261051559, \ + 0.040371557766381, 0.040371557766381, 0.040371557766381, \ + 0.040371557766381, 0.040371557766381, 0.040371557766381, \ + 0.022356773202303, 0.022356773202303, 0.022356773202303, \ + 0.022356773202303, 0.022356773202303, 0.022356773202303, \ + 0.017316231108659, 0.017316231108659, 0.017316231108659, \ + 0.017316231108659, 0.017316231108659, 0.017316231108659} -namespace device { -KOKKOS_CONSTANT Real trisym_order4_coord[] = SIQK_QUADRATURE_TRISYM_ORDER4_COORD; -KOKKOS_CONSTANT Real trisym_order4_weight[] = SIQK_QUADRATURE_TRISYM_ORDER4_WEIGHT; -KOKKOS_CONSTANT Real trisym_order8_coord[] = SIQK_QUADRATURE_TRISYM_ORDER8_COORD; -KOKKOS_CONSTANT Real trisym_order8_weight[] = SIQK_QUADRATURE_TRISYM_ORDER8_WEIGHT; -} +#define SIQK_QUADRATURE_TRISYM_ORDER14_COORD \ + {0.3333333333333333148296162562473910, 0.3333333333333333148296162562473910, 0.3333333333333333148296162562473910, \ + 0.0099797608064584319986778382371995, 0.0099797608064584319986778382371995, 0.9800404783870830804914930922677740, \ + 0.0099797608064584319986778382371995, 0.9800404783870830804914930922677740, 0.0099797608064584319986778382371995, \ + 0.9800404783870830804914930922677740, 0.0099797608064584319986778382371995, 0.0099797608064584319986778382371995, \ + 0.4799778935211884145495275788562139, 0.4799778935211884145495275788562139, 0.0400442129576231709009448422875721, \ + 0.4799778935211884145495275788562139, 0.0400442129576231709009448422875721, 0.4799778935211884145495275788562139, \ + 0.0400442129576231709009448422875721, 0.4799778935211884145495275788562139, 0.4799778935211884145495275788562139, \ + 0.1538119591769669114444951674158801, 0.1538119591769669114444951674158801, 0.6923760816460662326221608964260668, \ + 0.1538119591769669114444951674158801, 0.6923760816460662326221608964260668, 0.1538119591769669114444951674158801, \ + 0.6923760816460662326221608964260668, 0.1538119591769669114444951674158801, 0.1538119591769669114444951674158801, \ + 0.0740234771169878125185448425327195, 0.0740234771169878125185448425327195, 0.8519530457660243749629103149345610, \ + 0.0740234771169878125185448425327195, 0.8519530457660243749629103149345610, 0.0740234771169878125185448425327195, \ + 0.8519530457660243749629103149345610, 0.0740234771169878125185448425327195, 0.0740234771169878125185448425327195, \ + 0.1303546825033299882967696703417460, 0.1303546825033299882967696703417460, 0.7392906349933400234064606593165081, \ + 0.1303546825033299882967696703417460, 0.7392906349933400234064606593165081, 0.1303546825033299882967696703417460, \ + 0.7392906349933400234064606593165081, 0.1303546825033299882967696703417460, 0.1303546825033299882967696703417460, \ + 0.2306172260266531326422523306973744, 0.2306172260266531326422523306973744, 0.5387655479466937347154953386052512, \ + 0.2306172260266531326422523306973744, 0.5387655479466937347154953386052512, 0.2306172260266531326422523306973744, \ + 0.5387655479466937347154953386052512, 0.2306172260266531326422523306973744, 0.2306172260266531326422523306973744, \ + 0.4223320834191477968211358984262915, 0.4223320834191477968211358984262915, 0.1553358331617044063577282031474169, \ + 0.4223320834191477968211358984262915, 0.1553358331617044063577282031474169, 0.4223320834191477968211358984262915, \ + 0.1553358331617044063577282031474169, 0.4223320834191477968211358984262915, 0.4223320834191477968211358984262915, \ + 0.7862373859346609705767150444444269, 0.1906163600319009110428680742188590, 0.0231462540334381183804168813367141, \ + 0.7862373859346609705767150444444269, 0.0231462540334381183804168813367141, 0.1906163600319009110428680742188590, \ + 0.1906163600319009110428680742188590, 0.7862373859346609705767150444444269, 0.0231462540334381183804168813367141, \ + 0.1906163600319009110428680742188590, 0.0231462540334381183804168813367141, 0.7862373859346609705767150444444269, \ + 0.0231462540334381183804168813367141, 0.7862373859346609705767150444444269, 0.1906163600319009110428680742188590, \ + 0.0231462540334381183804168813367141, 0.1906163600319009110428680742188590, 0.7862373859346609705767150444444269, \ + 0.6305521436606074114905595706659369, 0.3623231377435471300962888108188054, 0.0071247185958454584131516185152577, \ + 0.6305521436606074114905595706659369, 0.0071247185958454584131516185152577, 0.3623231377435471300962888108188054, \ + 0.3623231377435471300962888108188054, 0.6305521436606074114905595706659369, 0.0071247185958454584131516185152577, \ + 0.3623231377435471300962888108188054, 0.0071247185958454584131516185152577, 0.6305521436606074114905595706659369, \ + 0.0071247185958454584131516185152577, 0.6305521436606074114905595706659369, 0.3623231377435471300962888108188054, \ + 0.0071247185958454584131516185152577, 0.3623231377435471300962888108188054, 0.6305521436606074114905595706659369, \ + 0.6265773298563063198329814440512564, 0.2907712058836673940653838599246228, 0.0826514642600262861016346960241208, \ + 0.6265773298563063198329814440512564, 0.0826514642600262861016346960241208, 0.2907712058836673940653838599246228, \ + 0.2907712058836673940653838599246228, 0.6265773298563063198329814440512564, 0.0826514642600262861016346960241208, \ + 0.2907712058836673940653838599246228, 0.0826514642600262861016346960241208, 0.6265773298563063198329814440512564, \ + 0.0826514642600262861016346960241208, 0.6265773298563063198329814440512564, 0.2907712058836673940653838599246228, \ + 0.0826514642600262861016346960241208, 0.2907712058836673940653838599246228, 0.6265773298563063198329814440512564, \ + 0.9142099849296254632236014003865421, 0.0711657108777507679819862573822320, 0.0146243041926237687944123422312259, \ + 0.9142099849296254632236014003865421, 0.0146243041926237687944123422312259, 0.0711657108777507679819862573822320, \ + 0.0711657108777507679819862573822320, 0.9142099849296254632236014003865421, 0.0146243041926237687944123422312259, \ + 0.0711657108777507679819862573822320, 0.0146243041926237687944123422312259, 0.9142099849296254632236014003865421, \ + 0.0146243041926237687944123422312259, 0.9142099849296254632236014003865421, 0.0711657108777507679819862573822320, \ + 0.0146243041926237687944123422312259, 0.0711657108777507679819862573822320, 0.9142099849296254632236014003865421} +#define SIQK_QUADRATURE_TRISYM_ORDER14_WEIGHT \ + {0.0585962852260285965710906452841300,0.0017351512297252675524200649093132,0.0017351512297252675524200649093132, \ + 0.0017351512297252675524200649093132,0.0261637825586145227052536910150593,0.0261637825586145227052536910150593, \ + 0.0261637825586145227052536910150593,0.0039197292424018289128118119890587,0.0039197292424018289128118119890587, \ + 0.0039197292424018289128118119890587,0.0122473597569408669538670864085361,0.0122473597569408669538670864085361, \ + 0.0122473597569408669538670864085361,0.0281996285032579604989955157634540,0.0281996285032579604989955157634540, \ + 0.0281996285032579604989955157634540,0.0508870871859594883779287499692146,0.0508870871859594883779287499692146, \ + 0.0508870871859594883779287499692146,0.0504534399016036000373830461285252,0.0504534399016036000373830461285252, \ + 0.0504534399016036000373830461285252,0.0170636442122334523741056244716674,0.0170636442122334523741056244716674, \ + 0.0170636442122334523741056244716674,0.0170636442122334523741056244716674,0.0170636442122334523741056244716674, \ + 0.0170636442122334523741056244716674,0.0096834664255066003890615178306689,0.0096834664255066003890615178306689, \ + 0.0096834664255066003890615178306689,0.0096834664255066003890615178306689,0.0096834664255066003890615178306689, \ + 0.0096834664255066003890615178306689,0.0363857559284850029523994408009457,0.0363857559284850029523994408009457, \ + 0.0363857559284850029523994408009457,0.0363857559284850029523994408009457,0.0363857559284850029523994408009457, \ + 0.0363857559284850029523994408009457,0.0069646633735184126576256424812073,0.0069646633735184126576256424812073, \ + 0.0069646633735184126576256424812073,0.0069646633735184126576256424812073,0.0069646633735184126576256424812073, \ + 0.0069646633735184126576256424812073} -template -KOKKOS_INLINE_FUNCTION -void get_coef (const int order, RawConstVec3s& coord, RawConstArray& weight) { - switch (order) { - case 4: - coord = RawConstVec3s(device::trisym_order4_coord, 6, 3); - weight = RawConstArray(device::trisym_order4_weight, 6); - break; - case 8: - coord = RawConstVec3s(device::trisym_order8_coord, 16, 3); - weight = RawConstArray(device::trisym_order8_weight, 16); - break; - } -} +#define SIQK_QUADRATURE_TRISYM_ORDER20_COORD \ + {0.3333333333333333148296162562473910, 0.3333333333333333148296162562473910, 0.3333333333333333148296162562473910, \ + 0.2158743059329919777855621987328050, 0.2158743059329919777855621987328050, 0.5682513881340160999400268337922171, \ + 0.2158743059329919777855621987328050, 0.5682513881340160999400268337922171, 0.2158743059329919777855621987328050, \ + 0.5682513881340160999400268337922171, 0.2158743059329919777855621987328050, 0.2158743059329919777855621987328050, \ + 0.0753767665297472716501303580116655, 0.0753767665297472716501303580116655, 0.8492464669405054289441636683477554, \ + 0.0753767665297472716501303580116655, 0.8492464669405054289441636683477554, 0.0753767665297472716501303580116655, \ + 0.8492464669405054289441636683477554, 0.0753767665297472716501303580116655, 0.0753767665297472716501303580116655, \ + 0.0103008281372217926769030427180951, 0.0103008281372217926769030427180951, 0.9793983437255564528101103860535659, \ + 0.0103008281372217926769030427180951, 0.9793983437255564528101103860535659, 0.0103008281372217926769030427180951, \ + 0.9793983437255564528101103860535659, 0.0103008281372217926769030427180951, 0.0103008281372217926769030427180951, \ + 0.4936022112987001886352800283930264, 0.4936022112987001886352800283930264, 0.0127955774025996227294399432139471, \ + 0.4936022112987001886352800283930264, 0.0127955774025996227294399432139471, 0.4936022112987001886352800283930264, \ + 0.0127955774025996227294399432139471, 0.4936022112987001886352800283930264, 0.4936022112987001886352800283930264, \ + 0.4615509381069253236340443891094765, 0.4615509381069253236340443891094765, 0.0768981237861493527319112217810471, \ + 0.4615509381069253236340443891094765, 0.0768981237861493527319112217810471, 0.4615509381069253236340443891094765, \ + 0.0768981237861493527319112217810471, 0.4615509381069253236340443891094765, 0.4615509381069253236340443891094765, \ + 0.3286214064242369836676971317501739, 0.4293405702582103744546770940360148, 0.2420380233175526418776257742138114, \ + 0.3286214064242369836676971317501739, 0.2420380233175526418776257742138114, 0.4293405702582103744546770940360148, \ + 0.4293405702582103744546770940360148, 0.3286214064242369836676971317501739, 0.2420380233175526418776257742138114, \ + 0.4293405702582103744546770940360148, 0.2420380233175526418776257742138114, 0.3286214064242369836676971317501739, \ + 0.2420380233175526418776257742138114, 0.3286214064242369836676971317501739, 0.4293405702582103744546770940360148, \ + 0.2420380233175526418776257742138114, 0.4293405702582103744546770940360148, 0.3286214064242369836676971317501739, \ + 0.2604803617865687481724989993381314, 0.1015775342809694392620656344661256, 0.6379421039324617570542841349379160, \ + 0.2604803617865687481724989993381314, 0.6379421039324617570542841349379160, 0.1015775342809694392620656344661256, \ + 0.1015775342809694392620656344661256, 0.2604803617865687481724989993381314, 0.6379421039324617570542841349379160, \ + 0.1015775342809694392620656344661256, 0.6379421039324617570542841349379160, 0.2604803617865687481724989993381314, \ + 0.6379421039324617570542841349379160, 0.2604803617865687481724989993381314, 0.1015775342809694392620656344661256, \ + 0.6379421039324617570542841349379160, 0.1015775342809694392620656344661256, 0.2604803617865687481724989993381314, \ + 0.1370742358464553112273875967730419, 0.7100659730011301684626801034028176, 0.1528597911524145480655079154530540, \ + 0.1370742358464553112273875967730419, 0.1528597911524145480655079154530540, 0.7100659730011301684626801034028176, \ + 0.7100659730011301684626801034028176, 0.1370742358464553112273875967730419, 0.1528597911524145480655079154530540, \ + 0.7100659730011301684626801034028176, 0.1528597911524145480655079154530540, 0.1370742358464553112273875967730419, \ + 0.1528597911524145480655079154530540, 0.1370742358464553112273875967730419, 0.7100659730011301684626801034028176, \ + 0.1528597911524145480655079154530540, 0.7100659730011301684626801034028176, 0.1370742358464553112273875967730419, \ + 0.1467269458722997854671632467216114, 0.4985454776784148389623396724346094, 0.3547275764492854310816483121016063, \ + 0.1467269458722997854671632467216114, 0.3547275764492854310816483121016063, 0.4985454776784148389623396724346094, \ + 0.4985454776784148389623396724346094, 0.1467269458722997854671632467216114, 0.3547275764492854310816483121016063, \ + 0.4985454776784148389623396724346094, 0.3547275764492854310816483121016063, 0.1467269458722997854671632467216114, \ + 0.3547275764492854310816483121016063, 0.1467269458722997854671632467216114, 0.4985454776784148389623396724346094, \ + 0.3547275764492854310816483121016063, 0.4985454776784148389623396724346094, 0.1467269458722997854671632467216114, \ + 0.0269989777425532900823057502748270, 0.0491867226725819992050325879517914, 0.9238142995848647176515555656806100, \ + 0.0269989777425532900823057502748270, 0.9238142995848647176515555656806100, 0.0491867226725819992050325879517914, \ + 0.0491867226725819992050325879517914, 0.0269989777425532900823057502748270, 0.9238142995848647176515555656806100, \ + 0.0491867226725819992050325879517914, 0.9238142995848647176515555656806100, 0.0269989777425532900823057502748270, \ + 0.9238142995848647176515555656806100, 0.0269989777425532900823057502748270, 0.0491867226725819992050325879517914, \ + 0.9238142995848647176515555656806100, 0.0491867226725819992050325879517914, 0.0269989777425532900823057502748270, \ + 0.0618717859336170294959345028473763, 0.7796601465405693653920593533257488, 0.1584680675258135496008549125690479, \ + 0.0618717859336170294959345028473763, 0.1584680675258135496008549125690479, 0.7796601465405693653920593533257488, \ + 0.7796601465405693653920593533257488, 0.0618717859336170294959345028473763, 0.1584680675258135496008549125690479, \ + 0.7796601465405693653920593533257488, 0.1584680675258135496008549125690479, 0.0618717859336170294959345028473763, \ + 0.1584680675258135496008549125690479, 0.0618717859336170294959345028473763, 0.7796601465405693653920593533257488, \ + 0.1584680675258135496008549125690479, 0.7796601465405693653920593533257488, 0.0618717859336170294959345028473763, \ + 0.0477243674276219970176171614184568, 0.3704915391495476328920233299868414, 0.5817840934228304394792985476669855, \ + 0.0477243674276219970176171614184568, 0.5817840934228304394792985476669855, 0.3704915391495476328920233299868414, \ + 0.3704915391495476328920233299868414, 0.0477243674276219970176171614184568, 0.5817840934228304394792985476669855, \ + 0.3704915391495476328920233299868414, 0.5817840934228304394792985476669855, 0.0477243674276219970176171614184568, \ + 0.5817840934228304394792985476669855, 0.0477243674276219970176171614184568, 0.3704915391495476328920233299868414, \ + 0.5817840934228304394792985476669855, 0.3704915391495476328920233299868414, 0.0477243674276219970176171614184568, \ + 0.1206005151863643737319975457467081, 0.8633469487547525966775197048264090, 0.0160525360588830157126949416124262, \ + 0.1206005151863643737319975457467081, 0.0160525360588830157126949416124262, 0.8633469487547525966775197048264090, \ + 0.8633469487547525966775197048264090, 0.1206005151863643737319975457467081, 0.0160525360588830157126949416124262, \ + 0.8633469487547525966775197048264090, 0.0160525360588830157126949416124262, 0.1206005151863643737319975457467081, \ + 0.0160525360588830157126949416124262, 0.1206005151863643737319975457467081, 0.8633469487547525966775197048264090, \ + 0.0160525360588830157126949416124262, 0.8633469487547525966775197048264090, 0.1206005151863643737319975457467081, \ + 0.0026971477967097875517998861738533, 0.0561949381877454995359855161041196, 0.9411079140155447220195128466002643, \ + 0.0026971477967097875517998861738533, 0.9411079140155447220195128466002643, 0.0561949381877454995359855161041196, \ + 0.0561949381877454995359855161041196, 0.0026971477967097875517998861738533, 0.9411079140155447220195128466002643, \ + 0.0561949381877454995359855161041196, 0.9411079140155447220195128466002643, 0.0026971477967097875517998861738533, \ + 0.9411079140155447220195128466002643, 0.0026971477967097875517998861738533, 0.0561949381877454995359855161041196, \ + 0.9411079140155447220195128466002643, 0.0561949381877454995359855161041196, 0.0026971477967097875517998861738533, \ + 0.0030156332779423624702863637736527, 0.2086750067484213488899769117779215, 0.7883093599736362699914593576977495, \ + 0.0030156332779423624702863637736527, 0.7883093599736362699914593576977495, 0.2086750067484213488899769117779215, \ + 0.2086750067484213488899769117779215, 0.0030156332779423624702863637736527, 0.7883093599736362699914593576977495, \ + 0.2086750067484213488899769117779215, 0.7883093599736362699914593576977495, 0.0030156332779423624702863637736527, \ + 0.7883093599736362699914593576977495, 0.0030156332779423624702863637736527, 0.2086750067484213488899769117779215, \ + 0.7883093599736362699914593576977495, 0.2086750067484213488899769117779215, 0.0030156332779423624702863637736527, \ + 0.0299053757884570198255502759820956, 0.7211512409120340860724240883428138, 0.2489433832995089357353890591184609, \ + 0.0299053757884570198255502759820956, 0.2489433832995089357353890591184609, 0.7211512409120340860724240883428138, \ + 0.7211512409120340860724240883428138, 0.0299053757884570198255502759820956, 0.2489433832995089357353890591184609, \ + 0.7211512409120340860724240883428138, 0.2489433832995089357353890591184609, 0.0299053757884570198255502759820956, \ + 0.2489433832995089357353890591184609, 0.0299053757884570198255502759820956, 0.7211512409120340860724240883428138, \ + 0.2489433832995089357353890591184609, 0.7211512409120340860724240883428138, 0.0299053757884570198255502759820956, \ + 0.0067566542224609888248054723192126, 0.6400554419405418693500564586429391, 0.3531879038369971635091815187479369, \ + 0.0067566542224609888248054723192126, 0.3531879038369971635091815187479369, 0.6400554419405418693500564586429391, \ + 0.6400554419405418693500564586429391, 0.0067566542224609888248054723192126, 0.3531879038369971635091815187479369, \ + 0.6400554419405418693500564586429391, 0.3531879038369971635091815187479369, 0.0067566542224609888248054723192126, \ + 0.3531879038369971635091815187479369, 0.0067566542224609888248054723192126, 0.6400554419405418693500564586429391, \ + 0.3531879038369971635091815187479369, 0.6400554419405418693500564586429391, 0.0067566542224609888248054723192126} +#define SIQK_QUADRATURE_TRISYM_ORDER20_WEIGHT \ + {0.0125376079944966561247055025773989,0.0274718698764242139076507953632245,0.0274718698764242139076507953632245, \ + 0.0274718698764242139076507953632245,0.0097652722770514236577676925321612,0.0097652722770514236577676925321612, \ + 0.0097652722770514236577676925321612,0.0013984195353918234608348036829284,0.0013984195353918234608348036829284, \ + 0.0013984195353918234608348036829284,0.0092921026251851831373462786700657,0.0092921026251851831373462786700657, \ + 0.0092921026251851831373462786700657,0.0165778760323669269172164320025331,0.0165778760323669269172164320025331, \ + 0.0165778760323669269172164320025331,0.0206677623486650786921448030852844,0.0206677623486650786921448030852844, \ + 0.0206677623486650786921448030852844,0.0206677623486650786921448030852844,0.0206677623486650786921448030852844, \ + 0.0206677623486650786921448030852844,0.0208222355211545064046507746979842,0.0208222355211545064046507746979842, \ + 0.0208222355211545064046507746979842,0.0208222355211545064046507746979842,0.0208222355211545064046507746979842, \ + 0.0208222355211545064046507746979842,0.0095686384198490608693488113090098,0.0095686384198490608693488113090098, \ + 0.0095686384198490608693488113090098,0.0095686384198490608693488113090098,0.0095686384198490608693488113090098, \ + 0.0095686384198490608693488113090098,0.0244527709689724634389840218773315,0.0244527709689724634389840218773315, \ + 0.0244527709689724634389840218773315,0.0244527709689724634389840218773315,0.0244527709689724634389840218773315, \ + 0.0244527709689724634389840218773315,0.0031557306306305341579709899946238,0.0031557306306305341579709899946238, \ + 0.0031557306306305341579709899946238,0.0031557306306305341579709899946238,0.0031557306306305341579709899946238, \ + 0.0031557306306305341579709899946238,0.0121367963653212975611017654387069,0.0121367963653212975611017654387069, \ + 0.0121367963653212975611017654387069,0.0121367963653212975611017654387069,0.0121367963653212975611017654387069, \ + 0.0121367963653212975611017654387069,0.0149664801438864486504698447788542,0.0149664801438864486504698447788542, \ + 0.0149664801438864486504698447788542,0.0149664801438864486504698447788542,0.0149664801438864486504698447788542, \ + 0.0149664801438864486504698447788542,0.0063275933217777392825187376956819,0.0063275933217777392825187376956819, \ + 0.0063275933217777392825187376956819,0.0063275933217777392825187376956819,0.0063275933217777392825187376956819, \ + 0.0063275933217777392825187376956819,0.0013425603120636958685146788994302,0.0013425603120636958685146788994302, \ + 0.0013425603120636958685146788994302,0.0013425603120636958685146788994302,0.0013425603120636958685146788994302, \ + 0.0013425603120636958685146788994302,0.0027760769163475539772489852907711,0.0027760769163475539772489852907711, \ + 0.0027760769163475539772489852907711,0.0027760769163475539772489852907711,0.0027760769163475539772489852907711, \ + 0.0027760769163475539772489852907711,0.0107398444741849414391099415411190,0.0107398444741849414391099415411190, \ + 0.0107398444741849414391099415411190,0.0107398444741849414391099415411190,0.0107398444741849414391099415411190, \ + 0.0107398444741849414391099415411190,0.0053678057381874528034004789844857,0.0053678057381874528034004789844857, \ + 0.0053678057381874528034004789844857,0.0053678057381874528034004789844857,0.0053678057381874528034004789844857, \ + 0.0053678057381874528034004789844857} + +class TriangleQuadrature { + const Real trisym_order4_coord_ [ 18] = SIQK_QUADRATURE_TRISYM_ORDER4_COORD; + const Real trisym_order4_weight_ [ 6] = SIQK_QUADRATURE_TRISYM_ORDER4_WEIGHT; + const Real trisym_order8_coord_ [ 48] = SIQK_QUADRATURE_TRISYM_ORDER8_COORD; + const Real trisym_order8_weight_ [ 16] = SIQK_QUADRATURE_TRISYM_ORDER8_WEIGHT; + const Real trisym_order12_coord_ [ 99] = SIQK_QUADRATURE_TRISYM_ORDER12_COORD; + const Real trisym_order12_weight_[ 33] = SIQK_QUADRATURE_TRISYM_ORDER12_WEIGHT; + const Real trisym_order14_coord_ [138] = SIQK_QUADRATURE_TRISYM_ORDER14_COORD; + const Real trisym_order14_weight_[ 46] = SIQK_QUADRATURE_TRISYM_ORDER14_WEIGHT; + const Real trisym_order20_coord_ [264] = SIQK_QUADRATURE_TRISYM_ORDER20_COORD; + const Real trisym_order20_weight_[ 88] = SIQK_QUADRATURE_TRISYM_ORDER20_WEIGHT; -template <> -KOKKOS_INLINE_FUNCTION -void get_coef (const int order, RawConstVec3s& coord, - RawConstArray& weight) { - switch (order) { - case 4: - coord = RawConstVec3s(host::trisym_order4_coord, 6, 3); - weight = RawConstArray(host::trisym_order4_weight, 6); - break; - case 8: - coord = RawConstVec3s(host::trisym_order8_coord, 16, 3); - weight = RawConstArray(host::trisym_order8_weight, 16); - break; +public: + KOKKOS_INLINE_FUNCTION TriangleQuadrature () {} + + KOKKOS_INLINE_FUNCTION + void get_coef (const int order, RawConstVec3s& coord, + RawConstArray& weight) const { + switch (order) { + case 4: + coord = RawConstVec3s(trisym_order4_coord_, 6, 3); + weight = RawConstArray(trisym_order4_weight_, 6); + break; + case 8: + coord = RawConstVec3s(trisym_order8_coord_, 16, 3); + weight = RawConstArray(trisym_order8_weight_, 16); + break; + case 12: + coord = RawConstVec3s(trisym_order12_coord_, 33, 3); + weight = RawConstArray(trisym_order12_weight_, 33); + break; + case 14: + coord = RawConstVec3s(trisym_order14_coord_, 46, 3); + weight = RawConstArray(trisym_order14_weight_, 46); + break; + case 20: + coord = RawConstVec3s(trisym_order20_coord_, 88, 3); + weight = RawConstArray(trisym_order20_weight_, 88); + break; + } } -} -} -} +}; + +} // namespace siqk #endif diff --git a/siqk/siqk_search.hpp b/siqk/siqk_search.hpp new file mode 100644 index 0000000..b9eab6a --- /dev/null +++ b/siqk/siqk_search.hpp @@ -0,0 +1,377 @@ +#ifndef INCLUDE_SIQK_SEARCH_HPP +#define INCLUDE_SIQK_SEARCH_HPP + +#include "siqk_defs.hpp" +#include "siqk_geometry.hpp" + +namespace siqk { + +// Oct-tree. Might do something else better suited to the sphere later. +template +class Octree { +public: + enum { max_depth = max_depth_ }; + typedef Real BoundingBox[6]; + + struct Options { + // Do not go beyond max_depth_ depth, including the root and leaf. With this + // constraInt, try to go deep enough so that a leaf has no more than + // max_nelem elements. + Int max_nelem; + Options () : max_nelem(8) {} + }; + + // Bounding box for a cluster of points ps (possibly vertices). + template + static void calc_bb (const CV3s& ps, const Int np, BB bb) { + if (np == 0) return; + for (Int j = 0; j < 3; ++j) + bb[j] = bb[j+3] = ps(0,j); + for (Int i = 1; i < np; ++i) { + for (Int j = 0; j < 3; ++j) { + bb[j] = min(bb[j], ps(i,j)); + bb[j+3] = max(bb[j+3], ps(i,j)); + } + } + pad_bb(bb); + } + + template + KOKKOS_INLINE_FUNCTION + static void calc_bb (const CV3s& p, const CIV e, const Int ne, BB ebb) { + for (Int j = 0; j < 3; ++j) + ebb[j] = ebb[j+3] = p(e[0], j); + for (Int i = 1; i < ne; ++i) { + if (e[i] == -1) break; + for (Int j = 0; j < 3; ++j) { + ebb[j] = min(ebb[j], p(e[i], j)); + ebb[j+3] = max(ebb[j+3], p(e[i], j)); + } + } + pad_bb(ebb); + } + + // If a bounding box was constructed from vertices of a spherical polygon, + // expand it to account for the possible protrusion of the sphere. + template + KOKKOS_INLINE_FUNCTION + static void pad_bb (BB bb) { + if (std::is_same::value) return; + Real hl = 0.5*std::sqrt(square(bb[3] - bb[0]) + square(bb[4] - bb[1]) + + square(bb[5] - bb[2])); + // Limit the half-length to the circle's radius. + hl = min(1.0, hl); + // Max distance from a chord of length 2 hl to the unit circle: + // hl = sin theta + // pad = 1 - cos theta = 1 - sqrt(1 - sin^2 theta) = 1 - sqrt(1 - hl^2). + const Real pad = 1 - std::sqrt(1 - square(hl)); + for (Int i = 0; i < 3; ++i) bb[ i] -= pad; + for (Int i = 0; i < 3; ++i) bb[3+i] += pad; + } + + template + static void calc_bb (const CV3s& ps, BoundingBox bb) { + calc_bb(ps, nslices(ps), bb); + } + + template + static void calc_bb (const CV3s& p, const CIs& e, V6s& ebbs) { + assert(nslices(ebbs) == nslices(e)); + for (Int k = 0, klim = nslices(e); k < klim; ++k) + calc_bb(p, slice(e, k), szslice(e), slice(ebbs, k)); + } + + // p is a 3xNp array of points. e is a KxNe array of elements. An entry <0 is + // ignored. All <0 entries must be at the end of an element's list. + Octree (const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e, + const Options& o) { + init(p, e, o); + } + Octree (const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e) { + Options o; + init(p, e, o); + } + + Octree() {} + void init (const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e) { + Options o; + init(p, e, o); + } + + // Apply f to every element in leaf nodes with which bb overlaps. f must have + // function + // void operator(const Int element). + template + KOKKOS_INLINE_FUNCTION + void apply (const CV bb, Functor& f) const { + if (nslices(nodes_) == 0) { + for (Int i = 0; i < offsets_[1]; ++i) + f(elems_[i]); + return; + } +#ifdef SIQK_NONRECURSIVE + // Non-recursive impl. + { + // Stack. + Real snbb[8*max_depth_]; + Int sni[max_depth_], si[max_depth_]; + Int sp = 0; + // Args for top-level call. + copy(snbb, bb_, 8); + sni[sp] = 0; + si[sp] = 0; + while (sp >= 0) { + // Get stack frame's (nbb, ni, current i) values. + const Int i = si[sp]; + if (i == 8) { + --sp; + continue; + } + // Increment stored value of i for next iteration. Current value is + // stored in 'i' above. + ++si[sp]; + const Int ni = sni[sp]; + const Real* const nbb = snbb + 8*sp; + // Can use the next stack frame's bb space for a child bb. + Real* const child_bb = snbb + 8*(sp+1); + fill_child_bb(nbb, i, child_bb); + if ( ! do_bb_overlap(child_bb, bb)) continue; + Int e = nodes_(ni,i); + if (e < 0) { + // Leaf, so apply functor to each element. + e = std::abs(e + 1); + for (Int k = offsets_[e]; k < offsets_[e+1]; ++k) + f(elems_[k]); + } else if (e > 0) { + // Recurse. + ++sp; + sni[sp] = e; + si[sp] = 0; + } + } + } +#else + apply_r(0, bb_, bb, f); +#endif + } + +private: + /* Each node in the oct-tree contains 8 integers, stored in 'nodes'. + + >0 is an index Into 'nodes', pointing to a child node. + + A <=0 entry in 'nodes' indicates a leaf node. If 0, there are no elements + in the leaf. If <0, the negative of the entry minus 1 is the index of an + offset array indexing 'elems'. + + Each segment of 'elems' contains a list of element indices covered by a + leaf node. Element indices refer to the list of elements the caller + provides during oct-tree construction. + */ + + // Static data structures holding the completed octree. + // nodes(:,i) is a list. The list includes children of node i (>0) and leaf + // node data (<=0). + //todo Make these const once ready to do full GPU stuff. + Nodes nodes_; + // A leaf node corresponding to -k covers elements + // elems[offset[k] : offset[k]-1]. + ko::View offsets_, elems_; + // Root node's bounding box. + BoundingBox bb_; + + // Dynamic data structures for construction phase. + class IntList { + Int* const buf_; + Int i_; + public: + IntList (Int* const buf) : buf_(buf), i_(0) {} + void reset () { i_ = 0; } + void push (const Int& i) { buf_[i_++] = i; } + Int* data () { return buf_; } + Int n () const { return i_; } + const Int& operator[] (const Int& i) const { return buf_[i]; } + }; + + class DynIntList { + std::vector buf_; + public: + DynIntList () {} + void push (const Int& i) { buf_.push_back(i); } + Int& back () { return buf_.back(); } + Int& operator[] (const size_t i) { + if (i >= buf_.size()) + buf_.resize(i+1); + return buf_[i]; + } + const Int& operator[] (const size_t i) const { return buf_[i]; } + Int n () const { return static_cast(buf_.size()); } + const Int* data () const { return buf_.data(); } + }; + + // Opposite index slot convention. + class DynNodes { + std::vector buf_; + public: + Int n () const { return static_cast(buf_.size()) >> 3; } + const Int* data () const { return buf_.data(); } + Int& operator() (const Int& r, const Int& c) { + const size_t ec = (c+1) << 3; + if (ec >= buf_.size()) + buf_.resize(ec); + return const_cast( + const_cast(this)->operator()(r, c)); + } + const Int& operator() (const Int& r, const Int& c) const { + assert(((c << 3) + r) >= 0); + assert(((c << 3) + r) < (Int) buf_.size()); + return buf_[(c << 3) + r]; + } + }; + + void init (const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e, + const Options& o) { + if (nslices(e) == 0) return; + // Get OT's bounding box. + calc_bb(p, bb_); + // Get elements' bounding boxes. + Vec6s::HostMirror ebbs("ebbs", nslices(e), 6); + calc_bb(p, e, ebbs); + // Static element lists for work. Each level has active work space. + std::vector buf(max_depth_*nslices(e)); + IntList es(buf.data()), wrk(buf.data() + nslices(e)); + for (Int i = 0, ilim = nslices(e); i < ilim; ++i) + es.push(i); + // Dynamic element lists. + DynIntList offsets, elems; + offsets[0] = 0; + // Dynamic node data structure. + DynNodes nodes; + // Recurse. We don't care about the return value. If it's 0 and nodes.n() == + // 0, we'll detect as much in 'apply'. + init_r(1, bb_, ebbs, o, es, wrk, offsets, elems, nodes); + // Build the static data structures. + if (elems.n() == 0) return; + init_static_ds(nodes, offsets, elems); + } + + Int init_r (const Int depth, // Tree's depth at this point, including root. + const BoundingBox& nbb, // My bounding box. + const ConstVec6s::HostMirror& ebbs, // All elements' bounding boxes. + const Options& o, // Options controlling construct of the tree. + IntList& es, // List of elements in my bounding box. + IntList& wrk, // Work space to store working element lists. + DynIntList& offsets, // Offsetss Into elems. + DynIntList& elems, // Elements belonging to leaf nodes. + DynNodes& nodes) // Dynamic nodes data structure. + { + const Int my_idx = nodes.n(); // My node index. + // Decide what to do. + if (es.n() == 0) { + // I have no elements, so return 0 to indicate I'm a leaf node containing + // nothing. + return 0; + } else if (es.n() <= o.max_nelem || depth == max_depth_) { + // I'm a leaf node with elements. Store my list of elements and return the + // storage location. + const Int os = offsets.back(); + offsets.push(os + es.n()); + for (Int i = 0, n = es.n(); i < n; ++i) + elems[os + i] = es[i]; + return 1 - offsets.n(); + } else { + // I'm not a leaf node. + nodes(0, my_idx) = 0; // Insert myself Into the nodes array. + for (Int ic = 0; ic < 8; ++ic) { + BoundingBox child_bb; + fill_child_bb(nbb, ic, child_bb); + // Find the elements that are in this child's bb. + IntList ces(wrk.data()); + for (Int i = 0, n = es.n(); i < n; ++i) + if (do_bb_overlap(child_bb, slice(ebbs, es[i]))) + ces.push(es[i]); + // Create some work space. + IntList cwrk(wrk.data() + ces.n()); + // Recurse. + const Int child_idx = init_r(depth+1, child_bb, ebbs, o, ces, cwrk, + offsets, elems, nodes); + nodes(ic, my_idx) = child_idx; + } + return my_idx; + } + } + + void init_static_ds (const DynNodes nodes, const DynIntList& offsets, + const DynIntList& elems) { + { + ko::resize(nodes_, nodes.n(), 8); + auto nodes_hm = ko::create_mirror_view(nodes_); + for (Int i = 0; i < nodes.n(); ++i) + for (Int j = 0; j < 8; ++j) + nodes_hm(i,j) = nodes(j,i); + ko::deep_copy(nodes_, nodes_hm); + } + hm_resize_and_copy(offsets_, offsets, offsets.n()); + hm_resize_and_copy(elems_, elems, elems.n()); + } + + // Using parent bb p, fill child bb c, with child_idx in 0:7. + template + KOKKOS_INLINE_FUNCTION + static void fill_child_bb (const CBB& p, const Int& child_idx, BB& c) { + const Real m[] = { 0.5*(p[0] + p[3]), + 0.5*(p[1] + p[4]), + 0.5*(p[2] + p[5]) }; + switch (child_idx) { + case 0: c[0] = p[0]; c[1] = p[1]; c[2] = p[2]; c[3] = m[0]; c[4] = m[1]; c[5] = m[2]; break; + case 1: c[0] = m[0]; c[1] = p[1]; c[2] = p[2]; c[3] = p[3]; c[4] = m[1]; c[5] = m[2]; break; + case 2: c[0] = m[0]; c[1] = m[1]; c[2] = p[2]; c[3] = p[3]; c[4] = p[4]; c[5] = m[2]; break; + case 3: c[0] = p[0]; c[1] = m[1]; c[2] = p[2]; c[3] = m[0]; c[4] = p[4]; c[5] = m[2]; break; + case 4: c[0] = p[0]; c[1] = p[1]; c[2] = m[2]; c[3] = m[0]; c[4] = m[1]; c[5] = p[5]; break; + case 5: c[0] = m[0]; c[1] = p[1]; c[2] = m[2]; c[3] = p[3]; c[4] = m[1]; c[5] = p[5]; break; + case 6: c[0] = m[0]; c[1] = m[1]; c[2] = m[2]; c[3] = p[3]; c[4] = p[4]; c[5] = p[5]; break; + case 7: c[0] = p[0]; c[1] = m[1]; c[2] = m[2]; c[3] = m[0]; c[4] = p[4]; c[5] = p[5]; break; + default: + // impossible + error("fill_child_bb: The impossible has happened."); + } + } + + // Do bounding boxes a and b overlap? + template + KOKKOS_INLINE_FUNCTION + static bool do_bb_overlap (const BoundingBox a, const BB b) { + for (Int i = 0; i < 3; ++i) + if ( ! do_lines_overlap(a[i], a[i+3], b[i], b[i+3])) + return false; + return true; + } + + KOKKOS_INLINE_FUNCTION + static bool do_lines_overlap (const Real& a1, const Real& a2, + const Real& b1, const Real& b2) { + return ! (a2 < b1 || a1 > b2); + } + + template KOKKOS_INLINE_FUNCTION + void apply_r (const Int ni, const BoundingBox& nbb, const CV bb, + Functor& f) const { + for (Int i = 0; i < 8; ++i) { + BoundingBox child_bb; + fill_child_bb(nbb, i, child_bb); + if ( ! do_bb_overlap(child_bb, bb)) continue; + Int e = nodes_(ni,i); + if (e > 0) + apply_r(e, child_bb, bb, f); + else if (e < 0) { + e = std::abs(e + 1); + for (Int k = offsets_[e]; k < offsets_[e+1]; ++k) + f(elems_[k]); + } + } + } +}; + +} // namespace siqk + +#endif diff --git a/siqk/siqk_sqr.hpp b/siqk/siqk_sqr.hpp new file mode 100644 index 0000000..3d0f9fd --- /dev/null +++ b/siqk/siqk_sqr.hpp @@ -0,0 +1,259 @@ +#ifndef INCLUDE_SIQK_SQR_HPP +#define INCLUDE_SIQK_SQR_HPP + +#include "siqk_defs.hpp" +#include "siqk_intersect.hpp" + +namespace siqk { +namespace sqr { // spherical quadrilateral <-> reference square +/* Let p be a 3x4 matrix with p(:,i) the i'th vertex in a spherical quad in CCW + order. Let (a,b) be coordinates in the reference square [0,1]^2. (Here we + choose [0,1] instead of [-1,1].) (a,b) = (0,0) corresponds to p(:,1); (1,0) + is p(:,2); (1,1) is p(:,3); (0,1) is p(:,4). + The map from reference square to bilinear quad can be written + T = p*[ 1 -1 1 -1 + -1 1 0 0 + -1 0 0 1 + 1 0 0 0]'; + f(a,b) = T(:,1)*a*b + T(:,2)*a + T(:,3)*b + T(:,4); + The map to the sphere is then completed with + g(a,b) = norm(f(a,b)) + q = f(a,b) / g(a,b). + The Jacobian matrix for q is given by + q_a = f_a/g - (f g_a)/g^2 + g_a = g_f f_a + and similarly for q_b. +*/ + +namespace impl { +// In the implementation, (a,b) in [0,1] because convex combinations are used +// throughout; but in the user interface, (a,b) in [-1,1] to agree with the +// definition of the reference square. + +// Compute T(i,:). +template +KOKKOS_INLINE_FUNCTION +void calc_T_row (const ConstVec3sT& p, const Quad& e, const Int i, + Real& t1, Real& t2, Real& t3, Real& t4) { + t4 = p(e[0],i); + t3 = -t4 + p(e[3],i); + t2 = -t4 + p(e[1],i); + t1 = -t2 + p(e[2],i) - p(e[3],i); +} + +// Compute T(:,1)*a*b + T(:,2)*a + T(:,3)*b + T(:,4). +template +KOKKOS_INLINE_FUNCTION +void calc_ref_to_bilinear (const ConstVec3sT& p, const Quad& e, + const Real a, const Real b, Real q[3]) { + for (Int i = 0; i < 3; ++i) { + Real t1, t2, t3, t4; + impl::calc_T_row(p, e, i, t1, t2, t3, t4); + q[i] = t1*a*b + t2*a + t3*b + t4; + } +} + +// The residual function is r(a,b) = f(a,b)/g(a,b) - q. +template +KOKKOS_INLINE_FUNCTION +void calc_residual (const ConstVec3sT& p, const Quad& e, const Real a, + const Real b, const Real q[3], Real r[3]) { + calc_ref_to_bilinear(p, e, a, b, r); + const Real rnorm = std::sqrt(SphereGeometry::norm2(r)); + for (Int i = 0; i < 3; ++i) + r[i] = r[i]/rnorm - q[i]; +} + +// Compute the Jacobian matrix of the residual function: Jacobian(ref square -> +// sphere). +template +KOKKOS_INLINE_FUNCTION +void calc_Jacobian (const ConstVec3sT& p, const Quad& e, const Real a, + const Real b, Real J[6]) { + Real r[3]; + for (Int i = 0; i < 3; ++i) { + Real t1, t2, t3, t4; + calc_T_row(p, e, i, t1, t2, t3, t4); + r[ i] = t1*a*b + t2*a + t3*b + t4; + J[ i] = t1*b + t2; + J[3+i] = t1*a + t3; + } + Real rtJ[2] = {0}; + for (Int j = 0; j < 2; ++j) { + const Real* const Jj = J + 3*j; + for (Int i = 0; i < 3; ++i) + rtJ[j] += r[i]*Jj[i]; + } + const Real rnorm2 = SphereGeometry::norm2(r), rnorm = std::sqrt(rnorm2); + for (Int j = 0; j < 2; ++j) { + Real* const Jj = J + 3*j; + for (Int i = 0; i < 3; ++i) + Jj[i] = (Jj[i] - r[i]*rtJ[j]/rnorm2)/rnorm; + } +} + +// Solve J dx = r. +KOKKOS_INLINE_FUNCTION +void solve_Jxr (Real J[6], const Real r[3], Real dx[2]) { + // QR factorization: J -> J [n1 a; 0 n2]. + const Real n1 = std::sqrt(SphereGeometry::norm2(J)); + SphereGeometry::scale(1/n1, J); + const Real a = SphereGeometry::dot(J, J+3); + SphereGeometry::axpy(-a, J, J+3); + const Real n2 = std::sqrt(SphereGeometry::norm2(J+3)); + SphereGeometry::scale(1/n2, J+3); + // r -> Q' r. + Real Qtr[2] = {0}; + for (Int j = 0; j < 2; ++j) { + const Real* const Jj = J + 3*j; + for (Int i = 0; i < 3; ++i) + Qtr[j] += Jj[i]*r[i]; + } + // dx = R \ (Q' r). + dx[1] = Qtr[1] / n2; + dx[0] = (Qtr[0] - a*dx[1]) / n1; +} +} // namespace impl + +struct Info { + bool success; + Int n_iterations; +}; + +template +KOKKOS_INLINE_FUNCTION +void calc_ref_to_sphere ( + // The spherical quad containing the point. + const ConstVec3sT& p, const Quad& e, + // (a,b) in [-1,1] + const Real a, const Real b, + // The point on the sphere. + Real q[3]) +{ + impl::calc_ref_to_bilinear(p, e, 0.5*(a+1), 0.5*(b+1), q); + SphereGeometry::normalize(q); +} + +template +KOKKOS_INLINE_FUNCTION +void calc_sphere_to_ref ( + // The spherical quad containing the point. + const ConstVec3sT& p, const Quad& e, + // The point on the sphere. + const Real q[3], + // (a,b) in [-1,1] + Real& a, Real& b, + // Optional info output. + Info* const info = nullptr, + // Max number of iterations before returning with failure. + const Int max_its = 10, + // Tolerance for Newton iteration. + const Real tol = 1e2*std::numeric_limits::epsilon()) +{ + const Real tol2 = square(tol); + Real rnorm2 = 1; + a = b = 0.5; + Int it = 0; + for (it = 1; it <= max_its; ++it) { // Newton's method. + Real r[3], J[6]; + impl::calc_residual(p, e, a, b, q, r); + rnorm2 = SphereGeometry::norm2(r); + if (rnorm2 <= tol2) break; + impl::calc_Jacobian(p, e, a, b, J); + Real dx[2]; + impl::solve_Jxr(J, r, dx); + a -= dx[0]; + b -= dx[1]; + } + a = 2*a - 1; + b = 2*b - 1; + if (info) { + info->success = rnorm2 <= tol2; + info->n_iterations = it; + } +} + +namespace test { +struct Info { + Int sum_nits, max_nits, nfails; +}; + +class TestSphereToRefKernel { + const Real a_test[9] = {-0.1, -1e-16, 0, 1e-15, 0.1, 0.7, 1, 1-1e-14, 1.1}; + const Int n_a_test = sizeof(a_test)/sizeof(*a_test); + + const Real tol_; + mutable ConstVec3s p_; + mutable ConstIdxs e_; + +public: + typedef Info value_type; + + TestSphereToRefKernel (const ConstVec3s::HostMirror& p_hm, + const ConstIdxs::HostMirror& e_hm, + const Real tol = 1e1*std::numeric_limits::epsilon()) + : tol_(tol) + { + { Vec3s p; resize_and_copy(p, p_hm); p_ = p; } + { Idxs e; resize_and_copy(e, e_hm); e_ = e; } + } + + Int n () const { return nslices(e_)*square(n_a_test); } + const Real& tol () const { return tol_; } + + KOKKOS_INLINE_FUNCTION + void operator() (const Int k, value_type& jinfo) const { + const Int + ei = k / square(n_a_test), + ij = k % square(n_a_test), + i = ij / n_a_test, + j = ij % n_a_test; + const Real a_t = 2*a_test[i]-1, b_t = 2*a_test[j]-1; + Real q[3]; + sqr::calc_ref_to_sphere(p_, slice(e_, ei), a_t, b_t, q); + Real a, b; + sqr::Info info; + sqr::calc_sphere_to_ref(p_, slice(e_, ei), q, a, b, &info, 100, tol_); + const Real err = std::sqrt(square(a_t - a) + square(b_t - b)); + // tol is on dx, not (a,b), so adjust slightly. + if ( ! info.success || err > 1e4*tol_) { + jinfo.nfails++; + printf("calc_sphere_to_ref ei %d i %d j %d: nits %d re %1.1e\n", + ei, i, j, info.n_iterations, err); + } + jinfo.sum_nits += info.n_iterations; + jinfo.max_nits = max(jinfo.max_nits, info.n_iterations); + } + + KOKKOS_INLINE_FUNCTION + void init (value_type& info) { + info.sum_nits = 0; + info.max_nits = 0; + info.nfails = 0; + } + + KOKKOS_INLINE_FUNCTION + void join (volatile value_type& dst, volatile value_type const& src) const { + dst.max_nits = max(dst.max_nits, src.max_nits); + dst.sum_nits += src.sum_nits; + dst.nfails += src.nfails; + } +}; + +static Int test_sphere_to_ref (const ConstVec3s::HostMirror& p, + const ConstIdxs::HostMirror& e) { + TestSphereToRefKernel k(p, e); + Info info; + auto t = tic(); + ko::parallel_reduce(k.n(), k, info); + const auto et = toc(t); + fprintf(stderr, "sqr: #fails %d #iterations mean %1.1f max %d\n", + info.nfails, (Real) info.sum_nits / k.n(), info.max_nits); + print_times("test_sphere_to_ref", et); + return info.nfails; +} +} // namespace test +} // namespace sqr +} // namespace siqk + +#endif diff --git a/siqk/siqk_test.cpp b/siqk/siqk_test.cpp new file mode 100644 index 0000000..1b37a59 --- /dev/null +++ b/siqk/siqk_test.cpp @@ -0,0 +1,517 @@ +// ko=/home/ambradl/lib/kokkos/cpu; mycpp -I$ko/include -L$ko/lib -fopenmp unit_test.cpp -lkokkos -ldl -Wall -pedantic -DSIQK_TIME +// ./a.out -m | grep "mat=1" > foo.m +// >> msik('draw_unit_test0', 'foo'); + +#include + +#include "siqk.hpp" +using namespace siqk; + +//> Code that will likely be moved to library files. + +template +void write_matlab (const std::string& name, const CV3s& p) { + printf("mat=1; %s = [", name.c_str()); + for (Int ip = 0; ip < nslices(p); ++ip) + printf(" %1.15e %1.15e %1.15e;", p(ip,0), p(ip,1), p(ip,2)); + printf("].';\n"); +} + +template +void write_matlab (const std::string& name, const CV3s& p, const CIs& e) { + printf("mat=1; %s.p = [", name.c_str()); + for (Int ip = 0; ip < nslices(p); ++ip) + printf(" %1.15e %1.15e %1.15e;", p(ip,0), p(ip,1), p(ip,2)); + printf("].';\n"); + printf("mat=1; %s.n = [", name.c_str()); + for (Int ie = 0; ie < nslices(e); ++ie) + printf(" %d %d %d %d;", e(ie,0)+1, e(ie,1)+1, e(ie,2)+1, e(ie,3)+1); + printf("].';\n"); +} + +static void make_planar_mesh (Vec3s::HostMirror& p, Idxs::HostMirror& e, + const Int n) { + const Real d = std::sqrt(0.5); + ko::resize(e, n*n, 4); + ko::resize(p, (n+1)*(n+1), 3); + for (Int iy = 0; iy < n+1; ++iy) + for (Int ix = 0; ix < n+1; ++ix) { + const auto idx = (n+1)*iy + ix; + p(idx,0) = 2*(static_cast(ix)/n - 0.5)*d; + p(idx,1) = 2*(static_cast(iy)/n - 0.5)*d; + p(idx,2) = 0; + } + for (Int iy = 0; iy < n; ++iy) + for (Int ix = 0; ix < n; ++ix) { + const auto idx = n*iy + ix; + e(idx,0) = (n+1)*iy + ix; + e(idx,1) = (n+1)*iy + ix+1; + e(idx,2) = (n+1)*(iy+1) + ix+1; + e(idx,3) = (n+1)*(iy+1) + ix; + } +} + +// Row-major R. +inline void form_rotation (const Real axis[3], const Real angle, Real r[9]) { + const Real nrm = std::sqrt(SphereGeometry::norm2(axis)); + const Real& x = axis[0] / nrm, & y = axis[1] / nrm, & z = axis[2] / nrm, + & th = angle; + const Real cth = std::cos(th), sth = std::sin(th), omcth = 1 - cth; + r[0] = cth + x*x*omcth; + r[3] = y*x*omcth + z*sth; + r[6] = z*x*omcth - y*sth; + r[1] = x*y*omcth - z*sth; + r[4] = cth + y*y*omcth; + r[7] = z*y*omcth + x*sth; + r[2] = x*z*omcth + y*sth; + r[5] = y*z*omcth - x*sth; + r[8] = cth + z*z*omcth; +} + +template +static void rotate (const Real R[9], V p) { + const Real x = p[0], y = p[1], z = p[2]; + p[0] = R[0]*x + R[1]*y + R[2]*z; + p[1] = R[3]*x + R[4]*y + R[5]*z; + p[2] = R[6]*x + R[7]*y + R[8]*z; +} + +template +static void translate (const Real xlate[3], V p) { + for (Int i = 0; i < 3; ++i) p[i] += xlate[i]; +} + +static void transform_planar_mesh (const Real R[9], const Real xlate[3], + Vec3s::HostMirror& p) { + for (Int i = 0; i < nslices(p); ++i) { + rotate(R, slice(p, i)); + translate(xlate, slice(p, i)); + } +} + +// Remove vertices marked unused and adjust numbering. +static void remove_unused_vertices (Vec3s::HostMirror& p, Idxs::HostMirror& e, + const Real unused) { + // adjust[i] is the number to subtract from i. Hence if e(ei,0) was originally + // i, it is adjusted to i - adjust[i]. + std::vector adjust(nslices(p), 0); + Int rmcnt = 0; + for (Int i = 0; i < nslices(p); ++i) { + if (p(i,0) != unused) continue; + adjust[i] = 1; + ++rmcnt; + } + // Cumsum. + for (Int i = 1; i < nslices(p); ++i) + adjust[i] += adjust[i-1]; + // Adjust e. + for (Int ei = 0; ei < nslices(e); ++ei) + for (Int k = 0; k < szslice(e); ++k) + e(ei,k) -= adjust[e(ei,k)]; + // Remove unused from p. + Vec3s::HostMirror pc("copy", nslices(p), szslice(p)); + ko::deep_copy(pc, p); + ko::resize(p, nslices(p) - rmcnt, szslice(p)); + for (Int i = 0, j = 0; i < nslices(pc); ++i) { + if (pc(i,0) == unused) continue; + for (Int k = 0; k < szslice(pc); ++k) p(j,k) = pc(i,k); + ++j; + } +} + +// A very simple cube-sphere mesh with nxn elements per face. At least for now +// I'm not bothering with making the elements well proportioned. +void make_cubesphere_mesh (Vec3s::HostMirror& p, Idxs::HostMirror& e, + const Int n) { + // Transformation of the reference mesh make_planar_mesh to make each of the + // six faces. + const Real d = std::sqrt(0.5); + static Real R[6][9] = {{ 1, 0, 0, 0, 0, 0, 0, 1, 0}, // face 0, -y + { 0, 0, 0, 1, 0, 0, 0, 1, 0}, // 1, +x + {-1, 0, 0, 0, 0, 0, 0, 1, 0}, // 2, +y + { 0, 0, 0,-1, 0, 0, 0, 1, 0}, // 3, -x + { 1, 0, 0, 0, 1, 0, 0, 0, 0}, // 4, +z + {-1, 0, 0, 0, 1, 0, 0, 0, 0}}; // 5, -z + static Real xlate[6][3] = {{ 0,-d, 0}, { d, 0, 0}, { 0, d, 0}, + {-d, 0, 0}, { 0, 0, d}, { 0, 0,-d}}; + // Construct 6 uncoupled faces. + Vec3s::HostMirror ps[6]; + Vec3s::HostMirror& p_ref = ps[0]; + Idxs::HostMirror es[6]; + Idxs::HostMirror& e_ref = es[0]; + make_planar_mesh(p_ref, e_ref, n); + ko::resize(e, 6*nslices(e_ref), 4); + ko::resize(p, 6*nslices(p_ref), 3); + for (Int i = 1; i < 6; ++i) { + ko::resize(es[i], nslices(e_ref), 4); + ko::deep_copy(es[i], e_ref); + ko::resize(ps[i], nslices(p_ref), 3); + ko::deep_copy(ps[i], p_ref); + transform_planar_mesh(R[i], xlate[i], ps[i]); + } + transform_planar_mesh(R[0], xlate[0], ps[0]); + // Pack (p,e), accounting for equivalent vertices. For the moment, keep the p + // slot for an equivalent vertex to make node numbering simpler, but make the + // value bogus so we know if there's a problem in the numbering. + const Real unused = -2; + ko::deep_copy(p, unused); + Int p_base = 0, e_base = 0; + { // -y face + const Vec3s::HostMirror& fp = ps[0]; + Idxs::HostMirror& fe = es[0]; + for (Int j = 0; j < nslices(fp); ++j) + for (Int k = 0; k < 3; ++k) p(j,k) = fp(j,k); + for (Int j = 0; j < nslices(fe); ++j) + for (Int k = 0; k < 4; ++k) e(j,k) = fe(j,k); + p_base += nslices(p_ref); + e_base += nslices(e_ref); + } + for (Int fi = 1; fi <= 2; ++fi) { // +x, +y faces + const Vec3s::HostMirror& fp = ps[fi]; + Idxs::HostMirror& fe = es[fi]; + for (Int j = 0; j < nslices(fp); ++j) { + if (j % (n+1) == 0) continue; // equiv vertex + for (Int k = 0; k < 3; ++k) p(p_base+j,k) = fp(j,k); + } + for (Int j = 0; j < nslices(fe); ++j) { + for (Int k = 0; k < 4; ++k) fe(j,k) += p_base; + // Left 2 vertices of left elem on face fi equiv to right 2 vertices of + // right elem on face fi-1. Write to the face, then copy to e, so that + // other faces can use these updated data. + if (j % n == 0) { + fe(j,0) = es[fi-1](j+n-1,1); + fe(j,3) = es[fi-1](j+n-1,2); + } + for (Int k = 0; k < 4; ++k) e(e_base+j,k) = fe(j,k); + } + p_base += nslices(p_ref); + e_base += nslices(e_ref); + } + { // -x face + const Vec3s::HostMirror& fp = ps[3]; + Idxs::HostMirror& fe = es[3]; + for (Int j = 0; j < nslices(fp); ++j) { + if (j % (n+1) == 0 || (j+1) % (n+1) == 0) continue; + for (Int k = 0; k < 3; ++k) p(p_base+j,k) = fp(j,k); + } + for (Int j = 0; j < nslices(fe); ++j) { + for (Int k = 0; k < 4; ++k) fe(j,k) += p_base; + if (j % n == 0) { + fe(j,0) = es[2](j+n-1,1); + fe(j,3) = es[2](j+n-1,2); + } else if ((j+1) % n == 0) { + fe(j,1) = es[0]((j+1)-n,0); + fe(j,2) = es[0]((j+1)-n,3); + } + for (Int k = 0; k < 4; ++k) e(e_base+j,k) = fe(j,k); + } + p_base += nslices(p_ref); + e_base += nslices(e_ref); + } + { // +z face + const Vec3s::HostMirror& fp = ps[4]; + Idxs::HostMirror& fe = es[4]; + for (Int j = n+1; j < nslices(fp) - (n+1); ++j) { + if (j % (n+1) == 0 || (j+1) % (n+1) == 0) continue; + for (Int k = 0; k < 3; ++k) p(p_base+j,k) = fp(j,k); + } + for (Int j = 0; j < nslices(fe); ++j) + for (Int k = 0; k < 4; ++k) fe(j,k) += p_base; + for (Int j = 0; j < n; ++j) { // -y + fe(j,0) = es[0](n*(n-1)+j,3); + fe(j,1) = es[0](n*(n-1)+j,2); + } + for (Int j = 0; j < n; ++j) { // +y + fe(n*(n-1)+j,2) = es[2](n*n-1-j,3); + fe(n*(n-1)+j,3) = es[2](n*n-1-j,2); + } + for (Int j = 0, i3 = 0; j < nslices(fe); j += n, ++i3) { // -x + fe(j,0) = es[3](n*n-1-i3,2); + fe(j,3) = es[3](n*n-1-i3,3); + } + for (Int j = n-1, i1 = 0; j < nslices(fe); j += n, ++i1) { // +x + fe(j,1) = es[1](n*(n-1)+i1,3); + fe(j,2) = es[1](n*(n-1)+i1,2); + } + for (Int j = 0; j < nslices(fe); ++j) + for (Int k = 0; k < 4; ++k) e(e_base+j,k) = fe(j,k); + p_base += nslices(p_ref); + e_base += nslices(e_ref); + } + { // -z face + const Vec3s::HostMirror& fp = ps[5]; + Idxs::HostMirror& fe = es[5]; + for (Int j = n+1; j < nslices(fp) - (n+1); ++j) { + if (j % (n+1) == 0 || (j+1) % (n+1) == 0) continue; + for (Int k = 0; k < 3; ++k) p(p_base+j,k) = fp(j,k); + } + for (Int j = 0; j < nslices(fe); ++j) + for (Int k = 0; k < 4; ++k) fe(j,k) += p_base; + for (Int j = 0; j < n; ++j) { // -y + fe(j,0) = es[0](n-1-j,1); + fe(j,1) = es[0](n-1-j,0); + } + for (Int j = 0; j < n; ++j) { // +y + fe(n*(n-1)+j,2) = es[2](j,1); + fe(n*(n-1)+j,3) = es[2](j,0); + } + for (Int j = 0, i3 = 0; j < nslices(fe); j += n, ++i3) { // -x + fe(j,0) = es[1](i3,0); + fe(j,3) = es[1](i3,1); + } + for (Int j = n-1, i1 = 0; j < nslices(fe); j += n, ++i1) { // +x + fe(j,1) = es[3](n-1-i1,1); + fe(j,2) = es[3](n-1-i1,0); + } + for (Int j = 0; j < nslices(fe); ++j) + for (Int k = 0; k < 4; ++k) e(e_base+j,k) = fe(j,k); + } + // Now go back and remove the unused vertices and adjust the numbering. + remove_unused_vertices(p, e, unused); + // Project to the unit sphere. + for (Int i = 0; i < nslices(p); ++i) + SphereGeometry::normalize(slice(p, i)); +} + +void calc_elem_ctr (const Vec3s::HostMirror& p, const Idxs::HostMirror& e, + const Int ei, Real ctr[3]) { + for (Int j = 0; j < 3; ++j) ctr[j] = 0; + Int n = 0; + for (Int i = 0; i < szslice(e); ++i) { + if (e(ei,i) < 0) break; + for (Int j = 0; j < 3; ++j) ctr[j] += p(e(ei,i),j); + ++n; + } + for (Int j = 0; j < 3; ++j) ctr[j] /= n; +} + +// Return 0 if all elements' subtri normals point outward relative to the +// sphere. +Int check_elem_normal_against_sphere (const Vec3s::HostMirror& p, + const Idxs::HostMirror& e) { + Int nerr = 0; + for (Int ei = 0; ei < nslices(e); ++ei) { // for each element + Real sphere[3]; // ray through elem ctr + calc_elem_ctr(p, e, ei, sphere); + for (Int ti = 0; ti < szslice(e) - 2; ++ti) { // for each tri + if (e(ei,ti+2) < 0) break; + Real tri_normal[3]; { + Real v[2][3]; + for (Int j = 0; j < 2; ++j) { + SphereGeometry::copy(v[j], slice(p, e(ei,ti+j+1))); + SphereGeometry::axpy(-1, slice(p, e(ei,0)), v[j]); + } + SphereGeometry::cross(v[0], v[1], tri_normal); + } + if (SphereGeometry::dot(tri_normal, sphere) <= 0) + ++nerr; + } + } + return nerr; +} + +//> Unit test code. + +struct Input { + Int testno; + Int n; + Real angle, xlate, ylate; + bool write_matlab, geo_sphere; + + Input(Int argc, char** argv); + void print(std::ostream& os) const; +}; + +static void project_onto_sphere (Vec3s::HostMirror& p) { + for (Int ip = 0; ip < nslices(p); ++ip) { + p(ip,2) = 1; + SphereGeometry::normalize(slice(p, ip)); + } +} + +static void +perturb_mesh (Vec3s::HostMirror& p, const Real angle, const Real xlate, + const Real ylate) { + const Real cr = std::cos(angle), sr = std::sin(angle); + for (Int ip = 0; ip < nslices(p); ++ip) { + const Real x = p(ip,0), y = p(ip,1); + p(ip,0) = cr*x - sr*y + xlate; + p(ip,1) = -sr*x + cr*y + ylate; + } +} + +static void +rotate_mesh (Vec3s::HostMirror& p, const Real axis[3], const Real angle) { + Real R[9]; + form_rotation(axis, angle, R); + for (Int i = 0; i < nslices(p); ++i) + rotate(R, slice(p,i)); +} + +static void fill_quad (const ConstVec3s::HostMirror& p, + Vec3s::HostMirror& poly) { + const Int n = static_cast(std::sqrt(nslices(p) - 1)); + copy(slice(poly, 0), slice(p, 0), 3); + copy(slice(poly, 1), slice(p, n), 3); + copy(slice(poly, 2), slice(p, nslices(p) - 1), 3); + copy(slice(poly, 3), slice(p, nslices(p) - 1 - n), 3); +} + +// Area of the outline of (p,e) clipped against the outline of (cp,ce). +template +static Real calc_true_area ( + const ConstVec3s::HostMirror& cp, const ConstIdxs::HostMirror& ce, + const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e, + const bool wm) +{ + Vec3s::HostMirror clip_poly("clip_poly", 4, 3), poly("poly", 4, 3), + nml("nml", 4, 3); + fill_quad(cp, clip_poly); + fill_quad(p, poly); + for (Int i = 0; i < 4; ++i) + Geo::edge_normal(slice(clip_poly, i), slice(clip_poly, (i+1) % 4), + slice(nml, i)); + Vec3s::HostMirror vo("vo", test::max_nvert, 3); + Int no; + { + Vec3s::HostMirror wrk("wrk", test::max_nvert, 3); + sh::clip_against_poly(clip_poly, nml, poly, 4, vo, no, wrk); + } + if (wm) { + write_matlab("clip_poly", clip_poly); + write_matlab("poly", poly); + write_matlab("intersection", + ko::subview(vo, std::pair(0, no), ko::ALL())); + } + return Geo::calc_area_formula(vo, no); +} + +template void finalize_mesh (Vec3s::HostMirror& p) {} +template <> void finalize_mesh (Vec3s::HostMirror& p) { + project_onto_sphere(p); +} + +template +static Int +test_area (const Int n, const Real angle, const Real xlate, const Real ylate, + const bool wm) { + Vec3s::HostMirror cp; + Idxs::HostMirror ce; + make_planar_mesh(cp, ce, n); + + Vec3s::HostMirror p; resize_and_copy(p, cp); + Idxs::HostMirror e; resize_and_copy(e, ce); + perturb_mesh(p, angle, xlate, ylate); + + finalize_mesh(cp); + finalize_mesh(p); + + const Real ta = calc_true_area(cp, ce, p, e, wm); + const Real a = test::test_area_ot(cp, ce, p, e); + + const Real re = std::abs(a - ta)/ta; + fprintf(stderr, "true area %1.4e mesh area %1.4e relerr %1.4e\n", ta, a, re); + if (wm) { + write_matlab("cm", cp, ce); + write_matlab("m", p, e); + } + return re < 1e-8 ? 0 : 1; +} + +static Int test_cube (const Input& in) { + Vec3s::HostMirror cp; + Idxs::HostMirror ce; + make_cubesphere_mesh(cp, ce, in.n); + Vec3s::HostMirror p; resize_and_copy(p, cp); + Idxs::HostMirror e; resize_and_copy(e, ce); + Int nerr = 0; + { + const Int ne = check_elem_normal_against_sphere(cp, ce); + if (ne) std::cerr << "FAIL: check_elem_normal_against_sphere\n"; + nerr += ne; + } + { // Make a copy, perturb it, and compute the area of the sphere from the + // overlap mesh. + Real axis[] = {0.1, -0.3, 0.2}; + rotate_mesh(p, axis, in.angle); + const Real + a = test::test_area_ot(cp, ce, p, e), + ta = 4*M_PI, + re = std::abs(a - ta)/ta; + fprintf(stderr, "true area %1.4e mesh area %1.4e relerr %1.4e\n", + ta, a, re); + nerr += re < 1e-8 ? 0 : 1; + } + // Test ref square <-> spherical quad transformations. + nerr += sqr::test::test_sphere_to_ref(p, e); + if (in.write_matlab) { + write_matlab("cm", cp, ce); + write_matlab("m", p, e); + } + return nerr; +} + +template +Int run (const Input& in) { + switch (in.testno) { + case 0: + return test_area(in.n, in.angle, in.xlate, in.ylate, in.write_matlab); + case 1: + return test_cube(in); + default: + return 1; + } +} + +inline bool +eq (const std::string& a, const char* const b1, const char* const b2 = 0) { + return (a == std::string(b1) || (b2 && a == std::string(b2)) || + a == std::string("-") + std::string(b1)); +} + +Input::Input (Int argc, char** argv) + : testno(0), n(25), angle(M_PI*1e-1), xlate(1e-1), ylate(1e-1), + write_matlab(false), geo_sphere(true) +{ + for (Int i = 1; i < argc; ++i) { + const std::string& token = argv[i]; + if (eq(token, "--testno")) testno = atoi(argv[++i]); + else if (eq(token, "-n")) n = atoi(argv[++i]); + else if (eq(token, "-m", "--write-matlab")) write_matlab = true; + else if (eq(token, "--plane")) geo_sphere = false; + else if (eq(token, "--xlate")) xlate = atof(argv[++i]); + else if (eq(token, "--ylate")) ylate = atof(argv[++i]); + else if (eq(token, "--angle")) angle = atof(argv[++i]); + } + + print(std::cout); +} + +void Input::print (std::ostream& os) const { + os << "testno " << testno << "\n" + << "n (-n): " << n << "\n" + << "write matlab (-m): " << write_matlab << "\n" + << "planar geometry (--plane): " << ! geo_sphere << "\n" + << "angle (--angle): " << angle << "\n" + << "xlate (--xlate): " << xlate << "\n" + << "ylate (--ylate): " << ylate << "\n"; +} + +int main (int argc, char** argv) { + Kokkos::initialize(argc, argv); + { + Input in(argc, argv); + Int nerr = 0; + if (in.geo_sphere) + nerr += run(in); + else { +#ifdef INSTANTIATE_PLANE + run(in); +#else + Kokkos::abort("PlaneGeometry not instantiated."); +#endif + } + std::cerr << (nerr ? "FAIL" : "PASS") << "ED\n"; + } + Kokkos::finalize_all(); +} diff --git a/siqk/slmm/Makefile b/siqk/slmm/Makefile new file mode 100644 index 0000000..7dd003b --- /dev/null +++ b/siqk/slmm/Makefile @@ -0,0 +1,45 @@ +opt= +CXX=g++-4.7 + +KOKKOS=/home/ambradl/lib/kokkos/cpu +SIQK=.. +LINK_LAPACK_BLAS=-llapack -lblas +# Optional. Comment out if no TPL available. +NETCDF=/home/ambradl/lib/netcdf + +# Should not have to change the rest. + +CXXFLAGS=$(opt) -Wall -pedantic -fopenmp -std=c++11 -I$(SIQK) -I$(KOKKOS)/include -DSIQK_TIME +LDFLAGS=-fopenmp -L$(KOKKOS)/lib -lkokkos -ldl + +ifdef NETCDF + CXXFLAGS+=-I$(NETCDF)/include -DSLMM_HAVE_NETCDF + LDFLAGS+=-L$(NETCDF)/lib -lnetcdf_c++ -lnetcdf -Wl,-rpath=$(NETCDF)/lib +endif + +SOURCES=slmm_mesh.cpp slmm_io.cpp slmm_time_int.cpp slmm_gallery.cpp slmm_util.cpp + +OBJECTS=$(SOURCES:.cpp=.o) + +.cpp.o: + $(CXX) $(CFLAGS) $(CXXFLAGS) -c $< -o $@ + +all: slmm_test slmmir + +slmm_test: $(OBJECTS) slmm_test.o + $(CXX) $(OBJECTS) slmm_test.o $(LDFLAGS) -o slmm_test + +slmmir: $(OBJECTS) slmmir.o + $(CXX) $(OBJECTS) slmmir.o $(LDFLAGS) $(LINK_LAPACK_BLAS) -o slmmir + +clean: + rm -f *.o slmm_test slmmir + +$(SIQK)/siqk.hpp: $(SIQK)/siqk_intersect.hpp $(SIQK)/siqk_geometry.hpp $(SIQK)/siqk_sqr.hpp $(SIQK)/siqk_search.hpp $(SIQK)/siqk_quadrature.hpp +slmm_test.o: slmm_defs.hpp slmm_mesh.hpp slmm_gll.hpp slmm_io.hpp slmm_time_int.hpp slmm_gallery.hpp $(SIQK)/siqk.hpp +slmmir.o: slmm_defs.hpp slmm_util.hpp slmm_mesh.hpp slmm_gll.hpp slmm_io.hpp slmm_time_int.hpp slmm_gallery.hpp $(SIQK)/siqk.hpp +slmm_mesh.o: slmm_mesh.hpp $(SIQK)/siqk.hpp +slmm_io.o: slmm_io.hpp +slmm_time_int.o: slmm_time_int.hpp +slmm_gallery.o: slmm_gallery.hpp +slmm_util.o: slmm_util.hpp diff --git a/siqk/slmm/slmm_debug.hpp b/siqk/slmm/slmm_debug.hpp new file mode 100644 index 0000000..010e549 --- /dev/null +++ b/siqk/slmm/slmm_debug.hpp @@ -0,0 +1,37 @@ +#ifndef INCLUDE_SLMM_DEBUG_HPP +#define INCLUDE_SLMM_DEBUG_HPP + +#include +#include + +namespace slmm { + +template +void write_matlab (const std::string& name, const CV3s& p) { + std::cout << "mat=1; " << name << " = ["; + for (Int ip = 0; ip < nslices(p); ++ip) { + for (Int k = 0; k < szslice(p); ++k) + std::cout << " " << p(ip,k); + std::cout << ";"; + } + std::cout << "].';\n"; +} + +template +void write_matlab (const std::string& name, const CV3s& p, const CIs& e) { + printf("mat=1; %s.p = [", name.c_str()); + for (Int ip = 0; ip < nslices(p); ++ip) + printf(" %1.15e %1.15e %1.15e;", p(ip,0), p(ip,1), p(ip,2)); + printf("].';\n"); + printf("mat=1; %s.n = [", name.c_str()); + for (Int ie = 0; ie < nslices(e); ++ie) { + for (Int k = 0; k < szslice(e); ++k) + printf(" %d", e(ie,k)+1); + printf(";"); + } + printf("].';\n"); +} + +} // namespace slmm + +#endif diff --git a/siqk/slmm/slmm_defs.hpp b/siqk/slmm/slmm_defs.hpp new file mode 100644 index 0000000..e7409b2 --- /dev/null +++ b/siqk/slmm/slmm_defs.hpp @@ -0,0 +1,36 @@ +#ifndef INCLUDE_SLMM_DEFS_HPP +#define INCLUDE_SLMM_DEFS_HPP + +#include "siqk.hpp" + +namespace slmm { +using siqk::Int; +using siqk::Real; +typedef Int Size; + +namespace ko = Kokkos; +using geometry = siqk::SphereGeometry; + +using siqk::Vec3s; +using siqk::ConstVec3s; +using siqk::Idxs; +using siqk::ConstIdxs; +typedef ko::View IdxArray; +typedef ko::View ConstIdxArray; +typedef ko::View RealArray; +typedef ko::View ConstRealArray; +typedef ko::View RealArray2; +typedef ko::View ConstRealArray2; + +// A 2D array A can be thought of as having nslices(A) rows and szslice(A) +// columns. A slice can be obtained by +// auto ak = slice(A, k); +// We use this format for arrays of vertices and adjacency arrays, for +// example. In most or all cases, the intention is to parallelize over slices, +// so a Kokkos operator() will do work on a particular slice. +using siqk::nslices; +using siqk::szslice; +using siqk::slice; +} // namespace slmm + +#endif diff --git a/siqk/slmm/slmm_gallery.cpp b/siqk/slmm/slmm_gallery.cpp new file mode 100644 index 0000000..3cc4956 --- /dev/null +++ b/siqk/slmm/slmm_gallery.cpp @@ -0,0 +1,14 @@ +#include "slmm_gallery.hpp" + +namespace slmm { +namespace gallery { + +const char* InitialCondition::inputs[] = + {"xyztrig", "gaussianhills", "cosinebells", "slottedcylinders", + "correlatedcosinebells"}; + +const char* WindFieldType::inputs[] = + {"dcmip1d3ll", "nondivergent", "divergent", "rotate", "nondivergenthack"}; + +} // namespace gallery +} // namespace slmm diff --git a/siqk/slmm/slmm_gallery.hpp b/siqk/slmm/slmm_gallery.hpp new file mode 100644 index 0000000..c4214f5 --- /dev/null +++ b/siqk/slmm/slmm_gallery.hpp @@ -0,0 +1,312 @@ +#ifndef INCLUDE_SLMM_GALLERY_HPP +#define INCLUDE_SLMM_GALLERY_HPP + +#include "slmm_defs.hpp" +#include "slmm_time_int.hpp" + +namespace slmm { +namespace gallery { + +class OdeFnBasicRecorder { + mutable int ne_; + bool xyz_form_; +public: + OdeFnBasicRecorder () : ne_(0), xyz_form_(false) {} + void record (const Real t, const Real* const y) const { ++ne_; } + int ne () const { return ne_; } + void set_xyz_form (const bool use_xyz_form) { xyz_form_ = use_xyz_form; } + bool use_xyz_form () const { return xyz_form_; } +}; + +// From Lauritzen et al, A standard test case suite for two-dimensional linear +// transport on the sphere, Geosci. Model Dev., 2012. +class InitialCondition { + static const char* inputs[]; + + static inline Real GH (const Real x, const Real y, const Real z, + const Real xi, const Real yi, const Real zi) { + const Real h_max = 0.95, b = 5; + return h_max*std::exp(-b*slmm::square(x - xi) + slmm::square(y - yi) + + slmm::square(z - zi)); + } + + static inline Real CB (const Real ri, const Real r) { + const Real h_max = 1; + return 0.5*h_max*(1 + std::cos(M_PI*ri/r)); + } + +public: + enum Shape { + XYZTrig, GaussianHills, CosineBells, SlottedCylinders, + CorrelatedCosineBells + }; + + static Shape from_string (const std::string& si) { + std::string s(si); + slmm::tolower(s); + if (s == inputs[0]) return XYZTrig; + if (s == inputs[1]) return GaussianHills; + if (s == inputs[2]) return CosineBells; + if (s == inputs[3]) return SlottedCylinders; + if (s == inputs[4]) return CorrelatedCosineBells; + throw std::runtime_error(si + " is not an initial condition."); + } + + static void init (const Shape shape, const Size n, const Real* const lat, + const Real* const lon, Real* const u) { + const Real lon1 = 5*(M_PI/6), lat1 = 0, lon2 = 7*(M_PI/6), lat2 = 0; + Real x1, y1, z1, x2, y2, z2; + slmm::ll2xyz(lat1, lon1, x1, y1, z1); + slmm::ll2xyz(lat2, lon2, x2, y2, z2); + switch (shape) { + case XYZTrig: { + for (Size i = 0; i < n; ++i) { + Real x, y, z; + slmm::ll2xyz(lat[i], lon[i], x, y, z, 1); + u[i] = std::sin(3*x)*std::sin(3*y)*std::sin(4*z); + } + } break; + case GaussianHills: { + for (Size i = 0; i < n; ++i) { + Real x, y, z; + slmm::ll2xyz(lat[i], lon[i], x, y, z, 1); + u[i] = GH(x, y, z, x1, y1, z1) + GH(x, y, z, x2, y2, z2); + } + } break; + case CosineBells: { + const Real r = 0.5, b = 0.1, c = 0.9; + for (Size i = 0; i < n; ++i) { + const Real r1 = slmm::great_circle_dist(lat[i], lon[i], lat1, lon1); + Real h = 0; + if (r1 < r) + h = CB(r1, r); + else { + const Real r2 = slmm::great_circle_dist(lat[i], lon[i], lat2, lon2); + if (r2 < r) + h = CB(r2, r); + } + u[i] = b + c*h; + } + } break; + case SlottedCylinders: { + const Real b = 0.1, c = 1, R = 1, r = 0.5*R, lon_thr = r/(6*R), + lat_thr = 5*(r/(12*R)); + for (Size i = 0; i < n; ++i) { + const Real r1 = slmm::great_circle_dist(lat[i], lon[i], lat1, lon1); + if (r1 <= r) { + if (std::abs(lon[i] - lon1) >= lon_thr) { + u[i] = c; + continue; + } + if (std::abs(lon[i] - lon1) < lon_thr && lat[i] - lat1 < -lat_thr) { + u[i] = c; + continue; + } + } + const Real r2 = slmm::great_circle_dist(lat[i], lon[i], lat2, lon2); + if (r2 <= r) { + if (std::abs(lon[i] - lon2) >= lon_thr) { + u[i] = c; + continue; + } + if (std::abs(lon[i] - lon2) < lon_thr && lat[i] - lat2 > lat_thr) { + u[i] = c; + continue; + } + } + u[i] = b; + } + } break; + case CorrelatedCosineBells: { + const Real a = -0.8, b = 0.9; + init(CosineBells, n, lat, lon, u); + for (Size i = 0; i < n; ++i) + u[i] = a*slmm::square(u[i]) + b; + } break; + default: assert(0); + } + } + + static std::string get_inputs () + { return slmm::format_strings_as_list(inputs, 5); } +}; + +// Convert from (u,v), where u is velocity along latitude and v is velocity +// along longitude, to (x,y,z), which is velocity in the global cartesian +// coordinate system. Add a w (local vertical) component to push the position +// (X,Y,Z) back to the unit sphere. +inline void uv2xyz ( + const Real X, const Real Y, const Real Z, // position + const Real u, const Real v, // velocity in tangent coord system + Real& x, Real& y, Real& z) // velocity in global coord system +{ + // r should be 1 but will numerically drift, so measure it ... + const Real r = std::sqrt(X*X + Y*Y + Z*Z); + // ... and then add a local vertical velocity to project back to the sphere. + const Real w = (1 - r)/slmm::consts::earth_radius_m; + Real R[9]; // Row major. + // The local vertical is just the position vector. + R[2] = X/r; R[5] = Y/r; R[8] = Z/r; + // The local along-latitude vector. + R[0] = -Y; R[3] = X; R[6] = 0; + const Real den = std::sqrt(R[0]*R[0] + R[3]*R[3]); + R[0] /= den; R[3] /= den; + // Local vertical x along-latitude. + R[1] = R[5]*R[6] - R[8]*R[3]; + R[4] = R[8]*R[0] - R[2]*R[6]; + R[7] = R[2]*R[3] - R[5]*R[0]; + // Transform. + x = R[0]*u + R[1]*v + R[2]*w; + y = R[3]*u + R[4]*v + R[5]*w; + z = R[6]*u + R[7]*v + R[8]*w; +} + +// Integrate the ODE in lat-lon space. Not good numerically in the lon direction +// because of the poles. +struct Dcmip1d3llOdeFn : public OdeFnBasicRecorder { + bool eval (const Real t, const Real* const d, Real* const f) const { + assert ( ! use_xyz_form()); + const Real + a = M_PI/6, + a_ref = slmm::consts::earth_radius_m, + tau = 1036800, + u0 = 2*M_PI*a_ref/tau, + sina = std::sin(a), + cosa = std::sqrt(1 - slmm::square(sina)), + lat = d[0], + lon = d[1], + sinp = std::sin(lat), + cosp = std::cos(lat), + sinl = std::sin(lon), + cosl = std::cos(lon); + // In what follows, + // u = u0*(cosp*cosa + sinp*cosl*sina) + // v = -u0*sinl*sina + // w = 0 + // lat_t = slmm::m2radlat(v) + // lon_t = slmm::m2radlon(lat, u). + // For numerical reasons, write this a little differently. + const Real v = -u0*sinl*sina; + f[0] = slmm::m2radlat(v); + // tan(phi) is singular at the pole. We could introduce a cutoff so the wind + // speed is not infinite, but for now it does not matter. + f[1] = slmm::m2radlat(u0*(slmm::sign(cosp)*cosa + + sinp*cosl*sina/std::abs(cosp))); + return true; + } +}; + +// Also from Lauritzen et al. +struct NonDivergentWindField : public OdeFnBasicRecorder { + bool eval (const Real t, const Real* const d, Real* const f) const { + Real theta, lambda; + if (use_xyz_form()) + xyz2ll(d[0], d[1], d[2], theta, lambda); + else { + theta = d[0]; // latitude + lambda = d[1]; // longitude + } + const Real + T = slmm::day2sec(12), + R = slmm::consts::earth_radius_m, + lambda_p = lambda - 2*M_PI*t/T, + costh = std::cos(theta), + cost = std::cos(M_PI*t/T); + // v + f[0] = 10*R/T*std::sin(2*lambda_p)*costh*cost; + // u + f[1] = R/T*(10*slmm::square(std::sin(lambda_p))*std::sin(2*theta)*cost + + 2*M_PI*costh); + if (use_xyz_form()) + uv2xyz(d[0], d[1], d[2], f[1]/R, f[0]/R, f[0], f[1], f[2]); + else { + f[0] = slmm::m2radlat(f[0]); + f[1] = slmm::m2radlon(theta, f[1]); + } + return true; + } +}; + +// Also from Lauritzen et al. +struct DivergentWindField : public OdeFnBasicRecorder { + bool eval (const Real t, const Real* const d, Real* const f) const { + Real theta, lambda; + if (use_xyz_form()) + xyz2ll(d[0], d[1], d[2], theta, lambda); + else { + theta = d[0]; // latitude + lambda = d[1]; // longitude + } + const Real + T = slmm::day2sec(12), + R = slmm::consts::earth_radius_m, + lambda_p = lambda - 2*M_PI*t/T, + costh = std::cos(theta), + cost = std::cos(M_PI*t/T); + // v + f[0] = 2.5*R/T*std::sin(lambda_p)*slmm::cube(costh)*cost; + // u + f[1] = R/T*(-5*slmm::square(std::sin(0.5*lambda_p))*std::sin(2*theta)* + slmm::square(costh)*cost + 2*M_PI*costh); + if (use_xyz_form()) + uv2xyz(d[0], d[1], d[2], f[1]/R, f[0]/R, f[0], f[1], f[2]); + else { + f[0] = slmm::m2radlat(f[0]); + f[1] = slmm::m2radlon(theta, f[1]); + } + return true; + } +}; + +struct NonDivergentWindFieldHack : public OdeFnBasicRecorder { + bool eval (const Real t, const Real* const d, Real* const f) const { + Real theta, lambda; + if (use_xyz_form()) + xyz2ll(d[0], d[1], d[2], theta, lambda); + else { + theta = d[0]; // latitude + lambda = d[1]; // longitude + } + const Real + T = slmm::day2sec(12), + R = slmm::consts::earth_radius_m, + lambda_p = lambda, + costh = std::cos(theta), + cost = std::cos(M_PI*t/T); + // v + f[0] = 10*R/T*std::sin(2*lambda_p)*costh*cost; + // u + f[1] = 10*R/T*slmm::square(std::sin(lambda_p))*std::sin(2*theta)*cost; + if (use_xyz_form()) + uv2xyz(d[0], d[1], d[2], f[1]/R, f[0]/R, f[0], f[1], f[2]); + else { + f[0] = slmm::m2radlat(f[0]); + f[1] = slmm::m2radlon(theta, f[1]); + } + return true; + } +}; + +struct WindFieldType { + static const char* inputs[]; +public: + enum Enum { Dcmip1d3ll, NonDivergentWindField, DivergentWindField, Rotate, + NonDivergentWindFieldHack }; + static Enum from_string (const std::string& si) { + std::string s(si); + slmm::tolower(s); + if (s == inputs[0]) return Dcmip1d3ll; + if (s == inputs[1]) return NonDivergentWindField; + if (s == inputs[2]) return DivergentWindField; + if (s == inputs[3]) return Rotate; + if (s == inputs[4]) return NonDivergentWindFieldHack; + throw std::runtime_error(si + " is not an ODE function."); + } + static std::string get_inputs () + { return slmm::format_strings_as_list(inputs, 4); } +}; + +} // namespace gallery +} // namespace slmm + +#endif diff --git a/siqk/slmm/slmm_gll.hpp b/siqk/slmm/slmm_gll.hpp new file mode 100644 index 0000000..b653686 --- /dev/null +++ b/siqk/slmm/slmm_gll.hpp @@ -0,0 +1,75 @@ +#ifndef INCLUDE_SLMM_GLL_HPP +#define INCLUDE_SLMM_GLL_HPP + +#include "slmm_defs.hpp" + +namespace slmm { + +class GLL { + const Real oo3 = 1.0/3.0; + const Real to3 = 2.0/3.0; + const Real sqrt5 = std::sqrt(5.0); + const Real oo6 = 1.0/6.0; + const Real np2_coord[2] = {-1.0, 1.0}; + const Real np2_wt[2] = {1.0, 1.0}; + const Real np3_coord[3] = {-1.0, 0.0, 1.0}; + const Real np3_wt[3] = {oo3, 2.0 - to3, oo3}; + const Real np4_coord[4] = {-1.0, -1.0/sqrt5, 1.0/sqrt5, 1.0}; + const Real np4_wt[4] = {oo6, 1.0 - oo6, 1.0 - oo6, oo6}; + +public: + enum { max_np = 4 }; + + KOKKOS_INLINE_FUNCTION GLL () {} + + KOKKOS_INLINE_FUNCTION + void get_coef (const int np, const Real*& coord, const Real*& wt) { + switch (np) { + case 2: + coord = np2_coord; + wt = np2_wt; + break; + case 3: + coord = np3_coord; + wt = np3_wt; + break; + case 4: + coord = np4_coord; + wt = np4_wt; + break; + default: + ko::abort("GLL::get_coef: order not supported."); + } + } + + // x in [-1, 1]. + KOKKOS_INLINE_FUNCTION + void eval (const int np, const Real& x, Real* const ge) const { + switch (np) { + case 2: { + ge[0] = 0.5*(1.0 - x); + ge[1] = 0.5*(1.0 + x); + } break; + case 3: { + const Real x2 = x*x; + ge[0] = 0.5*(x2 - x); + ge[1] = 1.0 - x2; + ge[2] = 0.5*(x2 + x); + } break; + case 4: { + const Real oo8 = 1.0/8.0; + const Real x2 = x*x; + ge[0] = (1.0 - x)*(5.0*x2 - 1.0)*oo8; + ge[1] = -sqrt5*oo8*(sqrt5 - 5.0*x)*(x2 - 1.0); + ge[2] = -sqrt5*oo8*(sqrt5 + 5.0*x)*(x2 - 1.0); + ge[3] = (1.0 + x)*(5.0*x2 - 1.0)*oo8; + } break; + default: + ko::abort("GLL::eval: order not supported."); + } + } +}; + +} // namespace slmm + +#endif diff --git a/siqk/slmm/slmm_io.cpp b/siqk/slmm/slmm_io.cpp new file mode 100644 index 0000000..ca6062e --- /dev/null +++ b/siqk/slmm/slmm_io.cpp @@ -0,0 +1,314 @@ +#include "slmm_io.hpp" + +#include + +#ifdef SLMM_HAVE_NETCDF +# include +#endif + +namespace slmm { +namespace io { + +NetcdfWriter::NetcdfWriter ( + const Vec3s::HostMirror& p, const Idxs::HostMirror& c2n, + const std::string& out_fn, const Int np, const Int monotone_type) +{ + init(p, c2n, out_fn, np, monotone_type); +} + +void NetcdfWriter::init ( + const Vec3s::HostMirror& p, const Idxs::HostMirror& c2n, + const std::string& out_fn, const Int np, const Int monotone_type) +{ +#ifdef SLMM_HAVE_NETCDF + nn_ = nslices(p); + nc_ = nslices(c2n); + + time_idx_ = 0; + time_ = 0; + define_done_ = false; + + //todo Do I need this? NcError error(NcError::silent_nonfatal); + ncf_ = std::make_shared(out_fn.c_str(), NcFile::Replace); + if ( ! ncf_->is_valid()) + throw std::runtime_error(std::string("Could not open file ") + out_fn + + " for writing."); + + // Thank you, TempestRemap, for figuring out the Exodus stuff. + static const int len_str = 33; + auto nodes_dim = ncf_->add_dim("num_nodes", nn_); + auto len_str_dim = ncf_->add_dim("len_string", len_str); + auto time_dim = ncf_->add_dim("time_step"); + auto cells_dim = ncf_->add_dim("num_elem", nc_); + auto num_el_blk_dim = ncf_->add_dim("num_el_blk", 1); + auto nodes_per_cell_dim = ncf_->add_dim("num_nod_per_el1", szslice(c2n)); + auto att_block1_dim = ncf_->add_dim("num_att_in_blk1", 1); + ncf_->add_dim("len_line", 81); + ncf_->add_dim("num_dim", 3); + ncf_->add_dim("num_el_in_blk1", nc_); + ncf_->add_att("api_version", 4.98f); + ncf_->add_att("version", 4.98f); + ncf_->add_att("floating_point_word_size", 8); + ncf_->add_att("file_size", 1); + ncf_->add_att("title", "slmm::io::NetcdfWriter::init"); + + ncf_->add_var("time_whole", ncDouble, time_dim); + ncf_->add_var("eb_names", ncChar, num_el_blk_dim, len_str_dim); + { // elem map + std::vector elem(nc_); + for (Int i = 0; i < nc_; ++i) elem[i] = i+1; + ncf_->add_var("elem_map", ncInt, cells_dim)->put(elem.data(), nc_); + } + { // c2n + auto v = ncf_->add_var("connect1", ncInt, cells_dim, nodes_per_cell_dim); + v->add_att("elem_type", "SHELL4"); + std::vector connect(nc_*szslice(c2n)); + for (Int i = 0, k = 0; i < nslices(c2n); ++i) + for (Int j = 0; j < szslice(c2n); ++j, ++k) + connect[k] = c2n(i,j) + 1; + v->set_cur(0, 0); + v->put(connect.data(), nc_, szslice(c2n)); + } + { // coords + std::vector buf(nn_); + double* const d = buf.data(); + for (Int i = 0; i < nn_; ++i) d[i] = p(i,0); + ncf_->add_var("coordx", ncDouble, nodes_dim)->put(d, nn_); + for (Int i = 0; i < nn_; ++i) d[i] = p(i,1); + ncf_->add_var("coordy", ncDouble, nodes_dim)->put(d, nn_); + for (Int i = 0; i < nn_; ++i) d[i] = p(i,2); + ncf_->add_var("coordz", ncDouble, nodes_dim)->put(d, nn_); + } + { // various other things + int one = 1; + ncf_->add_var("eb_status", ncInt, num_el_blk_dim)->put(&one, 1); + auto v = ncf_->add_var("eb_prop1", ncInt, num_el_blk_dim); + v->put(&one, 1); + v->add_att("name", "ID"); + std::vector buf(nc_, 1.0); + v = ncf_->add_var("attrib1", ncDouble, cells_dim, att_block1_dim); + v->put(buf.data(), nc_, 1); + } + + add_att("np", np); + add_att("monotone_type", monotone_type); +#else + std::cerr << "Warning: NetcdfWriter::init: Netcdf was not compiled in.\n"; +#endif +} + +template +void NetcdfWriter::add_att (const char* name, const T& val) { +#ifdef SLMM_HAVE_NETCDF + ncf_->add_att(name, val); +#endif +} + +void NetcdfWriter::add_nodal_field (const std::string& name, const Int dim) { +#ifdef SLMM_HAVE_NETCDF + if (define_done_) + throw std::runtime_error( + "Can't add a new field after end_definition() was called."); + const auto& it = name2field_.find(name); + if (it != name2field_.end()) + throw std::runtime_error("Field name was already added."); + name2field_[name] = FieldIdx(FieldType::node, node_fields_.size()); + node_fields_.push_back(Field(name, dim)); +#endif +} + +void NetcdfWriter::add_element_field (const std::string& name, const Int dim) { +#ifdef SLMM_HAVE_NETCDF + if (define_done_) + throw std::runtime_error( + "Can't add a new field after end_definition() was called."); + const auto& it = name2field_.find(name); + if (it != name2field_.end()) + throw std::runtime_error("Field name was already added."); + name2field_[name] = FieldIdx(FieldType::elem, elem_fields_.size()); + elem_fields_.push_back(Field(name, dim)); +#endif +} + +void NetcdfWriter::end_definition () { +#ifdef SLMM_HAVE_NETCDF + NcDim* const str_d = ncf_->get_dim("len_string"); + NcDim* const time_d = ncf_->get_dim("time_step"); + + do { + Int num_vars = 0; + for (auto f: node_fields_) + num_vars += static_cast(f.ncvars.size()); + if ( ! num_vars) break; + + NcDim* const nodes_d = ncf_->get_dim("num_nodes"); + NcDim* const nv_d = ncf_->add_dim("num_nod_var", num_vars); + NcVar* const name_v = ncf_->add_var("name_nod_var", ncChar, nv_d, str_d); + Int varno = 1; + for (std::size_t i = 0; i < node_fields_.size(); ++i) { + Field& f = node_fields_[i]; + if (f.ncvars.size() == 1) { + name_v->set_cur(i, 0); + name_v->put(f.name.c_str(), 1, f.name.size()); + + std::stringstream ss; + ss << "vals_nod_var" << varno++; + f.ncvars[0] = ncf_->add_var(ss.str().c_str(), ncDouble, time_d, nodes_d); + } else { + //todo dim > 1 + throw std::runtime_error("dim > 1 not impl'ed."); + } + } + } while (0); + + do { + Int num_vars = 0; + for (auto f: elem_fields_) + num_vars += static_cast(f.ncvars.size()); + if ( ! num_vars) break; + + NcDim* const elem_d = ncf_->get_dim("num_elem"); + NcDim* const ev_d = ncf_->add_dim("num_elem_var", num_vars); + NcVar* const name_v = ncf_->add_var("name_elem_var", ncChar, ev_d, str_d); + Int varno = 1; + for (std::size_t i = 0; i < elem_fields_.size(); ++i) { + Field& f = elem_fields_[i]; + if (f.ncvars.size() == 1) { + name_v->set_cur(i, 0); + name_v->put(f.name.c_str(), 1, f.name.size()); + + std::stringstream ss; + ss << "vals_elem_var" << varno++ << "eb1"; + f.ncvars[0] = ncf_->add_var(ss.str().c_str(), ncDouble, time_d, elem_d); + } else { + //todo dim > 1 + throw std::runtime_error("dim > 1 not impl'ed."); + } + } + } while (0); + + time_ = -1; + time_idx_ = -1; + time_v_ = ncf_->get_var("time_whole"); + + define_done_ = true; +#endif +} + +static void check_state (const Int time_idx, const bool define_done) { +#ifdef SLMM_HAVE_NETCDF + if (time_idx == -1) + throw std::runtime_error( + "Need to advance_time_to before writing fields."); + if ( ! define_done) + throw std::runtime_error( + "Can't write a field until end_definition() is called."); +#endif +} + +void NetcdfWriter::write_field (const std::string& name, const double* field) { +#ifdef SLMM_HAVE_NETCDF + check_state(time_idx_, define_done_); + const auto& it = name2field_.find(name); + if (it == name2field_.end()) + throw std::runtime_error("Invalid field."); + Field& f = it->second.first == FieldType::node ? + node_fields_[it->second.second] : elem_fields_[it->second.second]; + assert(f.ncvars.size() == 1); //todo dim > 1 + f.ncvars[0]->set_rec(time_idx_); + f.ncvars[0]->put_rec(field); + ncf_->sync(); +#endif +} + +void NetcdfWriter::advance_time_to (const double t) { +#ifdef SLMM_HAVE_NETCDF + ++time_idx_; + if (t <= time_) + throw std::runtime_error("t must be > current time."); + time_ = t; + time_v_->set_rec(time_idx_); + time_v_->put_rec(&time_); +#endif +} + +NetcdfWriter::Field::Field (const std::string& name, const Int dim) + : name(name), ncvars(dim, nullptr) +{} + +void get_field_vals (const NcFile& ncr, FieldType::Enum ft, const int field_idx, + const int time_idx, double* vals) { +#ifdef SLMM_HAVE_NETCDF + std::stringstream ss; + int nvals; + if (ft == FieldType::node) { + ss << "vals_nod_var" << field_idx + 1; + NcDim* const nodes_dim = ncr.get_dim("num_nodes"); + nvals = nodes_dim->size(); + } else { + ss << "vals_elem_var" << field_idx + 1 << "eb1"; + NcDim* const cell_dim = ncr.get_dim("num_elem"); + nvals = cell_dim->size(); + } + NcVar* const f_v = ncr.get_var(ss.str().c_str()); + f_v->set_cur(time_idx, 0); + f_v->get(vals, 1, nvals); +#endif +} + +void get_field_names ( + const NcFile& ncr, std::vector& node_names, + std::vector& elem_names) +{ +#ifdef SLMM_HAVE_NETCDF + NcDim* const str_d = ncr.get_dim("len_string"); + std::vector str(str_d->size()); + str.back() = '\0'; + do { + NcDim* const nv_d = ncr.get_dim("num_nod_var"); + if ( ! nv_d) break; + NcVar* const name_v = ncr.get_var("name_nod_var"); + for (int i = 0; i < nv_d->size(); ++i) { + name_v->set_cur(i, 0); + name_v->get(str.data(), 1, str.size()); + node_names.push_back(std::string(str.data())); + } + } while (0); + do { + NcDim* const ev_d = ncr.get_dim("num_elem_var"); + if ( ! ev_d) break; + NcVar* const name_v = ncr.get_var("name_elem_var"); + for (int i = 0; i < ev_d->size(); ++i) { + name_v->set_cur(i, 0); + name_v->get(str.data(), 1, str.size()); + elem_names.push_back(std::string(str.data())); + } + } while (0); +#endif +} + +#ifdef SLMM_HAVE_NETCDF +static NcValues* get_att_val (const NcFile& ncr, const char* name) { + NcAtt* att; + NcValues* vals; + if ( ! (att = ncr.get_att(name)) || + ! (vals = att->values())) + throw std::runtime_error(std::string("No attribute ") + name); + delete att; + return vals; +} +#endif + +Int get_np (const NcFile& ncr) { +#ifdef SLMM_HAVE_NETCDF + NcValues* vals = get_att_val(ncr, "np"); + const Int np = vals->as_int(0); + delete vals; + return np; +#else + return 0; +#endif +} + +} // namespace io +} // namespace slmm diff --git a/siqk/slmm/slmm_io.hpp b/siqk/slmm/slmm_io.hpp new file mode 100644 index 0000000..83728a4 --- /dev/null +++ b/siqk/slmm/slmm_io.hpp @@ -0,0 +1,73 @@ +#ifndef INCLUDE_SLMM_IO_HPP +#define INCLUDE_SLMM_IO_HPP + +#include "slmm_defs.hpp" + +#include +#include +#include + +class NcFile; +class NcVar; + +namespace slmm { +namespace io { + +struct FieldType { enum Enum { node, elem }; }; + +class NetcdfWriter { + struct Field { + std::string name; + std::vector ncvars; + Field(const std::string& name, const Int dim); + }; + + Size nn_, nc_; + Int time_idx_; + double time_; + bool define_done_; + std::shared_ptr ncf_; + NcVar* time_v_; + std::vector node_fields_, elem_fields_; + typedef std::pair FieldIdx; + std::map name2field_; + + void init(const Vec3s::HostMirror& p, const Idxs::HostMirror& c2n, + const std::string& out_fn, const Int np, const Int monotone_type); + template void add_att(const char* name, const T& val); + +public: + // Open a Netcdf file for writing. + NetcdfWriter(const Vec3s::HostMirror& p, const Idxs::HostMirror& c2n, + const std::string& out_fn, + const Int np = 4, const Int monotone_type = 0); + + // Add fields on the mesh to the file. + void add_nodal_field(const std::string& name, const Int dim = 1); + void add_element_field(const std::string& name, const Int dim = 1); + + // After adding all the fields, end the definition phase, providing the first + // time at which fields will be recorded. + void end_definition(); + + // Advance time forward before writing fields for a time step. + void advance_time_to(const double t); + + // If multidimensional, the fast index is the mesh dimension. + void write_field(const std::string& name, const double* field); +}; + +// vals must be preallocated. +void get_field_vals(const NcFile& ncr, FieldType::Enum ft, const int field_idx, + const int time_idx, double* vals); + +void get_field_names( + const NcFile& ncr, std::vector& nodal_field_names, + std::vector& element_field_names); + +Int get_np(const NcFile& ncr); + +} // namespace io +} // namespace slmm + +#endif diff --git a/siqk/slmm/slmm_mesh.cpp b/siqk/slmm/slmm_mesh.cpp new file mode 100644 index 0000000..93e18ca --- /dev/null +++ b/siqk/slmm/slmm_mesh.cpp @@ -0,0 +1,486 @@ +#include "slmm_mesh.hpp" +#include "slmm_gll.hpp" +#include "slmm_util.hpp" + +#include +#include + +namespace slmm { +namespace mesh { + +static void make_equiangular_nodes (const Int ne, std::vector& x) { + const Real d = 1.0 / std::sqrt(3.0); + const Real dtheta = 0.5*M_PI / ne; + x.resize(ne+1); + if (ne % 2 == 1) { + const Int n = (ne + 1) / 2; + for (Int i = 0; i < n; ++i) + x[n + i] = d*std::tan((i + 0.5)*dtheta); + for (Int i = 0; i < n; ++i) + x[n - 1 - i] = -x[n + i]; + } else { + const Int n = ne / 2; + x[n] = 0; + for (Int i = 1; i <= n; ++i) + x[n + i] = d*std::tan(i*dtheta); + for (Int i = 1; i <= n; ++i) + x[n - i] = -x[n + i]; + } +} + +static void make_planar_mesh (Vec3s::HostMirror& p, Idxs::HostMirror& e, + const Int n) { + std::vector x; + make_equiangular_nodes(n, x); + ko::resize(e, n*n, 4); + ko::resize(p, (n+1)*(n+1), 3); + for (Int iy = 0; iy < n+1; ++iy) + for (Int ix = 0; ix < n+1; ++ix) { + const auto idx = (n+1)*iy + ix; + p(idx,0) = x[ix]; + p(idx,1) = x[iy]; + p(idx,2) = 0; + } + for (Int iy = 0; iy < n; ++iy) + for (Int ix = 0; ix < n; ++ix) { + const auto idx = n*iy + ix; + e(idx,0) = (n+1)*iy + ix; + e(idx,1) = (n+1)*iy + ix+1; + e(idx,2) = (n+1)*(iy+1) + ix+1; + e(idx,3) = (n+1)*(iy+1) + ix; + } +} + +template +static void rotate (const Real R[9], V p) { + const Real x = p[0], y = p[1], z = p[2]; + p[0] = R[0]*x + R[1]*y + R[2]*z; + p[1] = R[3]*x + R[4]*y + R[5]*z; + p[2] = R[6]*x + R[7]*y + R[8]*z; +} + +template +static void translate (const Real xlate[3], V p) { + for (Int i = 0; i < 3; ++i) p[i] += xlate[i]; +} + +static void transform_planar_mesh (const Real R[9], const Real xlate[3], + Vec3s::HostMirror& p) { + for (Int i = 0; i < nslices(p); ++i) { + rotate(R, slice(p, i)); + translate(xlate, slice(p, i)); + } +} + +// Remove vertices marked unused and adjust numbering. +static void remove_unused_vertices (Vec3s::HostMirror& p, Idxs::HostMirror& e, + const Real unused) { + // adjust[i] is the number to subtract from i. Hence if e(ei,0) was originally + // i, it is adjusted to i - adjust[i]. + std::vector adjust(nslices(p), 0); + Int rmcnt = 0; + for (Int i = 0; i < nslices(p); ++i) { + if (p(i,0) != unused) continue; + adjust[i] = 1; + ++rmcnt; + } + // Cumsum. + for (Int i = 1; i < nslices(p); ++i) + adjust[i] += adjust[i-1]; + // Adjust e. + for (Int ei = 0; ei < nslices(e); ++ei) + for (Int k = 0; k < szslice(e); ++k) + e(ei,k) -= adjust[e(ei,k)]; + // Remove unused from p. + Vec3s::HostMirror pc("copy", nslices(p), szslice(p)); + ko::deep_copy(pc, p); + ko::resize(p, nslices(p) - rmcnt, szslice(p)); + for (Int i = 0, j = 0; i < nslices(pc); ++i) { + if (pc(i,0) == unused) continue; + for (Int k = 0; k < szslice(pc); ++k) p(j,k) = pc(i,k); + ++j; + } +} + +void make_cubedsphere (Vec3s::HostMirror& p, Idxs::HostMirror& e, const Int n) { + // Transformation of the reference mesh make_planar_mesh to make each of the + // six faces. + const Real d = 1.0 / std::sqrt(3.0); + static Real R[6][9] = {{ 1, 0, 0, 0, 0, 0, 0, 1, 0}, // face 0, -y + { 0, 0, 0, 1, 0, 0, 0, 1, 0}, // 1, +x + {-1, 0, 0, 0, 0, 0, 0, 1, 0}, // 2, +y + { 0, 0, 0,-1, 0, 0, 0, 1, 0}, // 3, -x + { 1, 0, 0, 0, 1, 0, 0, 0, 0}, // 4, +z + {-1, 0, 0, 0, 1, 0, 0, 0, 0}}; // 5, -z + static Real xlate[6][3] = {{ 0,-d, 0}, { d, 0, 0}, { 0, d, 0}, + {-d, 0, 0}, { 0, 0, d}, { 0, 0,-d}}; + // Construct 6 uncoupled faces. + Vec3s::HostMirror ps[6]; + Vec3s::HostMirror& p_ref = ps[0]; + Idxs::HostMirror es[6]; + Idxs::HostMirror& e_ref = es[0]; + make_planar_mesh(p_ref, e_ref, n); + ko::resize(e, 6*nslices(e_ref), 4); + ko::resize(p, 6*nslices(p_ref), 3); + for (Int i = 1; i < 6; ++i) { + ko::resize(es[i], nslices(e_ref), 4); + ko::deep_copy(es[i], e_ref); + ko::resize(ps[i], nslices(p_ref), 3); + ko::deep_copy(ps[i], p_ref); + transform_planar_mesh(R[i], xlate[i], ps[i]); + } + transform_planar_mesh(R[0], xlate[0], ps[0]); + // Pack (p,e), accounting for equivalent vertices. For the moment, keep the p + // slot for an equivalent vertex to make node numbering simpler, but make the + // value bogus so we know if there's a problem in the numbering. + const Real unused = -2; + ko::deep_copy(p, unused); + Int p_base = 0, e_base = 0; + { // -y face + const Vec3s::HostMirror& fp = ps[0]; + Idxs::HostMirror& fe = es[0]; + for (Int j = 0; j < nslices(fp); ++j) + for (Int k = 0; k < 3; ++k) p(j,k) = fp(j,k); + for (Int j = 0; j < nslices(fe); ++j) + for (Int k = 0; k < 4; ++k) e(j,k) = fe(j,k); + p_base += nslices(p_ref); + e_base += nslices(e_ref); + } + for (Int fi = 1; fi <= 2; ++fi) { // +x, +y faces + const Vec3s::HostMirror& fp = ps[fi]; + Idxs::HostMirror& fe = es[fi]; + for (Int j = 0; j < nslices(fp); ++j) { + if (j % (n+1) == 0) continue; // equiv vertex + for (Int k = 0; k < 3; ++k) p(p_base+j,k) = fp(j,k); + } + for (Int j = 0; j < nslices(fe); ++j) { + for (Int k = 0; k < 4; ++k) fe(j,k) += p_base; + // Left 2 vertices of left elem on face fi equiv to right 2 vertices of + // right elem on face fi-1. Write to the face, then copy to e, so that + // other faces can use these updated data. + if (j % n == 0) { + fe(j,0) = es[fi-1](j+n-1,1); + fe(j,3) = es[fi-1](j+n-1,2); + } + for (Int k = 0; k < 4; ++k) e(e_base+j,k) = fe(j,k); + } + p_base += nslices(p_ref); + e_base += nslices(e_ref); + } + { // -x face + const Vec3s::HostMirror& fp = ps[3]; + Idxs::HostMirror& fe = es[3]; + for (Int j = 0; j < nslices(fp); ++j) { + if (j % (n+1) == 0 || (j+1) % (n+1) == 0) continue; + for (Int k = 0; k < 3; ++k) p(p_base+j,k) = fp(j,k); + } + for (Int j = 0; j < nslices(fe); ++j) { + for (Int k = 0; k < 4; ++k) fe(j,k) += p_base; + if (j % n == 0) { + fe(j,0) = es[2](j+n-1,1); + fe(j,3) = es[2](j+n-1,2); + } else if ((j+1) % n == 0) { + fe(j,1) = es[0]((j+1)-n,0); + fe(j,2) = es[0]((j+1)-n,3); + } + for (Int k = 0; k < 4; ++k) e(e_base+j,k) = fe(j,k); + } + p_base += nslices(p_ref); + e_base += nslices(e_ref); + } + { // +z face + const Vec3s::HostMirror& fp = ps[4]; + Idxs::HostMirror& fe = es[4]; + for (Int j = n+1; j < nslices(fp) - (n+1); ++j) { + if (j % (n+1) == 0 || (j+1) % (n+1) == 0) continue; + for (Int k = 0; k < 3; ++k) p(p_base+j,k) = fp(j,k); + } + for (Int j = 0; j < nslices(fe); ++j) + for (Int k = 0; k < 4; ++k) fe(j,k) += p_base; + for (Int j = 0; j < n; ++j) { // -y + fe(j,0) = es[0](n*(n-1)+j,3); + fe(j,1) = es[0](n*(n-1)+j,2); + } + for (Int j = 0; j < n; ++j) { // +y + fe(n*(n-1)+j,2) = es[2](n*n-1-j,3); + fe(n*(n-1)+j,3) = es[2](n*n-1-j,2); + } + for (Int j = 0, i3 = 0; j < nslices(fe); j += n, ++i3) { // -x + fe(j,0) = es[3](n*n-1-i3,2); + fe(j,3) = es[3](n*n-1-i3,3); + } + for (Int j = n-1, i1 = 0; j < nslices(fe); j += n, ++i1) { // +x + fe(j,1) = es[1](n*(n-1)+i1,3); + fe(j,2) = es[1](n*(n-1)+i1,2); + } + for (Int j = 0; j < nslices(fe); ++j) + for (Int k = 0; k < 4; ++k) e(e_base+j,k) = fe(j,k); + p_base += nslices(p_ref); + e_base += nslices(e_ref); + } + { // -z face + const Vec3s::HostMirror& fp = ps[5]; + Idxs::HostMirror& fe = es[5]; + for (Int j = n+1; j < nslices(fp) - (n+1); ++j) { + if (j % (n+1) == 0 || (j+1) % (n+1) == 0) continue; + for (Int k = 0; k < 3; ++k) p(p_base+j,k) = fp(j,k); + } + for (Int j = 0; j < nslices(fe); ++j) + for (Int k = 0; k < 4; ++k) fe(j,k) += p_base; + for (Int j = 0; j < n; ++j) { // -y + fe(j,0) = es[0](n-1-j,1); + fe(j,1) = es[0](n-1-j,0); + } + for (Int j = 0; j < n; ++j) { // +y + fe(n*(n-1)+j,2) = es[2](j,1); + fe(n*(n-1)+j,3) = es[2](j,0); + } + for (Int j = 0, i3 = 0; j < nslices(fe); j += n, ++i3) { // -x + fe(j,0) = es[1](i3,0); + fe(j,3) = es[1](i3,1); + } + for (Int j = n-1, i1 = 0; j < nslices(fe); j += n, ++i1) { // +x + fe(j,1) = es[3](n-1-i1,1); + fe(j,2) = es[3](n-1-i1,0); + } + for (Int j = 0; j < nslices(fe); ++j) + for (Int k = 0; k < 4; ++k) e(e_base+j,k) = fe(j,k); + } + // Now go back and remove the unused vertices and adjust the numbering. + remove_unused_vertices(p, e, unused); + // Project to the unit sphere. + for (Int i = 0; i < nslices(p); ++i) + geometry::normalize(slice(p, i)); +} + +void make_cgll_from_geo ( + const Vec3s::HostMirror& geo_p, const Idxs::HostMirror& geo_c2n, const Int np, + Vec3s::HostMirror& cgll_p, Idxs::HostMirror& cgll_c2n) +{ + Idxs::HostMirror geo_c2e, geo_e2n; + impl::make_c2e_from_c2n(np, geo_c2n, geo_c2e, geo_e2n); + ko::resize(cgll_p, + nslices(geo_p) + // corner nodes + (np-2)*nslices(geo_e2n) + // np-2 per edge + siqk::square(np-2)*nslices(geo_c2n), // nodes inside a geo cell + 3); + ko::resize(cgll_c2n, nslices(geo_c2n), siqk::square(np)); + Int pi = 0; + // Geo cell vertices. + for ( ; pi < nslices(geo_p); ++pi) + for (Int k = 0; k < 3; ++k) cgll_p(pi,k) = geo_p(pi,k); + ko::View nodes("nodes", np, np); + const Real* gll_x = nullptr, * gll_wt = nullptr; + GLL gll; + gll.get_coef(np, gll_x, gll_wt); + // Add new edge nodes. + for (Int gci = 0; gci < nslices(geo_c2n); ++gci) { + const auto geo_nodes = slice(geo_c2n, gci); + for (Int ei = 0; ei < 4; ++ei) { + // If my edge is i -> j and j > i, then I'm responsible for adding these. + if (geo_nodes[ei] > geo_nodes[(ei+1) % 4]) continue; + // edge[0] -> edge[np-1] is the geo edge. + auto edge = slice(geo_e2n, geo_c2e(gci,ei)); + assert(edge[0] == geo_nodes[ei]); + assert(edge[np-1] == geo_nodes[(ei+1) % 4]); + // Add the new nodes. + const auto p0 = slice(cgll_p, edge[0]); + const auto p1 = slice(cgll_p, edge[np-1]); + for (Int i = 1; i < np-1; ++i) { + auto p = slice(cgll_p, pi); + const Real alpha = 0.5*(gll_x[i] + 1); + for (Int k = 0; k < 3; ++k) + p[k] = (1 - alpha)*p0[k] + alpha*p1[k]; + edge[i] = pi; + ++pi; + } + } + } + for (Int gci = 0; gci < nslices(geo_c2n); ++gci) { + const auto geo_nodes = slice(geo_c2n, gci); + // Record the newly created edge nodes. + for (Int ei = 0; ei < 4; ++ei) { + const auto edge = slice(geo_e2n, geo_c2e(gci,ei)); + if (geo_nodes[ei] < geo_nodes[(ei+1) % 4]) { + assert(edge[0] == geo_nodes[ei]); + assert(edge[np-1] == geo_nodes[(ei+1) % 4]); + switch (ei) { + case 0: for (Int i = 0; i < np; ++i) nodes(i,0) = edge[i]; break; + case 1: for (Int i = 0; i < np; ++i) nodes(np-1,i) = edge[i]; break; + case 2: for (Int i = 0; i < np; ++i) nodes(i,np-1) = edge[np-1-i]; break; + case 3: for (Int i = 0; i < np; ++i) nodes(0,i) = edge[np-1-i]; break; + default: assert(0); + } + } else { + assert(edge[np-1] == geo_nodes[ei]); + assert(edge[0] == geo_nodes[(ei+1) % 4]); + switch (ei) { + case 0: for (Int i = 0; i < np; ++i) nodes(i,0) = edge[np-1-i]; break; + case 1: for (Int i = 0; i < np; ++i) nodes(np-1,i) = edge[np-1-i]; break; + case 2: for (Int i = 0; i < np; ++i) nodes(i,np-1) = edge[i]; break; + case 3: for (Int i = 0; i < np; ++i) nodes(0,i) = edge[i]; break; + default: assert(0); + } + } + } + // Add new internal nodes. + for (Int j = 1; j < np-1; ++j) { + const auto p0 = slice(cgll_p, nodes(0,j)); + const auto p1 = slice(cgll_p, nodes(np-1,j)); + for (Int i = 1; i < np-1; ++i) { + assert(pi < nslices(cgll_p)); + auto p = slice(cgll_p, pi); + const Real alpha = 0.5*(gll_x[i] + 1); + for (Int k = 0; k < 3; ++k) + p[k] = (1 - alpha)*p0[k] + alpha*p1[k]; + nodes(i,j) = pi; + ++pi; + } + } + // Fill CGLL cell with nodes. + { + auto cell = slice(cgll_c2n, gci); + for (Int j = 0, k = 0; j < np; ++j) + for (Int i = 0; i < np; ++i, ++k) + cell[k] = nodes(i,j); + } + } + // Project to the unit sphere. + for (Int i = 0; i < nslices(cgll_p); ++i) + geometry::normalize(slice(cgll_p, i)); +} + +void make_io_cgll_from_internal_cgll ( + const Vec3s::HostMirror& cgll_p, const Idxs::HostMirror& cgll_c2n, + Idxs::HostMirror& cgll_io_c2n) +{ + const Int np2 = szslice(cgll_c2n), np = std::sqrt(np2), + nsc = siqk::square(np-1); + ko::resize(cgll_io_c2n, nslices(cgll_c2n)*nsc, 4); + for (Int ci = 0; ci < nslices(cgll_c2n); ++ci) { + const auto cell = slice(cgll_c2n, ci); + for (Int scj = 0; scj < np-1; ++scj) + for (Int sci = 0; sci < np-1; ++sci) { + auto subcell = slice(cgll_io_c2n, nsc*ci + (np-1)*scj + sci); + subcell[0] = cell[np* scj + sci ]; + subcell[1] = cell[np* scj + sci+1]; + subcell[2] = cell[np*(scj+1) + sci+1]; + subcell[3] = cell[np*(scj+1) + sci ]; + } + } +} + +void make_dgll_from_cgll ( + const Vec3s::HostMirror& cgll_p, const Idxs::HostMirror& cgll_c2n, + IdxArray::HostMirror& dglln2cglln, Idxs::HostMirror& dgll_c2n) +{ + const Int np2 = szslice(cgll_c2n); + ko::resize(dglln2cglln, np2*nslices(cgll_c2n)); + ko::resize(dgll_c2n, nslices(cgll_c2n), szslice(cgll_c2n)); + IdxArray::HostMirror cgll_c2n_used("used", nslices(cgll_p)); + ko::deep_copy(cgll_c2n_used, 0); + Int pi = nslices(cgll_p); + for (Int ci = 0; ci < nslices(cgll_c2n); ++ci) { + const auto cgll_cell = slice(cgll_c2n, ci); + auto dgll_cell = slice(dgll_c2n, ci); + for (Int ni = 0; ni < np2; ++ni) { + const Int cgll_node_nmbr = cgll_cell[ni]; + dglln2cglln[ci*np2 + ni] = cgll_node_nmbr; + if (cgll_c2n_used[cgll_node_nmbr]) + dgll_cell[ni] = pi++; + else { + dgll_cell[ni] = cgll_node_nmbr; + cgll_c2n_used[cgll_node_nmbr] = 1; + } + } + } +#ifndef NDEBUG + assert(pi == nslices(cgll_c2n) * np2); + for (Int i = 0; i < nslices(cgll_c2n_used); ++i) + assert(cgll_c2n_used[i]); +#endif +} + +namespace impl { +void calc_elem_ctr (const Vec3s::HostMirror& p, const Idxs::HostMirror& e, + const Int ei, Real ctr[3]) { + for (Int j = 0; j < 3; ++j) ctr[j] = 0; + Int n = 0; + for (Int i = 0; i < szslice(e); ++i) { + if (e(ei,i) < 0) break; + for (Int j = 0; j < 3; ++j) ctr[j] += p(e(ei,i),j); + ++n; + } + for (Int j = 0; j < 3; ++j) ctr[j] /= n; +} + +struct Edge { + const Int lo, hi; + Edge (const Int& n0, const Int& n1) + : lo(n0 < n1 ? n0 : n1), + hi(n0 < n1 ? n1 : n0) + {} + bool operator< (const Edge& e) const { + if (lo < e.lo) return true; + if (lo == e.lo) return hi < e.hi; + return false; + } +}; + +void make_c2e_from_c2n (const Int np, const Idxs::HostMirror& c2n, + Idxs::HostMirror& c2e, Idxs::HostMirror& e2n) { + const Int nnode = szslice(c2n); + // Number the edges. + std::map edge2nmbr; + Int nmbr = 0; + for (Int ci = 0; ci < nslices(c2n); ++ci) { + const auto cell = slice(c2n, ci); + for (Int ni = 0; ni < nnode; ++ni) { + Edge e(cell[ni], cell[(ni+1) % nnode]); + const auto it = edge2nmbr.find(e); + if (it == edge2nmbr.end()) + edge2nmbr[e] = nmbr++; + } + } + // Fill the adjacency arrays. + ko::resize(c2e, nslices(c2n), szslice(c2n)); + ko::resize(e2n, nmbr, np); + for (Int ci = 0; ci < nslices(c2n); ++ci) { + const auto cell = slice(c2n, ci); + for (Int ni = 0; ni < nnode; ++ni) { + Edge e(cell[ni], cell[(ni+1) % nnode]); + const auto it = edge2nmbr.find(e); + assert(it != edge2nmbr.end()); + const Int nmbr = it->second; + c2e(ci, ni) = nmbr; + e2n(nmbr, 0) = it->first.lo; + e2n(nmbr, np-1) = it->first.hi; + } + } +} + +Int check_elem_normal_against_sphere (const Vec3s::HostMirror& p, + const Idxs::HostMirror& e) { + Int nerr = 0; + for (Int ei = 0; ei < nslices(e); ++ei) { // for each element + Real sphere[3]; // ray through elem ctr + calc_elem_ctr(p, e, ei, sphere); + for (Int ti = 0; ti < szslice(e) - 2; ++ti) { // for each tri + if (e(ei,ti+2) < 0) break; + Real tri_normal[3]; { + Real v[2][3]; + for (Int j = 0; j < 2; ++j) { + geometry::copy(v[j], slice(p, e(ei,ti+j+1))); + geometry::axpy(-1, slice(p, e(ei,0)), v[j]); + } + geometry::cross(v[0], v[1], tri_normal); + } + if (geometry::dot(tri_normal, sphere) <= 0) + ++nerr; + } + } + return nerr; +} +} // namespace impl +} // namespace mesh +} // namespace slmm diff --git a/siqk/slmm/slmm_mesh.hpp b/siqk/slmm/slmm_mesh.hpp new file mode 100644 index 0000000..895b30c --- /dev/null +++ b/siqk/slmm/slmm_mesh.hpp @@ -0,0 +1,69 @@ +#ifndef INCLUDE_SLMM_MESH_HPP +#define INCLUDE_SLMM_MESH_HPP + +#include "slmm_defs.hpp" + +namespace slmm { +namespace mesh { + +// c is cell (aka element). n is node. e is edge. Hence c2e is the cell-to-edge +// adjacency array. +// geo is the basic geometric mesh. cgll is a continuous GLL mesh induced by +// the geometric mesh and the reference map. dgll is a discontinuous GLL map +// induced by the CGLL mesh. +// In a geo cell, the four nodes are ordered CCW. When a geometric mesh is +// converted to GLL, geometric cell i is divided into n = (np-1)^2 subcells. A +// GLL cell is 1-1 with a geo cell and contains GLL subcells. +// For netcdf I/O, we'll need to make GLL subcells explicitly, and they will +// be numbered n*i : n*(i+1)-1. make_io_cgll_from_internal_cgll does this. We'll +// use 'io' vs 'internal' decoration to distinguish these when necessary. +// For internal use, we don't need to form these cells explicitly. Instead, +// cgll_c2n has (np-1)^2 slots per slice. Nodes are ordered, e.g. with np=4, +// 12 13 14 15 +// 8 9 10 11 +// 4 5 6 7 +// 0 1 2 3. +// Hence cgll_c2n(i_cell, k) gives the k'th node of cell i_cell. +// With respect to the reference square (e.g., in siqk::sqr), in a quad the +// bottom-left node is (-1,-1), the bottom-right is (1,0), the top-right is +// (1,1), and the top-left is (-1,1). +// DGLL topology looks the same except that edge nodes are not +// shared. dglln2cglln(k) maps the k'th DGLL node to the corresponding CGLL +// node. cglln2dglln(k,:) is the list of DGLL nodes associated with CGLL node k. +// In all topology arrays, -1 indicates the end of a list. E.g., if CGLL node +// k corresponds to 2 DGLL nodes, then cglln2dglln(k,{0,1}) have values, and the +// rest are -1. + +void make_cubedsphere( + Vec3s::HostMirror& geo_p, Idxs::HostMirror& geo_c2n, const Int ne); + +void make_cgll_from_geo( + const Vec3s::HostMirror& geo_p, const Idxs::HostMirror& geo_c2n, + const Int np, Vec3s::HostMirror& cgll_p, Idxs::HostMirror& cgll_c2n); + +void make_io_cgll_from_internal_cgll( + const Vec3s::HostMirror& cgll_p, const Idxs::HostMirror& cgll_c2n, + Idxs::HostMirror& cgll_io_c2n); + +// dgll_c2n(cell_nmbr, :) contains node numbers for the DGLL mesh. However, a +// separate coordinate array (with redundant coordinates) is not +// created. Instead, use dglln2cglln as follows. The coordinates of the node +// dgll_c2n(cell_nmbr, k) are cgll_p(dglln2cglln(dgll_c2n(cell_nmbr, k)), :). +void make_dgll_from_cgll( + const Vec3s::HostMirror& cgll_p, const Idxs::HostMirror& cgll_c2n, + IdxArray::HostMirror& dglln2cglln, Idxs::HostMirror& dgll_c2n); + +namespace impl { +// slice(e2n,i) has np slots, and slots 0 and np-1 are filled. +void make_c2e_from_c2n(const Int np, const Idxs::HostMirror& c2n, + Idxs::HostMirror& c2e, Idxs::HostMirror& e2n); + +// Return 0 if all elements' subtri normals point outward relative to the +// sphere. +Int check_elem_normal_against_sphere( + const Vec3s::HostMirror& p, const Idxs::HostMirror& e); +} // namespace impl +} // namespace mesh +} // namespace slmm + +#endif diff --git a/siqk/slmm/slmm_runtests.py b/siqk/slmm/slmm_runtests.py new file mode 100755 index 0000000..d2d361a --- /dev/null +++ b/siqk/slmm/slmm_runtests.py @@ -0,0 +1,71 @@ +#!/usr/bin/python + +import os, sys, re + +def readall (fn): + # Shorthand for reading in all the text in a file. + try: + with open(fn, 'r') as f: + text = f.read() + except: + text = '' + return text + +def writeall (text, fn, for_real): + if for_real: + with open(fn, 'w') as f: + f.write(text) + +def parse_one_liner (text): + class struct: + pass + hits = re.findall('
    .*', text) + hits = re.findall('l2 (?P[^ ]*) .* cv re (?P[^ ]*) ', hits[0]) + o = struct + o.l2 = float(hits[0][0]) + o.cv = float(hits[0][1]) + return o + +def runtest (cmd): + outfn = 'runtests.tmp' + os.system(cmd + ' &> ' + outfn) + return readall(outfn) + +def print_test (cmd): + print '{0:.<70s}'.format(cmd + ' '), + +def print_result (passed): + if not passed: + print '***FAILED' + return 1 + else: + print ' PASSED' + return 0 + +def check_passed (cmd): + print_test(cmd) + out = runtest(cmd) + hits = re.findall('PASSED', out) + passed = len(hits) > 0 + return print_result(passed) + +def check_errs (cmd, l2_err, cv=10): + print_test(cmd) + out = runtest(cmd) + o = parse_one_liner(out) + passed = o.l2 <= l2_err and o.cv <= cv + return print_result(passed) + +nerr = 0 +nerr += check_passed('./slmm_test -q -c test_make_cubedsphere') +nerr += check_passed('./slmm_test -q -c test_gll') +nerr += check_passed('./slmm_test -q -c test_time_int') +nerr += check_passed('./slmm_test -q -c test_make_gll_mesh') + +base = './slmmir -nsteps 12 -ne 10 -we 0 -ode divergent -ic gaussianhills ' +nerr += check_errs(base + '-np 3', 6.3e-3, 1e-5) +nerr += check_errs(base + '-np 3 -xyz', 6.3e-3, 1e-5) +nerr += check_errs(base + '-np 3 -xyz -d2c', 8.8e-3, 1e-5) +nerr += check_errs(base + '-np 4 -xyz -d2c', 5e-3, 2e-7) + +print '{0:d} tests failed'.format(nerr) diff --git a/siqk/slmm/slmm_test.cpp b/siqk/slmm/slmm_test.cpp new file mode 100644 index 0000000..8a48375 --- /dev/null +++ b/siqk/slmm/slmm_test.cpp @@ -0,0 +1,201 @@ +#include "slmm_defs.hpp" +#include "slmm_mesh.hpp" +#include "slmm_gll.hpp" +#include "slmm_io.hpp" +#include "slmm_time_int.hpp" +#include "slmm_gallery.hpp" +#include "slmm_debug.hpp" +using namespace slmm; + +struct Command { + enum Enum { + test_make_cubedsphere, test_gll, test_make_gll_mesh, test_time_int + }; + static Enum from_string (const std::string& s) { + if (s == "test_make_cubedsphere") return test_make_cubedsphere; + if (s == "test_gll") return test_gll; + if (s == "test_make_gll_mesh") return test_make_gll_mesh; + if (s == "test_time_int") return test_time_int; + throw std::runtime_error(s + " is not a command."); + } +}; + +struct Input { + Command::Enum command; + Int n; + Real angle; + bool write_matlab, quiet; + std::string fn_pre_out; + + Input(Int argc, char** argv); + void print(std::ostream& os) const; +}; + +static Int test_make_cubedsphere (const Input& in) { + const Int np = 4; + Vec3s::HostMirror cp; + Idxs::HostMirror c2n; + mesh::make_cubedsphere(cp, c2n, in.n); + Int nerr = 0; + { + const Int ne = mesh::impl::check_elem_normal_against_sphere(cp, c2n); + if (ne) std::cerr << "FAIL: check_elem_normal_against_sphere\n"; + nerr += ne; + } + { + Idxs::HostMirror c2e, e2n; + mesh::impl::make_c2e_from_c2n(np, c2n, c2e, e2n); + Int ne = 0; + // Every edge has two cells, and each cell is a quad. + if (nslices(e2n) != 4/2*nslices(c2n)) { + ++ne; + std::cerr << "FAIL: make_c2e_from_c2n\n"; + } + nerr += ne; + } + if (in.write_matlab) + write_matlab("cm", cp, c2n); + return nerr; +} + +static Int test_gll (const Input& in) { + Int nerr = 0; + const Real tol = 1e2*std::numeric_limits::epsilon(); + GLL gll; + const Real* x, * wt; + const Int np = 4; + gll.get_coef(np, x, wt); + Real sum = 0; + for (Int i = 0; i < np; ++i) + sum += wt[i]; + if (std::abs(2 - sum) > tol) ++nerr; + for (Int j = 0; j < np; ++j) { + Real gj[GLL::max_np]; gll.eval(np, x[j], gj); + for (Int i = 0; i < np; ++i) { + if (j == i) continue; + if (std::abs(gj[i]) > tol) ++nerr; + } + } + return nerr; +} + +static Int test_make_gll_mesh (const Input& in) { + const Int np = 4; + Vec3s::HostMirror geo_p, cgll_p; + Idxs::HostMirror geo_c2n, cgll_c2n, cgll_io_c2n; + mesh::make_cubedsphere(geo_p, geo_c2n, in.n); + Int nerr = 0; + mesh::make_cgll_from_geo(geo_p, geo_c2n, np, cgll_p, cgll_c2n); + mesh::make_io_cgll_from_internal_cgll(cgll_p, cgll_c2n, cgll_io_c2n); + { // Clip the mesh against itself and get the total area. + const Real + area = siqk::test::test_area_ot(cgll_p, cgll_io_c2n, + cgll_p, cgll_io_c2n), + true_area = 4*M_PI, + re = std::abs(area - true_area)/true_area; + if (re >= 1e-10) { + fprintf(stderr, "true area %1.4e mesh area %1.4e relerr %1.4e\n", + true_area, area, re); + ++nerr; + } + } + { + const Int ne = mesh::impl::check_elem_normal_against_sphere( + cgll_p, cgll_io_c2n); + if (ne) std::cerr << "FAIL: check_elem_normal_against_sphere\n"; + nerr += ne; + } + { + IdxArray::HostMirror dglln2cglln; + Idxs::HostMirror dgll_c2n; + mesh::make_dgll_from_cgll(cgll_p, cgll_c2n, dglln2cglln, dgll_c2n); + const Int np2 = szslice(cgll_c2n); + Int ne = 0; + for (Int ci = 0; ci < nslices(cgll_c2n); ++ci) { + const auto cgll_cell = slice(cgll_c2n, ci); + for (Int ni = 0; ni < siqk::square(np); ++ni) + if (dglln2cglln[ci*np2 + ni] != cgll_cell[ni]) + ++ne; + } + if (ne) { + nerr += ne; + std::cerr << "FAIL: make_dgll_from_cgll\n"; + } + } + if ( ! in.fn_pre_out.empty()) { + io::NetcdfWriter ncw(cgll_p, cgll_io_c2n, in.fn_pre_out + ".g"); + ncw.add_nodal_field("x"); + ncw.end_definition(); + const Int n = nslices(cgll_p); + std::vector x(n), lat(n), lon(n); + for (Int i = 0; i < n; ++i) { + const auto p = slice(cgll_p, i); + xyz2ll(p[0], p[1], p[2], lat[i], lon[i]); + } + ncw.advance_time_to(1); + gallery::InitialCondition::init( + gallery::InitialCondition::CosineBells, + nslices(cgll_p), lat.data(), lon.data(), x.data()); + ncw.write_field("x", x.data()); + ncw.advance_time_to(1.5); + gallery::InitialCondition::init( + gallery::InitialCondition::SlottedCylinders, + nslices(cgll_p), lat.data(), lon.data(), x.data()); + ncw.write_field("x", x.data()); + ncw.advance_time_to(2.5); + gallery::InitialCondition::init( + gallery::InitialCondition::CorrelatedCosineBells, + nslices(cgll_p), lat.data(), lon.data(), x.data()); + ncw.write_field("x", x.data()); + } + if (in.write_matlab) { + write_matlab("cm", geo_p, geo_c2n); + write_matlab("m", cgll_p, cgll_io_c2n); + write_matlab("gll", cgll_p, cgll_c2n); + } + return nerr; +} + +static Int test_time_int (const Input& in) { + return timeint::test::test_ark( ! in.quiet); +} + +Input::Input (Int argc, char** argv) + : command(Command::test_make_cubedsphere), n(10), angle(M_PI*1e-1), + write_matlab(false), quiet(false) +{ + for (Int i = 1; i < argc; ++i) { + const std::string& token = argv[i]; + if (eq(token, "-c", "--command")) command = Command::from_string(argv[++i]); + else if (eq(token, "-n")) n = atoi(argv[++i]); + else if (eq(token, "-q", "--quiet")) quiet = true; + else if (eq(token, "-m", "--write-matlab")) write_matlab = true; + else if (eq(token, "-o", "--output-prefix")) fn_pre_out = argv[++i]; + else if (eq(token, "--angle")) angle = atof(argv[++i]); + } + + if ( ! quiet) print(std::cout); +} + +void Input::print (std::ostream& os) const { + os << "command " << command << "\n" + << "n (-n): " << n << "\n" + << "write matlab (-m): " << write_matlab << "\n" + << "angle (--angle): " << angle << "\n"; +} + +int main (int argc, char** argv) { + Kokkos::initialize(argc, argv); + { + Input in(argc, argv); + Int nerr = 0; + switch (in.command) { + case Command::test_make_cubedsphere: nerr = test_make_cubedsphere(in); break; + case Command::test_gll: nerr = test_gll(in); break; + case Command::test_make_gll_mesh: nerr = test_make_gll_mesh(in); break; + case Command::test_time_int: nerr = test_time_int(in); break; + } + std::cerr << (nerr ? "FAIL" : "PASS") << "ED\n"; + } + Kokkos::finalize_all(); +} diff --git a/siqk/slmm/slmm_time_int.cpp b/siqk/slmm/slmm_time_int.cpp new file mode 100644 index 0000000..96b2e05 --- /dev/null +++ b/siqk/slmm/slmm_time_int.cpp @@ -0,0 +1,156 @@ +#include "slmm_time_int.hpp" +#include "slmm_util.hpp" + +namespace slmm { +namespace timeint { +namespace test { +class TestFunctor { + mutable Size nsteps_; + +protected: + Real tspan_[2], ys_[2]; + +public: + TestFunctor (const Real tspan[2], const Real ys[2]) + : nsteps_(0) + { + copy(2, tspan, tspan_); + copy(2, ys, ys_); + } + Size nsteps () const { return nsteps_; } + void reset () { nsteps_ = 0; } + const Real* tspan () const { return tspan_; } + const Real* ys () const { return ys_; } + void record (const Real t, const Real* const y) const { ++nsteps_; } + virtual bool eval (const Real t, const Real* const y, Real* const f) const = 0; + virtual void eval_solution(const Real t, Real* const f) const = 0; +}; + +// ODE +// y'(t) = lambda y(t) +// with solution +// y(tf) = y(ts) e^(lambda (tf - ts)). +class LambdaFunctor : public TestFunctor { + Real lambda_[2]; +public: + LambdaFunctor (const Real lambda[2], const Real tspan[2], const Real ys[2]) + : TestFunctor(tspan, ys) + { + copy(2, lambda, lambda_); + } + virtual bool eval (const Real t, const Real* const y, Real* const f) const { + f[0] = lambda_[0]*y[0] - lambda_[1]*y[1]; + f[1] = lambda_[0]*y[1] + lambda_[1]*y[0]; + return true; + } + virtual void eval_solution (const Real t, Real* const y) const { + const Real dt = t - tspan_[0]; + const Real + c = std::cos(dt*lambda_[1]), + s = std::sin(dt*lambda_[1]), + mag = std::exp(dt*lambda_[0]); + y[0] = mag*(ys_[0]*c - ys_[1]*s); + y[1] = mag*(ys_[0]*s + ys_[1]*c); + } +}; + +class TimeDepFunctor : public TestFunctor { + const Real a_; +public: + TimeDepFunctor (const Real a, const Real tspan[2], const Real ys[2]) + : TestFunctor(tspan, ys), a_(a) + {} + virtual bool eval (const Real t, const Real* const y, Real* const f) const { + f[0] = a_*t; + f[1] = -0.5*a_*t; + return true; + } + virtual void eval_solution (const Real t, Real* const y) const { + const Real dst = square(t) - square(tspan_[0]); + y[0] = ys_[0] + 0.5*a_*dst; + y[1] = ys_[1] - 0.25*a_*dst; + } +}; + +enum ARKMethod { method_ark23, method_ark45 }; + +bool test_ark_y2 (TestFunctor& fun, const ARKMethod method, + const bool verbose = true) { + std::ostream& os = std::cout; + auto ios_state = save_ios(os); + + Options opts; + opts.set_initial_step(1e-3); + opts.set_abs_tol(1e-20); + Workspace w; + Real ya[2]; + fun.eval_solution(fun.tspan()[1], ya); + + Real rtol = 1e-1; + Real rds[6]; + const Size ntrial = static_cast(sizeof(rds)/sizeof(*rds)); + const Real rtol_increase = 100; + const Real den = std::sqrt(square(ya[0]) + square(ya[1])); + + for (Size trial = 0; trial < ntrial; ++trial) { + rtol *= 1/rtol_increase; + opts.set_rel_tol(rtol); + fun.reset(); + Real y[2]; + copy(2, fun.ys(), y); + if (method == method_ark23) + ark23(opts, fun, y, 2, fun.tspan()[0], fun.tspan()[1], w); + else + ark45(opts, fun, y, 2, fun.tspan()[0], fun.tspan()[1], w); + + rds[trial] = std::sqrt(square(y[0] - ya[0]) + square(y[1] - ya[1])) / den; + if (verbose) { + os.precision(2); + os << " trial " << std::setw(2) << trial + << " nsteps " << std::setw(6) << fun.nsteps() + << " rtol " << std::scientific << rtol + << " reldif " << rds[trial] << "\n"; + } + } + + const Real improvement = rds[ntrial-2] / rds[ntrial-1]; + const bool pass = + rds[ntrial-1] <= 1e2*rtol && + (improvement >= 0.9*rtol_increase || + rds[ntrial-1] <= 1e2*std::numeric_limits::epsilon()); + return pass; +} + +Int test_ark (const bool verbose) { + if (verbose) + std::cout << "> Adaptive Runge-Kutta 2-3 unit test\n"; + static const Real tspan[] = {0.5, 71.2}, ys[] = {3.6, -0.7}; + bool pass = true; + { + static const Real lambda[] = {-0.02, 0.25}; + { + if (verbose) std::cout << " Standard test function.\n"; + LambdaFunctor fun(lambda, tspan, ys); + pass = pass && test_ark_y2(fun, method_ark23, verbose); + pass = pass && test_ark_y2(fun, method_ark45, verbose); + } + { + if (verbose) std::cout << " Standard test function backwards in time.\n"; + const Real tspanb[] = {8, -3}; + LambdaFunctor fun(lambda, tspanb, ys); + pass = pass && test_ark_y2(fun, method_ark23, verbose); + pass = pass && test_ark_y2(fun, method_ark45, verbose); + } + } + { + if (verbose) std::cout << " Exact time-dependent function.\n"; + TimeDepFunctor fun(0.1, tspan, ys); + pass = pass && test_ark_y2(fun, method_ark23, verbose); + pass = pass && test_ark_y2(fun, method_ark45, verbose); + } + return pass ? 0 : 1; +} + +} // namespace test +} // namespace timeint +} // namespace slmm diff --git a/siqk/slmm/slmm_time_int.hpp b/siqk/slmm/slmm_time_int.hpp new file mode 100644 index 0000000..1da226a --- /dev/null +++ b/siqk/slmm/slmm_time_int.hpp @@ -0,0 +1,424 @@ +#ifndef INCLUDE_SLMM_TIME_INT_HPP +#define INCLUDE_SLMM_TIME_INT_HPP + +#include "slmm_defs.hpp" +#include "slmm_util.hpp" + +#include +#include +#include +#include +#include +#include +#include + +namespace slmm { +namespace timeint { +class Options { + Real initial_step_, rel_tol_, abs_tol_, max_step_size_; + +public: + Options () + : initial_step_(1e-3), rel_tol_(1e-3), abs_tol_(1e-6), max_step_size_(1e300) + {} + + void set_initial_step (const Real is) { initial_step_ = is; } + void set_rel_tol (const Real rt) { rel_tol_ = rt; } + void set_abs_tol (const Real at) { abs_tol_ = at; } + void set_max_step_size (const Real mss) { max_step_size_ = mss; } + + Real initial_step () const { return initial_step_; } + Real rel_tol () const { return rel_tol_; } + Real abs_tol () const { return abs_tol_; } + Real max_step_size () const { return max_step_size_; } +}; + +struct Workspace { + std::vector r; +}; + +struct Info { + Real good_initial_step; +}; + +struct ReturnState { + enum Enum { success, function_eval_failed, step_too_small }; +}; + +template +inline void copy (const Size n, const T* const s, T* const d) +{ for (Size i = 0; i < n; ++i) d[i] = s[i]; } + +inline void aixiy (const Size n, + const Real a0, const Real* const x0, + const Real a1, const Real* const x1, + Real* const y) { + for (Size i = 0; i < n; ++i) + y[i] = a0*x0[i] + a1*x1[i]; +} +inline void aixiy (const Size n, + const Real a0, const Real* const x0, + const Real a1, const Real* const x1, + const Real a2, const Real* const x2, + Real* const y) { + for (Size i = 0; i < n; ++i) + y[i] = a0*x0[i] + a1*x1[i] + a2*x2[i]; +} +inline void aixiy (const Size n, + const Real a0, const Real* const x0, + const Real a1, const Real* const x1, + const Real a2, const Real* const x2, + const Real a3, const Real* const x3, + Real* const y) { + for (Size i = 0; i < n; ++i) + y[i] = a0*x0[i] + a1*x1[i] + a2*x2[i] + a3*x3[i]; +} +inline void aixiy (const Size n, + const Real a0, const Real* const x0, + const Real a1, const Real* const x1, + const Real a2, const Real* const x2, + const Real a3, const Real* const x3, + const Real a4, const Real* const x4, + Real* const y) { + for (Size i = 0; i < n; ++i) + y[i] = a0*x0[i] + a1*x1[i] + a2*x2[i] + a3*x3[i] + a4*x4[i]; +} +inline void aixiy (const Size n, + const Real a0, const Real* const x0, + const Real a1, const Real* const x1, + const Real a2, const Real* const x2, + const Real a3, const Real* const x3, + const Real a4, const Real* const x4, + const Real a5, const Real* const x5, + Real* const y) { + for (Size i = 0; i < n; ++i) + y[i] = a0*x0[i] + a1*x1[i] + a2*x2[i] + a3*x3[i] + a4*x4[i] + a5*x5[i]; +} + +/*! \brief Implements the same RK3(2) pair as Matlab's ode23. + * + * A Functor f has + * - method + * bool eval(Real t, const Real* y, Real* f) const + * to evaluate f(t), the ODE at time t. Return false on failure. + * - method + * record(Real t, const Real* y) + * to optionally record y(t). + * + * \param opts [in] Options struct. + * \param fun [in] ODE Functor. + * \param y_caller [in/out] On input, y(t_s); on output, y(t_f). + * \param n [in] length(y). + * \param ts [in] t_s. + * \param tf [in] t_f. + * \param w [in/out] Workspace. Reuse between calls to minimize allocations. + * + * Cite: + * P. Bogacki, L.F. Shampine, "A 3(2) Pair of Runge-Kutta Formulas", + * Appl. Math Lett. 2(4), 321-325, 1989. + * and + * The MATLAB ODE Suite, L. F. Shampine and M. W. Reichelt, SIAM Journal on + * Scientific Computing, 18-1, 1997. + */ +template +ReturnState::Enum +ark23 (const Options& opts, const Functor& fun, + Real* const y_caller, const Size n, + const Real ts, const Real tf, + Workspace& w, Info* info=0) { + static const Real pow = 1.0/3.0; + + const Real threshold = opts.abs_tol() / opts.rel_tol(); + const int tdir = tf >= ts ? 1 : -1; + + w.r.resize(5*n); + Real* f0 = w.r.data(); + Real* const f1 = f0 + n; + Real* const f2 = f1 + n; + Real* f3 = f2 + n; + Real* y0 = y_caller; + Real* y1 = f3 + n; + + Real t = ts; + const Real sgn = sign(tf - ts); + Real absh = std::abs(opts.initial_step()); + if (info) info->good_initial_step = absh; + bool fgood = fun.eval(t, y0, f0); + fun.record(t, y0); + if ( ! fgood) return ReturnState::function_eval_failed; + + while (sgn*t < sgn*tf) { + const double hmin = 16*std::numeric_limits::epsilon()*t; + bool no_failed = true; + Real err, tnew; + for (;;) { // Integrate one step; loop until success. + // Get tnew and sanitized h. + absh = std::min(absh, opts.max_step_size()); + Real h = tdir*absh; + if (sgn*(t + h) > sgn*tf) { + h = tf - t; + absh = std::abs(h); + } + tnew = t + h; + h = tnew - t; + + // Integration rule. + do { + aixiy(n, 1, y0, 0.5*h, f0, y1); + fgood = fun.eval(t + 0.5*h, y1, f1); + if ( ! fgood) break; + aixiy(n, 1, y0, 0.75*h, f1, y1); + fgood = fun.eval(t + 0.75*h, y1, f2); + if ( ! fgood) break; + aixiy(n, 1, y0, 2.0*h/9.0, f0, h/3.0, f1, 4.0*h/9.0, f2, y1); + fgood = fun.eval(tnew, y1, f3); + } while (0); + + // Determine error. + err = 0; + if ( ! fgood) { + err = opts.rel_tol() + 1; + no_failed = false; + } else { + // Coefficients from subtracting the order-2 prediction from the order-3 + // prediction: + static const Real E[] = {-5.0/72.0, 1.0/12.0, 1.0/9.0, -1.0/8.0}; + // Element-wise error control: + // err = absh * norm( (f*E) ./ max( max(abs(y), abs(yt)), + // threshold), + // inf ); + for (Size i = 0; i < n; ++i) { + const Real fE = + std::abs(E[0]*f0[i] + E[1]*f1[i] + E[2]*f2[i] + E[3]*f3[i]); + const Real den = + std::max(std::max(std::abs(y0[i]), std::abs(y1[i])), + threshold); + err = std::max(err, fE / den); + } + err *= absh; + } + + // Determine if the step succeeded. If it did not, compute a smaller step + // size and try again. + if (err > opts.rel_tol()) { + if (absh <= hmin) { + fun.record(t, y1); + return ReturnState::step_too_small; + } + if (no_failed) { + no_failed = false; + absh = std::max( + hmin, absh*std::max( + 0.5, 0.8*std::pow(opts.rel_tol()/err, pow))); + } else { + absh = std::max(hmin, 0.5*absh); + } + } else { + // Successful step. Break from the integration loop. + break; + } + } // One integration step. + if (info) info->good_initial_step = absh; + + if (no_failed) { + // Integration step succeeded on first try. Increase the step size. + const Real fac = 0.8*std::pow(opts.rel_tol()/err, pow); + // Don't increase the step size by more than 5x. + absh = std::min(5, fac)*absh; + } + + t = tnew; + // Swap pointers. + std::swap(y0, y1); + std::swap(f0, f3); + + fun.record(t, y0); + } + + // On output, y_caller contains y(tf). If the pointers don't agree (because of + // swapping above), copy. + if (y_caller != y0) + memcpy(y_caller, y0, n*sizeof(*y_caller)); + + return ReturnState::success; +} + +/*! \brief Implements the same RK5(4) pair as Matlab's ode45. + * + * Cite: + * Dormand, J. R.; Prince, P. J. (1980), "A family of embedded Runge-Kutta + * formulae", Journal of Computational and Applied Mathematics 6 (1): 19–26. + * and + * The MATLAB ODE Suite, L. F. Shampine and M. W. Reichelt, SIAM Journal on + * Scientific Computing, 18-1, 1997. + * + * The Butcher tableau is + * + * 0 | + * 1/5 | 1/5 + * 3/10 | 3/40 9/40 + * 4/5 | 44/45 -56/15 32/9 + * 8/9 | 19372/656 -25360/2187 64448/6561 -212/729 + * 1 | 9017/3168 -355/33 46732/5247 49/176 -5103/18656 + * 1 | 35/384 0 500/1113 125/192 -2187/6784 11/84 + * -------------------------------------------------------------------------------- + * | 35/384 0 500/1113 125/192 -2187/6784 11/84 0 + * | 5179/57600 0 7571/16695 393/640 -92097/339200 187/2100 1/40 + * + * and the corresponding E array, obtained from subtracting the first row of b + * from the second, is + * + * -71/57600 0 71/16695 -71/1920 17253/339200 -88/2100 1/40 + */ +template +ReturnState::Enum +ark45 (const Options& opts, const Functor& fun, + Real* const y_caller, const Size n, + const Real ts, const Real tf, + Workspace& w, Info* info=0) { + static const Real + c2 = 0.2, c3 = 0.3, c4 = 0.8, c5 = 8.0/9.0; + static const Real + a21 = c2, + a31 = 3.0/40.0, a32 = 9.0/40.0, + a41 = 44.0/45.0, a42 = -56.0/15.0, a43 = 32.0/9.0, + a51 = 19372.0/6561.0, a52 = -25360.0/2187.0, a53 = 64448.0/6561.0, + a54 = -212.0/729.0, + a61 = 9017.0/3168.0, a62 = -355.0/33.0, a63 = 46732.0/5247.0, + a64 = 49.0/176.0, a65 = -5103.0/18656.0, + a71 = 35.0/384.0, a73 = 500.0/1113.0, a74 = 125.0/192.0, + a75 = -2187.0/6784.0, a76 = 11.0/84.0; + static const Real pow = 1.0/5.0; + // Coefficients from subtracting the order-4 prediction from the order-5 + // prediction: + static const Real E[] = {-71.0/57600.0, 0.0, 71.0/16695.0, -71.0/1920.0, + 17253.0/339200.0, -88.0/2100.0, 1.0/40.0}; + + const Real threshold = opts.abs_tol() / opts.rel_tol(); + const int tdir = tf >= ts ? 1 : -1; + + w.r.resize(9*n); + Real* f0 = w.r.data(); + Real* const f1 = f0 + n; + Real* const f2 = f1 + n; + Real* const f3 = f2 + n; + Real* const f4 = f3 + n; + Real* const f5 = f4 + n; + Real* f6 = f5 + n; + Real* y0 = y_caller; + Real* y1 = f6 + n; + + Real t = ts; + const Real sgn = sign(tf - ts); + Real absh = std::abs(opts.initial_step()); + if (info) info->good_initial_step = absh; + bool fgood = fun.eval(t, y0, f0); + fun.record(t, y0); + if ( ! fgood) return ReturnState::function_eval_failed; + + while (sgn*t < sgn*tf) { + const double hmin = 16*std::numeric_limits::epsilon()*t; + bool no_failed = true; + Real err, tnew; + for (;;) { // Integrate one step; loop until success. + // Get tnew and sanitized h. + absh = std::min(absh, opts.max_step_size()); + Real h = tdir*absh; + if (sgn*(t + h) > sgn*tf) { + h = tf - t; + absh = std::abs(h); + } + tnew = t + h; + h = tnew - t; + + // Integration rule. + do { + aixiy(n, 1, y0, a21*h, f0, y1); + fgood = fun.eval(t + c2*h, y1, f1); + if ( ! fgood) break; + aixiy(n, 1, y0, a31*h, f0, a32*h, f1, y1); + fgood = fun.eval(t + c3*h, y1, f2); + if ( ! fgood) break; + aixiy(n, 1, y0, a41*h, f0, a42*h, f1, a43*h, f2, y1); + fgood = fun.eval(t + c4*h, y1, f3); + if ( ! fgood) break; + aixiy(n, 1, y0, a51*h, f0, a52*h, f1, a53*h, f2, a54*h, f3, y1); + fgood = fun.eval(t + c5*h, y1, f4); + if ( ! fgood) break; + aixiy(n, 1, y0, a61*h, f0, a62*h, f1, a63*h, f2, a64*h, f3, a65*h, f4, y1); + fgood = fun.eval(tnew, y1, f5); + if ( ! fgood) break; + aixiy(n, 1, y0, a71*h, f0, a73*h, f2, a74*h, f3, a75*h, f4, a76*h, f5, y1); + fgood = fun.eval(tnew, y1, f6); + } while (0); + + // Determine error. + err = 0; + if ( ! fgood) { + err = opts.rel_tol() + 1; + no_failed = false; + } else { + for (Size i = 0; i < n; ++i) { + const Real fE = + std::abs(E[0]*f0[i] + E[1]*f1[i] + E[2]*f2[i] + E[3]*f3[i] + + E[4]*f4[i] + E[5]*f5[i] + E[6]*f6[i]); + const Real den = + std::max(std::max(std::abs(y0[i]), std::abs(y1[i])), + threshold); + err = std::max(err, fE / den); + } + err *= absh; + } + + // Determine if the step succeeded. If it did not, compute a smaller step + // size and try again. + if (err > opts.rel_tol()) { + if (absh <= hmin) { + fun.record(t, y1); + return ReturnState::step_too_small; + } + if (no_failed) { + no_failed = false; + absh = std::max( + hmin, absh*std::max( + 0.5, 0.8*std::pow(opts.rel_tol()/err, pow))); + } else { + absh = std::max(hmin, 0.5*absh); + } + } else { + // Successful step. Break from the integration loop. + break; + } + } // One integration step. + if (info) info->good_initial_step = absh; + + if (no_failed) { + // Integration step succeeded on first try. Increase the step size. + const Real fac = 0.8*std::pow(opts.rel_tol()/err, pow); + // Don't increase the step size by more than 5x. + absh = std::min(5, fac)*absh; + } + + t = tnew; + // Swap pointers. + std::swap(y0, y1); + std::swap(f0, f6); + + fun.record(t, y0); + } + + // On output, y_caller contains y(tf). If the pointers don't agree (because of + // swapping above), copy. + if (y_caller != y0) + memcpy(y_caller, y0, n*sizeof(*y_caller)); + + return ReturnState::success; +} + +namespace test { +Int test_ark(const bool verbose); +} // namespace test +} // namespace timeint +} // namespace slmm + +#endif diff --git a/siqk/slmm/slmm_util.cpp b/siqk/slmm/slmm_util.cpp new file mode 100644 index 0000000..d1c4480 --- /dev/null +++ b/siqk/slmm/slmm_util.cpp @@ -0,0 +1,30 @@ +#include "slmm_util.hpp" + +#include +#include +#include + +namespace slmm { + +double wall_time () { + static const double us = 1.0e6; + timeval t; + gettimeofday(&t, 0); + return (t.tv_sec*us + t.tv_usec)/us; +} + +std::string& tolower (std::string& s) { + for (auto& c: s) + c = std::tolower(c); + return s; +} + +std::string format_strings_as_list (const char** strings, const Size n) { + std::stringstream ss; + ss << "{"; + for (Size i = 0; i < n-1; ++i) ss << strings[i] << ", "; + ss << strings[n-1] << "}"; + return ss.str(); +} + +} // namespace slmm diff --git a/siqk/slmm/slmm_util.hpp b/siqk/slmm/slmm_util.hpp new file mode 100644 index 0000000..a192d6c --- /dev/null +++ b/siqk/slmm/slmm_util.hpp @@ -0,0 +1,153 @@ +#ifndef INCLUDE_SLMM_UTIL_HPP +#define INCLUDE_SLMM_UTIL_HPP + +#include "slmm_defs.hpp" + +#include + +namespace slmm { +using siqk::square; +template inline constexpr T cube (const T& x) { return x*x*x; } + +struct consts { + static constexpr Real earth_radius_m = 6.37122e6; +}; + +template inline T sign (const T& a) { return a >= 0 ? 1 : -1; } + +inline Real sec2day (const Real sec) { return sec/(24*3600); } +inline Real day2sec (const Real day) { return day*(24*3600); } + +// Output is in radians. +//todo Make a version that lets you pass R = mag(x,y,z). +inline void xyz2ll (const Real x, const Real y, const Real z, + Real& lat, Real& lon) { + const Real r = std::sqrt(square(x) + square(y) + square(z)); + lat = std::asin(z/r); + lon = std::atan2(y, x); +} + +// Input is in radians. +inline void ll2xyz (const Real lat, const Real lon, Real& x, Real& y, Real& z, + const Real radius = 1) { + const Real sinl = std::sin(lat), cosl = std::cos(lat); + x = radius*std::cos(lon)*cosl; + y = radius*std::sin(lon)*cosl; + z = radius*sinl; +} + +// Eq after eq 10 in Lauritzen et al test cases paper. +inline Real great_circle_dist ( + const Real lat1, const Real lon1, const Real lat2, const Real lon2, + const Real R = 1) +{ + return R*std::acos(std::sin(lat1)*std::sin(lat2) + + std::cos(lat1)*std::cos(lat2)*std::cos(lon1 - lon2)); +} + +inline constexpr Real m2radlat (const Real m) +{ return m/consts::earth_radius_m; } + +inline Real m2radlon(const Real lat, const Real m) +{ return m2radlat(m)/std::abs(std::cos(lat)); } + +inline constexpr Real deg2rad (const Real v) { return v * (M_PI/180); } +inline constexpr Real rad2deg (const Real v) { return v * (180/M_PI); } + +inline Real reldif (const Real a, const Real b, const Real abstol = 0) +{ return std::abs(b - a)/(abstol + std::abs(a)); } + +// Row-major R. +inline void form_rotation (const Real axis[3], const Real angle, Real r[9]) { + const Real nrm = std::sqrt(square(axis[0]) + square(axis[1]) + + square(axis[2])); + const Real& x = axis[0] / nrm, & y = axis[1] / nrm, & z = axis[2] / nrm, + & th = angle; + const Real cth = std::cos(th), sth = std::sin(th), omcth = 1 - cth; + r[0] = cth + x*x*omcth; + r[3] = y*x*omcth + z*sth; + r[6] = z*x*omcth - y*sth; + r[1] = x*y*omcth - z*sth; + r[4] = cth + y*y*omcth; + r[7] = z*y*omcth + x*sth; + r[2] = x*z*omcth + y*sth; + r[5] = y*z*omcth - x*sth; + r[8] = cth + z*z*omcth; +} + +/*! \brief RAII std stream state saver. + * + * Example: Preserve std::cout's state so manipulations don't affect others' use + * of cout. + */ +template class IosSaver { + Stream& s_; + std::ios state_; +public: + IosSaver (Stream& s) : s_(s), state_(nullptr) { state_.copyfmt(s); } + IosSaver (const IosSaver& ios) : s_(ios.s_), state_(nullptr) + { state_.copyfmt(ios.state_); } + IosSaver operator= (const IosSaver&) = delete; + ~IosSaver () { s_.copyfmt(state_); } +}; +template inline IosSaver save_ios (Stream& s) +{ return IosSaver(s); } + +template +inline T* tin (T* const p, const char* const msg="") { + if ( ! p) + throw std::runtime_error(std::string(std::string("Null pointer: ") + msg)); + return p; +} + +inline bool +eq (const std::string& a, const char* const b1, const char* const b2 = 0) { + return (a == std::string(b1) || (b2 && a == std::string(b2)) || + a == std::string("-") + std::string(b1)); +} + +std::string& tolower(std::string& s); + +std::string format_strings_as_list(const char** strings, const Size n); + +double wall_time(); + +template inline Int len (const V& v) +{ return static_cast(v.dimension_0()); } + +template inline Int len (const std::vector& v) +{ return static_cast(v.size()); } + +class ProgressBar { + std::string name_; + const Int nits_; // total # iterations + const Real wf_; // write frequency in percentage points + Int it_; + Real next_; + std::ostream& os_; + +public: + ProgressBar (const std::string& name, const Int niterations, + const Real write_freq = 1.0, std::ostream& os = std::cout) + : name_(name), nits_(niterations), wf_(write_freq), it_(0), next_(0), + os_(os) + { + os_ << name_ << ":"; + os_.flush(); + } + + void update () { + ++it_; + const Real p = 100 * it_ / nits_; + if (p >= next_ || it_ == nits_) { + os_ << " " << p; + if (it_ == nits_) os_ << "\n"; + os_.flush(); + next_ += wf_; + } + } +}; + +} // namespace slmm + +#endif diff --git a/siqk/slmm/slmmir.cpp b/siqk/slmm/slmmir.cpp new file mode 100644 index 0000000..160380e --- /dev/null +++ b/siqk/slmm/slmmir.cpp @@ -0,0 +1,1712 @@ +#include "slmm_defs.hpp" +#include "slmm_mesh.hpp" +#include "slmm_gll.hpp" +#include "slmm_io.hpp" +#include "slmm_time_int.hpp" +#include "slmm_gallery.hpp" +#include "slmm_debug.hpp" +using namespace slmm; + +// ----------------------------------------------------------------------------- +// NLA stuff taken from cflexp1 tr_gll. All of this needs to be rewritten for +// Kokkos. My plan is to get the program running end to end correctly, and then +// I'll go back and transition things to Kokkos and to running on the GPU. + +template class Array { + T* p_; + std::size_t n_, cap_; +public: + Array () { init(); } + Array(std::size_t n); + Array(std::size_t n, const T& init); + ~Array () { clear(); } + // Initialize the object with the assumption that all variables are uninit'ed + // prior to calling. + void init(); + void clear(); + // optclear means optionally clear. The function has the semantics of + // clearing, but it may not actually release the memory. + void optclear_and_resize(std::size_t n); + // _ft indicates first touch. + void optclear_and_resize_ft(std::size_t n); + void optclear_and_resize(std::size_t n, const T& i); + void optclear_and_reserve(std::size_t n); + void optclear_and_reserve_ft(std::size_t n); + T& operator[] (std::size_t i) { return p_[i]; } + const T& operator[] (std::size_t i) const { return p_[i]; } + T& back () { return p_[n_-1]; } + const T& back () const { return p_[n_-1]; } + std::size_t size () const { return n_; } + bool empty () const { return size() == 0; } + T* data () const { return p_; } + // This does not realloc; reserve must provide the necessary memory. It does + // not throw, either. It asserts. + void unsafe_push_back(const T& e); + T* begin () { return p_; } + T* end () { return p_ + n_; } + void set (const T& v) { for (std::size_t i = 0; i < n_; ++i) p_[i] = v; } +}; + +// All indices and sizes are relative to blocks except m() and n(). +// Whether each block is row- or col-major is up to the caller. +// Each row's cols must be sorted. +template +class BlockMatrix { +public: + typedef ScalarT Scalar; + typedef SizeT Size; + typedef IntT Int; + + typedef BlockMatrix Me; + + // Don't need N, really, but it's handy for assertions/debugging. + Int M_, N_, m_, n_; + std::shared_ptr rowptr_p_; + std::shared_ptr colidx_p_; + std::shared_ptr d_p_; + Size* rowptr_; + Int* colidx_; + Scalar* d_; + +public: + BlockMatrix () + : M_(0), m_(0), n_(0), rowptr_(nullptr), colidx_(nullptr), d_(nullptr) + {} + + BlockMatrix (const Int M, const Int N, const Int m, const Int n, + const Size* rowptr, const Int* colidx) { + init(M, N, m, n, rowptr, colidx); + } + + void init (const Int M, const Int N, const Int m, const Int n, + const Size* rowptr, const Int* colidx) { + M_ = M; N_ = N; m_ = m; n_ = n; + rowptr_p_ = std::shared_ptr(new Size[M_ + 1], + std::default_delete()); + rowptr_ = rowptr_p_.get(); + memcpy(rowptr_, rowptr, (M_ + 1)*sizeof(Size)); + colidx_p_ = std::shared_ptr(new Int[rowptr[M_]], + std::default_delete()); + colidx_ = colidx_p_.get(); + memcpy(colidx_, colidx, rowptr[M_]*sizeof(Int)); + d_p_ = std::shared_ptr(new Scalar[rowptr_[M_]*m_*n_], + std::default_delete()); + d_ = d_p_.get(); + } + + const Size* rowptr () const { return rowptr_; } + const Int* colidx () const { return colidx_; } + + const Int M () const { return M_; } + const Int N () const { return N_; } + const Int m () const { return m_; } + const Int n () const { return n_; } + + const Scalar* blockrow (const Int br) const { + assert(br < M_); + return d_ + rowptr_[br]*m_*n_; + } + Scalar* blockrow (const Int br) { + return const_cast(const_cast(this)->blockrow(br)); + } + + const Scalar* block (const Int br, const Int bc) const { + assert(br < M_); + assert(bc < N_); + const Int* const beg = colidx_ + rowptr_[br]; + const Int* const end = colidx_ + rowptr_[br+1]; + const Int* const idx = std::lower_bound(beg, end, bc); + if (idx == end) return nullptr; + const Int i = static_cast(idx - colidx_); + return d_ + i*m_*n_; + } + Scalar* block (const Int br, const Int bc) { + return const_cast(const_cast(this)->block(br, bc)); + } + + void zero () { for (Size i = 0; i < rowptr_[M_]*m_*n_; ++i) d_[i] = 0; } + + static void test(); +}; + +class FullMassMatrix { + typedef BlockMatrix MT; + + int np_; + MT m_; + +public: + typedef std::shared_ptr Ptr; + + FullMassMatrix () : np_(0) {} + FullMassMatrix (const int nelem, const int np) { init(nelem, np); } + + void init(const int nelem, const int np); + + int np2 () const { return np_*np_; } + int np4 () const { return np2()*np2(); } + + const Real* block(const int i) const; + Real* block(const int i); + + const MT& get_M () const { return m_; } + + void factor(); + void solve(const int elem, Real* const bx, const int nrhs, + const int ldbx) const; +}; + +class RemapData { +public: + typedef std::shared_ptr Ptr; + typedef BlockMatrix MT; + typedef Array VT; + typedef siqk::Octree Octree; + + // Full block-diag target-target mass matrix, factored. + FullMassMatrix fmm_; + // Search tree over Eulerian mesh. + Octree ot_; + // Target-source matrix. + MT T_; + // Jacobian(ref square -> sphere). + RealArray::HostMirror Jt_; + // Eulerian mesh basis function integrals. + RealArray::HostMirror dgbfi_, cgbfi_; + +public: + // Set up. + FullMassMatrix& fmm () { return fmm_; } + Octree& octree () { return ot_; } + MT& T () { return T_; } + RealArray::HostMirror& Jt () { return Jt_; } + RealArray::HostMirror& dgbfi () { return dgbfi_; } + RealArray::HostMirror& cgbfi () { return cgbfi_; } + + // Apply. + Int T_nrows () const { return T_.M()*T_.m(); } + Int T_ncols () const { return T_.N()*T_.n(); } + const Octree& octree () const { return ot_; } + const ConstRealArray::HostMirror& Jt () const { return Jt_; } + const ConstRealArray::HostMirror& dgbfi () const { return dgbfi_; } + const ConstRealArray::HostMirror& cgbfi () const { return cgbfi_; } + + // y = T x. + void apply_T(const Real* x, const int ldx, Real* y, const int ldy, + const int nrhs) const; + // y = T' x. Not needed in practice, but used in check(). + void apply_T_transp(const Real* x, const int ldx, Real* y, const int ldy, + const int nrhs) const; + // x = M_full \ b. + void solve_M_full(Real* bx, const int nrhs, const int ldxb) const; + // y = R_full x + void apply_R_full(const Real* x, const int ldx, Real* y, const int ldy, + const int nrhs) const; + // y = R_lump x + void apply_R_lump(const Real* x, const int ldx, Real* y, const int ldy, + const int nrhs) const; + + // Perform and print some checks. Each entry of these Jacobians is the + // integral over the spherical quad of a basis function. So it's really more + // than just a Jacobian. + void check(const Real* Js, const Real* Jt) const; + // If T is expected to be identical to M (analytically), check how close it + // really is. Works only before 'factor' is called. + void compare_MT() const; +}; + +template inline void touch (T* const p, const size_t n, + const T& init = T()) { + // 1 KB should be a safe lower bound on page size. Touch enough to touch every + // page; I don't think there's any need to touch more memory than that. + for (size_t i = 0; i < n; i += 1024 / sizeof(T)) + p[i] = init; + // Make sure the last part is touched. + if (n) p[n-1] = init; +} +template inline T* +allocn (const size_t n, const bool first_touch = false) { + if ( ! n) return 0; + T* p = new T[n]; + if (first_touch) touch(p, n); + return p; +} +template inline void deln (T*& p) { + if (p) delete[] p; + p = 0; +} +template inline void deln_const (const T* p) { + if (p) delete[] p; +} +template inline void del (T*& p) { + if (p) delete p; + p = 0; +} + +template +inline void Array::init () { + n_ = cap_ = 0; + p_ = 0; +} + +template +inline Array::Array (std::size_t n) + : p_(0), n_(0), cap_(0) +{ optclear_and_resize(n); } + +template +inline Array::Array (std::size_t n, const T& init) + : p_(0), n_(0), cap_(0) +{ optclear_and_resize(n, init); } + +template +inline void Array::clear () { + n_ = cap_ = 0; + deln(p_); +} + +template +inline void Array::optclear_and_reserve (std::size_t n) { + n_ = 0; + if (n <= cap_) return; + clear(); + p_ = allocn(n); + cap_ = n; +} + +template +inline void Array::optclear_and_reserve_ft (std::size_t n) { + n_ = 0; + if (n <= cap_) return; + clear(); + p_ = allocn(n, true); + cap_ = n; +} + +template +inline void Array::optclear_and_resize (std::size_t n) { + if (n <= cap_) { + n_ = n; + return; + } + optclear_and_reserve(n); + n_ = n; +} + +template +inline void Array::optclear_and_resize_ft (std::size_t n) { + if (n <= cap_) { + n_ = n; + return; + } + optclear_and_reserve_ft(n); + n_ = n; +} + +template +inline void Array::optclear_and_resize (std::size_t n, const T& init) { + optclear_and_resize(n); + for (std::size_t i = 0; i < n_; ++i) + memcpy(p_ + i, &init, sizeof(init)); +} + +template +inline void Array::unsafe_push_back (const T& e) { + assert(n_ < cap_); + p_[n_++] = e; +} + +template +void BlockMatrix::test () { + static const Size rowptr[] = {0, 2, 3, 6, 8 }; + static const Size colidx[] = {0, 1, 1, 0, 2, 3, 1, 3}; + static const Int M = sizeof(rowptr)/sizeof(Size) - 1; + static const int m = 3, n = 4; + + { + BlockMatrix a(M, M, m, n, rowptr, colidx); + + assert(a.M() == M); + assert(a.m() == m); + assert(a.n() == n); + + const auto rowptr = a.rowptr(); + const auto colidx = a.colidx(); + for (Int r = 0, ctr = 1; r < a.M(); ++r) { + Scalar* d = a.blockrow(r); + for (Int j = 0; j < rowptr[r+1] - rowptr[r]; ++j, ++ctr) { + for (Int i = 0; i < a.m()*a.n(); ++i) + d[i] = ctr; + d += a.m()*a.n(); + } + } + + for (Int r = 0, ctr = 1; r < M; ++r) + for (Int j = rowptr[r]; j < rowptr[r+1]; ++j, ++ctr) { + Scalar const* const d = a.block(r, colidx[j]); + assert(d); + for (Int i = 0; i < m*n; ++i) + assert(d[i] == ctr); + } + } +} + +extern "C" { + void dgemm_(const char* transa, const char* transb, const int* m, + const int* n, const int* k, const double* alpha, const double* a, + const int* lda, const double* b, const int* ldb, + const double* beta, double* c, const int* ldc); + void dpotrf_(const char* uplo, const int* n, double* a, const int* lda, + int* info); + void dpotrs_(const char* uplo, const int* n, const int* nrhs, const double* a, + const int* lda, double* b, const int* ldb, int* info); +} + +inline void dgemm ( + char transa, char transb, int m, int nrhs, int n, double alpha, + const double* a, int lda, const double* b, int ldb, double beta, + const double* c, int ldc) +{ + dgemm_(&transa, &transb, &m, &nrhs, &n, &alpha, const_cast(a), &lda, + const_cast(b), &ldb, &beta, const_cast(c), &ldc); +} + +void FullMassMatrix::init (const int nelem, const int np) { + np_ = np; + Array rowptr(nelem + 1), colidx(nelem); + for (int i = 0; i < nelem; ++i) { + rowptr[i] = i; + colidx[i] = i; + } + rowptr[nelem] = nelem; + m_.init(nelem, nelem, np2(), np2(), rowptr.data(), colidx.data()); + m_.zero(); + assert(m_.m() == np2() && m_.n() == np2()); + assert(m_.M() == m_.N() && m_.M() == nelem); + assert(m_.blockrow(0) + np4() == m_.blockrow(1)); +} + +const double* FullMassMatrix::block (const int i) const { + assert(m_.blockrow(i) - m_.blockrow(0) == i*np4()); + return m_.blockrow(i); +} +double* FullMassMatrix::block (const int i) { + return const_cast(const_cast(m_).blockrow(i)); +} + +void FullMassMatrix::factor () { + const int n = np2(); +# pragma omp parallel for + for (int i = 0; i < m_.M(); ++i) { + double* const d = block(i); + const char uplo = 'L'; + int info; + dpotrf_(&uplo, &n, d, &n, &info); + if (info != 0) { + fprintf(stderr, "M() %d i %d info %d\n", m_.M(), i, info); + fprintf(stderr, "a = ["); + for (int c = 0; c < n; ++c) { + for (int r = 0; r < n; ++r) + fprintf(stderr, " %1.15e", d[n*c + r]); + fprintf(stderr, ";"); + } + fprintf(stderr, "];\n"); + } + assert(info == 0); + } +} + +void FullMassMatrix:: +solve (const int elem, double* const bx, const int nrhs, const int ldbx) const { + const int n = np2(); + const double* const d = block(elem); + const char uplo = 'L'; + int info; + dpotrs_(&uplo, &n, &nrhs, const_cast(d), &n, bx, &ldbx, &info); + assert(info == 0); +} + +void RemapData::apply_T (const double* x, const int ldx, double* y, + const int ldy, const int nrhs) const { + const MT::Scalar* const d = T_.blockrow(0); + const MT::Size* const rowptr = T_.rowptr(); + const MT::Int* const colidx = T_.colidx(); +# pragma omp parallel + { + const MT::Int n = T_.N()*T_.n(); +# pragma omp for + for (MT::Int i = 0; i < n; ++i) + y[i] = 0; +# pragma omp for + for (MT::Size br = 0; br < T_.M(); ++br) + for (MT::Int j = rowptr[br]; j < rowptr[br+1]; ++j) { + const MT::Int bc = colidx[j]; + const MT::Scalar* const b = d + j*T_.m()*T_.n(); + dgemm('t', 'n', T_.m(), nrhs, T_.n(), 1, b, T_.m(), x + bc*T_.n(), ldx, + 1, y + br*T_.m(), ldy); + } + } +} + +void RemapData::apply_T_transp (const double* x, const int ldx, double* y, + const int ldy, const int nrhs) const { + const MT::Scalar* const d = T_.blockrow(0); + const MT::Size* const rowptr = T_.rowptr(); + const MT::Int* const colidx = T_.colidx(); + for (MT::Int i = 0, n = T_.M()*T_.m(); i < n; ++i) + y[i] = 0; + for (MT::Size br = 0; br < T_.M(); ++br) + for (MT::Int j = rowptr[br]; j < rowptr[br+1]; ++j) { + const MT::Int bc = colidx[j]; + const MT::Scalar* const b = d + j*T_.m()*T_.n(); + dgemm('n', 'n', T_.m(), nrhs, T_.n(), 1, b, T_.m(), x + br*T_.m(), ldx, 1, + y + bc*T_.n(), ldy); + } +} + +void RemapData::solve_M_full (double* bx, const int nrhs, + const int ldxb) const { +# pragma omp parallel for + for (MT::Int br = 0; br < T_.M(); ++br) + fmm_.solve(br, bx + br*fmm_.np2(), nrhs, ldxb); +} + +void RemapData::apply_R_full (const double* x, const int ldx, double* y, + const int ldy, const int nrhs) const { + const MT::Int n = T_nrows(); + apply_T(x, n, y, n, 1); + solve_M_full(y, 1, n); +} + +static void report (const std::string label, const Real* const x_t, + const Real* const x, const Int n) { + Real me = 0, den = 0; + for (Int i = 0; i < n; ++i) { + me = std::max(me, std::abs(x[i] - x_t[i])); + den = std::max(den, std::abs(x_t[i])); + } + printf("> RemapData %21s: %1.3e\n", label.c_str(), me/den); +} + +void RemapData::check (const Real* Js, const Real* Jt) const { + const int n = T_nrows(); + // This routine assumes T is nxn. + Array e(n), x(n), y(n); + e.set(1); + + memcpy(x.data(), Jt, n*sizeof(Real)); + solve_M_full(x.data(), 1, n); + report("M_full \\ Jt = e", e.data(), x.data(), n); + + apply_T_transp(e.data(), n, x.data(), n, 1); + report("e' T = Js'", Js, x.data(), n); + + apply_T(e.data(), n, x.data(), n, 1); + report("T e = Jt", Jt, x.data(), n); + + apply_R_full(e.data(), n, x.data(), n, 1); + report("[ct] R_full e = e", e.data(), x.data(), n); + + memcpy(x.data(), Jt, n*sizeof(Real)); + solve_M_full(x.data(), 1, n); + apply_T_transp(x.data(), n, y.data(), n, 1); + report("[cv] Jt' R_full = Js'", Js, y.data(), n); +} + +void RemapData::compare_MT () const { + Real diag_num = 0, diag_den = 0; + const auto& M = fmm_.get_M(); + const auto& T = T_; + assert(M.M() == T.M()); + assert(M.m() == T.m()); + for (Int br = 0; br < T.M(); ++br) { + const auto Mb = M.block(br, br); + const auto Tb = T.block(br, br); + for (Int k = 0; k < square(M.m()); ++k) { + diag_num += square(Tb[k] - Mb[k]); + diag_den += square(Mb[k]); + } + } + printf("> rd(M,T) %1.3e\n", std::sqrt(diag_num/diag_den)); +} + +// ----------------------------------------------------------------------------- +// fwd = forward: The mesh at t_{n-1} is the departure mesh and is integrated +// forward in time. It is the source mesh. +// bwd = backward: The mesh at t_n is the departure mesh and is integrated +// backward in time. It is the target mesh. +// R = M \ T. M is the mass matrix. T is the mixed mass matrix mapping source +// to target. + +// Some debug and code stuff. +namespace { +class Debug { + int index_; + std::string filename_; + bool on_; + +public: + Debug () + : index_(1), filename_("dbgout.m"), on_(true) + { +#ifdef SLMM_DEBUG + FILE* fid = fopen(filename_.c_str(), "w"); + fclose(fid); +#endif + } + + void advance () { ++index_; } + + void set_on (const bool set) { on_ = set; } + + template + void write_p (const std::string& name, const CV3s& p) { +#ifdef SLMM_DEBUG + if ( ! on_) return; + FILE* fid = fopen(filename_.c_str(), "a"); + fprintf(fid, "%s{%d} = [", name.c_str(), index_); + for (Int ip = 0; ip < nslices(p); ++ip) + fprintf(fid, " %1.15e %1.15e %1.15e;", p(ip,0), p(ip,1), p(ip,2)); + fprintf(fid, "].';\n"); + fclose(fid); +#endif + } + + template + void write_c2n (const std::string& name, const CIs& e) { +#ifdef SLMM_DEBUG + if ( ! on_) return; + FILE* fid = fopen(filename_.c_str(), "a"); + fprintf(fid, "%s{%d} = [", name.c_str(), index_); + for (Int ie = 0; ie < nslices(e); ++ie) { + for (Int k = 0; k < szslice(e); ++k) + fprintf(fid, " %d", e(ie,k)+1); + fprintf(fid, ";"); + } + fprintf(fid, "].';\n"); + fclose(fid); +#endif + } + + void write (const std::string& name, const BlockMatrix& m) { +#ifdef SLMM_DEBUG + if ( ! on_) return; + FILE* fid = fopen(filename_.c_str(), "a"); + fprintf(fid, "tmp = ["); + const Size* rowptr = m.rowptr(); + const Int* colidx = m.colidx(); + for (Int R = 0; R < m.M(); ++R) + for (Int J = rowptr[R]; J < rowptr[R+1]; ++J) { + const Int C = colidx[J]; + const Real* const block = m.block(R, C); + for (Int r = 0, k = 0; r < m.m(); ++r) + for (Int c = 0; c < m.n(); ++c, ++k) + fprintf(fid, "%d %d %1.15e\n", m.m()*R + r + 1, + m.n()*C + c + 1, block[k]); + } + fprintf(fid, "];\n"); + fprintf(fid, "%s{%d} = sparse(tmp(:,1),tmp(:,2),tmp(:,3),%d,%d);\n", + name.c_str(), index_, m.M()*m.m(), m.N()*m.n()); + fclose(fid); +#endif + } + + void write (const std::string& name, const Real* const a, const Int n) { +#ifdef SLMM_DEBUG + if ( ! on_) return; + FILE* fid = fopen(filename_.c_str(), "a"); + fprintf(fid, "%s{%d} = [", name.c_str(), index_); + for (Int i = 0; i < n; ++i) + fprintf(fid, " %1.15e", a[i]); + fprintf(fid, "].';\n"); + fclose(fid); +#endif + } +}; +static Debug gdbg; + +class Timer { +public: + enum Op { ts_setup, ts, ts_integrate, ts_remap, ts_rest, ts_error, + ts_remap_T, ts_remap_node_jac, + ts_remap_T_geometry, ts_remap_T_crs, ts_remap_T_fill, + total, NTIMERS }; + static inline void init () { +#ifdef SLMM_TIME + for (int i = 0; i < NTIMERS; ++i) et_[i] = 0; +#endif + } + static inline void start (const Op op) { +#ifdef SLMM_TIME + gettimeofday(&t_start_[op], 0); +#endif + } + static inline void stop (const Op op) { +#ifdef SLMM_TIME + timeval t2; + gettimeofday(&t2, 0); + const timeval& t1 = t_start_[op]; + static const double us = 1.0e6; + et_[op] += (t2.tv_sec*us + t2.tv_usec - t1.tv_sec*us - t1.tv_usec)/us; +#endif + } +# define tpr(op) do { \ + printf("%-20s %10.3e %10.1f\n", #op, et_[op], 100*et_[op]/tot); \ + } while (0) + static void print () { +#ifdef SLMM_TIME + const double tot = et_[total]; + tpr(ts_setup); tpr(ts); tpr(ts_integrate); tpr(ts_remap); + tpr(ts_remap_T); tpr(ts_remap_T_geometry); tpr(ts_remap_T_crs); + tpr(ts_remap_T_fill); tpr(ts_remap_node_jac); tpr(ts_rest); + tpr(ts_error); + printf("%-20s %10.3e %10.1f\n", "total", et_[total], 100.0); +#endif + } +#undef tpr +private: +#ifdef SLMM_TIME + static timeval t_start_[NTIMERS]; + static double et_[NTIMERS]; +#endif +}; +#ifdef SLMM_TIME +timeval Timer::t_start_[Timer::NTIMERS]; +double Timer::et_[Timer::NTIMERS]; +#endif +} // anon namespace + +static constexpr Int max_nvert = 8; +static constexpr Int max_hits = 25; // Covers at least a 2-halo. + +class MeshIntegrator { +protected: + std::vector ll_; +public: + MeshIntegrator (const Int nnodes) + : ll_(2*nnodes) + {} + virtual ~MeshIntegrator () {} + std::vector& get_ll () { return ll_; } + // Must be called from inside ||{}. + virtual void integrate(const Real ts, const Real tf, Vec3s::HostMirror& p) =0; +}; + +template +class MeshIntegratorWithOdeFn : public MeshIntegrator { + std::vector ws_; + std::vector initial_step_; + bool use_xyz_form_; + +public: + MeshIntegratorWithOdeFn (const Int nnodes, const bool use_xyz_form = false) + : MeshIntegrator(nnodes), initial_step_(nnodes, 1e-3), + use_xyz_form_(use_xyz_form) + {} + + virtual void integrate (const Real ts, const Real tf, Vec3s::HostMirror& p) { + const Int nn = nslices(p); + assert(2*nn == static_cast(ll_.size())); + ws_.resize(omp_get_max_threads()); +# pragma omp parallel for schedule(static, 4) + for (Int i = 0; i < nn; ++i) { + const int tid = omp_get_thread_num(); + + // Our primary interest in these numerical experiments is order of + // accuracy when the flow field is exact. Hence here we use extremely + // tight error tolerances. + timeint::Options opts; + opts.set_abs_tol(std::numeric_limits::epsilon()); + opts.set_rel_tol(1e2*std::numeric_limits::epsilon()); + opts.set_initial_step(initial_step_[i]); + + timeint::Info info; + OdeFn fun; + fun.set_xyz_form(use_xyz_form_); + if ( ! use_xyz_form_) { + Real lli[] = {ll_[2*i], ll_[2*i+1]}; + timeint::ark45(opts, fun, lli, 2, ts, tf, ws_[tid], &info); + auto n = slice(p, i); + ll2xyz(lli[0], lli[1], n[0], n[1], n[2]); + } else { + Real u[3]; + ll2xyz(ll_[2*i], ll_[2*i+1], u[0], u[1], u[2]); + timeint::ark45(opts, fun, u, 3, ts, tf, ws_[tid], &info); + geometry::normalize(u); + auto n = slice(p, i); + for (Int j = 0; j < 3; ++j) n[j] = u[j]; + } + initial_step_[i] = info.good_initial_step; + } + } +}; + +class MeshRotator : public MeshIntegrator { + Vec3s::HostMirror p_; + Real axis_[3]; + +public: + MeshRotator (const ConstVec3s::HostMirror& p) + : MeshIntegrator(nslices(p)) + { + axis_[0] = 0.2; axis_[1] = 0.7; axis_[2] = 1; + geometry::normalize(axis_); + ko::resize(p_, nslices(p), szslice(p)); + ko::deep_copy(p_, p); + } + + virtual void integrate (const Real ts, const Real tf, Vec3s::HostMirror& p) { + const Int nn = nslices(p); + assert(2*nn == static_cast(ll_.size())); + const Real + period = day2sec(12), + a = 2*M_PI*(tf - ts)/period; + Real r[9]; + form_rotation(axis_, a, r); +# pragma omp parallel for + for (Int i = 0; i < nn; ++i) { + auto n = slice(p_, i); + const Real x = n[0], y = n[1], z = n[2]; + n = slice(p, i); + n[0] = r[0]*x + r[1]*y + r[2]*z; + n[1] = r[3]*x + r[4]*y + r[5]*z; + n[2] = r[6]*x + r[7]*y + r[8]*z; + } + } +}; + +struct MeshIntegratorFactory : public gallery::WindFieldType { + static std::shared_ptr + create (const std::string& ode, const bool use_xyz_form, + const ConstVec3s::HostMirror& p) + { return create(from_string(ode), use_xyz_form, p); } + + static std::shared_ptr + create (const Enum& ode, const bool use_xyz_form, + const ConstVec3s::HostMirror& p) { + const Int nnodes = nslices(p); + switch (ode) { + case Dcmip1d3ll: + return std::make_shared >(nnodes, use_xyz_form); + case NonDivergentWindField: + return std::make_shared >(nnodes, use_xyz_form); + case DivergentWindField: + return std::make_shared >(nnodes, use_xyz_form); + case NonDivergentWindFieldHack: + return std::make_shared >(nnodes, use_xyz_form); + case Rotate: + return std::make_shared(p); + default: + assert(0); + } + } +}; + +struct IntegrateOptions { + enum Enum { fwd, bwd, test_looa }; + Enum stepping; + bool d2c; // Each step, and in error, convert dgll <-> cgll. +}; + +struct Input { + std::string output_fn, ode, initial_condition, program_name; + Real T; + Int ne, nsteps, write_every, monotone_type, np, tq_order; + bool debug, write_matlab; + bool xyz_form; // Integrate in (x,y,z) space instead of (lat,lon). + IntegrateOptions integrate_options; + + Input(Int argc, char** argv); + void print(std::ostream& os) const; +}; + +// _s is start and _e is end. +struct Output { + Real + l2_err, max_err, mass_s, mass_e, min_s, max_s, min_e, max_e, + et_timestep, + mass_gll_s, mass_gll_e; +}; + +struct RemapOptions { + Int np, monotone_type; + + RemapOptions () + : np(4), monotone_type(0) + {} +}; + +struct Mesh { + Int np, tq_order; + Vec3s::HostMirror geo_p, geo_nml, cgll_p; + Idxs::HostMirror geo_c2n, geo_c2nml, cgll_c2n, dgll_c2n, cgll_io_c2n; + IdxArray::HostMirror dglln2cglln; +}; + +static void copy_vertices ( + const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& c2n, + const Int ci, Real* ps) +{ + const auto cell = slice(c2n, ci); + for (Int i = 0; i < szslice(c2n); ++i) { + const auto n = slice(p, cell[i]); + for (Int k = 0; k < 3; ++k) ps[k] = n[k]; + ps += 3; + } +} + +static void calc_node_jacobians ( + const Mesh& m, const ConstVec3s::HostMirror& p, RealArray::HostMirror& J_dg) +{ + const Int np2 = square(m.np); + ko::resize(J_dg, nslices(m.geo_c2n)*np2); + GLL gll; + const Real* gll_x, * gll_wt; + gll.get_coef(m.np, gll_x, gll_wt); +# pragma omp parallel for + for (Int ci = 0; ci < nslices(m.geo_c2n); ++ci) { + const auto cell = slice(m.geo_c2n, ci); + for (Int j = 0, basis_idx = 0; j < m.np; ++j) { + const Real b = 0.5*(gll_x[j] + 1); + for (Int i = 0; i < m.np; ++i, ++basis_idx) { + const Real a = 0.5*(gll_x[i] + 1); + Real J[9]; + siqk::sqr::impl::calc_Jacobian(p, cell, a, b, J); + geometry::cross(J, J+3, J+6); + const Real jac = std::sqrt(geometry::norm2(J+6)); + J_dg(ci*np2 + basis_idx) = jac; + } + } + } +} + +static void calc_basis_function_integrals ( + const Int np, const Int tq_order, const ConstVec3s::HostMirror& p, + const ConstIdxs::HostMirror& c2n, RealArray::HostMirror& dgbfi) +{ + const Int np2 = square(np); + ko::resize(dgbfi, nslices(c2n)*np2); + ko::deep_copy(dgbfi, 0); + siqk::TriangleQuadrature tq; + siqk::RawConstVec3s tq_bary; + siqk::RawConstArray tq_w; + tq.get_coef(tq_order, tq_bary, tq_w); + const Int nq = len(tq_w); + GLL gll; +# pragma omp parallel for + for (Int ci = 0; ci < nslices(c2n); ++ci) { // cell + Real ps[12]; + copy_vertices(p, c2n, ci, ps); + const auto cell = slice(c2n, ci); + for (Int k = 1; k <= 2; ++k) // 2 triangles per quad cell + for (Int q = 0; q < nq; ++q) { // quad point + Real sphere_coord[3]; + const Real jac = geometry::calc_tri_jacobian( + ps, ps+3*k, ps+3*(k+1), slice(tq_bary, q), sphere_coord); + Real gj[GLL::max_np], gi[GLL::max_np]; { + Real a, b; + siqk::sqr::calc_sphere_to_ref(p, cell, sphere_coord, a, b); + gll.eval(np, b, gj); + gll.eval(np, a, gi); + } + const Real d0 = 0.5 * tq_w[q] * jac; + for (Int j = 0, basis_idx = 0; j < np; ++j) { // along ref y dir + const Real d1 = d0 * gj[j]; + for (Int i = 0; i < np; ++i, ++basis_idx) // along ref x dir + dgbfi(ci*np2 + basis_idx) += d1 * gi[i]; + } + } + } +} + +static void calc_basis_function_integrals ( + const Mesh& m, const ConstVec3s::HostMirror& p, RealArray::HostMirror& dgbfi, + RealArray::HostMirror& cgbfi) +{ + calc_basis_function_integrals(m.np, m.tq_order, p, m.geo_c2n, dgbfi); + ko::resize(cgbfi, nslices(m.cgll_p)); + ko::deep_copy(cgbfi, 0); + for (Int i = 0; i < len(m.dglln2cglln); ++i) + cgbfi(m.dglln2cglln(i)) += dgbfi(i); +} + +static void calc_gll_basis_function_integrals ( + const Mesh& m, const ConstVec3s::HostMirror& p, RealArray::HostMirror& J_dg) +{ + const Int np2 = square(m.np); + ko::resize(J_dg, nslices(m.geo_c2n)*np2); + GLL gll; + const Real* gll_x, * gll_wt; + gll.get_coef(m.np, gll_x, gll_wt); +# pragma omp parallel for + for (Int ci = 0; ci < nslices(m.geo_c2n); ++ci) { + const auto cell = slice(m.geo_c2n, ci); + for (Int j = 0, basis_idx = 0; j < m.np; ++j) { + const Real b = 0.5*(gll_x[j] + 1); + for (Int i = 0; i < m.np; ++i, ++basis_idx) { + const Real a = 0.5*(gll_x[i] + 1); + Real J[9]; + siqk::sqr::impl::calc_Jacobian(p, cell, a, b, J); + geometry::cross(J, J+3, J+6); + const Real jac = std::sqrt(geometry::norm2(J+6)); + // Product of weights is the integral of the 2D basis function on the + // ref square. Multiply by Jacobian of the map bilinear quad -> + // sphere. Since this is GLL quadrature, there's exactly one quadrature + // point. + J_dg(ci*np2 + basis_idx) = 0.25 * jac * gll_wt[i] * gll_wt[j]; + } + } + } +} + +static void map_cgll2dgll ( + const IdxArray::HostMirror& dglln2cglln, const Real* const cg_data, + Real* const dg_data) +{ +# pragma omp parallel for + for (Int i = 0; i < len(dglln2cglln); ++i) + dg_data[i] = cg_data[dglln2cglln[i]]; +} + +static void map_dgll2cgll ( + const IdxArray::HostMirror& dglln2cglln, + const ConstRealArray::HostMirror& dgbfi, + const ConstRealArray::HostMirror& cgbfi, + const Real* const dg_data, Real* const cg_data, const Int cnn) +{ + for (Int i = 0; i < cnn; ++i) cg_data[i] = 0; + for (Int i = 0; i < len(dglln2cglln); ++i) { + const Int i_cgll = dglln2cglln(i); + cg_data[i_cgll] += (dgbfi(i) / cgbfi(i_cgll)) * dg_data[i]; + } +} + +static void calc_M_fwd (const Mesh& m, RemapData& rd) { + const auto& p = m.geo_p; + const auto& c2n = m.geo_c2n; + auto& fmm = rd.fmm(); + fmm.init(nslices(c2n), m.np); + const Int np = m.np, np2 = square(np); + siqk::TriangleQuadrature tq; + siqk::RawConstVec3s tq_bary; + siqk::RawConstArray tq_w; + tq.get_coef(m.tq_order, tq_bary, tq_w); + const Int nq = len(tq_w); + GLL gll; +# pragma omp parallel for + for (Int ci = 0; ci < nslices(c2n); ++ci) { + Real ps[12]; + copy_vertices(p, c2n, ci, ps); + const auto cell = slice(c2n, ci); + Real* block = fmm.block(ci); + for (Int k = 1; k <= 2; ++k) + for (Int q = 0; q < nq; ++q) { + Real sphere_coord[3]; + const Real jac = geometry::calc_tri_jacobian( + ps, ps+3*k, ps+3*(k+1), slice(tq_bary, q), sphere_coord); + Real gj[GLL::max_np], gi[GLL::max_np]; { + Real a, b; + siqk::sqr::calc_sphere_to_ref(p, cell, sphere_coord, a, b); + gll.eval(np, b, gj); + gll.eval(np, a, gi); + } + const Real d0 = 0.5 * tq_w[q] * jac; + for (Int aj = 0, a_basis_idx = 0; aj < np; ++aj) { + const Real d1 = d0 * gj[aj]; + for (Int ai = 0; ai < np; ++ai, ++a_basis_idx) { + const Real d2 = d1 * gi[ai]; + for (Int bj = 0, b_basis_idx = 0; bj < np; ++bj) { + const Real d3 = d2 * gj[bj]; + for (Int bi = 0; bi < np; ++bi, ++b_basis_idx) { + if (b_basis_idx < a_basis_idx) continue; + const Real d = d3 * gi[bi]; + block[np2*a_basis_idx + b_basis_idx] += d; + if (a_basis_idx != b_basis_idx) + block[np2*b_basis_idx + a_basis_idx] += d; + } + } + } + } + } + } + gdbg.write("M", rd.fmm().get_M()); + //fmm.factor(); +} + +class CountIntersectionsFunctor { +protected: + const siqk::sh::Mesh& cm_; + const ConstVec3s::HostMirror p_; + const ConstIdxs::HostMirror e_; + Int hits_[max_hits]; + Int k_, nh_; + +public: + CountIntersectionsFunctor ( + const siqk::sh::Mesh& cm, const ConstVec3s::HostMirror& p, + const ConstIdxs::HostMirror& c2n) + : cm_(cm), p_(p), e_(c2n), nh_(0) + {} + + void reset (const Int clipped_ci) { + k_ = clipped_ci; + nh_ = 0; + } + + void operator() (const Int clip_ci) { + // Check whether we've clipped against this polygon before and there was a + // non-0 intersection. + for (Int i = 0; i < nh_; ++i) + if (hits_[i] == clip_ci) + return; + // We have not, so do the intersection. + Int no = 0; + { + // Area of all overlapping regions. + // In and out vertex lists. + Real buf[9*max_nvert]; + siqk::RawVec3s + vi(buf, max_nvert, 3), + vo(buf + 3*max_nvert, max_nvert, 3), + wrk(buf + 6*max_nvert, max_nvert, 3); + Int ni; + ni = 0; + for (Int i = 0; i < szslice(e_); ++i) { + if (e_(k_,i) == -1) break; + geometry::copy(slice(vi, i), slice(p_, e_(k_,i))); + ++ni; + } + siqk::sh::clip_against_poly(cm_, clip_ci, vi, ni, vo, no, wrk); + } + if (no) { + // Non-0 intersection, so record. + if (nh_ == max_hits) Kokkos::abort("max_hits is too small."); + hits_[nh_++] = clip_ci; + } + } + + Int get_nhits () const { return nh_; } + const Int* get_hits () const { return hits_; } +}; + +static void calc_T_pattern_fwd ( + const Mesh& m, const ConstVec3s::HostMirror& depart_p, + const RemapData::Octree& ot, std::vector& rowptr, + std::vector& colidx) +{ + Timer::start(Timer::ts_remap_T_geometry); + const Int ncell = nslices(m.geo_c2n); + Idxs::HostMirror hits("hits", ncell, max_hits); + { + siqk::sh::Mesh cm; + cm.p = m.geo_p; cm.e = m.geo_c2n; cm.nml = m.geo_nml; cm.en = m.geo_c2nml; +# pragma omp parallel for schedule(static, 20) + for (Int ci = 0; ci < ncell; ++ci) { + Real bb[6]; + RemapData::Octree::calc_bb(depart_p, slice(m.geo_c2n, ci), + szslice(m.geo_c2n), bb); + CountIntersectionsFunctor cif(cm, depart_p, m.geo_c2n); + cif.reset(ci); + ot.apply(bb, cif); + const Int* ci_hits = cif.get_hits(); + const Int hin = cif.get_nhits(); + for (Int hi = 0; hi < hin; ++hi) + hits(ci, hi) = ci_hits[hi]; + if (hin < max_hits) + hits(ci, hin) = -1; + } + } + Timer::stop(Timer::ts_remap_T_geometry); Timer::start(Timer::ts_remap_T_crs); + // Need to form transpose of the matrix that is most naturally created by the + // above if using CRS format. + rowptr.resize(ncell + 1, 0); + for (Int ci = 0; ci < ncell; ++ci) + for (Int hi = 0; hi < max_hits; ++hi) { + if (hits(ci, hi) == -1) break; + ++rowptr[hits(ci, hi) + 1]; + } + // Cumsum. + for (Int ci = 1; ci <= ncell; ++ci) + rowptr[ci] += rowptr[ci-1]; + colidx.resize(rowptr[ncell]); + // Shift up 1. + for (Int ci = ncell; ci > 0; --ci) + rowptr[ci] = rowptr[ci-1]; + for (Int ci = 0; ci < ncell; ++ci) + for (Int hi = 0; hi < max_hits; ++hi) { + const Int row = hits(ci, hi); + if (row == -1) break; + colidx[rowptr[row+1]] = ci; + ++rowptr[row+1]; + } +# pragma omp parallel for + for (Int ci = 0; ci < ncell; ++ci) + std::sort(colidx.data() + rowptr[ci], colidx.data() + rowptr[ci+1]); + Timer::stop(Timer::ts_remap_T_crs); +} + +static void fill_T_fwd (const Mesh& m, const ConstVec3s::HostMirror& depart_p, + RemapData::MT& T) { + const Int ncell = nslices(m.geo_c2n); + const Int np = m.np, np2 = square(np), np4 = square(np2); + siqk::TriangleQuadrature tq; + siqk::RawConstVec3s tq_bary; + siqk::RawConstArray tq_w; + tq.get_coef(m.tq_order, tq_bary, tq_w); + const Int nq = len(tq_w); + GLL gll; + const Size* rowptr = T.rowptr(); + const Int* colidx = T.colidx(); + siqk::sh::Mesh cm; + cm.p = m.geo_p; cm.e = m.geo_c2n; cm.nml = m.geo_nml; cm.en = m.geo_c2nml; +# pragma omp parallel for schedule(static, 1) + for (Int tci = 0; tci < ncell; ++tci) { + Real* block = T.blockrow(tci); + const auto tcell = slice(m.geo_c2n, tci); + for (Int cj = rowptr[tci]; cj < rowptr[tci+1]; ++cj) { + const Int sci = colidx[cj]; + const auto scell = slice(m.geo_c2n, sci); + Real buf[9*max_nvert]; + siqk::RawVec3s + vi(buf, max_nvert, 3), + vo(buf + 3*max_nvert, max_nvert, 3), + wrk(buf + 6*max_nvert, max_nvert, 3); + Int ni = 0, no; + for (Int i = 0; i < szslice(m.geo_c2n); ++i) { + if (scell[i] == -1) break; + geometry::copy(slice(vi, i), slice(depart_p, scell[i])); + ++ni; + } + siqk::sh::clip_against_poly(cm, tci, vi, ni, vo, no, wrk); + assert(no); + { + for (Int i = 0; i < np4; ++i) block[i] = 0; + for (Int ktri = 1; ktri < no-1; ++ktri) // triangles in vo + for (Int q = 0; q < nq; ++q) { // quad point + Real sphere_coord[3]; + const Real jac = geometry::calc_tri_jacobian( + slice(vo,0), slice(vo,ktri), slice(vo,ktri+1), slice(tq_bary, q), + sphere_coord); + Real tgj[GLL::max_np], tgi[GLL::max_np], + sgj[GLL::max_np], sgi[GLL::max_np]; + { + Real ta, tb, sa, sb; + siqk::sqr::calc_sphere_to_ref(m.geo_p, tcell, sphere_coord, + ta, tb); + siqk::sqr::calc_sphere_to_ref(depart_p, scell, sphere_coord, + sa, sb); + gll.eval(np, tb, tgj); + gll.eval(np, ta, tgi); + gll.eval(np, sb, sgj); + gll.eval(np, sa, sgi); + } + const Real d0 = 0.5 * tq_w[q] * jac; + for (Int tj = 0, t_basis_idx = 0; tj < np; ++tj) { + const Real d1 = d0 * tgj[tj]; + for (Int ti = 0; ti < np; ++ti, ++t_basis_idx) { + const Real d2 = d1 * tgi[ti]; + for (Int sj = 0, s_basis_idx = 0; sj < np; ++sj) { + const Real d3 = d2 * sgj[sj]; + for (Int si = 0; si < np; ++si, ++s_basis_idx) { + const Real d = d3 * sgi[si]; + block[np2*t_basis_idx + s_basis_idx] += d; + } + } + } + } + } + } + block += np4; + } + } +} + +static void calc_T_fwd (const Mesh& m, const Vec3s::HostMirror& depart_p, + RemapData& rd) +{ + { // Build T's sparse matrix nonzero pattern. + std::vector rowptr, colidx; + calc_T_pattern_fwd(m, depart_p, rd.octree(), rowptr, colidx); + const Int N = len(rowptr)-1, n = square(m.np); + rd.T().init(N, N, n, n, rowptr.data(), colidx.data()); + } + Timer::start(Timer::ts_remap_T_fill); + fill_T_fwd(m, depart_p, rd.T()); + Timer::stop(Timer::ts_remap_T_fill); +} + +// On input, src_tracer is rho*tracer. On output, it is just the updated +// tracer. Density is removed for output and error checking. +static void remap ( + RemapData& rd, const Mesh& m, const Vec3s::HostMirror& depart_p, + Real* const src_tracer, Real* const tgt_tracer, const Int ntracers, + Real* const src_density, Real* const tgt_density, + // If in_dgll, we're working in DGLL space the whole time; otherwise, we're + // doing CGLL -> DGLL -> remap -> CGLL. If in_dgll, wrk can be null; + // otherwise, it must have length >= 2 dnn. + const bool in_dgll, Real* const wrk) +{ + // For debugging and analysis, factor here. + static bool first = true; if (first) { + //rd.compare_MT(); + rd.fmm().factor(); + first = false; + } + + Timer::start(Timer::ts_remap_T); + const Int dnn = len(m.dglln2cglln), cnn = nslices(m.cgll_p), + len = in_dgll ? dnn : cnn; + calc_T_fwd(m, depart_p, rd); + Timer::stop(Timer::ts_remap_T); Timer::start(Timer::ts_remap_node_jac); + RealArray::HostMirror Js; + calc_node_jacobians(m, depart_p, Js); + Timer::stop(Timer::ts_remap_node_jac); + + for (Int ti = 0; ti < ntracers; ++ti) { + Real* src, * tgt; + if (in_dgll) { + src = src_tracer + ti*len; + tgt = tgt_tracer + ti*len; + } else { + src = wrk; + tgt = wrk + dnn; + map_cgll2dgll(m.dglln2cglln, src_tracer + ti*len, src); + } + // Adjust density according to the flow. At this point, the tracer field has + // density in it. +# pragma omp parallel for + for (Int i = 0; i < dnn; ++i) { + const Real q = rd.Jt()[i]/Js[i]; + src[i] *= q; + } + // L2 project. + rd.apply_R_full(src, dnn, tgt, dnn, 1); + if ( ! in_dgll) + map_dgll2cgll(m.dglln2cglln, rd.dgbfi(), rd.cgbfi(), tgt, + tgt_tracer + ti*len, cnn); + } + + { + Real* src, * tgt; + if (in_dgll) { + src = src_density; + tgt = tgt_density; + } else { + src = wrk; + tgt = wrk + dnn; + map_cgll2dgll(m.dglln2cglln, src_density, src); + } +# pragma omp parallel for + for (Int i = 0; i < dnn; ++i) { + const Real q = rd.Jt()[i]/Js[i]; + src[i] *= q; + } + rd.apply_R_full(src, dnn, tgt, dnn, 1); + if ( ! in_dgll) + map_dgll2cgll(m.dglln2cglln, rd.dgbfi(), rd.cgbfi(), tgt, + tgt_density, cnn); + } + + // For output, remove density from tracer field. + for (Int ti = 0; ti < ntracers; ++ti) { +# pragma omp parallel for + for (Int i = 0; i < len; ++i) + tgt_tracer[ti*len + i] /= tgt_density[i]; + } +} + +static void print_error ( + const Mesh& m, const ConstRealArray::HostMirror& J_gll, const bool in_dgll, + const Real* const fs, const Real* const ds, + const Real* const fe, const Real* const de, Output& out) +{ + Real l2_num = 0, l2_den = 0, max_num = 0, max_den = 0; + out.max_s = -1e300; out.min_s = 1e300; + out.max_e = -1e300; out.min_e = 1e300; + out.mass_s = 0; out.mass_e = 0; + out.mass_gll_s = 0; out.mass_gll_e = 0; + siqk::TriangleQuadrature tq; + siqk::RawConstVec3s tq_bary; + siqk::RawConstArray tq_w; + tq.get_coef(m.tq_order, tq_bary, tq_w); + const Int nq = len(tq_w); + GLL gll; + const auto& c2n = m.geo_c2n; + const auto& p = m.geo_p; + const Int np = m.np, np2 = square(np); + // GLL mass conservation. + for (Int ci = 0; ci < nslices(c2n); ++ci) + for (Int j = 0, basis_idx = 0; j < m.np; ++j) + for (Int i = 0; i < m.np; ++i, ++basis_idx) { + const Int k = ci*np2 + basis_idx; + const Real w = J_gll[k]; + const Int idx = in_dgll ? k : m.cgll_c2n(ci, basis_idx); + out.mass_gll_s += w * ds[idx]; + out.mass_gll_e += w * de[idx]; + } + // Mass conservation wrt quadrature approximation of exact integrals. + for (Int ci = 0; ci < nslices(c2n); ++ci) { + const auto cell = slice(c2n, ci); + Real ps[12]; + copy_vertices(p, c2n, ci, ps); + for (Int k = 1; k <= 2; ++k) + for (Int q = 0; q < nq; ++q) { + Real sphere_coord[3]; + const Real jac = geometry::calc_tri_jacobian( + ps, ps+3*k, ps+3*(k+1), slice(tq_bary, q), sphere_coord); + Real gj[GLL::max_np], gi[GLL::max_np]; { + Real a, b; + siqk::sqr::calc_sphere_to_ref(p, cell, sphere_coord, a, b); + gll.eval(np, b, gj); + gll.eval(np, a, gi); + } + const Real d0 = 0.5 * tq_w[q] * jac; + for (Int j = 0, basis_idx = 0; j < np; ++j) { + const Real d1 = d0 * gj[j]; + for (Int i = 0; i < np; ++i, ++basis_idx) { + const Int k = ci*np2 + basis_idx; + const Int idx = in_dgll ? k : m.cgll_c2n(ci, basis_idx); + const Real w = d1 * gi[i]; + const Real e = fe[idx] - fs[idx]; + out.mass_s += w * ds[idx]; + out.mass_e += w * de[idx]; + l2_num += w * square(e); + l2_den += w * square(fs[idx]); + max_num = std::max(max_num, std::abs(e)); + max_den = std::max(max_den, std::abs(fs[idx])); + out.min_s = std::min(out.min_s, fs[idx]); + out.max_s = std::max(out.max_s, fs[idx]); + out.min_e = std::min(out.min_e, fe[idx]); + out.max_e = std::max(out.max_e, fe[idx]); + } + } + } + } + out.l2_err = std::sqrt(l2_num/l2_den); + out.max_err = max_num/max_den; + printf("> re l2 %9.3e max %9.3e\n", out.l2_err, out.max_err); + printf("> [cv] re %10.3e\n", reldif(out.mass_s, out.mass_e)); + printf("> [cv gll] re %10.3e\n", reldif(out.mass_gll_s, out.mass_gll_e)); + printf("> [mo] min %10.3e %10.3e [%10.3e] max %10.3e %10.3e [%10.3e]\n", + out.min_s, out.min_e, out.min_e - out.min_s, + out.max_s, out.max_e, out.max_e - out.max_s); +} + +static void print_one_liner (const Input& in, const Output& out) { + std::cout << "
      method " << in.integrate_options.stepping + << " ode " << in.ode << " ic " << in.initial_condition + << " T " << in.T << " np " << in.np << " ne " << in.ne + << " tq " << in.tq_order << " nsteps " << in.nsteps + << " mono " << in.monotone_type; + printf(" re l2 %9.3e max %9.3e", out.l2_err, out.max_err); + printf(" cv re %9.3e", reldif(out.mass_s, out.mass_e)); + printf(" cvgll re %9.3e", reldif(out.mass_gll_s, out.mass_gll_e)); + printf(" mo min %9.3e %9.3e %9.3e max %9.3e %9.3e %9.3e", + out.min_s, out.min_e, out.min_e - out.min_s, + out.max_s, out.max_e, out.max_e - out.max_s); + printf(" et ts %9.3e nthr %d", out.et_timestep, omp_get_max_threads()); + std::cout << " prog " << in.program_name; + std::cout << " xyz " << in.xyz_form; + std::cout << " d2c " << in.integrate_options.d2c; + std::cout << "\n"; +} + +static void init_mesh (const Int np, const Int tq_order, const Int ne, + Mesh& m) { + m.np = np; + m.tq_order = tq_order; + mesh::make_cubedsphere(m.geo_p, m.geo_c2n, ne); + mesh::make_cgll_from_geo(m.geo_p, m.geo_c2n, np, m.cgll_p, m.cgll_c2n); + mesh::make_dgll_from_cgll(m.cgll_p, m.cgll_c2n, m.dglln2cglln, m.dgll_c2n); + mesh::make_io_cgll_from_internal_cgll(m.cgll_p, m.cgll_c2n, m.cgll_io_c2n); + { + siqk::sh::Mesh sm; sm.p = m.geo_p; sm.e = m.geo_c2n; + siqk::test::fill_normals(sm); + m.geo_nml = sm.nml; m.geo_c2nml = sm.en; + } +} + +// A bit of complication in this routine is opts.d2c. The natural thing to do +// is to work in DGLL space the whole time, except possibly when writing to the +// netcdf file. However, we need to mimic the intended application: at each +// step, we get CGLL fields, convert to DGLL, L2 project, then convert back. If +// opts.d2c, mimic this behavior; if ! opts.d2c, stay in DGLL space the whole +// time except when writing to the file. We support both behaviors so we can +// analyze the impact of going back and forth on accuracy. +static void integrate ( + const Mesh& m, const std::shared_ptr& mi, + const RemapOptions& ro, const Real T, const Int nsteps, + gallery::InitialCondition::Shape ic, const std::string& out_fn, + const Int write_every, const IntegrateOptions opts, Output& out) +{ + Timer::start(Timer::ts_setup); + const Int dnn = len(m.dglln2cglln), cnn = nslices(m.cgll_p), + len = opts.d2c ? cnn : dnn; + + // Initialize I/O. + std::shared_ptr ncw; + if (write_every > 0) { + ncw = std::make_shared( + m.cgll_p, m.cgll_io_c2n, out_fn + ".g", ro.np, ro.monotone_type); + ncw->add_nodal_field("tracer"); + ncw->add_nodal_field("density"); + ncw->end_definition(); + } + + // Eulerian mesh remap data. + RealArray::HostMirror Jt_gll; + calc_gll_basis_function_integrals(m, m.geo_p, Jt_gll); + RemapData rd; + calc_M_fwd(m, rd); + rd.octree().init(m.geo_p, m.geo_c2n); + calc_node_jacobians(m, m.cgll_p, rd.Jt()); + calc_basis_function_integrals(m, m.geo_p, rd.dgbfi(), rd.cgbfi()); + + // Initialize data and workspace. + std::vector tracer[2], density[2]; + std::vector* tracer_p[2], * density_p[2]; + for (Int i = 0; i < 2; ++i) { + tracer[i].resize(len); + tracer_p[i] = &tracer[i]; + density[i].resize(len); + density_p[i] = &density[i]; + } + for (Int k = 0; k < 2; ++k) + for (Int i = 0; i < len; ++i) + (*density_p[k])[i] = 1; + std::vector wrk(opts.d2c ? 2*dnn : cnn); + // Record the initial and final states. + std::vector error_data(4*len); + + { + // Get the initial conditions. + std::vector lat(cnn), lon(cnn); + for (Int i = 0; i < cnn; ++i) { + const auto n = slice(m.cgll_p, i); + xyz2ll(n[0], n[1], n[2], lat[i], lon[i]); + } + Real* data = opts.d2c ? tracer_p[0]->data() : wrk.data(); + gallery::InitialCondition::init( + ic, nslices(m.cgll_p), lat.data(), lon.data(), data); + // Record the ICs. + if ( ! opts.d2c) + map_cgll2dgll(m.dglln2cglln, data, tracer_p[0]->data()); + memcpy(error_data.data(), tracer_p[0]->data(), len*sizeof(Real)); + if (ncw) { + ncw->advance_time_to(0); + ncw->write_field("tracer", data); + } + memcpy(error_data.data() + len, density_p[0]->data(), len*sizeof(Real)); + if (ncw) { + data = opts.d2c ? density_p[0]->data() : wrk.data(); + if ( ! opts.d2c) + map_dgll2cgll(m.dglln2cglln, rd.dgbfi(), rd.cgbfi(), + density_p[0]->data(), data, cnn); + ncw->write_field("density", data); + } + } + // Remap is done on density*tracer, but sometimes the tracer field doesn't + // have the density rho in it. + for (Int i = 0; i < len; ++i) + (*tracer_p[0])[i] *= (*density_p[0])[i]; + + // Time step. + Vec3s::HostMirror departure_p; + ko::resize(departure_p, nslices(m.geo_p), szslice(m.geo_p)); + const Real dt = T/nsteps; + const Int last_step = + opts.stepping == IntegrateOptions::test_looa ? 1 : nsteps - 1; + ProgressBar progress_bar("integrate", last_step+1, 10); + const auto step_t = siqk::tic(); + Timer::stop(Timer::ts_setup); Timer::start(Timer::ts); + for (Int step = 0; step <= last_step; ++step) { + Timer::start(Timer::ts_integrate); + const Real tf = step == last_step ? T : dt*(step + 1); + switch (opts.stepping) { + case IntegrateOptions::fwd: + // Integrate mesh forward in time. + mi->integrate(dt*step, tf, departure_p); + break; + case IntegrateOptions::bwd: + throw std::runtime_error("IntegrateOptions::bwd is not impl'ed."); + break; + case IntegrateOptions::test_looa: + switch (step) { + case 0: mi->integrate(dt*step, tf, departure_p); break; + case 1: mi->integrate(dt*step, dt*(step - 1), departure_p); break; + default: assert(0); break; + } + break; + } + Timer::stop(Timer::ts_integrate); Timer::start(Timer::ts_remap); + remap(rd, m, departure_p, tracer_p[0]->data(), tracer_p[1]->data(), 1, + density_p[0]->data(), density_p[1]->data(), ! opts.d2c, wrk.data()); + Timer::stop(Timer::ts_remap); Timer::start(Timer::ts_rest); + if (step == 0) { + // Analyze the remap operator R = M \ T. + RealArray::HostMirror dgbfi_s; + calc_basis_function_integrals(m.np, m.tq_order, departure_p, m.geo_c2n, + dgbfi_s); + printf("\n> triangle quadrature jacobians\n"); + rd.check(dgbfi_s.ptr_on_device(), rd.dgbfi().ptr_on_device()); + RealArray::HostMirror Js_gll; + calc_gll_basis_function_integrals(m, departure_p, Js_gll); + printf("> GLL jacobians\n"); + rd.check(Js_gll.ptr_on_device(), Jt_gll.ptr_on_device()); + } + gdbg.write("T", rd.T()); + gdbg.write_p("geo_p", m.geo_p); gdbg.write_c2n("geo_c2n", m.geo_c2n); + gdbg.write_p("departure_p", departure_p); + + // Netcdf I/O. + if (ncw && (step % write_every == 0 || step == last_step)) { + ncw->advance_time_to(tf); + if (opts.d2c) { + ncw->write_field("tracer", tracer_p[1]->data()); + ncw->write_field("density", density_p[1]->data()); + } else { + Real* const data = wrk.data(); + map_dgll2cgll(m.dglln2cglln, rd.dgbfi(), rd.cgbfi(), + tracer_p[1]->data(), data, cnn); + ncw->write_field("tracer", data); + map_dgll2cgll(m.dglln2cglln, rd.dgbfi(), rd.cgbfi(), + density_p[1]->data(), data, cnn); + ncw->write_field("density", data); + } + } + // Record data for error analysis. + if (step == last_step) { + memcpy(error_data.data() + 2*len, tracer_p[1]->data(), len*sizeof(Real)); + memcpy(error_data.data() + 3*len, density_p[1]->data(), len*sizeof(Real)); + } + +# pragma omp parallel for + for (Int i = 0; i < len; ++i) + (*tracer_p[1])[i] *= (*density_p[1])[i]; + + std::swap(tracer_p[0], tracer_p[1]); + std::swap(density_p[0], density_p[1]); + progress_bar.update(); + gdbg.advance(); + gdbg.set_on(false); + } + const Real step_et = siqk::toc(step_t); + Timer::stop(Timer::ts); + siqk::print_times("timestep", step_et); + out.et_timestep = step_et; + + { Timer::start(Timer::ts_error); + const Real* const d = error_data.data(); + print_error(m, Jt_gll, ! opts.d2c, d, d + len, d + 2*len, + d + 3*len, out); + Timer::stop(Timer::ts_error); + } +} + +static void run (const Input& in) { + const Real T = day2sec(in.T); + const auto ic = gallery::InitialCondition::from_string( + in.initial_condition); + RemapOptions ro; + ro.np = in.np; + + Mesh m; + init_mesh(in.np, in.tq_order, in.ne, m); + + auto mi = MeshIntegratorFactory::create(in.ode, in.xyz_form, m.geo_p); + // Get lat-lon of geo mesh nodes. + const Int nn = nslices(m.geo_p); +# pragma omp parallel for + for (Int i = 0; i < nn; ++i) { + const auto n = slice(m.geo_p, i); + Real* const lli = mi->get_ll().data() + 2*i; + xyz2ll(n[0], n[1], n[2], lli[0], lli[1]); + } + + Output out; + integrate(m, mi, ro, T, in.nsteps, ic, in.output_fn, in.write_every, + in.integrate_options, out); + print_one_liner(in, out); +} + +Input::Input (int argc, char** argv) + : output_fn("tmp/out"), ode("divergent"), + initial_condition("xyztrig"), T(12), ne(5), nsteps(120), write_every(1), + monotone_type(0), np(4), tq_order(12), debug(false), xyz_form(false) +{ + program_name = argv[0]; + integrate_options.stepping = IntegrateOptions::fwd; + integrate_options.d2c = false; + for (int i = 1; i < argc; ++i) { + const std::string& token = argv[i]; + if (eq(token, "-o", "--output")) + output_fn = argv[++i]; + else if (eq(token, "-T")) + T = atof(argv[++i]); + else if (eq(token, "-nsteps")) + nsteps = atoi(argv[++i]); + else if (eq(token, "-ode")) + ode = argv[++i]; + else if (eq(token, "-ic")) + initial_condition = argv[++i]; + else if (eq(token, "-mono", "--monotone")) + monotone_type = atoi(argv[++i]); + else if (eq(token, "-np")) + np = atoi(argv[++i]); + else if (eq(token, "-tq")) + tq_order = atoi(argv[++i]); + else if (eq(token, "-ne")) + ne = atoi(argv[++i]); + else if (eq(token, "-we", "--write-every")) + write_every = atoi(argv[++i]); + else if (eq(token, "-looa", "--looa")) + integrate_options.stepping = IntegrateOptions::test_looa; + else if (eq(token, "-xyz", "--xyz")) + xyz_form = true; + else if (eq(token, "-d2c", "--d2c")) + integrate_options.d2c = true; + else if (eq(token, "-d", "--debug")) + debug = true; + } + + if (np == 4) tq_order = 20; + + print(std::cout); +} + +void Input::print (std::ostream& os) const { + os << "output filename (-o): " << output_fn << "\n" + << "ode (-ode, " << MeshIntegratorFactory::get_inputs() << "): " + << ode << "\n" + << "xyz_form (-xyz): " << xyz_form << "\n" + << "initial condition (-ic, " + << gallery::InitialCondition::get_inputs() << "): " + << initial_condition << "\n" + << "T (-T): " << T << " [day]\n" + << "nsteps (-nsteps): " << nsteps << "\n" + << "np (-np): " << np << "\n" + << "tq (-tq): " << tq_order << "\n" + << "ne (-ne): " << ne << "\n" + << "monotone_type (-mono, {0,1,2,3}): " << monotone_type << "\n" + << "write every (-we): " << write_every << "\n" + << "test_looa (--looa): " + << (integrate_options.stepping == IntegrateOptions::test_looa) << "\n" + << "d2c (-d2c): " << integrate_options.d2c << "\n" + << "debug (-d): " << debug << "\n"; +} + +int main (int argc, char** argv) { + Kokkos::initialize(argc, argv); { + Timer::init(); + Timer::start(Timer::total); + BlockMatrix<>::test(); + Input in(argc, argv); + run(in); + Timer::stop(Timer::total); + Timer::print(); + } Kokkos::finalize_all(); +} diff --git a/siqk/test.cpp b/siqk/test.cpp deleted file mode 100644 index dd10981..0000000 --- a/siqk/test.cpp +++ /dev/null @@ -1,191 +0,0 @@ -// ko=/home/ambradl/lib/kokkos/cpu; mycpp -I$ko/include -L$ko/lib -fopenmp test.cpp -lkokkos -ldl -Wall -pedantic -// ./a.out -m | grep "mat=1" > foo.m -// >> msik('draw_test_output', 'foo'); - -#include "siqk_intersect.hpp" -using namespace siqk; - -template -static void -write_matlab (const std::string& name, const CV3s& p) { - printf("mat=1; %s = [", name.c_str()); - for (Int ip = 0; ip < nslices(p); ++ip) - printf(" %1.15e %1.15e %1.15e;", p(ip,0), p(ip,1), p(ip,2)); - printf("].';\n"); -} - -template -static void -write_matlab (const std::string& name, const CV3s& p, const CIs& e) { - printf("mat=1; %s.p = [", name.c_str()); - for (Int ip = 0; ip < nslices(p); ++ip) - printf(" %1.15e %1.15e %1.15e;", p(ip,0), p(ip,1), p(ip,2)); - printf("].';\n"); - printf("mat=1; %s.n = [", name.c_str()); - for (Int ie = 0; ie < nslices(e); ++ie) - printf(" %d %d %d %d;", e(ie,0)+1, e(ie,1)+1, e(ie,2)+1, e(ie,3)+1); - printf("].';\n"); -} - -static void make_planar_mesh (Vec3s::HostMirror& p, Idxs::HostMirror& e, - const Int n) { - const Real d = std::sqrt(0.5); - ko::resize(e, n*n, 4); - ko::resize(p, (n+1)*(n+1), 3); - for (Int iy = 0; iy < n+1; ++iy) - for (Int ix = 0; ix < n+1; ++ix) { - const auto idx = (n+1)*iy + ix; - p(idx,0) = 2*(static_cast(ix)/n - 0.5)*d; - p(idx,1) = 2*(static_cast(iy)/n - 0.5)*d; - p(idx,2) = 0; - } - for (Int iy = 0; iy < n; ++iy) - for (Int ix = 0; ix < n; ++ix) { - const auto idx = n*iy + ix; - e(idx,0) = (n+1)*iy + ix; - e(idx,1) = (n+1)*iy + ix+1; - e(idx,2) = (n+1)*(iy+1) + ix+1; - e(idx,3) = (n+1)*(iy+1) + ix; - } -} - -static void project_onto_sphere (Vec3s::HostMirror& p) { - for (Int ip = 0; ip < nslices(p); ++ip) { - p(ip,2) = 1; - SphereGeometry::normalize(slice(p, ip)); - } -} - -static void -perturb_mesh (Vec3s::HostMirror& p, Idxs::HostMirror& e, - const Real angle, const Real xlate, const Real ylate) { - const Real cr = std::cos(angle), sr = std::sin(angle); - for (Int ip = 0; ip < nslices(p); ++ip) { - const Real x = p(ip,0), y = p(ip,1); - p(ip,0) = cr*x - sr*y + xlate; - p(ip,1) = -sr*x + cr*y + ylate; - } -} - -static void fill_quad (const ConstVec3s::HostMirror& p, - Vec3s::HostMirror& poly) { - const Int n = static_cast(std::sqrt(nslices(p) - 1)); - copy(slice(poly, 0), slice(p, 0), 3); - copy(slice(poly, 1), slice(p, n), 3); - copy(slice(poly, 2), slice(p, nslices(p) - 1), 3); - copy(slice(poly, 3), slice(p, nslices(p) - 1 - n), 3); -} - -// Area of the outline of (p,e) clipped against the outline of (cp,ce). -template -static Real calc_true_area ( - const ConstVec3s::HostMirror& cp, const ConstIdxs::HostMirror& ce, - const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e, - const bool wm) -{ - Vec3s::HostMirror clip_poly("clip_poly", 4, 3), poly("poly", 4, 3), - nml("nml", 4, 3); - fill_quad(cp, clip_poly); - fill_quad(p, poly); - for (Int i = 0; i < 4; ++i) - Geo::edge_normal(slice(clip_poly, i), slice(clip_poly, (i+1) % 4), - slice(nml, i)); - Vec3s::HostMirror vo("vo", test::max_nvert, 3); - Int no; - { - Vec3s::HostMirror wrk("wrk", test::max_nvert, 3); - sh::clip_against_poly(clip_poly, nml, poly, 4, vo, no, wrk); - } - if (wm) { - write_matlab("clip_poly", clip_poly); - write_matlab("poly", poly); - write_matlab("intersection", - ko::subview(vo, std::pair(0, no), ko::ALL())); - } - return Geo::calc_area_formula(vo, no); -} - -template void finalize_mesh (Vec3s::HostMirror& p) {} -template <> void finalize_mesh (Vec3s::HostMirror& p) { - project_onto_sphere(p); -} - -template -static int -run (const Int n, const Real angle, const Real xlate, const Real ylate, - const bool wm) { - Vec3s::HostMirror cp; - Idxs::HostMirror ce; - make_planar_mesh(cp, ce, n); - - Vec3s::HostMirror p("p", nslices(cp), szslice(cp)); - Idxs::HostMirror e("e", nslices(ce), szslice(ce)); - ko::deep_copy(p, cp); - ko::deep_copy(e, ce); - perturb_mesh(p, e, angle, xlate, ylate); - - finalize_mesh(cp); - finalize_mesh(p); - - const Real ta = calc_true_area(cp, ce, p, e, wm); - const Real a = test::test_area_ot(cp, ce, p, e); - - const Real re = std::abs(a - ta)/ta; - fprintf(stderr, "true area %1.4e mesh area %1.4e relerr %1.4e\n", ta, a, re); - if (wm) { - write_matlab("cm", cp, ce); - write_matlab("m", p, e); - } - return re < 1e-10 ? 0 : 1; -} - -inline bool -eq (const std::string& a, const char* const b1, const char* const b2 = 0) { - return (a == std::string(b1) || (b2 && a == std::string(b2)) || - a == std::string("-") + std::string(b1)); -} - -struct Input { - Int n; - Real angle, xlate, ylate; - bool write_matlab, geo_sphere; - - Input (Int argc, char** argv) - : n(5), angle(M_PI*1e-1), xlate(1e-1), ylate(1e-1), write_matlab(false), - geo_sphere(true) - { - for (Int i = 1; i < argc; ++i) { - const std::string& token = argv[i]; - if (eq(token, "-n")) n = atoi(argv[++i]); - if (eq(token, "-m", "--write-matlab")) write_matlab = true; - if (eq(token, "--plane")) geo_sphere = false; - if (eq(token, "--xlate")) xlate = atof(argv[++i]); - if (eq(token, "--ylate")) ylate = atof(argv[++i]); - if (eq(token, "--angle")) angle = atof(argv[++i]); - } - - print(std::cout); - } - - void print (std::ostream& os) { - os << "n (-n): " << n << "\n" - << "write matlab (-m): " << write_matlab << "\n" - << "planar geometry (--plane): " << ! geo_sphere << "\n" - << "angle (--angle): " << angle << "\n" - << "xlate (--xlate): " << xlate << "\n" - << "ylate (--ylate): " << ylate << "\n"; - } -}; - -int main (int argc, char** argv) { - Kokkos::initialize(argc, argv); - { - Input in(argc, argv); - Int nerr = 0; - nerr += (in.geo_sphere ? - run(in.n, in.angle, in.xlate, in.ylate, in.write_matlab) : - run(in.n, in.angle, in.xlate, in.ylate, in.write_matlab)); - std::cerr << (nerr ? "FAIL" : "PASS") << "ED\n"; - } - Kokkos::finalize_all(); -} From 401215773053c53909094fe0616498b8f2e2f13f Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Sun, 10 Jul 2016 20:52:48 -0600 Subject: [PATCH 12/28] ICE: C++ prototype impl. Implement clipping of a polygon made of a mix of quadratic and linear edges against a convex straight-edged polygon. This is implemented in the non-Kokkos prototype code that Dave uses, and it's implemented for the plane only at the moment. --- siqk/si/Array_raw.hpp | 16 +- siqk/si/fsi.cpp | 23 +- siqk/si/fsi.h | 57 ++++ siqk/si/{sik.hpp => siqp.hpp} | 476 ++++++++++++++++++++++++++++++---- siqk/si/test.cpp | 30 +-- siqk/si/testf.f90 | 31 ++- 6 files changed, 553 insertions(+), 80 deletions(-) rename siqk/si/{sik.hpp => siqp.hpp} (62%) diff --git a/siqk/si/Array_raw.hpp b/siqk/si/Array_raw.hpp index f27da33..7926398 100644 --- a/siqk/si/Array_raw.hpp +++ b/siqk/si/Array_raw.hpp @@ -33,7 +33,7 @@ class Array1D { a_p_ = std::shared_ptr(new T[n], std::default_delete()); a_ = a_p_.get(); } - void reset (const int n, T* const a) { n_ = n; a_p_.reset(); a_ = a; } + void reset (const int n, T* const a) { n_ = n; a_p_ = nullptr; a_ = a; } const int& n () const { return n_; } T* data () { return a_; } const T* data () const { return a_; } @@ -47,7 +47,7 @@ class Array1D { private: #ifdef SIQK_DEBUG void debug (const int& i) const { - if (i < 0 || i >= m_) { + if (i < 0 || i >= n_) { std::stringstream ss; ss << "Array1D: i is " << i << " but n_ is " << n_ << "\n"; error(ss.str().c_str()); @@ -78,7 +78,7 @@ class Array2D { a_p_ = std::shared_ptr(new T[m*n], std::default_delete()); a_ = a_p_.get(); } - void reset (const int m, const int n, T* const a) { m_ = m; n_ = n; a_p_.reset(); a_ = a; } + void reset (const int m, const int n, T* const a) { m_ = m; n_ = n; a_p_ = nullptr; a_ = a; } const int& m () const { return m_; } const int& n () const { return n_; } T* data () { return a_; } @@ -111,6 +111,16 @@ class Array2D { #endif }; +template inline T* slice (Array2D& a, const int c) { return a(c); } +template inline const T* slice (const Array2D& a, const int c) { return a(c); } +template inline int nslices (const Array2D& a) { return a.n(); } +template inline int szslice (const Array2D& a) { return a.m(); } + +template inline Array1D offset (Array1D& a, const int os) +{ return Array1D(a.n() - os, a.data() + os); } +template inline Array2D offset (Array2D& a, const int os) +{ return Array2D(a.m(), a.n() - os, a.data() + a.m()*os); } + // Define a few things to minimize KOKKOS guards. # ifndef KOKKOS_FUNCTION # define KOKKOS_FUNCTION diff --git a/siqk/si/fsi.cpp b/siqk/si/fsi.cpp index 467c657..c56a610 100644 --- a/siqk/si/fsi.cpp +++ b/siqk/si/fsi.cpp @@ -1,5 +1,5 @@ #include "Array_raw.hpp" -#include "sik.hpp" +#include "siqp.hpp" extern "C" void clipagainstpolysphere_ ( double const* const clip_poly, int const* const clip_poly_n_vertices, @@ -8,10 +8,29 @@ extern "C" void clipagainstpolysphere_ ( int* const info) { Array2D avo(3, *n_vertices, vo); - const bool success = siqk::sh::clip_against_poly( + const bool success = siqp::sh::clip_against_poly( Array2D(3, *clip_poly_n_vertices, clip_poly), Array2D(3, *clip_poly_n_vertices, clip_edge_normals), Array2D(3, *ni, vi), *ni, avo, *no, wrk, *n_vertices); *info = success ? 0 : 1; } + +extern "C" void iceclipagainstpolyplane_( + double const* const clip_poly, int const* const clip_poly_n_vertices, + double const* const clip_edge_normals, + double const* const vi, int const* const vti, int const* const ni, + double* const vo, int* const vto, int* const no, + double* const rwrk, int* const iwrk, int const* const nwrk, + int* const info) +{ + Array2D avo(3, *nwrk, vo); + Array1D avto(*nwrk, vto); + const bool success = + siqp::ice::clip_cpoly_against_convex_poly( + Array2D(3, *clip_poly_n_vertices, clip_poly), + Array2D(3, *clip_poly_n_vertices, clip_edge_normals), + Array2D(3, *ni, vi), Array1D(*ni, vti), *ni, + avo, avto, *no, rwrk, iwrk, *nwrk); + *info = success ? 0 : 1; +} diff --git a/siqk/si/fsi.h b/siqk/si/fsi.h index e12605f..a82c9d0 100644 --- a/siqk/si/fsi.h +++ b/siqk/si/fsi.h @@ -13,3 +13,60 @@ extern "C" void clipagainstpolysphere_( double* const wrk, int const* const n_vertices, // info = 0 on success. info = 1 if n_vertices is not large enough. int* const info); + +// ICE: Intersection with curved edges. +// +// Some terminology: +// s, m, p: start, middle, end points of a curved edge. m is not really the +// middle or midpoint; indeed, it is unlikely to be on the curve. Rather, +// it's a point that defines the curve. See below for more. +// straight: straight on a plane, or a great arc on the sphere. +// curved: quadratic on a plane, projected quadratic on the sphere. +// cedge: a curved edge. +// sedge: a straight edge, including a great arc. +// cpoly, spoly: similar terminology; but note that a cpoly can contain a mix +// of cedges and sedges. +// (vs, vts, n): Vertex list. vs is an array of vertices. There are n +// vertices. vts is a list of vertex types. s, p are endpoint vertices (0); +// m is a midpoint node (1). If an edge is straight, then vts(k:k+1) = [0 +// 0]; if an edge is curved, then vts(k:k+2) = [0 1 0]. Keep in mind that +// there is an edge that wraps around the end of the list. The wrap can +// occur like 0|1 0 or like 0 1|0. As an example, [1 0 0 1 0] is a vertex +// type list for a triangle containing two cedges and one sedge. +// +// Some math. +// a in [0,1] is the parameter in the curve +// x(a) = (1-a)^2 s + a (1-a) m + a^2 p. (1) +// s and p sit on the curve, but m in general does not. We can define m by +// x(1/2) = M => m = 4 n - s - p, +// where M is a point that is intended to be on the curve and serves as a useful +// midpoint reference. This construction has the essential property that the +// curve is invariant to the swapping of s and p. However, the clip routines are +// independent of this definition; (s,m,p) are used as in equation (1), and that +// is all that is needed. +// When segments are extracted from x(a) in a clip, we use c in the segment +// [c,1], d in the segment [0,d], and both in [c,d]. Similarly, x(c) = r is the +// new start point, and x(d) = q is the new end point. A segment requires a +// midpoint so that the resulting parameterized curve sits on the original; this +// is n. Hence a segment of (s,m,p) defined by [c,d] subset [0,1] is (r,n,q). n +// is given by +// n = 2 (c d - c - d + 1) s + (c + d - 2 c d) m + 2 c d p, +// which satisfies +// x(a) = (1-a)^2 s + a(1-a) m + a^2 p = (1-b)^2 r + b(1-b) n + b^2 q +// for all b in [0,1], where b = (a-c)/(d-c), r = x(c), q = x(d). +// +// Even though this is a 2D routine, the vertices and vectors must still be +// 3D. The third value is ignored. +extern "C" void iceclipagainstpolyplane_( + // 3 x clip_poly_n_vertices clip spherical polygon vertex list. + double const* const clip_poly, int const* const clip_poly_n_vertices, + // 3 x clip_poly_n_vertices clip polygon's inward-facing edge normals. + double const* const clip_edge_normals, + // 3 x ni curved polygon to clip. + double const* const vi, int const* const vti, int const* const ni, + // On output, a 3 x no clipped polygon. + double* const vo, int* const vto, int* const no, + // Workspace. rwrk is 3*nwrk; iwrk is 1*nwrk. + double* const rwrk, int* const iwrk, int const* const nwrk, + // info = 0 on success. info = 1 if workspace is not large enough. + int* const info); diff --git a/siqk/si/sik.hpp b/siqk/si/siqp.hpp similarity index 62% rename from siqk/si/sik.hpp rename to siqk/si/siqp.hpp index 54ac376..444cdfd 100644 --- a/siqk/si/sik.hpp +++ b/siqk/si/siqp.hpp @@ -1,5 +1,5 @@ -#ifndef INCLUDE_SIK_HPP -#define INCLUDE_SIK_HPP +#ifndef INCLUDE_SIQP_HPP +#define INCLUDE_SIQP_HPP #include #include @@ -10,7 +10,7 @@ #include #include -#ifdef SIQK_TIME +#ifdef SIQP_TIME # include # include # include @@ -31,8 +31,8 @@ static void prarr (const std::string& name, const T* const v, const size_t n) { std::cerr << "\n"; } -namespace siqk { -#ifdef SIQK_TIME +namespace siqp { +#ifdef SIQP_TIME static timeval tic () { timeval t; gettimeofday(&t, 0); @@ -43,9 +43,6 @@ static double calc_et (const timeval& t1, const timeval& t2) { return (t2.tv_sec * us + t2.tv_usec - t1.tv_sec * us - t1.tv_usec) / us; } static double toc (const timeval& t1) { -#ifdef SIQK_USE_KOKKOS - Kokkos::fence(); -#endif timeval t; gettimeofday(&t, 0); return calc_et(t1, t); @@ -62,7 +59,7 @@ static inline double toc (const int&) { return 0; } #endif static void print_times (const std::string& name, const double* const parts, const int nparts) { -#ifdef SIQK_TIME +#ifdef SIQP_TIME double total = 0; for (int i = 0; i < nparts; ++i) total += parts[i]; printf("%20s %1.3e s %7.1f MB", name.c_str(), total, get_memusage()); for (int i = 0; i < nparts; ++i) printf(" %1.3e s", parts[i]); @@ -76,38 +73,69 @@ static void copy (V dst, CV src, const int n) { } // A decorator function so that a for loop's counter can be auto typed. -template KOKKOS_INLINE_FUNCTION +template typename V::size_type zero(const V& v) { return 0; } +template +int solve_quadratic_equation (const CV a, const CV b, const CV c, V xs) { + const double disc = b*b - 4*a*c; + if (disc < 0) return 0; + if (disc == 0) { + xs[0] = -0.5*b/a; + return 1; + } + if (b >= 0) { + const double t = b + std::sqrt(disc); + xs[0] = -0.5*t/a; + xs[1] = -2*c/t; + } else { + const double t = std::sqrt(disc) - b; + xs[0] = 2*c/t; + xs[1] = 0.5*t/a; + } + return 2; +} + // Planar geometry calculations. struct PlaneGeometry { - template KOKKOS_INLINE_FUNCTION + enum { dim = 3 }; + + template static void scale (const double a, V v) { v[0] *= a; v[1] *= a; } - template KOKKOS_INLINE_FUNCTION + template + static double dot (const CA a, const CB b) { + return a[0]*b[0] + a[1]*b[1]; + } + template static double dot_c_amb (const CV c, const CV a, const CV b) { return c[0]*(a[0] - b[0]) + c[1]*(a[1] - b[1]); } - template KOKKOS_INLINE_FUNCTION + template + static void copy (V d, const CV s) { + d[0] = s[0]; + d[1] = s[1]; + } + template static void combine (const CV u, const CV v, const double a, V x) { const double oma = 1 - a; x[0] = oma*u[0] + a*v[0]; x[1] = oma*u[1] + a*v[1]; } - template KOKKOS_INLINE_FUNCTION + template static void edge_normal (const CV e1, const CV e2, V en) { en[0] = e1[1] - e2[1]; en[1] = e2[0] - e1[0]; } - template KOKKOS_INLINE_FUNCTION + template static bool inside (const CV v, const CV e1, const CV e2, const CV en) { return dot_c_amb(en, v, e1) > 0 && dot_c_amb(en, v, e2) > 0; } - template KOKKOS_INLINE_FUNCTION + template static void intersect (const CV v1, const CV v2, const CV e1, const CV en, V intersection) { double a; { @@ -120,7 +148,7 @@ struct PlaneGeometry { combine(v1, v2, a, intersection); } - template KOKKOS_INLINE_FUNCTION + template static bool output (const CV v, int& no, Array2D& vo) { #ifdef SIKQ_DEBUG if (no >= vo.n()) { @@ -138,7 +166,7 @@ struct PlaneGeometry { } //todo Handle non-convex case. - KOKKOS_INLINE_FUNCTION + static double calc_area (const Array2D& v) { double area = 0; for (int i = 1; i < v.n() - 1; ++i) { @@ -152,44 +180,60 @@ struct PlaneGeometry { } return 0.5*area; } + + // For quadratic edges. + template + static int intersect (const CV s, const CV m, const CV p, const CV e1, + const CV nml, V as) { + double w[3]; + for (int i = 0; i < 3; ++i) w[i] = s[i] - m[i] + p[i]; + const double a = dot(nml, w); + for (int i = 0; i < 3; ++i) w[i] = m[i] - 2*s[i]; + const double b = dot(nml, w); + for (int i = 0; i < 3; ++i) w[i] = s[i] - e1[i]; + const double c = dot(nml, w); + return solve_quadratic_equation(a, b, c, as); + } }; // Geometry on the sphere. All inputs and outputs are relative to the // unit-radius sphere. struct SphereGeometry { - template KOKKOS_INLINE_FUNCTION + enum { dim = 3 }; + + template static void cross (const CV a, const CV b, V c) { c[0] = a[1]*b[2] - a[2]*b[1]; c[1] = a[2]*b[0] - a[0]*b[2]; c[2] = a[0]*b[1] - a[1]*b[0]; } - template KOKKOS_INLINE_FUNCTION + template static double dot (const CV a, const CV b) { return a[0]*b[0] + a[1]*b[1] + a[2]*b[2]; } - template KOKKOS_INLINE_FUNCTION + template static double norm2 (const CV v) { return dot(v, v); } - template KOKKOS_INLINE_FUNCTION + template static void scale (const double a, V v) { v[0] *= a; v[1] *= a; v[2] *= a; } - template KOKKOS_INLINE_FUNCTION + template static void normalize (V v) { scale(1.0/std::sqrt(norm2(v)), v); } - template KOKKOS_INLINE_FUNCTION + template static double dot_c_amb (const CV c, const CV a, const CV b) { return c[0]*(a[0] - b[0]) + c[1]*(a[1] - b[1]) + c[2]*(a[2] - b[2]); } - template KOKKOS_INLINE_FUNCTION - static void copy (V& d, const CV& s) { + template + static void copy (V d, const CV s) { d[0] = s[0]; d[1] = s[1]; d[2] = s[2]; } - template KOKKOS_INLINE_FUNCTION + template static void combine (const CV u, const CV v, const double a, V x) { const double oma = 1 - a; x[0] = oma*u[0] + a*v[0]; @@ -197,13 +241,14 @@ struct SphereGeometry { x[2] = oma*u[2] + a*v[2]; } - template KOKKOS_INLINE_FUNCTION + template static void edge_normal (const CV a, const CV b, V en) { cross(a, b, en); normalize(en); } - template KOKKOS_INLINE_FUNCTION + // Is v inside the lined (a1,a2) having normal n? + template static bool inside (const CV v, const CV a1, const CV a2, const CV n) { return dot_c_amb(n, v, a1) > 0 && dot_c_amb(n, v, a2) > 0; } @@ -218,7 +263,7 @@ struct SphereGeometry { Then uvec(v(a)) is the intersection point on the unit sphere. Assume intersection exists. (Already filtered by 'inside'.) */ - template KOKKOS_INLINE_FUNCTION + template static void intersect (const CV v1, const CV v2, const CV e1, const CV en, V intersection) { /* Consider the case where e1 == v1 or e1 == v2. All == are FP. @@ -244,7 +289,7 @@ struct SphereGeometry { } } - template KOKKOS_INLINE_FUNCTION + template static bool output (const CV v, int& no, Array2D& vo) { #ifdef SIKQ_DEBUG if (no >= vo.n()) { @@ -264,7 +309,7 @@ struct SphereGeometry { //todo Handle non-convex case. // This uses a terrible formula, but it's just for testing. - KOKKOS_INLINE_FUNCTION + static double calc_area (const Array2D& v) { double area = 0; for (int i = 1; i < v.n() - 1; ++i) { @@ -279,7 +324,7 @@ struct SphereGeometry { } return area; } - template KOKKOS_INLINE_FUNCTION + template static double calc_arc_length (const CV a, const CV b) { const double d = dot(a, b); if (d >= 1) return 0; @@ -310,7 +355,7 @@ struct Mesh { }; // Generally not a user routine. -template KOKKOS_INLINE_FUNCTION +template bool clip_against_edge ( // Input vertex list. const Array2D& vi, const int ni, @@ -345,7 +390,7 @@ bool clip_against_edge ( } // Efficient user routine that uses the mesh data structure. -template KOKKOS_INLINE_FUNCTION +template bool clip_against_poly ( // Clip mesh. m.e(:,cp_e) is the element, and m.en(:,cp_e) is the // corresponding list of normal indices. @@ -399,7 +444,7 @@ bool clip_against_poly ( // Not used for real stuff; just a convenient version for testing. In this // version, clip_poly is a list of clip polygon vertices. This is instead of the // mesh data structure. -template KOKKOS_INLINE_FUNCTION +template bool clip_against_poly ( // Clip polygon. const Array2D& clip_poly, @@ -441,7 +486,324 @@ bool clip_against_poly ( } } // namespace sh +// ICE: Intersection with curved edges. +// +// Some terminology: +// s, m, p: start, middle, end points of a curved edge. m is not really the +// middle or midpoint; indeed, it is unlikely to be on the curve. Rather, +// it's a point that defines the curve. +// smp-plane: plane defined by s, m, p. +// straight: straight on a plane, or a great arc on the sphere. +// curved: quadratic on a plane, projected quadratic on the sphere. +// cedge: a curved edge. +// sedge: a straight edge, including a great arc. +// cpoly, spoly: similar terminology; but note that a cpoly can contain a mix +// of cedges and sedges. +// ed: short for edge +// vt: node type. s, p are endpoint vertices (0); m is a midpoint node (1). +// ced is [s m p]. +// (vs, vts, n): Vertex list. vs is an array of vertices. There are n +// vertices. vts is a list of vertex types. s, p are endpoint vertices (0); +// m is a midpoint node (1). If an edge is straight, then vts(k:k+1) = [0 +// 0]; if an edge is curved, then vts(k:k+2) = [0 1 0]. Keep in mind that +// there is an edge that wraps around the end of the list. The wrap can +// occur like 0|1 0 or like 0 1|0. As an example, [1 0 0 1 0] is a vertex +// type list for a triangle containing two cedges and one sedge. +// +// Some math. +// a in [0,1] is the parameter in the curve +// x(a) = (1-a)^2 s + a (1-a) m + a^2 p. (1) +// s and p sit on the curve, but m in general does not. We can define m by +// x(1/2) = M => m = 4 n - s - p, +// where M is a point that is intended to be on the curve and serves as a useful +// midpoint reference. This construction has the essential property that the +// curve is invariant to the swapping of s and p. However, the clip routines are +// independent of this definition; (s,m,p) are used as in equation (1), and that +// is all that is needed. +// When segments are extracted from x(a) in a clip, we use c in the segment +// [c,1], d in the segment [0,d], and both in [c,d]. Similarly, x(c) = r is the +// new start point, and x(d) = q is the new end point. A segment requires a +// midpoint so that the resulting parameterized curve sits on the original; this +// is n. Hence a segment of (s,m,p) defined by [c,d] subset [0,1] is (r,n,q). n +// is given by +// n = 2 (c d - c - d + 1) s + (c + d - 2 c d) m + 2 c d p, +// which satisfies +// x(a) = (1-a)^2 s + a(1-a) m + a^2 p = (1-b)^2 r + b(1-b) n + b^2 q +// for all b in [0,1], where b = (a-c)/(d-c), r = x(c), q = x(d). +template +struct ice { + typedef double Real; + typedef int Int; + typedef unsigned int UInt; + enum { dim = geo::dim }; + + // Follow the outer loop of the Sutherland-Hodgmann algorithm. + static bool clip_cpoly_against_convex_poly ( + // Clip polygon. + const Array2D& clip_poly, + // Clip polygon edges' inward-facing normals. + const Array2D& clip_edge_normals, + // Input vertex and vertex type lists. + const Array2D& vi, const Array1D& vti, const Int ni, + // Outputs. + Array2D& vo, Array1D& vto, Int& no, + // Workspace. n*wrk applies to both *wrk and v*o. If workspace is not large + // enough, false is returned. + Real* const rwrk, Int* const iwrk, const Int nwrk) + { + Array2D vo1(dim, nwrk, rwrk); + Array2D* vs[] = { &vo, &vo1 }; + Array1D vto1(nwrk, iwrk); + Array1D* vts[] = { &vto, &vto1 }; + int nos[] = { 0, 0 }; + + no = 0; + const auto nv = nslices(clip_poly); + if (nv % 2 == 0) { + // Make sure the final vertex output list is in the caller's buffer. + std::swap(vs[0], vs[1]); + std::swap(vts[0], vts[1]); + std::swap(nos[0], nos[1]); + } + + if ( ! clip_cpoly_against_sed(vi, vti, ni, *vs[0], *vts[0], nos[0], + slice(clip_poly, 0), slice(clip_poly, 1), + slice(clip_edge_normals, 0))) + return false; + if ( ! nos[0]) return true; + + for (Int ie = 1, ielim = nv - 1; ; ++ie) { + if ( ! clip_cpoly_against_sed(*vs[0], *vts[0], nos[0], *vs[1], *vts[1], nos[1], + slice(clip_poly, ie), slice(clip_poly, (ie+1) % nv), + slice(clip_edge_normals, ie))) + return false; + if ( ! nos[1]) return true; + if (ie == ielim) break; + std::swap(vs[0], vs[1]); + std::swap(vts[0], vts[1]); + std::swap(nos[0], nos[1]); + } + + no = nos[1]; + return true; + } + + template + static bool clip_cpoly_against_sed ( + // Input vertex and vertex type lists. + const Array2D& vi, const Array1D& vti, const Int ni, + // Outputs. + Array2D& vo, Array1D& vto, Int& no, + // The end points of the clip edge segment. + const CV se1, const CV se2, + // Clip edge's inward-facing normal. + const CV sen) + { + bool ends_connected = false; + Int k = 0; + no = 0; + for (;;) { + if (no + 5 > nslices(vo)) return false; + if (vti[k] == 1) { + // 1. Start of vi list, with [1 0] as the two first vts. + assert(k == 0); + assert(vti[ni-1] == 0); + no += clip_ced_against_sed(se1, se2, sen, + slice(vi, ni-1), slice(vi, 0), slice(vi, 1), + vo, vto); + k++; + ends_connected = true; + } else if (vti[k] == 0 && k+1 == ni) { + // 2. Last vertex in the list. + if (ends_connected) break; + if (vti[0] == 0) { + // 2a. A sedge connects end of list to start. + const UInt nv = clip_sed_against_sed(se1, se2, sen, + slice(vi, ni-1), slice(vi, 0), + offset(vo, no)); + for (UInt i = 0; i < nv; ++i) vto[no+i] = 0; + no += nv; + } else { + // 2b. Block 1 took care of this cedge. + assert(false); + } + break; + } else if (vti[k+1] == 1) { + assert(vti[k] == 0); + if (k+2 == ni) { + // 3. [0 1] at the end of the list connects to 0 at the start. + assert(vti[0] == 0); + no += clip_ced_against_sed(se1, se2, sen, + slice(vi, k), slice(vi, k+1), slice(vi, 0), + offset(vo, no), offset(vto, no)); + break; + } else { + assert(k+2 < ni); + // 4. General case: [0 1 0] in the middle of the list. + no += clip_ced_against_sed(se1, se2, sen, + slice(vi, k), slice(vi, k+1), slice(vi, k+2), + offset(vo, no), offset(vto, no)); + k += 2; + } + } else { + // 5. General case: [0 0] in the middle of the list. + assert(k+1 < ni); + assert(vti[k+1] == 0); + const UInt nv = clip_sed_against_sed(se1, se2, sen, + slice(vi, k), slice(vi, k+1), + offset(vo, no)); + for (UInt i = 0; i < nv; ++i) vto[no+i] = 0; + no += nv; + k++; + } + } + return true; + } + + // sed must have >= 2 slices alocated. + template + static UInt clip_sed_against_sed (const CV se1, const CV se2, const CV sen, + const CV s, const CV p, Array sed) { + const bool s_inside = geo::inside(s, se1, se2, sen); + const bool p_inside = geo::inside(p, se1, se2, sen); + if (p_inside) { + if (s_inside) { + copy(slice(sed, 0), p); + return 1; + } else { + geo::intersect(s, p, se1, sen, slice(sed, 0)); + copy(slice(sed, 1), p); + return 2; + } + } else { + if (s_inside) { + geo::intersect(s, p, se1, sen, slice(sed, 0)); + return 1; + } else { + return 0; + } + } + } + + // eds and evts must have >= 5 slices alocated. + template + static UInt clip_ced_against_sed (const CV se1, const CV se2, const CV sen, + const CV s, const CV m, const CV p, + Array eds, IV edvts) { + Real as[2]; + const UInt nas = intersect(s, m, p, se1, sen, as); + const bool s_inside = geo::inside(s, se1, se2, sen); + const bool p_inside = geo::inside(p, se1, se2, sen); + if (p_inside) { + if (s_inside) { + if (nas < 2) { + copy(slice(eds, 0), m); edvts[0] = 1; + copy(slice(eds, 1), p); edvts[1] = 0; + return 2; + } else { + middle_for_segment_0d(s, m, as[0], slice(eds, 0)); + eval(s, m, p, as[0], slice(eds, 1)); + eval(s, m, p, as[1], slice(eds, 2)); + middle_for_segment_c1(m, p, as[1], slice(eds, 3)); + copy(slice(eds, 4), p); + edvts[0] = 1; edvts[1] = 0; edvts[2] = 0; edvts[3] = 1; edvts[4] = 0; + return 5; + } + } else { + eval(s, m, p, as[0], slice(eds, 0)); + middle_for_segment_c1(m, p, as[0], slice(eds, 1)); + copy(slice(eds, 2), p); + edvts[0] = 0; edvts[1] = 1; edvts[2] = 0; + return 3; + } + } else { + if (s_inside) { + middle_for_segment_0d(s, m, as[0], slice(eds, 0)); + eval(s, m, p, as[0], slice(eds, 1)); + edvts[0] = 1; edvts[1] = 0; + return 2; + } else { + if (nas < 2) { + return 0; + } else { + eval(s, m, p, as[0], slice(eds, 0)); + middle_for_segment(s, m, p, as[0], as[1], slice(eds, 1)); + eval(s, m, p, as[1], slice(eds, 2)); + edvts[0] = 0; edvts[1] = 1; edvts[2] = 0; + return 3; + } + } + } + assert(0); + return 0; + } + + template + static UInt intersect (const CV s, const CV m, const CV p, + const CV e1, const CV en, Real as[2]) { + Int nas = geo::intersect(s, m, p, e1, en, as); + if (nas == 2 && (as[1] < 0 || as[1] > 1)) --nas; + if (nas >= 1 && (as[0] < 0 || as[0] > 1)) { as[0] = as[1]; --nas; } + if (nas == 2 && as[0] > as[1]) std::swap(as[0], as[1]); + assert(nas >= 0 && nas <= 2); + return static_cast(nas); + } + + // Create m in (s,m,p) for the segment a in [c,d]. + template + static void middle_for_segment (const CV s, const CV m, const CV p, + const Real c, const Real d, V n) { + if (c == 0) middle_for_segment_0d(s, m, d, n); + else if (d == 1) middle_for_segment_c1(m, p, c, n); + else middle_for_segment_cd(s, m, p, c, d, n); + } + template + static void middle_for_segment_0d (const CV s, const CV m, const Real d, + V n) { + const Real tomd = 2*(1 - d); + for (UInt i = 0; i < dim; ++i) + n[i] = tomd*s[i] + d*m[i]; + } + template + static void middle_for_segment_c1 (const CV m, const CV p, const Real c, + V n) { + const Real omc = 1 - c, tc = 2*c; + for (UInt i = 0; i < dim; ++i) + n[i] = omc*m[i] + tc*p[i]; + } + template + static void middle_for_segment_cd (const CV s, const CV m, const CV p, + const Real c, const Real d, V n) { + const Real cd = c*d, tcd = 2*cd, c0 = 2*(cd - c - d + 1), c1 = c + d - tcd; + for (UInt i = 0; i < dim; ++i) + n[i] = c0*s[i] + c1*m[i] + tcd*p[i]; + } + + // Create m in (s,m,p) so that the curve hits n. + template + static void middle_matches (const CV s, const CV p, const CV n, V m) { + const Real fn = 4*n; + for (UInt i = 0; i < dim; ++i) + m[i] = fn*n[i] - s[i] - p[i]; + } + + template + static void eval (const CV s, const CV m, const CV p, const Real a, V v) { + const Real oma = 1 - a, oma2 = oma*oma, omaa = oma*a, a2 = a*a; + for (UInt i = 0; i < dim; ++i) + v[i] = oma2*s[i] + omaa*m[i] + a2*p[i]; + } + + template + static void copy (V d, const CV s) { + for (UInt i = 0; i < dim; ++i) d[i] = s[i]; + } +}; + +template constexpr T square (const T& x) { return x*x; } + // Octree for search. +template class Octree { public: typedef double BoundingBox[6]; @@ -465,6 +827,7 @@ class Octree { bb[j] = std::min(bb[j], ps(j,i)); bb[j+3] = std::max(bb[j+3], ps(j,i)); } + pad_bb(bb); } static void calc_bb (const Array2D& ps, BoundingBox bb) { @@ -472,7 +835,7 @@ class Octree { } template - static void calc_bb (const Array2D& p, const CIV& e, + static void calc_bb (const Array2D& p, const CIV e, const int ne, V ebb) { for (int j = 0; j < 3; ++j) ebb[j] = ebb[j+3] = p(j, e[0]); @@ -483,6 +846,7 @@ class Octree { ebb[j+3] = ko::max(ebb[j+3], p(j, e[i])); } } + pad_bb(ebb); } static void calc_bb (const Array2D& p, const Array2D& e, @@ -492,6 +856,23 @@ class Octree { calc_bb(p, e(k), e.m(), ebbs(k)); } + // If a bounding box was constructed from vertices of a spherical polygon, + // expand it to account for the possible protrusion of the sphere. + template + static void pad_bb (BB bb) { + if (std::is_same::value) return; + double hl = 0.5*std::sqrt(square(bb[3] - bb[0]) + square(bb[4] - bb[1]) + + square(bb[5] - bb[2])); + // Limit the half-length to the circle's radius. + hl = std::min(1.0, hl); + // Max distance from a chord of length 2 hl to the unit circle: + // hl = sin theta + // pad = 1 - cos theta = 1 - sqrt(1 - sin^2 theta) = 1 - sqrt(1 - hl^2). + const double pad = 1 - std::sqrt(1 - square(hl)); + for (int i = 0; i < 3; ++i) bb[ i] -= pad; + for (int i = 0; i < 3; ++i) bb[3+i] += pad; + } + // p is a 3xNp array of points. e is a KxNe array of elements. An entry <0 is // ignored. All <0 entries must be at the end of an element's list. Octree (const Array2D& p, const Array2D& e, @@ -507,7 +888,7 @@ class Octree { // function // void operator(const int element_index). // element_index indexes e. - template KOKKOS_INLINE_FUNCTION + template void apply (const CV bb, Functor& f) const { if (nodes_.n() == 0) { for (int i = 0; i < offset_[1]; ++i) @@ -665,7 +1046,6 @@ class Octree { } // Using parent bb p, fill child bb c, with child_idx in 0:7. - KOKKOS_INLINE_FUNCTION static void fill_child_bb (const BoundingBox& p, const int& child_idx, BoundingBox& c) { const double m[] = { 0.5*(p[0] + p[3]), @@ -687,7 +1067,6 @@ class Octree { } // Do bounding boxes a and b overlap? - KOKKOS_INLINE_FUNCTION static bool do_bb_overlap (const BoundingBox a, const BoundingBox b) { for (int i = 0; i < 3; ++i) if ( ! do_lines_overlap(a[i], a[i+3], b[i], b[i+3])) @@ -695,13 +1074,12 @@ class Octree { return true; } - KOKKOS_INLINE_FUNCTION static bool do_lines_overlap (const double& a1, const double& a2, const double& b1, const double& b2) { return ! (a2 < b1 || a1 > b2); } - template KOKKOS_INLINE_FUNCTION + template void apply_r (const int ni, const BoundingBox& nbb, const CV bb, Functor& f) const { for (int i = 0; i < 8; ++i) { @@ -757,7 +1135,7 @@ void fill_normals (sh::Mesh& m) { // Used in Octree::apply to gather a set of possibly intersecting polygons. struct OTSearchFunctor { std::set hits; - KOKKOS_INLINE_FUNCTION void operator() (const int i) { hits.insert(i); } + void operator() (const int i) { hits.insert(i); } }; // Find the area of the overlapping part of two meshes by summing over the areas @@ -772,21 +1150,21 @@ class TestAreaOTFunctor { const Array2D e; // Array of polygons. e(:,k) is the k'th polygon. // Already initialized octree used to search for possibly intersecting // polygons. - Octree ot; + Octree ot; public: typedef double value_type; TestAreaOTFunctor (const sh::Mesh& cm, const Array2D& p, - const Array2D& e, const Octree& ot) + const Array2D& e, const Octree& ot) : cm(cm), p(p), e(e), ot(ot) {} // k indexes (p,e). - KOKKOS_INLINE_FUNCTION void operator() (const int k, double& area) const { + void operator() (const int k, double& area) const { // Clipped element bounding box. double ebb[6]; - Octree::calc_bb(p, e(k), e.m(), ebb); + Octree::calc_bb(p, e(k), e.m(), ebb); // Get list of possible overlaps. OTSearchFunctor f; ot.apply(ebb, f); @@ -830,7 +1208,7 @@ double test_area_ot (const Array2D& cp, const Array2D& double et[2]; auto t = tic(); // Build an octree over the clip mesh. - Octree ot(cp, ce); + Octree ot(cp, ce); et[0] = toc(t); // Compute the area in a silly way to test search and interesection. @@ -842,6 +1220,6 @@ double test_area_ot (const Array2D& cp, const Array2D& return area; } } // namespace test -} // namespace siqk +} // namespace siqp #endif // INCLUDE_SIK_HPP diff --git a/siqk/si/test.cpp b/siqk/si/test.cpp index 8a09042..439e1c0 100644 --- a/siqk/si/test.cpp +++ b/siqk/si/test.cpp @@ -1,10 +1,6 @@ -#ifdef SIQK_USE_KOKKOS -# include "Array_Kokkos.hpp" -#else -# include "Array_raw.hpp" -#endif -#include "sik.hpp" -using namespace siqk; +#include "Array_raw.hpp" +#include "siqp.hpp" +using namespace siqp; #include "fsi.h" template @@ -218,18 +214,10 @@ struct Input { }; int main (int argc, char** argv) { -#ifdef SIQK_USE_KOKKOS - Kokkos::initialize(argc, argv); -#endif - { - Input in(argc, argv); - int nerr = 0; - nerr += (in.geo_sphere ? - run(in.n, in.angle, in.xlate, in.ylate, in.write_matlab) : - run(in.n, in.angle, in.xlate, in.ylate, in.write_matlab)); - std::cerr << (nerr ? "FAIL" : "PASS") << "ED\n"; - } -#ifdef SIQK_USE_KOKKOS - Kokkos::finalize_all(); -#endif + Input in(argc, argv); + int nerr = 0; + nerr += (in.geo_sphere ? + run(in.n, in.angle, in.xlate, in.ylate, in.write_matlab) : + run(in.n, in.angle, in.xlate, in.ylate, in.write_matlab)); + std::cerr << (nerr ? "FAIL" : "PASS") << "ED\n"; } diff --git a/siqk/si/testf.f90 b/siqk/si/testf.f90 index 097d5bc..cf2ce78 100644 --- a/siqk/si/testf.f90 +++ b/siqk/si/testf.f90 @@ -24,12 +24,22 @@ program main -5.000000000000000000d-01, -4.999999999999998890d-01, 7.071067811865474617d-01, & -5.441369567663348894d-01, -3.342826900143283098d-01, 7.695258640473731093d-01, & -3.127479665047677160d-01, 3.127479665047677160d-01, 8.968709042522592378d-01 /), (/3,8/)) - real*8 :: vo(3,20), wrk(3,20) - integer :: ncp = 4, np = 4, nvert = 20, no, info, i, j + real*8 :: ice_intersection(2,8) = reshape( & + (/ -5.000000000000000d-01, -3.397939048350230d-01, & + -3.127479665047677d-01, 3.127479665047677d-01, & + 3.397939048350230d-01, 5.000000000000000d-01, & + 5.000000000000000d-01, 5.000000000000000d-01, & + 5.000000000000000d-01, 3.397939048350231d-01, & + 3.127479665047677d-01, -3.127479665047677d-01, & + -3.397939048350231d-01, -5.000000000000000d-01, & + -5.000000000000000d-01, -5.000000000000000d-01 /), (/2,8/)) + integer :: polyt(4) = (/ 0, 0, 0, 0 /) + real*8 :: vo(3,20), rwrk(3,20) + integer :: vto(20), iwrk(20) + integer :: ncp = 4, np = 4, nvert = 20, no, info, i, j, cnt real*8 :: err - call clipagainstpolysphere(clip, ncp, nml, poly, np, vo, no, wrk, nvert, info) - + call clipagainstpolysphere(clip, ncp, nml, poly, np, vo, no, rwrk, nvert, info) err = 0 do i = 1,8 do j = 1,3 @@ -38,6 +48,17 @@ program main end do err = sqrt(err) if (no /= 8) err = err + 1 - + print *, 'err', err + + call iceclipagainstpolyplane(clip, ncp, nml, poly, polyt, np, vo, vto, no, & + rwrk, iwrk, nvert, info) + err = 0 + do i = 1,8 + do j = 1,2 + err = err + (vo(j,i) - ice_intersection(j,i))**2 + end do + end do + err = sqrt(err) + if (no /= 8) err = err + 1 print *, 'err', err end program main From 8cca23dd4c0f082b479d5e95904fa871949a23dd Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Tue, 12 Jul 2016 10:55:24 -0600 Subject: [PATCH 13/28] ICE: Tweaks; add to the Fortran interface; add more tests. --- siqk/si/Makefile | 6 +- siqk/si/fsi.cpp | 15 +++++ siqk/si/fsi.h | 18 +++++- siqk/si/icetest.py | 28 +++++++++ siqk/si/siqp.hpp | 144 ++++++++++++++++++++++++++++++++++++++++----- siqk/si/test.cpp | 48 ++++++++++----- siqk/si/testf.f90 | 31 ++++++++-- 7 files changed, 253 insertions(+), 37 deletions(-) create mode 100755 siqk/si/icetest.py diff --git a/siqk/si/Makefile b/siqk/si/Makefile index fddabe1..76198fd 100644 --- a/siqk/si/Makefile +++ b/siqk/si/Makefile @@ -3,7 +3,7 @@ CXX=g++-4.7 FC=gfortran-4.7 CXXFLAGS=$(opt) -Wall -pedantic -std=c++11 -DSIQK_FORTRAN FFLAGS=$(opt) -ffixed-line-length-none -LDFLAGS=-lgfortran +LDFLAGS=-lgfortran -fopenmp CXXSOURCES=test.cpp fsi.cpp F90SOURCES=testf.f90 @@ -28,3 +28,7 @@ test: $(CXXOBJECTS) $(F90OBJECTS) clean: rm -f *.o test.exe testf.exe + +test.o: siqp.hpp +fsi.o: fsi.h siqp.hpp +testf.o: siqp.hpp fsi.h fsi.cpp diff --git a/siqk/si/fsi.cpp b/siqk/si/fsi.cpp index c56a610..e9f6961 100644 --- a/siqk/si/fsi.cpp +++ b/siqk/si/fsi.cpp @@ -16,6 +16,21 @@ extern "C" void clipagainstpolysphere_ ( *info = success ? 0 : 1; } +extern "C" void clipagainstpolyplane_ ( + double const* const clip_poly, int const* const clip_poly_n_vertices, + double const* const clip_edge_normals, double const* const vi, int const* const ni, + double* const vo, int* const no, double* const wrk, int const* const n_vertices, + int* const info) +{ + Array2D avo(3, *n_vertices, vo); + const bool success = siqp::sh::clip_against_poly( + Array2D(3, *clip_poly_n_vertices, clip_poly), + Array2D(3, *clip_poly_n_vertices, clip_edge_normals), + Array2D(3, *ni, vi), *ni, + avo, *no, wrk, *n_vertices); + *info = success ? 0 : 1; +} + extern "C" void iceclipagainstpolyplane_( double const* const clip_poly, int const* const clip_poly_n_vertices, double const* const clip_edge_normals, diff --git a/siqk/si/fsi.h b/siqk/si/fsi.h index a82c9d0..3798db2 100644 --- a/siqk/si/fsi.h +++ b/siqk/si/fsi.h @@ -1,4 +1,4 @@ -// Fortran interface to simple polygon clipping routine. +// Fortran interface to polygon clipping routines. extern "C" void clipagainstpolysphere_( // 3 x clip_poly_n_vertices clip spherical polygon vertex list. @@ -14,6 +14,20 @@ extern "C" void clipagainstpolysphere_( // info = 0 on success. info = 1 if n_vertices is not large enough. int* const info); +extern "C" void clipagainstpolyplane_( + // 3 x clip_poly_n_vertices clip spherical polygon vertex list. + double const* const clip_poly, int const* const clip_poly_n_vertices, + // 3 x clip_poly_n_vertices clip polygon's inward-facing edge normals. + double const* const clip_edge_normals, + // 3 x ni polygon to clip. + double const* const to_clip_poly, int const* const ni, + // On output, a 3 x no clipped polygon. + double* const vo, int* const no, + // Workspace. Both vo and wrk must have n_vertices of space available. + double* const wrk, int const* const n_vertices, + // info = 0 on success. info = 1 if n_vertices is not large enough. + int* const info); + // ICE: Intersection with curved edges. // // Some terminology: @@ -38,7 +52,7 @@ extern "C" void clipagainstpolysphere_( // a in [0,1] is the parameter in the curve // x(a) = (1-a)^2 s + a (1-a) m + a^2 p. (1) // s and p sit on the curve, but m in general does not. We can define m by -// x(1/2) = M => m = 4 n - s - p, +// x(1/2) = M => m = 4 M - s - p, // where M is a point that is intended to be on the curve and serves as a useful // midpoint reference. This construction has the essential property that the // curve is invariant to the swapping of s and p. However, the clip routines are diff --git a/siqk/si/icetest.py b/siqk/si/icetest.py new file mode 100755 index 0000000..5d30aac --- /dev/null +++ b/siqk/si/icetest.py @@ -0,0 +1,28 @@ +#!/usr/bin/python + +import os + +stride = 1 + +xlates = [4.2*10**f for f in range(-17, 0, stride)] +xlates.append(0) + +ylates = [0] + +angles = xlates + +fails = [] +cnt = 0 + +for n in [21]: + for angle in angles: + for xlate in xlates: + for ylate in ylates: + cmd = ('./test.exe --plane --xlate {xlate:1.15e} --ylate {ylate:1.14e} --angle {angle:1.15e} -n {n:d}'. + format(xlate=xlate, ylate=ylate, angle=angle, n=n)) + stat = os.system(cmd + ' |& grep PASSED &> /dev/null') + if stat: + fails.append(cmd) + else: + cnt += 1 + print len(fails) diff --git a/siqk/si/siqp.hpp b/siqk/si/siqp.hpp index 444cdfd..6fc9cd8 100644 --- a/siqk/si/siqp.hpp +++ b/siqk/si/siqp.hpp @@ -247,7 +247,7 @@ struct SphereGeometry { normalize(en); } - // Is v inside the lined (a1,a2) having normal n? + // Is v inside the line (a1,a2) having normal n? template static bool inside (const CV v, const CV a1, const CV a2, const CV n) { return dot_c_amb(n, v, a1) > 0 && dot_c_amb(n, v, a2) > 0; @@ -308,8 +308,7 @@ struct SphereGeometry { } //todo Handle non-convex case. - // This uses a terrible formula, but it's just for testing. - + // This uses a terrible formula, but it's just for testing. static double calc_area (const Array2D& v) { double area = 0; for (int i = 1; i < v.n() - 1; ++i) { @@ -330,6 +329,13 @@ struct SphereGeometry { if (d >= 1) return 0; return acos(d); } + + // For quadratic edges. + template + static int intersect (const CV s, const CV m, const CV p, const CV e1, + const CV nml, V as) { + assert(0); // Not yet. + } }; // Sutherland-Hodgmann polygon clipping algorithm. Follow Foley, van Dam, @@ -514,7 +520,7 @@ bool clip_against_poly ( // a in [0,1] is the parameter in the curve // x(a) = (1-a)^2 s + a (1-a) m + a^2 p. (1) // s and p sit on the curve, but m in general does not. We can define m by -// x(1/2) = M => m = 4 n - s - p, +// x(1/2) = M => m = 4 M - s - p, // where M is a point that is intended to be on the curve and serves as a useful // midpoint reference. This construction has the essential property that the // curve is invariant to the swapping of s and p. However, the clip routines are @@ -660,7 +666,7 @@ struct ice { return true; } - // sed must have >= 2 slices alocated. + // sed must have >= 2 slices allocated. template static UInt clip_sed_against_sed (const CV se1, const CV se2, const CV sen, const CV s, const CV p, Array sed) { @@ -685,15 +691,21 @@ struct ice { } } - // eds and evts must have >= 5 slices alocated. + // eds and evts must have >= 5 slices allocated. template static UInt clip_ced_against_sed (const CV se1, const CV se2, const CV sen, const CV s, const CV m, const CV p, Array eds, IV edvts) { - Real as[2]; - const UInt nas = intersect(s, m, p, se1, sen, as); - const bool s_inside = geo::inside(s, se1, se2, sen); - const bool p_inside = geo::inside(p, se1, se2, sen); + Real as[2] = {0}; + UInt nas = intersect(s, m, p, se1, sen, as); + bool s_inside = geo::inside(s, se1, se2, sen); + bool p_inside = geo::inside(p, se1, se2, sen); + // Handle cases where FP fails to lead to a consistent state. + if (p_inside != s_inside && nas == 0) { + // There is no FP intersection, so p and s might as well both be inside. + p_inside = s_inside = true; + } + if (p_inside) { if (s_inside) { if (nas < 2) { @@ -734,6 +746,7 @@ struct ice { } } } + assert(0); return 0; } @@ -782,9 +795,8 @@ struct ice { // Create m in (s,m,p) so that the curve hits n. template static void middle_matches (const CV s, const CV p, const CV n, V m) { - const Real fn = 4*n; for (UInt i = 0; i < dim; ++i) - m[i] = fn*n[i] - s[i] - p[i]; + m[i] = 4*n[i] - s[i] - p[i]; } template @@ -1197,9 +1209,106 @@ class TestAreaOTFunctor { } }; +#ifdef SIKQ_DEBUG_CRITICAL +static void +write_matlab (const std::string& name, const Array2D& p) { + printf("mat=1; %s = [", name.c_str()); + for (int ip = zero(p); ip < p.n(); ++ip) + printf(" %1.15e %1.15e %1.15e;", p(0,ip), p(1,ip), p(2,ip)); + printf("].';\n"); +} +#endif + template -double test_area_ot (const Array2D& cp, const Array2D& ce, - const Array2D& p, const Array2D& e) { +class IceTestAreaOTFunctor { + sh::Mesh cm; + const Array2D p; + const Array2D e; + Octree ot; + +public: + typedef double value_type; + + IceTestAreaOTFunctor (const sh::Mesh& cm, const Array2D& p, + const Array2D& e, const Octree& ot) + : cm(cm), p(p), e(e), ot(ot) + {} + + // k indexes (p,e). + void operator() (const int k, double& area) const { + // Clipped element bounding box. + double ebb[6]; + Octree::calc_bb(p, e(k), e.m(), ebb); + // Get list of possible overlaps. + OTSearchFunctor f; + ot.apply(ebb, f); + int ni, no; + // Area of all overlapping regions. + double a = 0; + for (const auto icp : f.hits) { + ni = 0; + static const int N = 2*max_nvert; + double rbuf[15*N]; + Array2D cp(3, N, rbuf), cens(3, N, rbuf + 3*N), + vi(3, N, rbuf + 6*N), ivo(3, N, rbuf + 9*N), vo(3, N, rbuf + 12*N); + int ibuf[2*N]; + Array1D vti(N, ibuf), vto(N, ibuf + N); + int ncp = 0; + for (int i = 0; i < e.m(); ++i) { + if (e(i,icp) == -1) break; + geo::copy(cp(i), cm.p(cm.e(i,icp))); + geo::copy(cens(i), cm.nml(cm.en(i,icp))); + ++ncp; + } + for (int i = 0; i < e.m(); ++i) { + if (e(i,k) == -1) break; + vti[2*i] = 0; vti[2*i+1] = 1; + geo::copy(vi(2*i), p(e(i,k))); + ni += 2; + } + for (int i = 0; i < e.m(); ++i) { + double n[3]; + geo::combine(vi(2*i), vi((2*(i+1)) % ni), 0.5, n); + ice::middle_matches(vi(2*i), vi((2*(i+1)) % ni), n, vi(2*i+1)); + } + double rwrk[3*N]; + int iwrk[N]; + ice::clip_cpoly_against_convex_poly( + Array2D(cp.m(), ncp, cp.data()), + Array2D(cens.m(), ncp, cens.data()), + vi, vti, ni, ivo, vto, no, rwrk, iwrk, N); + int n = 0; + for (int i = 0; i < no; ++i) + if (vto[i] == 0) { + geo::copy(vo(n), ivo(i)); + ++n; + } + no = n; + if (no) { + const double + a1 = geo::calc_area(Array2D(vo.m(), no, vo.data())); + a += a1; +#ifdef SIKQ_DEBUG_CRITICAL + if (a1 < -1e-6) { + write_matlab("cp", Array2D(cp.m(), ncp, cp.data())); + write_matlab("vi", Array2D(vi.m(), ni, vi.data())); + write_matlab("vo", Array2D(vo.m(), no, vo.data())); + exit(-1); + } +#endif + } + } + // Add our area to the reduction. + area += a; + } +}; + +template +double test_area_ot ( + const Array2D& cp, const Array2D& ce, + const Array2D& p, const Array2D& e, + const bool use_ice) +{ // Clip mesh and edge normal calculation. (In practice, we'd like to use // higher-quality edge normals.) sh::Mesh cm; cm.p = cp; cm.e = ce; @@ -1214,7 +1323,10 @@ double test_area_ot (const Array2D& cp, const Array2D& // Compute the area in a silly way to test search and interesection. t = tic(); double area = 0; - ko::parallel_reduce(e.n(), TestAreaOTFunctor(cm, p, e, ot), area); + if (use_ice) + ko::parallel_reduce(e.n(), IceTestAreaOTFunctor(cm, p, e, ot), area); + else + ko::parallel_reduce(e.n(), TestAreaOTFunctor(cm, p, e, ot), area); et[1] = toc(t); print_times("test_area_ot", et, 2); return area; @@ -1222,4 +1334,4 @@ double test_area_ot (const Array2D& cp, const Array2D& } // namespace test } // namespace siqp -#endif // INCLUDE_SIK_HPP +#endif // INCLUDE_SIQP_HPP diff --git a/siqk/si/test.cpp b/siqk/si/test.cpp index 439e1c0..f55617a 100644 --- a/siqk/si/test.cpp +++ b/siqk/si/test.cpp @@ -32,6 +32,7 @@ write_matlab (const std::string& name, const Array2D& p, } #ifdef SIQK_FORTRAN +template static int test_fortran (const Array2D& clip_poly, const Array2D& nml, const Array2D& poly) { @@ -39,11 +40,18 @@ static int test_fortran (const Array2D& clip_poly, const int nvi = poly.n(); Array2D vo(3, test::max_nvert), fvo(3, test::max_nvert); double wrk[3*test::max_nvert]; - sh::clip_against_poly(clip_poly, nml, poly, nvi, vo, no, - wrk, test::max_nvert); const int ncp = clip_poly.n(); - clipagainstpolysphere_(clip_poly.data(), &ncp, nml.data(), poly.data(), &nvi, - fvo.data(), &fno, wrk, &test::max_nvert, &info); + if (std::is_same::value) { + sh::clip_against_poly(clip_poly, nml, poly, nvi, vo, no, + wrk, test::max_nvert); + clipagainstpolysphere_(clip_poly.data(), &ncp, nml.data(), poly.data(), &nvi, + fvo.data(), &fno, wrk, &test::max_nvert, &info); + } else { + sh::clip_against_poly(clip_poly, nml, poly, nvi, vo, no, + wrk, test::max_nvert); + clipagainstpolyplane_(clip_poly.data(), &ncp, nml.data(), poly.data(), &nvi, + fvo.data(), &fno, wrk, &test::max_nvert, &info); + } if (info != 0) ++nerr; if (fno != no) ++nerr; for (int i = 0; i < no; ++i) @@ -128,7 +136,7 @@ calc_true_area (const Array2D& cp, const Array2D& ce, #ifdef SIQK_FORTRAN { // Sneak in a test of the Fortran interface. - const int nerr = test_fortran(clip_poly, nml, poly); + const int nerr = test_fortran(clip_poly, nml, poly); std::cerr << "Fortran test " << (nerr ? "FAIL" : "PASS") << "ED\n"; } #endif @@ -162,17 +170,29 @@ run (const int n, const double angle, const double xlate, const double ylate, // True intersection area from quadrilateral boundary of the mesh. const double ta = calc_true_area(cp, ce, p, e, wm); - // Area from the sum over the common refinement polygons. - const double a = test::test_area_ot(cp, ce, p, e); - // Report information. - const double re = std::abs(a - ta)/ta; - fprintf(stderr, "true area %1.4e mesh area %1.4e relerr %1.4e\n", ta, a, re); - if (wm) { - write_matlab("cm", cp, ce); - write_matlab("m", p, e); + bool pass = true; + for (int cnt = 0; + // ice works only for PlaneGeometry right now. + cnt < (std::is_same::value ? 2 : 1); + ++cnt) { + const bool use_ice = cnt == 1; + // Area from the sum over the common refinement polygons. Use sh the first + // time and ice the second. When using ice, edges are cedges in data + // structure but geometrically straight. + const double a = test::test_area_ot(cp, ce, p, e, use_ice); + + // Report information. + const double re = std::abs(a - ta)/ta; + pass = pass && re < 1e-8; + fprintf(stderr, "ice %d true area %1.4e mesh area %1.4e relerr %1.4e\n", + use_ice, ta, a, re); + if (wm) { + write_matlab("cm", cp, ce); + write_matlab("m", p, e); + } } - return re < 1e-8 ? 0 : 1; + return pass ? 0 : 1; } inline bool diff --git a/siqk/si/testf.f90 b/siqk/si/testf.f90 index cf2ce78..8e0354b 100644 --- a/siqk/si/testf.f90 +++ b/siqk/si/testf.f90 @@ -24,7 +24,7 @@ program main -5.000000000000000000d-01, -4.999999999999998890d-01, 7.071067811865474617d-01, & -5.441369567663348894d-01, -3.342826900143283098d-01, 7.695258640473731093d-01, & -3.127479665047677160d-01, 3.127479665047677160d-01, 8.968709042522592378d-01 /), (/3,8/)) - real*8 :: ice_intersection(2,8) = reshape( & + real*8 :: plane_intersection(2,8) = reshape( & (/ -5.000000000000000d-01, -3.397939048350230d-01, & -3.127479665047677d-01, 3.127479665047677d-01, & 3.397939048350230d-01, 5.000000000000000d-01, & @@ -48,17 +48,40 @@ program main end do err = sqrt(err) if (no /= 8) err = err + 1 - print *, 'err', err + print *, 'sh sphere err', err + + do i = 1,8 + do j = 1,3 + vo(j,i) = 0 + end do + end do + + call clipagainstpolyplane(clip, ncp, nml, poly, np, vo, no, rwrk, nvert, info) + err = 0 + do i = 1,8 + do j = 1,2 + err = err + (vo(j,i) - plane_intersection(j,mod(i+1,8)+1))**2 + end do + end do + err = sqrt(err) + if (no /= 8) err = err + 1 + print *, 'sh plane err', err + + do i = 1,8 + do j = 1,3 + vo(j,i) = 0 + end do + end do call iceclipagainstpolyplane(clip, ncp, nml, poly, polyt, np, vo, vto, no, & rwrk, iwrk, nvert, info) err = 0 do i = 1,8 do j = 1,2 - err = err + (vo(j,i) - ice_intersection(j,i))**2 + err = err + (vo(j,i) - plane_intersection(j,i))**2 end do end do err = sqrt(err) if (no /= 8) err = err + 1 - print *, 'err', err + print *, 'ice plane err', err end program main From 06965d9bd0646541260b54612bb1e1781cdf6997 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Sat, 25 Feb 2017 19:15:48 -0700 Subject: [PATCH 14/28] Rework the source code layout of this repo, and update SLMMIR. * Separate SIQK and SLMMIR. * Add readme and Makefile for SIQK standalone. * Separate out "si" code (no-Kokkos SIQK prototype and selective Fortran interface) from SIQK. --- siqk/Makefile | 17 + siqk/make.inc.amb | 3 + siqk/readme.txt | 13 + siqk/si/Array_raw.hpp | 151 --- siqk/si/Makefile | 34 - siqk/si/README.md | 25 - siqk/si/fsi.cpp | 51 -- siqk/si/fsi.h | 86 -- siqk/si/icetest.py | 28 - siqk/si/siqp.hpp | 1337 --------------------------- siqk/si/test.cpp | 243 ----- siqk/si/testf.f90 | 87 -- siqk/siqk.cpp | 153 ---- siqk/siqk_defs.hpp | 5 +- siqk/siqk_geometry.hpp | 51 +- siqk/siqk_runtests.py | 53 ++ siqk/siqk_search.hpp | 1 + siqk/siqk_sqr.hpp | 9 +- siqk/siqk_test.cpp | 4 +- siqk/slmm/Makefile | 45 - siqk/slmm/slmm_debug.hpp | 37 - siqk/slmm/slmm_defs.hpp | 36 - siqk/slmm/slmm_gallery.cpp | 14 - siqk/slmm/slmm_gallery.hpp | 312 ------- siqk/slmm/slmm_gll.hpp | 75 -- siqk/slmm/slmm_io.cpp | 314 ------- siqk/slmm/slmm_io.hpp | 73 -- siqk/slmm/slmm_mesh.cpp | 486 ---------- siqk/slmm/slmm_mesh.hpp | 69 -- siqk/slmm/slmm_runtests.py | 71 -- siqk/slmm/slmm_test.cpp | 201 ---- siqk/slmm/slmm_time_int.cpp | 156 ---- siqk/slmm/slmm_time_int.hpp | 424 --------- siqk/slmm/slmm_util.cpp | 30 - siqk/slmm/slmm_util.hpp | 153 ---- siqk/slmm/slmmir.cpp | 1712 ----------------------------------- 36 files changed, 138 insertions(+), 6421 deletions(-) create mode 100644 siqk/Makefile create mode 100644 siqk/make.inc.amb create mode 100644 siqk/readme.txt delete mode 100644 siqk/si/Array_raw.hpp delete mode 100644 siqk/si/Makefile delete mode 100644 siqk/si/README.md delete mode 100644 siqk/si/fsi.cpp delete mode 100644 siqk/si/fsi.h delete mode 100755 siqk/si/icetest.py delete mode 100644 siqk/si/siqp.hpp delete mode 100644 siqk/si/test.cpp delete mode 100644 siqk/si/testf.f90 delete mode 100644 siqk/siqk.cpp create mode 100644 siqk/siqk_runtests.py delete mode 100644 siqk/slmm/Makefile delete mode 100644 siqk/slmm/slmm_debug.hpp delete mode 100644 siqk/slmm/slmm_defs.hpp delete mode 100644 siqk/slmm/slmm_gallery.cpp delete mode 100644 siqk/slmm/slmm_gallery.hpp delete mode 100644 siqk/slmm/slmm_gll.hpp delete mode 100644 siqk/slmm/slmm_io.cpp delete mode 100644 siqk/slmm/slmm_io.hpp delete mode 100644 siqk/slmm/slmm_mesh.cpp delete mode 100644 siqk/slmm/slmm_mesh.hpp delete mode 100755 siqk/slmm/slmm_runtests.py delete mode 100644 siqk/slmm/slmm_test.cpp delete mode 100644 siqk/slmm/slmm_time_int.cpp delete mode 100644 siqk/slmm/slmm_time_int.hpp delete mode 100644 siqk/slmm/slmm_util.cpp delete mode 100644 siqk/slmm/slmm_util.hpp delete mode 100644 siqk/slmm/slmmir.cpp diff --git a/siqk/Makefile b/siqk/Makefile new file mode 100644 index 0000000..525107a --- /dev/null +++ b/siqk/Makefile @@ -0,0 +1,17 @@ +include make.inc + +CXXFLAGS=$(opt) -Wall -pedantic -fopenmp -std=c++11 -I$(KOKKOS)/include -DSIQK_TIME -Wno-unused-function +LDFLAGS=-fopenmp -L$(KOKKOS)/lib -lkokkos -ldl + +OBJECTS=$(SOURCES:.cpp=.o) + +.cpp.o: + $(CXX) $(CFLAGS) $(CXXFLAGS) -c $< -o $@ + +all: siqk_test + +siqk_test: $(OBJECTS) siqk_test.o + $(CXX) $(OBJECTS) siqk_test.o $(LDFLAGS) -o siqk_test + +clean: + rm -f *.o siqk_test diff --git a/siqk/make.inc.amb b/siqk/make.inc.amb new file mode 100644 index 0000000..457e199 --- /dev/null +++ b/siqk/make.inc.amb @@ -0,0 +1,3 @@ +opt=-O3 +CXX=g++-4.7 +KOKKOS=/home/ambradl/lib/kokkos/cpu diff --git a/siqk/readme.txt b/siqk/readme.txt new file mode 100644 index 0000000..2ace5d7 --- /dev/null +++ b/siqk/readme.txt @@ -0,0 +1,13 @@ +For clarity, suppose your your C++ compiler is g++-4.7 in what follows. But it +can be something else. + +1. Get and install the standalone Kokkos TPL: + +$ git clone https://github.com/kokkos/kokkos.git +$ ./kokkos/generate_makefile.bash --with-openmp --ldflags=-fPIC --prefix=/path/to/desired/installation --compiler=g++-4.7 + +2. cp an existing make.inc.* file to one for your machine, say, +make.inc.mymachine. Edit it with machine-specific information. Then + $ ln -s make.inc.machine make.inc + $ make -j8 + $ ./siqk_runtests.py diff --git a/siqk/si/Array_raw.hpp b/siqk/si/Array_raw.hpp deleted file mode 100644 index 7926398..0000000 --- a/siqk/si/Array_raw.hpp +++ /dev/null @@ -1,151 +0,0 @@ -#ifndef INCLUDE_ARRAY_RAW_HPP -#define INCLUDE_ARRAY_RAW_HPP - -#include -#include -#include -#include - -static inline void error(const std::string& msg) -{ throw std::runtime_error(msg.c_str()); } - -template static inline void share_nodelete_delete (T* p) {} -template inline std::shared_ptr share_nodelete (T* o) -{ return std::shared_ptr(o, share_nodelete_delete); } - -template -class Array1D { - typedef typename std::remove_const::type T_nonconst; - friend class Array1D; - int n_; - std::shared_ptr a_p_; - T* a_; -public: - typedef int size_type; - Array1D () : n_(0) {} - Array1D (const int n) { reset(n); } - Array1D (const int n, T* const a) { reset(n, a); } - Array1D (const Array1D& v) - : n_(v.n_), a_p_(v.a_p_), a_(v.a_) - {} - void reset (const int n) { - n_ = n; - a_p_ = std::shared_ptr(new T[n], std::default_delete()); - a_ = a_p_.get(); - } - void reset (const int n, T* const a) { n_ = n; a_p_ = nullptr; a_ = a; } - const int& n () const { return n_; } - T* data () { return a_; } - const T* data () const { return a_; } - T& operator[] (const int i) { debug(i); return a_[i]; } - const T& operator[] (const int i) const { debug(i); return a_[i]; } - void set (const T& init) { for (int i = 0; i < n_; ++i) a_[i] = init; } - Array1D& device () { return *this; } - const Array1D& device () const { return *this; } - void sync () {} - void modify () {} -private: -#ifdef SIQK_DEBUG - void debug (const int& i) const { - if (i < 0 || i >= n_) { - std::stringstream ss; - ss << "Array1D: i is " << i << " but n_ is " << n_ << "\n"; - error(ss.str().c_str()); - } - } -#else - static void debug (const int& i) {} -#endif -}; - -template -class Array2D { - typedef typename std::remove_const::type T_nonconst; - friend class Array2D; - int m_, n_; - std::shared_ptr a_p_; - T* a_; -public: - typedef int size_type; - Array2D () : m_(0), n_(0) {} - Array2D (const int m, const int n) { reset(m, n); } - Array2D (const int m, const int n, T* const a) { reset(m, n, a); } - Array2D (const Array2D& v) - : m_(v.m_), n_(v.n_), a_p_(v.a_p_), a_(v.a_) - {} - void reset (const int m, const int n) { - m_ = m; n_ = n; - a_p_ = std::shared_ptr(new T[m*n], std::default_delete()); - a_ = a_p_.get(); - } - void reset (const int m, const int n, T* const a) { m_ = m; n_ = n; a_p_ = nullptr; a_ = a; } - const int& m () const { return m_; } - const int& n () const { return n_; } - T* data () { return a_; } - const T* data () const { return a_; } - T& operator() (const int r, const int c) { debug(r, c); return a_[c*m_ + r]; } - const T& operator() (const int r, const int c) const { debug(r, c); return a_[c*m_ + r]; } - T* operator() (const int c) { debug(0, c); return a_ + m_*c; } - const T* operator() (const int c) const { debug(0, c); return a_ + m_*c; } - void set (const T& init) { for (int i = 0; i < m_*n_; ++i) a_[i] = init; } - Array2D& device () { return *this; } - const Array2D& device () const { return *this; } - void sync () {} - void modify () {} -private: -#ifdef SIQK_DEBUG - void debug (const int& r, const int& c) const { - if (r < 0 || r >= m_) { - std::stringstream ss; - ss << "Array2D: r is " << r << " but m_ is " << m_ << "\n"; - error(ss.str().c_str()); - } - if (c < 0 || c >= n_) { - std::stringstream ss; - ss << "Array2D: c is " << c << " but n_ is " << n_ << "\n"; - error(ss.str().c_str()); - } - } -#else - static void debug (const int& r, const int& c) {} -#endif -}; - -template inline T* slice (Array2D& a, const int c) { return a(c); } -template inline const T* slice (const Array2D& a, const int c) { return a(c); } -template inline int nslices (const Array2D& a) { return a.n(); } -template inline int szslice (const Array2D& a) { return a.m(); } - -template inline Array1D offset (Array1D& a, const int os) -{ return Array1D(a.n() - os, a.data() + os); } -template inline Array2D offset (Array2D& a, const int os) -{ return Array2D(a.m(), a.n() - os, a.data() + a.m()*os); } - -// Define a few things to minimize KOKKOS guards. -# ifndef KOKKOS_FUNCTION -# define KOKKOS_FUNCTION -# endif -# ifndef KOKKOS_INLINE_FUNCTION -# define KOKKOS_INLINE_FUNCTION inline -# endif -# ifndef KOKKOS_FORCEINLINE_FUNCTION -# define KOKKOS_FORCEINLINE_FUNCTION inline -# endif - -namespace Kokkos { -typedef void DefaultExecutionSpace; -inline void fence() {} -} - -namespace ko { -using std::min; -using std::max; - -template -void parallel_reduce (const int n, Functor f, Scalar& r) { - for (int i = 0; i < n; ++i) - f(i, r); -} -} - -#endif // INCLUDE_ARRAY_RAW_HPP diff --git a/siqk/si/Makefile b/siqk/si/Makefile deleted file mode 100644 index 76198fd..0000000 --- a/siqk/si/Makefile +++ /dev/null @@ -1,34 +0,0 @@ -opt= -CXX=g++-4.7 -FC=gfortran-4.7 -CXXFLAGS=$(opt) -Wall -pedantic -std=c++11 -DSIQK_FORTRAN -FFLAGS=$(opt) -ffixed-line-length-none -LDFLAGS=-lgfortran -fopenmp - -CXXSOURCES=test.cpp fsi.cpp -F90SOURCES=testf.f90 - -CXXOBJECTS=$(CXXSOURCES:.cpp=.o) -F90OBJECTS=$(F90SOURCES:.f90=.o) - -.SUFFIXES: -.SUFFIXES: .cpp .f .f90 .o - -.cpp.o: - $(CXX) $(CFLAGS) $(CXXFLAGS) -c $< -o $@ - -.f90.o: - $(FC) $(FFLAGS) -c $< -o $@ - -all: test - -test: $(CXXOBJECTS) $(F90OBJECTS) - $(CXX) $(CXXOBJECTS) $(LDFLAGS) -o test.exe - $(CXX) $(F90OBJECTS) fsi.o $(LDFLAGS) -o testf.exe - -clean: - rm -f *.o test.exe testf.exe - -test.o: siqp.hpp -fsi.o: fsi.h siqp.hpp -testf.o: siqp.hpp fsi.h fsi.cpp diff --git a/siqk/si/README.md b/siqk/si/README.md deleted file mode 100644 index b465b36..0000000 --- a/siqk/si/README.md +++ /dev/null @@ -1,25 +0,0 @@ -Simple sphere interesection prototype with no-Kokkos build. - -Basic build and test run: - -$ g++ -std=c++11 test.cpp -$ ./a.out -n 20 - -For performance profiling, - -$ g++ -O3 -DSIQK_TIME -std=c++11 test.cpp -$ ./a.out -n 20 - -You should see something like - -n (-n): 20 - test_area_ot 1.276e-02 s 1.4 MB 1.228e-03 s 1.153e-02 s -true area 1.0196e+00 mesh area 1.0196e+00 relerr 3.2447e-13 - -The first line is the input. The second line shows total test time, memory -highwater, octree construction time, and (search, clip, and area calculation) -time. The third line shows the true overlap area, the area based on the meshes, -and the relative error. As the mesh is refined, the relative error increases -because (a) the sphere polygon area calculation is naive and (b) the edge -normals have increasing cancellation error. Each is part of the test setup and -are not used in practice. diff --git a/siqk/si/fsi.cpp b/siqk/si/fsi.cpp deleted file mode 100644 index e9f6961..0000000 --- a/siqk/si/fsi.cpp +++ /dev/null @@ -1,51 +0,0 @@ -#include "Array_raw.hpp" -#include "siqp.hpp" - -extern "C" void clipagainstpolysphere_ ( - double const* const clip_poly, int const* const clip_poly_n_vertices, - double const* const clip_edge_normals, double const* const vi, int const* const ni, - double* const vo, int* const no, double* const wrk, int const* const n_vertices, - int* const info) -{ - Array2D avo(3, *n_vertices, vo); - const bool success = siqp::sh::clip_against_poly( - Array2D(3, *clip_poly_n_vertices, clip_poly), - Array2D(3, *clip_poly_n_vertices, clip_edge_normals), - Array2D(3, *ni, vi), *ni, - avo, *no, wrk, *n_vertices); - *info = success ? 0 : 1; -} - -extern "C" void clipagainstpolyplane_ ( - double const* const clip_poly, int const* const clip_poly_n_vertices, - double const* const clip_edge_normals, double const* const vi, int const* const ni, - double* const vo, int* const no, double* const wrk, int const* const n_vertices, - int* const info) -{ - Array2D avo(3, *n_vertices, vo); - const bool success = siqp::sh::clip_against_poly( - Array2D(3, *clip_poly_n_vertices, clip_poly), - Array2D(3, *clip_poly_n_vertices, clip_edge_normals), - Array2D(3, *ni, vi), *ni, - avo, *no, wrk, *n_vertices); - *info = success ? 0 : 1; -} - -extern "C" void iceclipagainstpolyplane_( - double const* const clip_poly, int const* const clip_poly_n_vertices, - double const* const clip_edge_normals, - double const* const vi, int const* const vti, int const* const ni, - double* const vo, int* const vto, int* const no, - double* const rwrk, int* const iwrk, int const* const nwrk, - int* const info) -{ - Array2D avo(3, *nwrk, vo); - Array1D avto(*nwrk, vto); - const bool success = - siqp::ice::clip_cpoly_against_convex_poly( - Array2D(3, *clip_poly_n_vertices, clip_poly), - Array2D(3, *clip_poly_n_vertices, clip_edge_normals), - Array2D(3, *ni, vi), Array1D(*ni, vti), *ni, - avo, avto, *no, rwrk, iwrk, *nwrk); - *info = success ? 0 : 1; -} diff --git a/siqk/si/fsi.h b/siqk/si/fsi.h deleted file mode 100644 index 3798db2..0000000 --- a/siqk/si/fsi.h +++ /dev/null @@ -1,86 +0,0 @@ -// Fortran interface to polygon clipping routines. - -extern "C" void clipagainstpolysphere_( - // 3 x clip_poly_n_vertices clip spherical polygon vertex list. - double const* const clip_poly, int const* const clip_poly_n_vertices, - // 3 x clip_poly_n_vertices clip polygon's inward-facing edge normals. - double const* const clip_edge_normals, - // 3 x ni polygon to clip. - double const* const to_clip_poly, int const* const ni, - // On output, a 3 x no clipped polygon. - double* const vo, int* const no, - // Workspace. Both vo and wrk must have n_vertices of space available. - double* const wrk, int const* const n_vertices, - // info = 0 on success. info = 1 if n_vertices is not large enough. - int* const info); - -extern "C" void clipagainstpolyplane_( - // 3 x clip_poly_n_vertices clip spherical polygon vertex list. - double const* const clip_poly, int const* const clip_poly_n_vertices, - // 3 x clip_poly_n_vertices clip polygon's inward-facing edge normals. - double const* const clip_edge_normals, - // 3 x ni polygon to clip. - double const* const to_clip_poly, int const* const ni, - // On output, a 3 x no clipped polygon. - double* const vo, int* const no, - // Workspace. Both vo and wrk must have n_vertices of space available. - double* const wrk, int const* const n_vertices, - // info = 0 on success. info = 1 if n_vertices is not large enough. - int* const info); - -// ICE: Intersection with curved edges. -// -// Some terminology: -// s, m, p: start, middle, end points of a curved edge. m is not really the -// middle or midpoint; indeed, it is unlikely to be on the curve. Rather, -// it's a point that defines the curve. See below for more. -// straight: straight on a plane, or a great arc on the sphere. -// curved: quadratic on a plane, projected quadratic on the sphere. -// cedge: a curved edge. -// sedge: a straight edge, including a great arc. -// cpoly, spoly: similar terminology; but note that a cpoly can contain a mix -// of cedges and sedges. -// (vs, vts, n): Vertex list. vs is an array of vertices. There are n -// vertices. vts is a list of vertex types. s, p are endpoint vertices (0); -// m is a midpoint node (1). If an edge is straight, then vts(k:k+1) = [0 -// 0]; if an edge is curved, then vts(k:k+2) = [0 1 0]. Keep in mind that -// there is an edge that wraps around the end of the list. The wrap can -// occur like 0|1 0 or like 0 1|0. As an example, [1 0 0 1 0] is a vertex -// type list for a triangle containing two cedges and one sedge. -// -// Some math. -// a in [0,1] is the parameter in the curve -// x(a) = (1-a)^2 s + a (1-a) m + a^2 p. (1) -// s and p sit on the curve, but m in general does not. We can define m by -// x(1/2) = M => m = 4 M - s - p, -// where M is a point that is intended to be on the curve and serves as a useful -// midpoint reference. This construction has the essential property that the -// curve is invariant to the swapping of s and p. However, the clip routines are -// independent of this definition; (s,m,p) are used as in equation (1), and that -// is all that is needed. -// When segments are extracted from x(a) in a clip, we use c in the segment -// [c,1], d in the segment [0,d], and both in [c,d]. Similarly, x(c) = r is the -// new start point, and x(d) = q is the new end point. A segment requires a -// midpoint so that the resulting parameterized curve sits on the original; this -// is n. Hence a segment of (s,m,p) defined by [c,d] subset [0,1] is (r,n,q). n -// is given by -// n = 2 (c d - c - d + 1) s + (c + d - 2 c d) m + 2 c d p, -// which satisfies -// x(a) = (1-a)^2 s + a(1-a) m + a^2 p = (1-b)^2 r + b(1-b) n + b^2 q -// for all b in [0,1], where b = (a-c)/(d-c), r = x(c), q = x(d). -// -// Even though this is a 2D routine, the vertices and vectors must still be -// 3D. The third value is ignored. -extern "C" void iceclipagainstpolyplane_( - // 3 x clip_poly_n_vertices clip spherical polygon vertex list. - double const* const clip_poly, int const* const clip_poly_n_vertices, - // 3 x clip_poly_n_vertices clip polygon's inward-facing edge normals. - double const* const clip_edge_normals, - // 3 x ni curved polygon to clip. - double const* const vi, int const* const vti, int const* const ni, - // On output, a 3 x no clipped polygon. - double* const vo, int* const vto, int* const no, - // Workspace. rwrk is 3*nwrk; iwrk is 1*nwrk. - double* const rwrk, int* const iwrk, int const* const nwrk, - // info = 0 on success. info = 1 if workspace is not large enough. - int* const info); diff --git a/siqk/si/icetest.py b/siqk/si/icetest.py deleted file mode 100755 index 5d30aac..0000000 --- a/siqk/si/icetest.py +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/python - -import os - -stride = 1 - -xlates = [4.2*10**f for f in range(-17, 0, stride)] -xlates.append(0) - -ylates = [0] - -angles = xlates - -fails = [] -cnt = 0 - -for n in [21]: - for angle in angles: - for xlate in xlates: - for ylate in ylates: - cmd = ('./test.exe --plane --xlate {xlate:1.15e} --ylate {ylate:1.14e} --angle {angle:1.15e} -n {n:d}'. - format(xlate=xlate, ylate=ylate, angle=angle, n=n)) - stat = os.system(cmd + ' |& grep PASSED &> /dev/null') - if stat: - fails.append(cmd) - else: - cnt += 1 - print len(fails) diff --git a/siqk/si/siqp.hpp b/siqk/si/siqp.hpp deleted file mode 100644 index 6fc9cd8..0000000 --- a/siqk/si/siqp.hpp +++ /dev/null @@ -1,1337 +0,0 @@ -#ifndef INCLUDE_SIQP_HPP -#define INCLUDE_SIQP_HPP - -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef SIQP_TIME -# include -# include -# include -#endif - -#define pr(m) do { \ - std::stringstream _ss_; \ - _ss_ << m << std::endl; \ - std::cerr << _ss_.str(); \ - } while (0) -#define prc(m) pr(#m << " | " << (m)) -#define puf(m)"(" << #m << " " << (m) << ")" -#define pu(m) << " " << puf(m) -template -static void prarr (const std::string& name, const T* const v, const size_t n) { - std::cerr << name << ": "; - for (size_t i = 0; i < n; ++i) std::cerr << " " << v[i]; - std::cerr << "\n"; -} - -namespace siqp { -#ifdef SIQP_TIME -static timeval tic () { - timeval t; - gettimeofday(&t, 0); - return t; -} -static double calc_et (const timeval& t1, const timeval& t2) { - static const double us = 1.0e6; - return (t2.tv_sec * us + t2.tv_usec - t1.tv_sec * us - t1.tv_usec) / us; -} -static double toc (const timeval& t1) { - timeval t; - gettimeofday(&t, 0); - return calc_et(t1, t); -} -static double get_memusage () { - static const double scale = 1.0 / (1 << 10); // Memory in MB. - rusage ru; - getrusage(RUSAGE_SELF, &ru); - return ru.ru_maxrss*scale; -} -#else -static inline int tic () { return 0; } -static inline double toc (const int&) { return 0; } -#endif -static void print_times (const std::string& name, const double* const parts, - const int nparts) { -#ifdef SIQP_TIME - double total = 0; for (int i = 0; i < nparts; ++i) total += parts[i]; - printf("%20s %1.3e s %7.1f MB", name.c_str(), total, get_memusage()); - for (int i = 0; i < nparts; ++i) printf(" %1.3e s", parts[i]); - printf("\n"); -#endif -} - -template -static void copy (V dst, CV src, const int n) { - for (int i = 0; i < n; ++i) dst[i] = src[i]; -} - -// A decorator function so that a for loop's counter can be auto typed. -template -typename V::size_type zero(const V& v) { return 0; } - -template -int solve_quadratic_equation (const CV a, const CV b, const CV c, V xs) { - const double disc = b*b - 4*a*c; - if (disc < 0) return 0; - if (disc == 0) { - xs[0] = -0.5*b/a; - return 1; - } - if (b >= 0) { - const double t = b + std::sqrt(disc); - xs[0] = -0.5*t/a; - xs[1] = -2*c/t; - } else { - const double t = std::sqrt(disc) - b; - xs[0] = 2*c/t; - xs[1] = 0.5*t/a; - } - return 2; -} - -// Planar geometry calculations. -struct PlaneGeometry { - enum { dim = 3 }; - - template - static void scale (const double a, V v) { - v[0] *= a; v[1] *= a; - } - template - static double dot (const CA a, const CB b) { - return a[0]*b[0] + a[1]*b[1]; - } - template - static double dot_c_amb (const CV c, const CV a, const CV b) { - return c[0]*(a[0] - b[0]) + c[1]*(a[1] - b[1]); - } - template - static void copy (V d, const CV s) { - d[0] = s[0]; - d[1] = s[1]; - } - template - static void combine (const CV u, const CV v, const double a, V x) { - const double oma = 1 - a; - x[0] = oma*u[0] + a*v[0]; - x[1] = oma*u[1] + a*v[1]; - } - - template - static void edge_normal (const CV e1, const CV e2, V en) { - en[0] = e1[1] - e2[1]; - en[1] = e2[0] - e1[0]; - } - - template - static bool inside (const CV v, const CV e1, const CV e2, const CV en) { - return dot_c_amb(en, v, e1) > 0 && dot_c_amb(en, v, e2) > 0; - } - - template - static void intersect (const CV v1, const CV v2, const CV e1, const CV en, - V intersection) { - double a; { - const double - num = dot_c_amb(en, e1, v1), - den = dot_c_amb(en, v2, v1); - a = num == 0 || den == 0 ? 0 : num/den; - a = a < 0 ? 0 : a > 1 ? 1 : a; - } - combine(v1, v2, a, intersection); - } - - template - static bool output (const CV v, int& no, Array2D& vo) { -#ifdef SIKQ_DEBUG - if (no >= vo.n()) { - std::stringstream ss; - ss << "output: No room in vo; vo.n() is " << vo.n() << " but no is " - << no << "\n"; - error(ss.str().c_str()); - } -#endif - if (no >= vo.n()) return false; - vo(0,no) = v[0]; - vo(1,no) = v[1]; - ++no; - return true; - } - - //todo Handle non-convex case. - - static double calc_area (const Array2D& v) { - double area = 0; - for (int i = 1; i < v.n() - 1; ++i) { - double v1[2], v2[2]; - v1[0] = v(0,i) - v(0,0); - v1[1] = v(1,i) - v(1,0); - v2[0] = v(0,i+1) - v(0,0); - v2[1] = v(1,i+1) - v(1,0); - const double a = v1[0]*v2[1] - v1[1]*v2[0]; - area += a; - } - return 0.5*area; - } - - // For quadratic edges. - template - static int intersect (const CV s, const CV m, const CV p, const CV e1, - const CV nml, V as) { - double w[3]; - for (int i = 0; i < 3; ++i) w[i] = s[i] - m[i] + p[i]; - const double a = dot(nml, w); - for (int i = 0; i < 3; ++i) w[i] = m[i] - 2*s[i]; - const double b = dot(nml, w); - for (int i = 0; i < 3; ++i) w[i] = s[i] - e1[i]; - const double c = dot(nml, w); - return solve_quadratic_equation(a, b, c, as); - } -}; - -// Geometry on the sphere. All inputs and outputs are relative to the -// unit-radius sphere. -struct SphereGeometry { - enum { dim = 3 }; - - template - static void cross (const CV a, const CV b, V c) { - c[0] = a[1]*b[2] - a[2]*b[1]; - c[1] = a[2]*b[0] - a[0]*b[2]; - c[2] = a[0]*b[1] - a[1]*b[0]; - } - template - static double dot (const CV a, const CV b) { - return a[0]*b[0] + a[1]*b[1] + a[2]*b[2]; - } - template - static double norm2 (const CV v) { - return dot(v, v); - } - template - static void scale (const double a, V v) { - v[0] *= a; v[1] *= a; v[2] *= a; - } - template - static void normalize (V v) { - scale(1.0/std::sqrt(norm2(v)), v); - } - template - static double dot_c_amb (const CV c, const CV a, const CV b) { - return c[0]*(a[0] - b[0]) + c[1]*(a[1] - b[1]) + c[2]*(a[2] - b[2]); - } - template - static void copy (V d, const CV s) { - d[0] = s[0]; - d[1] = s[1]; - d[2] = s[2]; - } - template - static void combine (const CV u, const CV v, const double a, V x) { - const double oma = 1 - a; - x[0] = oma*u[0] + a*v[0]; - x[1] = oma*u[1] + a*v[1]; - x[2] = oma*u[2] + a*v[2]; - } - - template - static void edge_normal (const CV a, const CV b, V en) { - cross(a, b, en); - normalize(en); - } - - // Is v inside the line (a1,a2) having normal n? - template - static bool inside (const CV v, const CV a1, const CV a2, const CV n) { - return dot_c_amb(n, v, a1) > 0 && dot_c_amb(n, v, a2) > 0; - } - - /* Let - n = edge normal - c = edge point - d = n' c - v(a) = (1 - a) v1 + a v2. - Solve n' v = d for a: - a = (d - n' v1) / (n' (v2 - v1)). - Then uvec(v(a)) is the intersection point on the unit sphere. Assume - intersection exists. (Already filtered by 'inside'.) - */ - template - static void intersect (const CV v1, const CV v2, const CV e1, const CV en, - V intersection) { - /* Consider the case where e1 == v1 or e1 == v2. All == are FP. - If e1 == v1, then num = 0, a = 0, and intersection is set to v1. - If e2 == v1, then num == den, a = 1, and intersection is set to v2. - These two cases I believe are the only ones that matter to the bow-tie - issue in Dave's use case. - */ - double a; { - const double - num = dot_c_amb(en, e1, v1), - den = dot_c_amb(en, v2, v1); - a = num == 0 || den == 0 ? 0 : num/den; - a = a < 0 ? 0 : a > 1 ? 1 : a; - } - if (a == 0) - copy(intersection, v1); - else if (a == 1) - copy(intersection, v2); - else { - combine(v1, v2, a, intersection); - normalize(intersection); - } - } - - template - static bool output (const CV v, int& no, Array2D& vo) { -#ifdef SIKQ_DEBUG - if (no >= vo.n()) { - std::stringstream ss; - ss << "output: No room in vo; vo.n() is " << vo.n() << " but no is " - << no << "\n"; - error(ss.str().c_str()); - } -#endif - if (no >= vo.n()) return false; - vo(0,no) = v[0]; - vo(1,no) = v[1]; - vo(2,no) = v[2]; - ++no; - return true; - } - - //todo Handle non-convex case. - // This uses a terrible formula, but it's just for testing. - static double calc_area (const Array2D& v) { - double area = 0; - for (int i = 1; i < v.n() - 1; ++i) { - const double a = calc_arc_length(v(0), v(i)); - const double b = calc_arc_length(v(i), v(i+1)); - const double c = calc_arc_length(v(i+1), v(0)); - const double s = 0.5*(a + b + c); - const double d = (std::tan(0.5*s)*std::tan(0.5*(s-a))* - std::tan(0.5*(s-b))*std::tan(0.5*(s-c))); - if (d <= 0) continue; - area += 4*std::atan(std::sqrt(d)); - } - return area; - } - template - static double calc_arc_length (const CV a, const CV b) { - const double d = dot(a, b); - if (d >= 1) return 0; - return acos(d); - } - - // For quadratic edges. - template - static int intersect (const CV s, const CV m, const CV p, const CV e1, - const CV nml, V as) { - assert(0); // Not yet. - } -}; - -// Sutherland-Hodgmann polygon clipping algorithm. Follow Foley, van Dam, -// Feiner, Hughes Fig 3.49. -namespace sh { -/* A mesh is described by the following arrays: - p: 3 x #nodes, the array of vertices. - e: max(#verts) x #elems, the array of element base-0 indices. - nml: 3 x #edges, the array of edge normals. - en: max(#verts) x #elems, the array of edge-normal base-0 indices. - e. e indexes p. e(i,j) == -1 in column j indicates that j:end are not used. - nml. As a mesh is refined, cancellation error makes an edge normal based - off of an element's vertices increasingly inaccurate. Roughly, if an edge - subtends angle phi of the sphere, -log10(phi/(2 pi)) digits are lost in the - edge normal. Therefore, we compute edge normals offline, since in certain - meshes, they can be computed by an accurate means. E.g., in a cubed-sphere - mesh, the whole line of a square face can be used to compute the edge - normal. Furthermore, there are far fewer unique edge normals than edges. - */ -struct Mesh { - Array2D p, nml; - Array2D e, en; -}; - -// Generally not a user routine. -template -bool clip_against_edge ( - // Input vertex list. - const Array2D& vi, const int ni, - // Output vertex list. - Array2D& vo, int& no, - // The end points of the clip edge segment. - const CV ce1, const CV ce2, - // Clip edge's inward-facing normal. - const CV cen) -{ - const double* s, * p; - double intersection[3]; - no = 0; - s = vi(ni-1); - for (int j = 0; j < ni; ++j) { - p = vi(j); - if (geo::inside(p, ce1, ce2, cen)) { - if (geo::inside(s, ce1, ce2, cen)) { - if ( ! geo::output(p, no, vo)) return false; - } else { - geo::intersect(s, p, ce1, cen, intersection); - if ( ! geo::output(intersection, no, vo)) return false; - if ( ! geo::output(p, no, vo)) return false; - } - } else if (geo::inside(s, ce1, ce2, cen)) { - geo::intersect(s, p, ce1, cen, intersection); - if ( ! geo::output(intersection, no, vo)) return false; - } - s = p; - } - return true; -} - -// Efficient user routine that uses the mesh data structure. -template -bool clip_against_poly ( - // Clip mesh. m.e(:,cp_e) is the element, and m.en(:,cp_e) is the - // corresponding list of normal indices. - const Mesh& m, const int cp_e, - // A list of vertices describing the polygon to clip. The vertices must be in - // a convention-determined order, such as CCW. vi(:,1:ni-1) are valid entries. - const Array2D& vi, const int ni, - // On output, vo(:,0:no-1) are vertices of the clipped polygon. no is 0 if - // there is no intersection. - Array2D& vo, int& no, - // Workspace. nvertwrk applies to both wrk and vo.n(). If nvertwrk is not - // large enough, false is returned. - double* const wrk, const int nvertwrk) -{ - Array2D vo1(3, nvertwrk, wrk); - int nos[] = { 0, 0 }; - Array2D* vs[] = { &vo, &vo1 }; - - const auto e = m.e(cp_e); - const auto en = m.en(cp_e); - - auto nv = m.e.m(); // Number of vertices in clip polygon. - while (e[nv-1] == -1) --nv; - - no = 0; - if (nv % 2 == 0) { - // Make sure the final vertex output list is in the caller's buffer. - std::swap(vs[0], vs[1]); - std::swap(nos[0], nos[1]); - } - - if ( ! clip_against_edge(vi, ni, *vs[0], nos[0], m.p(e[0]), m.p(e[1]), - m.nml(en[0]))) - return false; - if ( ! nos[0]) return true; - - for (int ie = 1, ielim = nv - 1; ; ++ie) { - if ( ! clip_against_edge(*vs[0], nos[0], *vs[1], nos[1], m.p(e[ie]), - m.p(e[(ie+1) % nv]), m.nml(en[ie]))) - return false; - if ( ! nos[1]) return true; - if (ie == ielim) break; - std::swap(vs[0], vs[1]); - std::swap(nos[0], nos[1]); - } - - no = nos[1]; - return true; -} - -// Not used for real stuff; just a convenient version for testing. In this -// version, clip_poly is a list of clip polygon vertices. This is instead of the -// mesh data structure. -template -bool clip_against_poly ( - // Clip polygon. - const Array2D& clip_poly, - // Clip polygon edges' inward-facing normals. - const Array2D& clip_edge_normals, - const Array2D& vi, const int ni, - Array2D& vo, int& no, - double* const wrk, const int nvertwrk) -{ - Array2D vo1(3, nvertwrk, wrk); - int nos[] = { 0, 0 }; - Array2D* vs[] = { &vo, &vo1 }; - - no = 0; - const auto nv = clip_poly.n(); - if (nv % 2 == 0) { - // Make sure the final vertex output list is in the caller's buffer. - std::swap(vs[0], vs[1]); - std::swap(nos[0], nos[1]); - } - - if ( ! clip_against_edge(vi, ni, *vs[0], nos[0], clip_poly(0), clip_poly(1), - clip_edge_normals(0))) - return false; - if ( ! nos[0]) return true; - - for (int ie = 1, ielim = nv - 1; ; ++ie) { - if ( ! clip_against_edge(*vs[0], nos[0], *vs[1], nos[1], clip_poly(ie), - clip_poly((ie+1) % nv), clip_edge_normals(ie))) - return false; - if ( ! nos[1]) return true; - if (ie == ielim) break; - std::swap(vs[0], vs[1]); - std::swap(nos[0], nos[1]); - } - - no = nos[1]; - return true; -} -} // namespace sh - -// ICE: Intersection with curved edges. -// -// Some terminology: -// s, m, p: start, middle, end points of a curved edge. m is not really the -// middle or midpoint; indeed, it is unlikely to be on the curve. Rather, -// it's a point that defines the curve. -// smp-plane: plane defined by s, m, p. -// straight: straight on a plane, or a great arc on the sphere. -// curved: quadratic on a plane, projected quadratic on the sphere. -// cedge: a curved edge. -// sedge: a straight edge, including a great arc. -// cpoly, spoly: similar terminology; but note that a cpoly can contain a mix -// of cedges and sedges. -// ed: short for edge -// vt: node type. s, p are endpoint vertices (0); m is a midpoint node (1). -// ced is [s m p]. -// (vs, vts, n): Vertex list. vs is an array of vertices. There are n -// vertices. vts is a list of vertex types. s, p are endpoint vertices (0); -// m is a midpoint node (1). If an edge is straight, then vts(k:k+1) = [0 -// 0]; if an edge is curved, then vts(k:k+2) = [0 1 0]. Keep in mind that -// there is an edge that wraps around the end of the list. The wrap can -// occur like 0|1 0 or like 0 1|0. As an example, [1 0 0 1 0] is a vertex -// type list for a triangle containing two cedges and one sedge. -// -// Some math. -// a in [0,1] is the parameter in the curve -// x(a) = (1-a)^2 s + a (1-a) m + a^2 p. (1) -// s and p sit on the curve, but m in general does not. We can define m by -// x(1/2) = M => m = 4 M - s - p, -// where M is a point that is intended to be on the curve and serves as a useful -// midpoint reference. This construction has the essential property that the -// curve is invariant to the swapping of s and p. However, the clip routines are -// independent of this definition; (s,m,p) are used as in equation (1), and that -// is all that is needed. -// When segments are extracted from x(a) in a clip, we use c in the segment -// [c,1], d in the segment [0,d], and both in [c,d]. Similarly, x(c) = r is the -// new start point, and x(d) = q is the new end point. A segment requires a -// midpoint so that the resulting parameterized curve sits on the original; this -// is n. Hence a segment of (s,m,p) defined by [c,d] subset [0,1] is (r,n,q). n -// is given by -// n = 2 (c d - c - d + 1) s + (c + d - 2 c d) m + 2 c d p, -// which satisfies -// x(a) = (1-a)^2 s + a(1-a) m + a^2 p = (1-b)^2 r + b(1-b) n + b^2 q -// for all b in [0,1], where b = (a-c)/(d-c), r = x(c), q = x(d). -template -struct ice { - typedef double Real; - typedef int Int; - typedef unsigned int UInt; - enum { dim = geo::dim }; - - // Follow the outer loop of the Sutherland-Hodgmann algorithm. - static bool clip_cpoly_against_convex_poly ( - // Clip polygon. - const Array2D& clip_poly, - // Clip polygon edges' inward-facing normals. - const Array2D& clip_edge_normals, - // Input vertex and vertex type lists. - const Array2D& vi, const Array1D& vti, const Int ni, - // Outputs. - Array2D& vo, Array1D& vto, Int& no, - // Workspace. n*wrk applies to both *wrk and v*o. If workspace is not large - // enough, false is returned. - Real* const rwrk, Int* const iwrk, const Int nwrk) - { - Array2D vo1(dim, nwrk, rwrk); - Array2D* vs[] = { &vo, &vo1 }; - Array1D vto1(nwrk, iwrk); - Array1D* vts[] = { &vto, &vto1 }; - int nos[] = { 0, 0 }; - - no = 0; - const auto nv = nslices(clip_poly); - if (nv % 2 == 0) { - // Make sure the final vertex output list is in the caller's buffer. - std::swap(vs[0], vs[1]); - std::swap(vts[0], vts[1]); - std::swap(nos[0], nos[1]); - } - - if ( ! clip_cpoly_against_sed(vi, vti, ni, *vs[0], *vts[0], nos[0], - slice(clip_poly, 0), slice(clip_poly, 1), - slice(clip_edge_normals, 0))) - return false; - if ( ! nos[0]) return true; - - for (Int ie = 1, ielim = nv - 1; ; ++ie) { - if ( ! clip_cpoly_against_sed(*vs[0], *vts[0], nos[0], *vs[1], *vts[1], nos[1], - slice(clip_poly, ie), slice(clip_poly, (ie+1) % nv), - slice(clip_edge_normals, ie))) - return false; - if ( ! nos[1]) return true; - if (ie == ielim) break; - std::swap(vs[0], vs[1]); - std::swap(vts[0], vts[1]); - std::swap(nos[0], nos[1]); - } - - no = nos[1]; - return true; - } - - template - static bool clip_cpoly_against_sed ( - // Input vertex and vertex type lists. - const Array2D& vi, const Array1D& vti, const Int ni, - // Outputs. - Array2D& vo, Array1D& vto, Int& no, - // The end points of the clip edge segment. - const CV se1, const CV se2, - // Clip edge's inward-facing normal. - const CV sen) - { - bool ends_connected = false; - Int k = 0; - no = 0; - for (;;) { - if (no + 5 > nslices(vo)) return false; - if (vti[k] == 1) { - // 1. Start of vi list, with [1 0] as the two first vts. - assert(k == 0); - assert(vti[ni-1] == 0); - no += clip_ced_against_sed(se1, se2, sen, - slice(vi, ni-1), slice(vi, 0), slice(vi, 1), - vo, vto); - k++; - ends_connected = true; - } else if (vti[k] == 0 && k+1 == ni) { - // 2. Last vertex in the list. - if (ends_connected) break; - if (vti[0] == 0) { - // 2a. A sedge connects end of list to start. - const UInt nv = clip_sed_against_sed(se1, se2, sen, - slice(vi, ni-1), slice(vi, 0), - offset(vo, no)); - for (UInt i = 0; i < nv; ++i) vto[no+i] = 0; - no += nv; - } else { - // 2b. Block 1 took care of this cedge. - assert(false); - } - break; - } else if (vti[k+1] == 1) { - assert(vti[k] == 0); - if (k+2 == ni) { - // 3. [0 1] at the end of the list connects to 0 at the start. - assert(vti[0] == 0); - no += clip_ced_against_sed(se1, se2, sen, - slice(vi, k), slice(vi, k+1), slice(vi, 0), - offset(vo, no), offset(vto, no)); - break; - } else { - assert(k+2 < ni); - // 4. General case: [0 1 0] in the middle of the list. - no += clip_ced_against_sed(se1, se2, sen, - slice(vi, k), slice(vi, k+1), slice(vi, k+2), - offset(vo, no), offset(vto, no)); - k += 2; - } - } else { - // 5. General case: [0 0] in the middle of the list. - assert(k+1 < ni); - assert(vti[k+1] == 0); - const UInt nv = clip_sed_against_sed(se1, se2, sen, - slice(vi, k), slice(vi, k+1), - offset(vo, no)); - for (UInt i = 0; i < nv; ++i) vto[no+i] = 0; - no += nv; - k++; - } - } - return true; - } - - // sed must have >= 2 slices allocated. - template - static UInt clip_sed_against_sed (const CV se1, const CV se2, const CV sen, - const CV s, const CV p, Array sed) { - const bool s_inside = geo::inside(s, se1, se2, sen); - const bool p_inside = geo::inside(p, se1, se2, sen); - if (p_inside) { - if (s_inside) { - copy(slice(sed, 0), p); - return 1; - } else { - geo::intersect(s, p, se1, sen, slice(sed, 0)); - copy(slice(sed, 1), p); - return 2; - } - } else { - if (s_inside) { - geo::intersect(s, p, se1, sen, slice(sed, 0)); - return 1; - } else { - return 0; - } - } - } - - // eds and evts must have >= 5 slices allocated. - template - static UInt clip_ced_against_sed (const CV se1, const CV se2, const CV sen, - const CV s, const CV m, const CV p, - Array eds, IV edvts) { - Real as[2] = {0}; - UInt nas = intersect(s, m, p, se1, sen, as); - bool s_inside = geo::inside(s, se1, se2, sen); - bool p_inside = geo::inside(p, se1, se2, sen); - // Handle cases where FP fails to lead to a consistent state. - if (p_inside != s_inside && nas == 0) { - // There is no FP intersection, so p and s might as well both be inside. - p_inside = s_inside = true; - } - - if (p_inside) { - if (s_inside) { - if (nas < 2) { - copy(slice(eds, 0), m); edvts[0] = 1; - copy(slice(eds, 1), p); edvts[1] = 0; - return 2; - } else { - middle_for_segment_0d(s, m, as[0], slice(eds, 0)); - eval(s, m, p, as[0], slice(eds, 1)); - eval(s, m, p, as[1], slice(eds, 2)); - middle_for_segment_c1(m, p, as[1], slice(eds, 3)); - copy(slice(eds, 4), p); - edvts[0] = 1; edvts[1] = 0; edvts[2] = 0; edvts[3] = 1; edvts[4] = 0; - return 5; - } - } else { - eval(s, m, p, as[0], slice(eds, 0)); - middle_for_segment_c1(m, p, as[0], slice(eds, 1)); - copy(slice(eds, 2), p); - edvts[0] = 0; edvts[1] = 1; edvts[2] = 0; - return 3; - } - } else { - if (s_inside) { - middle_for_segment_0d(s, m, as[0], slice(eds, 0)); - eval(s, m, p, as[0], slice(eds, 1)); - edvts[0] = 1; edvts[1] = 0; - return 2; - } else { - if (nas < 2) { - return 0; - } else { - eval(s, m, p, as[0], slice(eds, 0)); - middle_for_segment(s, m, p, as[0], as[1], slice(eds, 1)); - eval(s, m, p, as[1], slice(eds, 2)); - edvts[0] = 0; edvts[1] = 1; edvts[2] = 0; - return 3; - } - } - } - - assert(0); - return 0; - } - - template - static UInt intersect (const CV s, const CV m, const CV p, - const CV e1, const CV en, Real as[2]) { - Int nas = geo::intersect(s, m, p, e1, en, as); - if (nas == 2 && (as[1] < 0 || as[1] > 1)) --nas; - if (nas >= 1 && (as[0] < 0 || as[0] > 1)) { as[0] = as[1]; --nas; } - if (nas == 2 && as[0] > as[1]) std::swap(as[0], as[1]); - assert(nas >= 0 && nas <= 2); - return static_cast(nas); - } - - // Create m in (s,m,p) for the segment a in [c,d]. - template - static void middle_for_segment (const CV s, const CV m, const CV p, - const Real c, const Real d, V n) { - if (c == 0) middle_for_segment_0d(s, m, d, n); - else if (d == 1) middle_for_segment_c1(m, p, c, n); - else middle_for_segment_cd(s, m, p, c, d, n); - } - template - static void middle_for_segment_0d (const CV s, const CV m, const Real d, - V n) { - const Real tomd = 2*(1 - d); - for (UInt i = 0; i < dim; ++i) - n[i] = tomd*s[i] + d*m[i]; - } - template - static void middle_for_segment_c1 (const CV m, const CV p, const Real c, - V n) { - const Real omc = 1 - c, tc = 2*c; - for (UInt i = 0; i < dim; ++i) - n[i] = omc*m[i] + tc*p[i]; - } - template - static void middle_for_segment_cd (const CV s, const CV m, const CV p, - const Real c, const Real d, V n) { - const Real cd = c*d, tcd = 2*cd, c0 = 2*(cd - c - d + 1), c1 = c + d - tcd; - for (UInt i = 0; i < dim; ++i) - n[i] = c0*s[i] + c1*m[i] + tcd*p[i]; - } - - // Create m in (s,m,p) so that the curve hits n. - template - static void middle_matches (const CV s, const CV p, const CV n, V m) { - for (UInt i = 0; i < dim; ++i) - m[i] = 4*n[i] - s[i] - p[i]; - } - - template - static void eval (const CV s, const CV m, const CV p, const Real a, V v) { - const Real oma = 1 - a, oma2 = oma*oma, omaa = oma*a, a2 = a*a; - for (UInt i = 0; i < dim; ++i) - v[i] = oma2*s[i] + omaa*m[i] + a2*p[i]; - } - - template - static void copy (V d, const CV s) { - for (UInt i = 0; i < dim; ++i) d[i] = s[i]; - } -}; - -template constexpr T square (const T& x) { return x*x; } - -// Octree for search. -template -class Octree { -public: - typedef double BoundingBox[6]; - - struct Options { - // Do not go beyond max_depth depth, including the root and leaf. With this - // constraint, try to go deep enough so that a leaf has no more than - // max_nelem elements. - int max_depth, max_nelem; - Options () : max_depth(10), max_nelem(8) {} - }; - - // Bounding box for a cluster of points ps (possibly vertices). - static void calc_bb (const Array2D& ps, const int np, - BoundingBox bb) { - if (np == 0) return; - for (int j = 0; j < 3; ++j) - bb[j] = bb[j+3] = ps(j,0); - for (int i = 1; i < np; ++i) - for (int j = 0; j < 3; ++j) { - bb[j] = std::min(bb[j], ps(j,i)); - bb[j+3] = std::max(bb[j+3], ps(j,i)); - } - pad_bb(bb); - } - - static void calc_bb (const Array2D& ps, BoundingBox bb) { - calc_bb(ps, ps.n(), bb); - } - - template - static void calc_bb (const Array2D& p, const CIV e, - const int ne, V ebb) { - for (int j = 0; j < 3; ++j) - ebb[j] = ebb[j+3] = p(j, e[0]); - for (int i = 1; i < ne; ++i) { - if (e[i] == -1) break; - for (int j = 0; j < 3; ++j) { - ebb[j] = ko::min(ebb[j], p(j, e[i])); - ebb[j+3] = ko::max(ebb[j+3], p(j, e[i])); - } - } - pad_bb(ebb); - } - - static void calc_bb (const Array2D& p, const Array2D& e, - Array2D& ebbs) { - assert(ebbs.n() == e.n()); - for (int k = 0; k < e.n(); ++k) - calc_bb(p, e(k), e.m(), ebbs(k)); - } - - // If a bounding box was constructed from vertices of a spherical polygon, - // expand it to account for the possible protrusion of the sphere. - template - static void pad_bb (BB bb) { - if (std::is_same::value) return; - double hl = 0.5*std::sqrt(square(bb[3] - bb[0]) + square(bb[4] - bb[1]) + - square(bb[5] - bb[2])); - // Limit the half-length to the circle's radius. - hl = std::min(1.0, hl); - // Max distance from a chord of length 2 hl to the unit circle: - // hl = sin theta - // pad = 1 - cos theta = 1 - sqrt(1 - sin^2 theta) = 1 - sqrt(1 - hl^2). - const double pad = 1 - std::sqrt(1 - square(hl)); - for (int i = 0; i < 3; ++i) bb[ i] -= pad; - for (int i = 0; i < 3; ++i) bb[3+i] += pad; - } - - // p is a 3xNp array of points. e is a KxNe array of elements. An entry <0 is - // ignored. All <0 entries must be at the end of an element's list. - Octree (const Array2D& p, const Array2D& e, - const Options& o) { - init(p, e, o); - } - Octree (const Array2D& p, const Array2D& e) { - Options o; - init(p, e, o); - } - - // Apply f to every element in leaf nodes with which bb overlaps. f must have - // function - // void operator(const int element_index). - // element_index indexes e. - template - void apply (const CV bb, Functor& f) const { - if (nodes_.n() == 0) { - for (int i = 0; i < offset_[1]; ++i) - f(elems_[i]); - return; - } - apply_r(0, bb_, bb, f); - } - -private: - /* Each node in the oct-tree contains 8 integers, stored in 'nodes'. - - >0 is an index into 'nodes', pointing to a child node. - - A <=0 entry in 'nodes' indicates a leaf node. If 0, there are no elements - in the leaf. If <0, the negative of the entry minus 1 is the index of an - offset array indexing 'elems'. - - Each segment of 'elems' contains a list of element indices covered by a - leaf node. Element indices refer to the list of elements the caller - provides during oct-tree construction. - */ - - // nodes(:,i) is a list. The list includes children of node i (>0) and leaf - // node data (<=0). - Array2D nodes_; - // A leaf node corresponding to -k covers elements - // elems[offset[k] : offset[k]-1]. - Array1D offset_, elems_; - // Root node's bounding box. - BoundingBox bb_; - - class IntList { - int* const buf_; - int i_; - public: - IntList (int* const buf) : buf_(buf), i_(0) {} - void reset () { i_ = 0; } - void push (const int& i) { buf_[i_++] = i; } - int* data () { return buf_; } - int n () const { return i_; } - const int& operator[] (const int& i) const { return buf_[i]; } - }; - - class DynIntList { - std::vector buf_; - public: - DynIntList () {} - void push (const int& i) { buf_.push_back(i); } - int& back () { return buf_.back(); } - int& operator[] (const size_t i) { - if (i >= buf_.size()) - buf_.resize(i+1); - return buf_[i]; - } - int n () const { return static_cast(buf_.size()); } - const int* data () const { return buf_.data(); } - }; - - class Nodes { - std::vector buf_; - public: - int n () const { return static_cast(buf_.size()) >> 3; } - const int* data () const { return buf_.data(); } - int& operator() (const int& r, const int& c) { - const size_t ec = (c+1) << 3; - if (ec >= buf_.size()) - buf_.resize(ec); - assert(((c << 3) + r) >= 0); - assert(((c << 3) + r) < (int) buf_.size()); - return buf_[(c << 3) + r]; - } - }; - - void init (const Array2D& p, const Array2D& e, - const Options& o) { - if (e.n() == 0) return; - // Get OT's bounding box. - calc_bb(p, bb_); - // Get elements' bounding boxes. - Array2D ebbs(6, e.n()); - calc_bb(p, e, ebbs); - // Static element lists for work. Each level has active work space. - std::vector buf((o.max_depth - 1)*e.n()); - IntList es(buf.data()), wrk(buf.data() + e.n()); - for (int i = 0; i < e.n(); ++i) - es.push(i); - // Dynamic element lists. - DynIntList offset, elems; - offset[0] = 0; - // Dynamic node data structure. - Nodes nodes; - // Recurse. We don't care about the return value. If it's 0 and nodes.n() == - // 0, we'll detect as much in 'apply'. - init_r(1, bb_, ebbs, o, es, wrk, offset, elems, nodes); - // Build the static data structures. - if (elems.n() == 0) return; - offset_.reset(offset.n()); - elems_.reset(elems.n()); - memcpy(offset_.data(), offset.data(), offset.n() * sizeof(*offset_.data())); - memcpy(elems_.data(), elems.data(), elems.n() * sizeof(*offset_.data())); - if (nodes.n() == 0) return; - nodes_.reset(8, nodes.n()); - memcpy(nodes_.data(), nodes.data(), (nodes.n() << 3) * sizeof(*offset_.data())); - // Move them to the device. - nodes_.modify(); nodes_.device().sync(); - offset_.modify(); offset_.device().sync(); - elems_.modify(); elems_.device().sync(); - } - - int init_r (const int depth, // Tree's depth at this point, including root. - const BoundingBox& nbb, // My bounding box. - const Array2D& ebbs, // All elements' bounding boxes. - const Options& o, // Options controlling construct of the tree. - IntList& es, // List of elements in my bounding box. - IntList& wrk, // Work space to store working element lists. - DynIntList& offset, // Offsets into elems. - DynIntList& elems, // Elements belonging to leaf nodes. - Nodes& nodes) // Dynamic nodes data structure. - { - const int my_idx = nodes.n(); // My node index. - // Decide what to do. - if (es.n() == 0) { - // I have no elements, so return 0 to indicate I'm a leaf node containing - // nothing. - return 0; - } else if (es.n() <= o.max_nelem || depth == o.max_depth) { - // I'm a leaf node with elements. Store my list of elements and return the - // storage location. - const int os = offset.back(); - offset.push(os + es.n()); - for (int i = 0, n = es.n(); i < n; ++i) - elems[os + i] = es[i]; - return 1 - offset.n(); - } else { - // I'm not a leaf node. - nodes(0, my_idx) = 0; // Insert myself into the nodes array. - for (int ic = 0; ic < 8; ++ic) { - BoundingBox child_bb; - fill_child_bb(nbb, ic, child_bb); - // Find the elements that are in this child's bb. - IntList ces(wrk.data()); - for (int i = 0, n = es.n(); i < n; ++i) - if (do_bb_overlap(child_bb, ebbs(es[i]))) - ces.push(es[i]); - // Create some work space. - IntList cwrk(wrk.data() + ces.n()); - // Recurse. - const int child_idx = init_r(depth+1, child_bb, ebbs, o, ces, cwrk, - offset, elems, nodes); - nodes(ic, my_idx) = child_idx; - } - return my_idx; - } - } - - // Using parent bb p, fill child bb c, with child_idx in 0:7. - static void fill_child_bb (const BoundingBox& p, const int& child_idx, - BoundingBox& c) { - const double m[] = { 0.5*(p[0] + p[3]), - 0.5*(p[1] + p[4]), - 0.5*(p[2] + p[5]) }; - switch (child_idx) { - case 0: c[0] = p[0]; c[1] = p[1]; c[2] = p[2]; c[3] = m[0]; c[4] = m[1]; c[5] = m[2]; break; - case 1: c[0] = m[0]; c[1] = p[1]; c[2] = p[2]; c[3] = p[3]; c[4] = m[1]; c[5] = m[2]; break; - case 2: c[0] = m[0]; c[1] = m[1]; c[2] = p[2]; c[3] = p[3]; c[4] = p[4]; c[5] = m[2]; break; - case 3: c[0] = p[0]; c[1] = m[1]; c[2] = p[2]; c[3] = m[0]; c[4] = p[4]; c[5] = m[2]; break; - case 4: c[0] = p[0]; c[1] = p[1]; c[2] = m[2]; c[3] = m[0]; c[4] = m[1]; c[5] = p[5]; break; - case 5: c[0] = m[0]; c[1] = p[1]; c[2] = m[2]; c[3] = p[3]; c[4] = m[1]; c[5] = p[5]; break; - case 6: c[0] = m[0]; c[1] = m[1]; c[2] = m[2]; c[3] = p[3]; c[4] = p[4]; c[5] = p[5]; break; - case 7: c[0] = p[0]; c[1] = m[1]; c[2] = m[2]; c[3] = m[0]; c[4] = p[4]; c[5] = p[5]; break; - default: - // impossible - error("fill_child_bb: The impossible has happened."); - } - } - - // Do bounding boxes a and b overlap? - static bool do_bb_overlap (const BoundingBox a, const BoundingBox b) { - for (int i = 0; i < 3; ++i) - if ( ! do_lines_overlap(a[i], a[i+3], b[i], b[i+3])) - return false; - return true; - } - - static bool do_lines_overlap (const double& a1, const double& a2, - const double& b1, const double& b2) { - return ! (a2 < b1 || a1 > b2); - } - - template - void apply_r (const int ni, const BoundingBox& nbb, const CV bb, - Functor& f) const { - for (int i = 0; i < 8; ++i) { - BoundingBox child_bb; - fill_child_bb(nbb, i, child_bb); - if ( ! do_bb_overlap(child_bb, bb)) continue; - int e = nodes_(i,ni); - if (e > 0) - apply_r(e, child_bb, bb, f); - else if (e < 0) { - e = std::abs(e + 1); - for (int k = offset_[e]; k < offset_[e+1]; ++k) - f(elems_[k]); - } - } - } -}; - -namespace test { -static constexpr int max_nvert = 20; - -// In practice, we want to form high-quality normals using information about the -// mesh, such as that it is a CS mesh. For testing, form the normals from edge -// vertices. (This leads to increasing cancellation error with mesh refinement.) -template -void fill_normals (sh::Mesh& m) { - // Count number of edges. - int ne = 0; - for (auto ip = zero(m.e); ip < m.e.n(); ++ip) - for (auto iv = zero(m.e); iv < m.e.m(); ++iv) - if (m.e(iv,ip) == -1) break; else ++ne; - // Fill. - Array2D en(m.e.m(), m.e.n()); - en.set(-1); - Array2D nml(3, ne); - int ie = 0; - for (auto ip = zero(m.e); ip < m.e.n(); ++ip) - for (auto iv = zero(m.e); iv < m.e.m(); ++iv) - if (m.e(iv,ip) == -1) - break; - else { - // Somewhat complicated next node index. - const int iv_next = (iv+1 == m.e.m() ? 0 : - (m.e(iv+1,ip) == -1 ? 0 : iv+1)); - geo::edge_normal(m.p(m.e(iv, ip)), m.p(m.e(iv_next, ip)), nml(ie)); - en(iv,ip) = ie; - ++ie; - } - m.en = en; - m.nml = nml; -} - -// Used in Octree::apply to gather a set of possibly intersecting polygons. -struct OTSearchFunctor { - std::set hits; - void operator() (const int i) { hits.insert(i); } -}; - -// Find the area of the overlapping part of two meshes by summing over the areas -// of the common refinement polygons. Obviously a silly thing to do, but a good -// test and demonstration problem. -template -class TestAreaOTFunctor { - // Mesh against which to clip. ("Eulerian mesh".) - sh::Mesh cm; - // Mesh of clipped polygons. ("Departure mesh".) - const Array2D p; // 3 x #verts array of polygon vertices. - const Array2D e; // Array of polygons. e(:,k) is the k'th polygon. - // Already initialized octree used to search for possibly intersecting - // polygons. - Octree ot; - -public: - typedef double value_type; - - TestAreaOTFunctor (const sh::Mesh& cm, const Array2D& p, - const Array2D& e, const Octree& ot) - : cm(cm), p(p), e(e), ot(ot) - {} - - // k indexes (p,e). - void operator() (const int k, double& area) const { - // Clipped element bounding box. - double ebb[6]; - Octree::calc_bb(p, e(k), e.m(), ebb); - // Get list of possible overlaps. - OTSearchFunctor f; - ot.apply(ebb, f); - // In and out vertex lists. - double buf[6*max_nvert]; - Array2D - vi(3, max_nvert, buf), - vo(3, max_nvert, buf + 3*max_nvert); - int ni, no; - // Workspace. - double wrk[3*max_nvert]; - // Area of all overlapping regions. - double a = 0; - for (const auto icp : f.hits) { - // Create the polygon to be clipped. - ni = 0; - for (int i = 0; i < e.m(); ++i) { - if (e(i,k) == -1) break; - copy(vi(i), p(e(i,k)), 3); - ++ni; - } - sh::clip_against_poly(cm, icp, vi, ni, vo, no, wrk, max_nvert); - if (no) { - // A non-0 intersection was found. Accumulate the area. - a += geo::calc_area(Array2D(vo.m(), no, vo.data())); - } - } - // Add our area to the reduction. - area += a; - } -}; - -#ifdef SIKQ_DEBUG_CRITICAL -static void -write_matlab (const std::string& name, const Array2D& p) { - printf("mat=1; %s = [", name.c_str()); - for (int ip = zero(p); ip < p.n(); ++ip) - printf(" %1.15e %1.15e %1.15e;", p(0,ip), p(1,ip), p(2,ip)); - printf("].';\n"); -} -#endif - -template -class IceTestAreaOTFunctor { - sh::Mesh cm; - const Array2D p; - const Array2D e; - Octree ot; - -public: - typedef double value_type; - - IceTestAreaOTFunctor (const sh::Mesh& cm, const Array2D& p, - const Array2D& e, const Octree& ot) - : cm(cm), p(p), e(e), ot(ot) - {} - - // k indexes (p,e). - void operator() (const int k, double& area) const { - // Clipped element bounding box. - double ebb[6]; - Octree::calc_bb(p, e(k), e.m(), ebb); - // Get list of possible overlaps. - OTSearchFunctor f; - ot.apply(ebb, f); - int ni, no; - // Area of all overlapping regions. - double a = 0; - for (const auto icp : f.hits) { - ni = 0; - static const int N = 2*max_nvert; - double rbuf[15*N]; - Array2D cp(3, N, rbuf), cens(3, N, rbuf + 3*N), - vi(3, N, rbuf + 6*N), ivo(3, N, rbuf + 9*N), vo(3, N, rbuf + 12*N); - int ibuf[2*N]; - Array1D vti(N, ibuf), vto(N, ibuf + N); - int ncp = 0; - for (int i = 0; i < e.m(); ++i) { - if (e(i,icp) == -1) break; - geo::copy(cp(i), cm.p(cm.e(i,icp))); - geo::copy(cens(i), cm.nml(cm.en(i,icp))); - ++ncp; - } - for (int i = 0; i < e.m(); ++i) { - if (e(i,k) == -1) break; - vti[2*i] = 0; vti[2*i+1] = 1; - geo::copy(vi(2*i), p(e(i,k))); - ni += 2; - } - for (int i = 0; i < e.m(); ++i) { - double n[3]; - geo::combine(vi(2*i), vi((2*(i+1)) % ni), 0.5, n); - ice::middle_matches(vi(2*i), vi((2*(i+1)) % ni), n, vi(2*i+1)); - } - double rwrk[3*N]; - int iwrk[N]; - ice::clip_cpoly_against_convex_poly( - Array2D(cp.m(), ncp, cp.data()), - Array2D(cens.m(), ncp, cens.data()), - vi, vti, ni, ivo, vto, no, rwrk, iwrk, N); - int n = 0; - for (int i = 0; i < no; ++i) - if (vto[i] == 0) { - geo::copy(vo(n), ivo(i)); - ++n; - } - no = n; - if (no) { - const double - a1 = geo::calc_area(Array2D(vo.m(), no, vo.data())); - a += a1; -#ifdef SIKQ_DEBUG_CRITICAL - if (a1 < -1e-6) { - write_matlab("cp", Array2D(cp.m(), ncp, cp.data())); - write_matlab("vi", Array2D(vi.m(), ni, vi.data())); - write_matlab("vo", Array2D(vo.m(), no, vo.data())); - exit(-1); - } -#endif - } - } - // Add our area to the reduction. - area += a; - } -}; - -template -double test_area_ot ( - const Array2D& cp, const Array2D& ce, - const Array2D& p, const Array2D& e, - const bool use_ice) -{ - // Clip mesh and edge normal calculation. (In practice, we'd like to use - // higher-quality edge normals.) - sh::Mesh cm; cm.p = cp; cm.e = ce; - fill_normals(cm); - - double et[2]; - auto t = tic(); - // Build an octree over the clip mesh. - Octree ot(cp, ce); - et[0] = toc(t); - - // Compute the area in a silly way to test search and interesection. - t = tic(); - double area = 0; - if (use_ice) - ko::parallel_reduce(e.n(), IceTestAreaOTFunctor(cm, p, e, ot), area); - else - ko::parallel_reduce(e.n(), TestAreaOTFunctor(cm, p, e, ot), area); - et[1] = toc(t); - print_times("test_area_ot", et, 2); - return area; -} -} // namespace test -} // namespace siqp - -#endif // INCLUDE_SIQP_HPP diff --git a/siqk/si/test.cpp b/siqk/si/test.cpp deleted file mode 100644 index f55617a..0000000 --- a/siqk/si/test.cpp +++ /dev/null @@ -1,243 +0,0 @@ -#include "Array_raw.hpp" -#include "siqp.hpp" -using namespace siqp; -#include "fsi.h" - -template -void copy (Array2D& d, const Array2D& s) { - for (auto i = zero(s); i < s.n(); ++i) - for (auto j = zero(s); j < s.m(); ++j) - d(j,i) = s(j,i); -} - -static void -write_matlab (const std::string& name, const Array2D& p) { - printf("mat=1; %s = [", name.c_str()); - for (int ip = zero(p); ip < p.n(); ++ip) - printf(" %1.15e %1.15e %1.15e;", p(0,ip), p(1,ip), p(2,ip)); - printf("].';\n"); -} - -static void -write_matlab (const std::string& name, const Array2D& p, - const Array2D& e) { - printf("mat=1; %s.p = [", name.c_str()); - for (int ip = zero(p); ip < p.n(); ++ip) - printf(" %1.15e %1.15e %1.15e;", p(0,ip), p(1,ip), p(2,ip)); - printf("].';\n"); - printf("mat=1; %s.n = [", name.c_str()); - for (int ie = zero(e); ie < e.n(); ++ie) - printf(" %d %d %d %d;", e(0,ie)+1, e(1,ie)+1, e(2,ie)+1, e(3,ie)+1); - printf("].';\n"); -} - -#ifdef SIQK_FORTRAN -template -static int test_fortran (const Array2D& clip_poly, - const Array2D& nml, - const Array2D& poly) { - int nerr = 0, no, fno, info; - const int nvi = poly.n(); - Array2D vo(3, test::max_nvert), fvo(3, test::max_nvert); - double wrk[3*test::max_nvert]; - const int ncp = clip_poly.n(); - if (std::is_same::value) { - sh::clip_against_poly(clip_poly, nml, poly, nvi, vo, no, - wrk, test::max_nvert); - clipagainstpolysphere_(clip_poly.data(), &ncp, nml.data(), poly.data(), &nvi, - fvo.data(), &fno, wrk, &test::max_nvert, &info); - } else { - sh::clip_against_poly(clip_poly, nml, poly, nvi, vo, no, - wrk, test::max_nvert); - clipagainstpolyplane_(clip_poly.data(), &ncp, nml.data(), poly.data(), &nvi, - fvo.data(), &fno, wrk, &test::max_nvert, &info); - } - if (info != 0) ++nerr; - if (fno != no) ++nerr; - for (int i = 0; i < no; ++i) - for (int j = 0; j < 3; ++j) - if (fvo(j,i) != vo(j,i)) ++nerr; - return nerr; -} -#endif - -static void make_planar_mesh (Array2D& p, Array2D& e, - const int n) { - const double d = std::sqrt(0.5); - e.reset(4, n*n); - p.reset(3, (n+1)*(n+1)); - p.set(0); - for (int iy = 0; iy < n+1; ++iy) - for (int ix = 0; ix < n+1; ++ix) { - const auto idx = (n+1)*iy + ix; - p(0,idx) = 2*(static_cast(ix)/n - 0.5)*d; - p(1,idx) = 2*(static_cast(iy)/n - 0.5)*d; - } - for (int iy = 0; iy < n; ++iy) - for (int ix = 0; ix < n; ++ix) { - const auto idx = n*iy + ix; - e(0,idx) = (n+1)*iy + ix; - e(1,idx) = (n+1)*iy + ix+1; - e(2,idx) = (n+1)*(iy+1) + ix+1; - e(3,idx) = (n+1)*(iy+1) + ix; - } -} - -static void project_onto_sphere (Array2D& p) { - for (auto ip = zero(p); ip < p.n(); ++ip) { - p(2,ip) = 1; - SphereGeometry::normalize(p(ip)); - } -} - -static void -perturb_mesh (Array2D& p, Array2D& e, const double angle, - const double xlate, const double ylate) { - const double cr = std::cos(angle), sr = std::sin(angle); - for (auto ip = zero(p); ip < p.n(); ++ip) { - const double x = p(0,ip), y = p(1,ip); - p(0,ip) = cr*x - sr*y + xlate; - p(1,ip) = -sr*x + cr*y + ylate; - } -} - -static void fill_quad (const Array2D& p, Array2D& poly) { - const int n = static_cast(std::sqrt(p.n() - 1)); - copy(poly(0), p(0), 3); - copy(poly(1), p(n), 3); - copy(poly(2), p(p.n() - 1), 3); - copy(poly(3), p(p.n() - 1 - n), 3); -} - -// Area of the outline of (p,e) clipped against the outline of (cp,ce). -template -static double -calc_true_area (const Array2D& cp, const Array2D& ce, - const Array2D& p, const Array2D& e, - const bool wm) { - Array2D clip_poly(3, 4), poly(3, 4), nml(3, 4); - fill_quad(cp, clip_poly); - fill_quad(p, poly); - for (int i = 0; i < 4; ++i) - Geo::edge_normal(clip_poly(i), clip_poly((i+1) % 4), nml(i)); - Array2D vo(3, test::max_nvert); - int no; - { - double wrk[3*test::max_nvert]; - sh::clip_against_poly(clip_poly, nml, poly, 4, vo, no, - wrk, test::max_nvert); - } - Array2D intersection(3, no, vo.data()); - if (wm) { - write_matlab("clip_poly", clip_poly); - write_matlab("poly", poly); - write_matlab("intersection", intersection); - } -#ifdef SIQK_FORTRAN - { - // Sneak in a test of the Fortran interface. - const int nerr = test_fortran(clip_poly, nml, poly); - std::cerr << "Fortran test " << (nerr ? "FAIL" : "PASS") << "ED\n"; - } -#endif - return Geo::calc_area(intersection); -} - -template void finalize_mesh (Array2D& p) {} -template <> void finalize_mesh (Array2D& p) { - project_onto_sphere(p); -} - -template -static int -run (const int n, const double angle, const double xlate, const double ylate, - const bool wm) { - // Make the clip mesh. - Array2D cp; - Array2D ce; - make_planar_mesh(cp, ce, n); - - // Make a perturbed mesh. - Array2D p(cp.m(), cp.n()); - Array2D e(ce.m(), ce.n()); - copy(p, cp); - copy(e, ce); - perturb_mesh(p, e, angle, xlate, ylate); - - // Project these meshes onto the sphere. - finalize_mesh(cp); - finalize_mesh(p); - - // True intersection area from quadrilateral boundary of the mesh. - const double ta = calc_true_area(cp, ce, p, e, wm); - - bool pass = true; - for (int cnt = 0; - // ice works only for PlaneGeometry right now. - cnt < (std::is_same::value ? 2 : 1); - ++cnt) { - const bool use_ice = cnt == 1; - // Area from the sum over the common refinement polygons. Use sh the first - // time and ice the second. When using ice, edges are cedges in data - // structure but geometrically straight. - const double a = test::test_area_ot(cp, ce, p, e, use_ice); - - // Report information. - const double re = std::abs(a - ta)/ta; - pass = pass && re < 1e-8; - fprintf(stderr, "ice %d true area %1.4e mesh area %1.4e relerr %1.4e\n", - use_ice, ta, a, re); - if (wm) { - write_matlab("cm", cp, ce); - write_matlab("m", p, e); - } - } - return pass ? 0 : 1; -} - -inline bool -eq (const std::string& a, const char* const b1, const char* const b2 = 0) { - return (a == std::string(b1) || (b2 && a == std::string(b2)) || - a == std::string("-") + std::string(b1)); -} - -struct Input { - int n; - double angle, xlate, ylate; - bool write_matlab, geo_sphere; - - Input (int argc, char** argv) - : n(5), angle(M_PI*1e-1), xlate(1e-1), ylate(1e-1), write_matlab(false), - geo_sphere(true) - { - for (int i = 1; i < argc; ++i) { - const std::string& token = argv[i]; - if (eq(token, "-n")) n = atoi(argv[++i]); - if (eq(token, "-m", "--write-matlab")) write_matlab = true; - if (eq(token, "--plane")) geo_sphere = false; - if (eq(token, "--xlate")) xlate = atof(argv[++i]); - if (eq(token, "--ylate")) ylate = atof(argv[++i]); - if (eq(token, "--angle")) angle = atof(argv[++i]); - } - - print(std::cout); - } - - void print (std::ostream& os) { - os << "n (-n): " << n << "\n" - << "write matlab (-m): " << write_matlab << "\n" - << "planar geometry (--plane): " << ! geo_sphere << "\n" - << "angle (--angle): " << angle << "\n" - << "xlate (--xlate): " << xlate << "\n" - << "ylate (--ylate): " << ylate << "\n"; - } -}; - -int main (int argc, char** argv) { - Input in(argc, argv); - int nerr = 0; - nerr += (in.geo_sphere ? - run(in.n, in.angle, in.xlate, in.ylate, in.write_matlab) : - run(in.n, in.angle, in.xlate, in.ylate, in.write_matlab)); - std::cerr << (nerr ? "FAIL" : "PASS") << "ED\n"; -} diff --git a/siqk/si/testf.f90 b/siqk/si/testf.f90 deleted file mode 100644 index 8e0354b..0000000 --- a/siqk/si/testf.f90 +++ /dev/null @@ -1,87 +0,0 @@ -program main - implicit none - real*8 :: clip(3,4) = reshape( & - (/ -5.000000000000000000d-01, -5.000000000000000000d-01, 7.071067811865474617d-01, & - 5.000000000000000000d-01, -5.000000000000000000d-01, 7.071067811865474617d-01, & - 5.000000000000000000d-01, 5.000000000000000000d-01, 7.071067811865474617d-01, & - -5.000000000000000000d-01, 5.000000000000000000d-01, 7.071067811865474617d-01 /), (/3,4/)) - real*8 :: nml(3,4) = reshape( & - (/ 0.000000000000000000d+00, 8.164965809277260345d-01, 5.773502691896258421d-01, & - -8.164965809277260345d-01, 0.000000000000000000d+00, 5.773502691896258421d-01, & - 0.000000000000000000d+00, -8.164965809277260345d-01, 5.773502691896258421d-01, & - 8.164965809277260345d-01, 0.000000000000000000d+00, 5.773502691896258421d-01 /), (/3,4/)) - real*8 :: poly(3,4) = reshape( & - (/ 5.644736133437637804d-01, 5.644736133437637804d-01, 6.022782410790465946d-01, & - 3.127479665047677160d-01, -3.127479665047677160d-01, 8.968709042522592378d-01, & - -5.644736133437637804d-01, -5.644736133437637804d-01, 6.022782410790465946d-01, & - -3.127479665047677160d-01, 3.127479665047677160d-01, 8.968709042522592378d-01 /), (/3,4/)) - real*8 :: intersection(3,8) = reshape( & - (/ 3.342826900143281987d-01, 5.441369567663348894d-01, 7.695258640473731093d-01, & - 4.999999999999998890d-01, 5.000000000000001110d-01, 7.071067811865476838d-01, & - 5.441369567663348894d-01, 3.342826900143283098d-01, 7.695258640473731093d-01, & - 3.127479665047677160d-01, -3.127479665047677160d-01, 8.968709042522592378d-01, & - -3.342826900143280877d-01, -5.441369567663347784d-01, 7.695258640473732203d-01, & - -5.000000000000000000d-01, -4.999999999999998890d-01, 7.071067811865474617d-01, & - -5.441369567663348894d-01, -3.342826900143283098d-01, 7.695258640473731093d-01, & - -3.127479665047677160d-01, 3.127479665047677160d-01, 8.968709042522592378d-01 /), (/3,8/)) - real*8 :: plane_intersection(2,8) = reshape( & - (/ -5.000000000000000d-01, -3.397939048350230d-01, & - -3.127479665047677d-01, 3.127479665047677d-01, & - 3.397939048350230d-01, 5.000000000000000d-01, & - 5.000000000000000d-01, 5.000000000000000d-01, & - 5.000000000000000d-01, 3.397939048350231d-01, & - 3.127479665047677d-01, -3.127479665047677d-01, & - -3.397939048350231d-01, -5.000000000000000d-01, & - -5.000000000000000d-01, -5.000000000000000d-01 /), (/2,8/)) - integer :: polyt(4) = (/ 0, 0, 0, 0 /) - real*8 :: vo(3,20), rwrk(3,20) - integer :: vto(20), iwrk(20) - integer :: ncp = 4, np = 4, nvert = 20, no, info, i, j, cnt - real*8 :: err - - call clipagainstpolysphere(clip, ncp, nml, poly, np, vo, no, rwrk, nvert, info) - err = 0 - do i = 1,8 - do j = 1,3 - err = err + (vo(j,i) - intersection(j,i))**2 - end do - end do - err = sqrt(err) - if (no /= 8) err = err + 1 - print *, 'sh sphere err', err - - do i = 1,8 - do j = 1,3 - vo(j,i) = 0 - end do - end do - - call clipagainstpolyplane(clip, ncp, nml, poly, np, vo, no, rwrk, nvert, info) - err = 0 - do i = 1,8 - do j = 1,2 - err = err + (vo(j,i) - plane_intersection(j,mod(i+1,8)+1))**2 - end do - end do - err = sqrt(err) - if (no /= 8) err = err + 1 - print *, 'sh plane err', err - - do i = 1,8 - do j = 1,3 - vo(j,i) = 0 - end do - end do - - call iceclipagainstpolyplane(clip, ncp, nml, poly, polyt, np, vo, vto, no, & - rwrk, iwrk, nvert, info) - err = 0 - do i = 1,8 - do j = 1,2 - err = err + (vo(j,i) - plane_intersection(j,i))**2 - end do - end do - err = sqrt(err) - if (no /= 8) err = err + 1 - print *, 'ice plane err', err -end program main diff --git a/siqk/siqk.cpp b/siqk/siqk.cpp deleted file mode 100644 index 870bdbe..0000000 --- a/siqk/siqk.cpp +++ /dev/null @@ -1,153 +0,0 @@ -#include -#include "siqk_intersect.hpp" -#include "mexutil.hpp" -using namespace siqk; - -static void make_elems (const mexutil::ConstDenseMexMat& me, Idxs& e) { - for (size_t i = 0; i < me.n; ++i) - for (size_t j = 0; j < me.m; ++j) - e(i,j) = static_cast(me.a[me.m*i + j]) - 1; -} - -static void merror (const std::string& msg) { - Kokkos::finalize(); - mexErrMsgTxt(msg.c_str()); -} - -void mexFunction (int nlhs, mxArray** plhs, int nrhs, const mxArray** prhs) { - omp_set_num_threads(4); - Kokkos::initialize(); - using namespace mexutil; - std::string cmd = init_mex(nrhs, prhs); - try { - typedef PlaneGeometry geo; - if (cmd == "inside") { - if (nlhs != 1 || nrhs != 2) merror("in = inside(edge, points)"); - ConstDenseMexMat edge(prhs[0]); - reqorexit(edge.m == 3 && edge.n == 2); - ConstDenseMexMat points(prhs[1]); - reqorexit(points.m == 3); - DenseMexMat in(1, points.n); - plhs[0] = in.ma; - for (size_t i = 0; i < points.n; ++i) { - double en[3]; - geo::edge_normal(edge.a, edge.a + 3, en); - in.a[i] = geo::inside(points.a + points.m*i, edge.a, - const_cast(en)); - } - } else if (cmd == "intersect") { - // Assumption: Intersection exists. - if (nlhs != 1 || nrhs != 2) - merror("points = intersect(edge, edges)"); - ConstDenseMexMat edge(prhs[0]); - reqorexit(edge.m == 3 && edge.n == 2); - ConstDenseMexMat edges(prhs[1]); - DenseMexMat points(edge.m, edges.n), exists(1, edges.n); - plhs[0] = points.ma; - for (size_t i = 0; i < edges.n; ++i) { - double en[3]; - geo::edge_normal(edge.a, edge.a + 3, en); - geo::intersect(edges.a + 6*i, edges.a + 6*i + 3, edge.a, - const_cast(en), points.a + points.m*i); - } - } else if (cmd == "clip_against_edge") { - if (nlhs != 1 || nrhs != 2) - merror("vo = clip_against_edge(edge, vi)"); - ConstDenseMexMat edge(prhs[0]); - reqorexit(edge.m == 3 && edge.n == 2); - ConstDenseMexMat vi(prhs[1]); - reqorexit(vi.m == 3); - Vec3s vo("vo", test::max_nvert, 3); - int no; - double en[3]; - geo::edge_normal(edge.a, edge.a + 3, en); - sh::clip_against_edge(RawConstVec3s(vi.a, vi.n, vi.m), vi.n, - vo, no, edge.a, const_cast(en)); - DenseMexMat vom(vi.n, no); - memcpy(vom.a, vo.ptr_on_device(), vi.n*no*sizeof(double)); - plhs[0] = vom.ma; - } else if (cmd == "clip_against_poly") { - if (nlhs != 1 || nrhs != 2) - merror("vo = clip_against_poly(clip_polygon, vi)"); - ConstDenseMexMat mcp(prhs[0]); - reqorexit(mcp.m == 3); - ConstDenseMexMat vi(prhs[1]); - reqorexit(vi.m == 3); - RawConstVec3s cp(mcp.a, mcp.n, mcp.m); - Vec3s cens("cens", nslices(cp), 3); - for (int i = 0; i < nslices(cp); ++i) - geo::edge_normal(slice(cp,i), slice(cp, (i + 1) % nslices(cp)), slice(cens,i)); - Vec3s vo("vo", test::max_nvert, 3), wrk("wrk", test::max_nvert, 3); - int no; - sh::clip_against_poly(cp, cens, - RawConstVec3s(vi.a, vi.n, vi.m), vi.n, - vo, no, wrk); - DenseMexMat vom(vi.m, no); - memcpy(vom.a, vo.ptr_on_device(), vi.m*no*sizeof(double)); - plhs[0] = vom.ma; - } else if (cmd == "clip_against_poly_sphere") { - if (nlhs != 1 || nrhs != 2) - merror("vo = clip_against_poly_sphere(clip_polygon, vi)"); - ConstDenseMexMat mcp(prhs[0]); - reqorexit(mcp.m == 3); - ConstDenseMexMat vi(prhs[1]); - reqorexit(vi.m == 3); - RawConstVec3s cp(mcp.a, mcp.n, mcp.m); - Vec3s cens("cens", nslices(cp), 3); - for (int i = 0; i < nslices(cp); ++i) - SphereGeometry::edge_normal(slice(cp,i), slice(cp, (i + 1) % nslices(cp)), - slice(cens,i)); - Vec3s vo("vo", test::max_nvert, 3), wrk("wrk", test::max_nvert, 3); - int no; - sh::clip_against_poly(cp, cens, - RawConstVec3s(vi.a, vi.n, vi.m), vi.n, - vo, no, wrk); - DenseMexMat vom(vi.m, no); - memcpy(vom.a, vo.ptr_on_device(), vi.m*no*sizeof(double)); - plhs[0] = vom.ma; -#if 0 - } else if (cmd == "test_area_ot") { - // Test using oct-tree. - if (nlhs != 1 || nrhs != 4) - merror("area = test_area_ot(cp, ce, p, e)"); - ConstDenseMexMat mcp(prhs[0]); - reqorexit(mcp.m == 3); - ConstDenseMexMat mce(prhs[1]); - ConstDenseMexMat mp(prhs[2]); - reqorexit(mp.m == 3); - ConstDenseMexMat me(prhs[3]); - Array2D cp(3, mcp.n, mcp.a); - Array2D p(3, mp.n, mp.a); - Array2D ce(mce.m, mce.n), e(me.m, me.n); - make_elems(mce, ce); - make_elems(me, e); - DenseMexMat area(1, 1); - plhs[0] = area.ma; - area.a[0] = test::test_area_ot(cp, ce, p, e); - } else if (cmd == "test_area_ot_sphere") { - // Test using oct-tree. - if (nlhs != 1 || nrhs != 4) - merror("area = test_area_ot(cp, ce, p, e)"); - ConstDenseMexMat mcp(prhs[0]); - reqorexit(mcp.m == 3); - ConstDenseMexMat mce(prhs[1]); - ConstDenseMexMat mp(prhs[2]); - reqorexit(mp.m == 3); - ConstDenseMexMat me(prhs[3]); - Array2D cp(3, mcp.n, mcp.a); - Array2D p(3, mp.n, mp.a); - Array2D ce(mce.m, mce.n), e(me.m, me.n); - make_elems(mce, ce); - make_elems(me, e); - DenseMexMat area(1, 1); - plhs[0] = area.ma; - area.a[0] = test::test_area_ot(cp, ce, p, e); -#endif - } else { - merror((string("Invalid function: ") + cmd).c_str()); - } - } catch (const std::exception& e) { - merror(e.what()); - } - Kokkos::finalize(); -} diff --git a/siqk/siqk_defs.hpp b/siqk/siqk_defs.hpp index 0a0ae89..5653417 100644 --- a/siqk/siqk_defs.hpp +++ b/siqk/siqk_defs.hpp @@ -64,8 +64,9 @@ static double get_memusage () { return ru.ru_maxrss*scale; } #else -static inline int tic () { return 0; } -static inline double toc (const int&) { return 0; } +inline int tic () { return 0; } +inline double toc (const int&) { return 0; } +inline double get_memusage () { return 0; } #endif static void print_times (const std::string& name, const double* const parts, const int nparts) { diff --git a/siqk/siqk_geometry.hpp b/siqk/siqk_geometry.hpp index 0fc7fb6..c916406 100644 --- a/siqk/siqk_geometry.hpp +++ b/siqk/siqk_geometry.hpp @@ -6,6 +6,8 @@ namespace siqk { +// Vectors and points are 2D. Thus, if you're working on planes in 3D, project +// to a 2D space before calling these. struct PlaneGeometry { template KOKKOS_INLINE_FUNCTION static void scale (const Real& a, V v) { @@ -21,6 +23,11 @@ struct PlaneGeometry { x[0] = oma*u[0] + a*v[0]; x[1] = oma*u[1] + a*v[1]; } + template KOKKOS_INLINE_FUNCTION + static void axpy (const Real& a, const CV x, V y) { + y[0] += a*x[0]; + y[1] += a*x[1]; + } template KOKKOS_INLINE_FUNCTION static void edge_normal (const CV e1, const CV e2, V en) { @@ -64,31 +71,47 @@ struct PlaneGeometry { } //todo Handle non-convex case. - template + template KOKKOS_INLINE_FUNCTION - static Real calc_area (const TriangleQuadrature& , const CV3s& v, + static Real calc_area (const TriangleQuadrature& , const CV2s& v, const Int n) { return calc_area_formula(v, n); } - template + template KOKKOS_INLINE_FUNCTION - static Real calc_area_formula (const CV3s& v, const Int n) { + static Real calc_area_formula (const CV2s& v, const Int n) { Real area = 0; - for (Int i = 1, ilim = n - 1; i < ilim; ++i) { - Real v1[2], v2[2]; - v1[0] = v(i,0) - v(0,0); - v1[1] = v(i,1) - v(0,1); - v2[0] = v(i+1,0) - v(0,0); - v2[1] = v(i+1,1) - v(0,1); - const Real a = v1[0]*v2[1] - v1[1]*v2[0]; - area += a; - } + for (Int i = 1, ilim = n - 1; i < ilim; ++i) + area += calc_tri_jacobian(slice(v,0), slice(v,i), slice(v,i+1)); return 0.5*area; } + + template + KOKKOS_INLINE_FUNCTION + static void bary2coord (const CV v1, const CV v2, const CV v3, const CA alpha, + Real u[2]) { + for (Int k = 0; k < 2; ++k) u[k] = 0; + axpy(alpha[0], v1, u); + axpy(alpha[1], v2, u); + axpy(alpha[2], v3, u); + } + + template + KOKKOS_INLINE_FUNCTION + static Real calc_tri_jacobian (const CV v1, const CV v2, const CV v3) { + Real r1[2], r2[2]; + r1[0] = v2[0] - v1[0]; + r1[1] = v2[1] - v1[1]; + r2[0] = v3[0] - v1[0]; + r2[1] = v3[1] - v1[1]; + const Real a = r1[0]*r2[1] - r1[1]*r2[0]; + return a; + } }; -// All inputs and outputs are relative to the unit-radius sphere. +// All inputs and outputs are relative to the unit-radius sphere. Vectors and +// points are 3D. struct SphereGeometry { template KOKKOS_INLINE_FUNCTION static void cross (const CV a, const CV b, V c) { diff --git a/siqk/siqk_runtests.py b/siqk/siqk_runtests.py new file mode 100644 index 0000000..d5783b9 --- /dev/null +++ b/siqk/siqk_runtests.py @@ -0,0 +1,53 @@ +#!/usr/bin/python + +import os + +quick = True + +stride = 1 +biggest = 1111 + +xlates = [4.2*10**f for f in range(-17, 0, stride)] +xlates.append(0) + +ylates = [0] + +angles = xlates + +fails = [] +cnt = 0 + +# Test 1 +for n in [4, 20, 40, 79]: + if quick and n > 20: break + for angle in angles: + cmd = ('OMP_NUM_THREADS=8 ./a.out --testno 1 --angle {angle:1.15e} -n {n:d}'. + format(angle=angle, n=n)) + stat = os.system(cmd + ' |& grep PASSED &> /dev/null') + if stat: + fails.append(cmd) + else: + cnt += 1 + print len(fails) + +# Test 0 +for n in [4, 50, 511, biggest]: + if quick and n > 50: break + for angle in angles: + for xlate in xlates: + for ylate in ylates: + cmd = ('OMP_NUM_THREADS=8 ./a.out --testno 0 --xlate {xlate:1.15e} --ylate {ylate:1.14e} --angle {angle:1.15e} -n {n:d}'. + format(xlate=xlate, ylate=ylate, angle=angle, n=n)) + stat = os.system(cmd + ' |& grep PASSED &> /dev/null') + if stat: + fails.append(cmd) + else: + cnt += 1 + print len(fails) + +if len(fails) > 0: + print 'FAILED' + for f in fails: + print f +else: + print 'PASSED ({0:d})'.format(cnt) diff --git a/siqk/siqk_search.hpp b/siqk/siqk_search.hpp index b9eab6a..c9d70b8 100644 --- a/siqk/siqk_search.hpp +++ b/siqk/siqk_search.hpp @@ -3,6 +3,7 @@ #include "siqk_defs.hpp" #include "siqk_geometry.hpp" +#include namespace siqk { diff --git a/siqk/siqk_sqr.hpp b/siqk/siqk_sqr.hpp index 3d0f9fd..8a20cf7 100644 --- a/siqk/siqk_sqr.hpp +++ b/siqk/siqk_sqr.hpp @@ -173,6 +173,13 @@ void calc_sphere_to_ref ( } } +// Ref coords, packed (x,y), CCW, starting from (-1,-1). +KOKKOS_INLINE_FUNCTION +const Real* get_ref_vertices () { + static const Real c[] = {-1, -1, 1, -1, 1, 1, -1, 1}; + return c; +} + namespace test { struct Info { Int sum_nits, max_nits, nfails; @@ -240,7 +247,7 @@ class TestSphereToRefKernel { } }; -static Int test_sphere_to_ref (const ConstVec3s::HostMirror& p, +inline Int test_sphere_to_ref (const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e) { TestSphereToRefKernel k(p, e); Info info; diff --git a/siqk/siqk_test.cpp b/siqk/siqk_test.cpp index 1b37a59..7d73cdf 100644 --- a/siqk/siqk_test.cpp +++ b/siqk/siqk_test.cpp @@ -7,6 +7,8 @@ #include "siqk.hpp" using namespace siqk; +#define INSTANTIATE_PLANE + //> Code that will likely be moved to library files. template @@ -506,7 +508,7 @@ int main (int argc, char** argv) { nerr += run(in); else { #ifdef INSTANTIATE_PLANE - run(in); + nerr += run(in); #else Kokkos::abort("PlaneGeometry not instantiated."); #endif diff --git a/siqk/slmm/Makefile b/siqk/slmm/Makefile deleted file mode 100644 index 7dd003b..0000000 --- a/siqk/slmm/Makefile +++ /dev/null @@ -1,45 +0,0 @@ -opt= -CXX=g++-4.7 - -KOKKOS=/home/ambradl/lib/kokkos/cpu -SIQK=.. -LINK_LAPACK_BLAS=-llapack -lblas -# Optional. Comment out if no TPL available. -NETCDF=/home/ambradl/lib/netcdf - -# Should not have to change the rest. - -CXXFLAGS=$(opt) -Wall -pedantic -fopenmp -std=c++11 -I$(SIQK) -I$(KOKKOS)/include -DSIQK_TIME -LDFLAGS=-fopenmp -L$(KOKKOS)/lib -lkokkos -ldl - -ifdef NETCDF - CXXFLAGS+=-I$(NETCDF)/include -DSLMM_HAVE_NETCDF - LDFLAGS+=-L$(NETCDF)/lib -lnetcdf_c++ -lnetcdf -Wl,-rpath=$(NETCDF)/lib -endif - -SOURCES=slmm_mesh.cpp slmm_io.cpp slmm_time_int.cpp slmm_gallery.cpp slmm_util.cpp - -OBJECTS=$(SOURCES:.cpp=.o) - -.cpp.o: - $(CXX) $(CFLAGS) $(CXXFLAGS) -c $< -o $@ - -all: slmm_test slmmir - -slmm_test: $(OBJECTS) slmm_test.o - $(CXX) $(OBJECTS) slmm_test.o $(LDFLAGS) -o slmm_test - -slmmir: $(OBJECTS) slmmir.o - $(CXX) $(OBJECTS) slmmir.o $(LDFLAGS) $(LINK_LAPACK_BLAS) -o slmmir - -clean: - rm -f *.o slmm_test slmmir - -$(SIQK)/siqk.hpp: $(SIQK)/siqk_intersect.hpp $(SIQK)/siqk_geometry.hpp $(SIQK)/siqk_sqr.hpp $(SIQK)/siqk_search.hpp $(SIQK)/siqk_quadrature.hpp -slmm_test.o: slmm_defs.hpp slmm_mesh.hpp slmm_gll.hpp slmm_io.hpp slmm_time_int.hpp slmm_gallery.hpp $(SIQK)/siqk.hpp -slmmir.o: slmm_defs.hpp slmm_util.hpp slmm_mesh.hpp slmm_gll.hpp slmm_io.hpp slmm_time_int.hpp slmm_gallery.hpp $(SIQK)/siqk.hpp -slmm_mesh.o: slmm_mesh.hpp $(SIQK)/siqk.hpp -slmm_io.o: slmm_io.hpp -slmm_time_int.o: slmm_time_int.hpp -slmm_gallery.o: slmm_gallery.hpp -slmm_util.o: slmm_util.hpp diff --git a/siqk/slmm/slmm_debug.hpp b/siqk/slmm/slmm_debug.hpp deleted file mode 100644 index 010e549..0000000 --- a/siqk/slmm/slmm_debug.hpp +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef INCLUDE_SLMM_DEBUG_HPP -#define INCLUDE_SLMM_DEBUG_HPP - -#include -#include - -namespace slmm { - -template -void write_matlab (const std::string& name, const CV3s& p) { - std::cout << "mat=1; " << name << " = ["; - for (Int ip = 0; ip < nslices(p); ++ip) { - for (Int k = 0; k < szslice(p); ++k) - std::cout << " " << p(ip,k); - std::cout << ";"; - } - std::cout << "].';\n"; -} - -template -void write_matlab (const std::string& name, const CV3s& p, const CIs& e) { - printf("mat=1; %s.p = [", name.c_str()); - for (Int ip = 0; ip < nslices(p); ++ip) - printf(" %1.15e %1.15e %1.15e;", p(ip,0), p(ip,1), p(ip,2)); - printf("].';\n"); - printf("mat=1; %s.n = [", name.c_str()); - for (Int ie = 0; ie < nslices(e); ++ie) { - for (Int k = 0; k < szslice(e); ++k) - printf(" %d", e(ie,k)+1); - printf(";"); - } - printf("].';\n"); -} - -} // namespace slmm - -#endif diff --git a/siqk/slmm/slmm_defs.hpp b/siqk/slmm/slmm_defs.hpp deleted file mode 100644 index e7409b2..0000000 --- a/siqk/slmm/slmm_defs.hpp +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef INCLUDE_SLMM_DEFS_HPP -#define INCLUDE_SLMM_DEFS_HPP - -#include "siqk.hpp" - -namespace slmm { -using siqk::Int; -using siqk::Real; -typedef Int Size; - -namespace ko = Kokkos; -using geometry = siqk::SphereGeometry; - -using siqk::Vec3s; -using siqk::ConstVec3s; -using siqk::Idxs; -using siqk::ConstIdxs; -typedef ko::View IdxArray; -typedef ko::View ConstIdxArray; -typedef ko::View RealArray; -typedef ko::View ConstRealArray; -typedef ko::View RealArray2; -typedef ko::View ConstRealArray2; - -// A 2D array A can be thought of as having nslices(A) rows and szslice(A) -// columns. A slice can be obtained by -// auto ak = slice(A, k); -// We use this format for arrays of vertices and adjacency arrays, for -// example. In most or all cases, the intention is to parallelize over slices, -// so a Kokkos operator() will do work on a particular slice. -using siqk::nslices; -using siqk::szslice; -using siqk::slice; -} // namespace slmm - -#endif diff --git a/siqk/slmm/slmm_gallery.cpp b/siqk/slmm/slmm_gallery.cpp deleted file mode 100644 index 3cc4956..0000000 --- a/siqk/slmm/slmm_gallery.cpp +++ /dev/null @@ -1,14 +0,0 @@ -#include "slmm_gallery.hpp" - -namespace slmm { -namespace gallery { - -const char* InitialCondition::inputs[] = - {"xyztrig", "gaussianhills", "cosinebells", "slottedcylinders", - "correlatedcosinebells"}; - -const char* WindFieldType::inputs[] = - {"dcmip1d3ll", "nondivergent", "divergent", "rotate", "nondivergenthack"}; - -} // namespace gallery -} // namespace slmm diff --git a/siqk/slmm/slmm_gallery.hpp b/siqk/slmm/slmm_gallery.hpp deleted file mode 100644 index c4214f5..0000000 --- a/siqk/slmm/slmm_gallery.hpp +++ /dev/null @@ -1,312 +0,0 @@ -#ifndef INCLUDE_SLMM_GALLERY_HPP -#define INCLUDE_SLMM_GALLERY_HPP - -#include "slmm_defs.hpp" -#include "slmm_time_int.hpp" - -namespace slmm { -namespace gallery { - -class OdeFnBasicRecorder { - mutable int ne_; - bool xyz_form_; -public: - OdeFnBasicRecorder () : ne_(0), xyz_form_(false) {} - void record (const Real t, const Real* const y) const { ++ne_; } - int ne () const { return ne_; } - void set_xyz_form (const bool use_xyz_form) { xyz_form_ = use_xyz_form; } - bool use_xyz_form () const { return xyz_form_; } -}; - -// From Lauritzen et al, A standard test case suite for two-dimensional linear -// transport on the sphere, Geosci. Model Dev., 2012. -class InitialCondition { - static const char* inputs[]; - - static inline Real GH (const Real x, const Real y, const Real z, - const Real xi, const Real yi, const Real zi) { - const Real h_max = 0.95, b = 5; - return h_max*std::exp(-b*slmm::square(x - xi) + slmm::square(y - yi) + - slmm::square(z - zi)); - } - - static inline Real CB (const Real ri, const Real r) { - const Real h_max = 1; - return 0.5*h_max*(1 + std::cos(M_PI*ri/r)); - } - -public: - enum Shape { - XYZTrig, GaussianHills, CosineBells, SlottedCylinders, - CorrelatedCosineBells - }; - - static Shape from_string (const std::string& si) { - std::string s(si); - slmm::tolower(s); - if (s == inputs[0]) return XYZTrig; - if (s == inputs[1]) return GaussianHills; - if (s == inputs[2]) return CosineBells; - if (s == inputs[3]) return SlottedCylinders; - if (s == inputs[4]) return CorrelatedCosineBells; - throw std::runtime_error(si + " is not an initial condition."); - } - - static void init (const Shape shape, const Size n, const Real* const lat, - const Real* const lon, Real* const u) { - const Real lon1 = 5*(M_PI/6), lat1 = 0, lon2 = 7*(M_PI/6), lat2 = 0; - Real x1, y1, z1, x2, y2, z2; - slmm::ll2xyz(lat1, lon1, x1, y1, z1); - slmm::ll2xyz(lat2, lon2, x2, y2, z2); - switch (shape) { - case XYZTrig: { - for (Size i = 0; i < n; ++i) { - Real x, y, z; - slmm::ll2xyz(lat[i], lon[i], x, y, z, 1); - u[i] = std::sin(3*x)*std::sin(3*y)*std::sin(4*z); - } - } break; - case GaussianHills: { - for (Size i = 0; i < n; ++i) { - Real x, y, z; - slmm::ll2xyz(lat[i], lon[i], x, y, z, 1); - u[i] = GH(x, y, z, x1, y1, z1) + GH(x, y, z, x2, y2, z2); - } - } break; - case CosineBells: { - const Real r = 0.5, b = 0.1, c = 0.9; - for (Size i = 0; i < n; ++i) { - const Real r1 = slmm::great_circle_dist(lat[i], lon[i], lat1, lon1); - Real h = 0; - if (r1 < r) - h = CB(r1, r); - else { - const Real r2 = slmm::great_circle_dist(lat[i], lon[i], lat2, lon2); - if (r2 < r) - h = CB(r2, r); - } - u[i] = b + c*h; - } - } break; - case SlottedCylinders: { - const Real b = 0.1, c = 1, R = 1, r = 0.5*R, lon_thr = r/(6*R), - lat_thr = 5*(r/(12*R)); - for (Size i = 0; i < n; ++i) { - const Real r1 = slmm::great_circle_dist(lat[i], lon[i], lat1, lon1); - if (r1 <= r) { - if (std::abs(lon[i] - lon1) >= lon_thr) { - u[i] = c; - continue; - } - if (std::abs(lon[i] - lon1) < lon_thr && lat[i] - lat1 < -lat_thr) { - u[i] = c; - continue; - } - } - const Real r2 = slmm::great_circle_dist(lat[i], lon[i], lat2, lon2); - if (r2 <= r) { - if (std::abs(lon[i] - lon2) >= lon_thr) { - u[i] = c; - continue; - } - if (std::abs(lon[i] - lon2) < lon_thr && lat[i] - lat2 > lat_thr) { - u[i] = c; - continue; - } - } - u[i] = b; - } - } break; - case CorrelatedCosineBells: { - const Real a = -0.8, b = 0.9; - init(CosineBells, n, lat, lon, u); - for (Size i = 0; i < n; ++i) - u[i] = a*slmm::square(u[i]) + b; - } break; - default: assert(0); - } - } - - static std::string get_inputs () - { return slmm::format_strings_as_list(inputs, 5); } -}; - -// Convert from (u,v), where u is velocity along latitude and v is velocity -// along longitude, to (x,y,z), which is velocity in the global cartesian -// coordinate system. Add a w (local vertical) component to push the position -// (X,Y,Z) back to the unit sphere. -inline void uv2xyz ( - const Real X, const Real Y, const Real Z, // position - const Real u, const Real v, // velocity in tangent coord system - Real& x, Real& y, Real& z) // velocity in global coord system -{ - // r should be 1 but will numerically drift, so measure it ... - const Real r = std::sqrt(X*X + Y*Y + Z*Z); - // ... and then add a local vertical velocity to project back to the sphere. - const Real w = (1 - r)/slmm::consts::earth_radius_m; - Real R[9]; // Row major. - // The local vertical is just the position vector. - R[2] = X/r; R[5] = Y/r; R[8] = Z/r; - // The local along-latitude vector. - R[0] = -Y; R[3] = X; R[6] = 0; - const Real den = std::sqrt(R[0]*R[0] + R[3]*R[3]); - R[0] /= den; R[3] /= den; - // Local vertical x along-latitude. - R[1] = R[5]*R[6] - R[8]*R[3]; - R[4] = R[8]*R[0] - R[2]*R[6]; - R[7] = R[2]*R[3] - R[5]*R[0]; - // Transform. - x = R[0]*u + R[1]*v + R[2]*w; - y = R[3]*u + R[4]*v + R[5]*w; - z = R[6]*u + R[7]*v + R[8]*w; -} - -// Integrate the ODE in lat-lon space. Not good numerically in the lon direction -// because of the poles. -struct Dcmip1d3llOdeFn : public OdeFnBasicRecorder { - bool eval (const Real t, const Real* const d, Real* const f) const { - assert ( ! use_xyz_form()); - const Real - a = M_PI/6, - a_ref = slmm::consts::earth_radius_m, - tau = 1036800, - u0 = 2*M_PI*a_ref/tau, - sina = std::sin(a), - cosa = std::sqrt(1 - slmm::square(sina)), - lat = d[0], - lon = d[1], - sinp = std::sin(lat), - cosp = std::cos(lat), - sinl = std::sin(lon), - cosl = std::cos(lon); - // In what follows, - // u = u0*(cosp*cosa + sinp*cosl*sina) - // v = -u0*sinl*sina - // w = 0 - // lat_t = slmm::m2radlat(v) - // lon_t = slmm::m2radlon(lat, u). - // For numerical reasons, write this a little differently. - const Real v = -u0*sinl*sina; - f[0] = slmm::m2radlat(v); - // tan(phi) is singular at the pole. We could introduce a cutoff so the wind - // speed is not infinite, but for now it does not matter. - f[1] = slmm::m2radlat(u0*(slmm::sign(cosp)*cosa + - sinp*cosl*sina/std::abs(cosp))); - return true; - } -}; - -// Also from Lauritzen et al. -struct NonDivergentWindField : public OdeFnBasicRecorder { - bool eval (const Real t, const Real* const d, Real* const f) const { - Real theta, lambda; - if (use_xyz_form()) - xyz2ll(d[0], d[1], d[2], theta, lambda); - else { - theta = d[0]; // latitude - lambda = d[1]; // longitude - } - const Real - T = slmm::day2sec(12), - R = slmm::consts::earth_radius_m, - lambda_p = lambda - 2*M_PI*t/T, - costh = std::cos(theta), - cost = std::cos(M_PI*t/T); - // v - f[0] = 10*R/T*std::sin(2*lambda_p)*costh*cost; - // u - f[1] = R/T*(10*slmm::square(std::sin(lambda_p))*std::sin(2*theta)*cost + - 2*M_PI*costh); - if (use_xyz_form()) - uv2xyz(d[0], d[1], d[2], f[1]/R, f[0]/R, f[0], f[1], f[2]); - else { - f[0] = slmm::m2radlat(f[0]); - f[1] = slmm::m2radlon(theta, f[1]); - } - return true; - } -}; - -// Also from Lauritzen et al. -struct DivergentWindField : public OdeFnBasicRecorder { - bool eval (const Real t, const Real* const d, Real* const f) const { - Real theta, lambda; - if (use_xyz_form()) - xyz2ll(d[0], d[1], d[2], theta, lambda); - else { - theta = d[0]; // latitude - lambda = d[1]; // longitude - } - const Real - T = slmm::day2sec(12), - R = slmm::consts::earth_radius_m, - lambda_p = lambda - 2*M_PI*t/T, - costh = std::cos(theta), - cost = std::cos(M_PI*t/T); - // v - f[0] = 2.5*R/T*std::sin(lambda_p)*slmm::cube(costh)*cost; - // u - f[1] = R/T*(-5*slmm::square(std::sin(0.5*lambda_p))*std::sin(2*theta)* - slmm::square(costh)*cost + 2*M_PI*costh); - if (use_xyz_form()) - uv2xyz(d[0], d[1], d[2], f[1]/R, f[0]/R, f[0], f[1], f[2]); - else { - f[0] = slmm::m2radlat(f[0]); - f[1] = slmm::m2radlon(theta, f[1]); - } - return true; - } -}; - -struct NonDivergentWindFieldHack : public OdeFnBasicRecorder { - bool eval (const Real t, const Real* const d, Real* const f) const { - Real theta, lambda; - if (use_xyz_form()) - xyz2ll(d[0], d[1], d[2], theta, lambda); - else { - theta = d[0]; // latitude - lambda = d[1]; // longitude - } - const Real - T = slmm::day2sec(12), - R = slmm::consts::earth_radius_m, - lambda_p = lambda, - costh = std::cos(theta), - cost = std::cos(M_PI*t/T); - // v - f[0] = 10*R/T*std::sin(2*lambda_p)*costh*cost; - // u - f[1] = 10*R/T*slmm::square(std::sin(lambda_p))*std::sin(2*theta)*cost; - if (use_xyz_form()) - uv2xyz(d[0], d[1], d[2], f[1]/R, f[0]/R, f[0], f[1], f[2]); - else { - f[0] = slmm::m2radlat(f[0]); - f[1] = slmm::m2radlon(theta, f[1]); - } - return true; - } -}; - -struct WindFieldType { - static const char* inputs[]; -public: - enum Enum { Dcmip1d3ll, NonDivergentWindField, DivergentWindField, Rotate, - NonDivergentWindFieldHack }; - static Enum from_string (const std::string& si) { - std::string s(si); - slmm::tolower(s); - if (s == inputs[0]) return Dcmip1d3ll; - if (s == inputs[1]) return NonDivergentWindField; - if (s == inputs[2]) return DivergentWindField; - if (s == inputs[3]) return Rotate; - if (s == inputs[4]) return NonDivergentWindFieldHack; - throw std::runtime_error(si + " is not an ODE function."); - } - static std::string get_inputs () - { return slmm::format_strings_as_list(inputs, 4); } -}; - -} // namespace gallery -} // namespace slmm - -#endif diff --git a/siqk/slmm/slmm_gll.hpp b/siqk/slmm/slmm_gll.hpp deleted file mode 100644 index b653686..0000000 --- a/siqk/slmm/slmm_gll.hpp +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef INCLUDE_SLMM_GLL_HPP -#define INCLUDE_SLMM_GLL_HPP - -#include "slmm_defs.hpp" - -namespace slmm { - -class GLL { - const Real oo3 = 1.0/3.0; - const Real to3 = 2.0/3.0; - const Real sqrt5 = std::sqrt(5.0); - const Real oo6 = 1.0/6.0; - const Real np2_coord[2] = {-1.0, 1.0}; - const Real np2_wt[2] = {1.0, 1.0}; - const Real np3_coord[3] = {-1.0, 0.0, 1.0}; - const Real np3_wt[3] = {oo3, 2.0 - to3, oo3}; - const Real np4_coord[4] = {-1.0, -1.0/sqrt5, 1.0/sqrt5, 1.0}; - const Real np4_wt[4] = {oo6, 1.0 - oo6, 1.0 - oo6, oo6}; - -public: - enum { max_np = 4 }; - - KOKKOS_INLINE_FUNCTION GLL () {} - - KOKKOS_INLINE_FUNCTION - void get_coef (const int np, const Real*& coord, const Real*& wt) { - switch (np) { - case 2: - coord = np2_coord; - wt = np2_wt; - break; - case 3: - coord = np3_coord; - wt = np3_wt; - break; - case 4: - coord = np4_coord; - wt = np4_wt; - break; - default: - ko::abort("GLL::get_coef: order not supported."); - } - } - - // x in [-1, 1]. - KOKKOS_INLINE_FUNCTION - void eval (const int np, const Real& x, Real* const ge) const { - switch (np) { - case 2: { - ge[0] = 0.5*(1.0 - x); - ge[1] = 0.5*(1.0 + x); - } break; - case 3: { - const Real x2 = x*x; - ge[0] = 0.5*(x2 - x); - ge[1] = 1.0 - x2; - ge[2] = 0.5*(x2 + x); - } break; - case 4: { - const Real oo8 = 1.0/8.0; - const Real x2 = x*x; - ge[0] = (1.0 - x)*(5.0*x2 - 1.0)*oo8; - ge[1] = -sqrt5*oo8*(sqrt5 - 5.0*x)*(x2 - 1.0); - ge[2] = -sqrt5*oo8*(sqrt5 + 5.0*x)*(x2 - 1.0); - ge[3] = (1.0 + x)*(5.0*x2 - 1.0)*oo8; - } break; - default: - ko::abort("GLL::eval: order not supported."); - } - } -}; - -} // namespace slmm - -#endif diff --git a/siqk/slmm/slmm_io.cpp b/siqk/slmm/slmm_io.cpp deleted file mode 100644 index ca6062e..0000000 --- a/siqk/slmm/slmm_io.cpp +++ /dev/null @@ -1,314 +0,0 @@ -#include "slmm_io.hpp" - -#include - -#ifdef SLMM_HAVE_NETCDF -# include -#endif - -namespace slmm { -namespace io { - -NetcdfWriter::NetcdfWriter ( - const Vec3s::HostMirror& p, const Idxs::HostMirror& c2n, - const std::string& out_fn, const Int np, const Int monotone_type) -{ - init(p, c2n, out_fn, np, monotone_type); -} - -void NetcdfWriter::init ( - const Vec3s::HostMirror& p, const Idxs::HostMirror& c2n, - const std::string& out_fn, const Int np, const Int monotone_type) -{ -#ifdef SLMM_HAVE_NETCDF - nn_ = nslices(p); - nc_ = nslices(c2n); - - time_idx_ = 0; - time_ = 0; - define_done_ = false; - - //todo Do I need this? NcError error(NcError::silent_nonfatal); - ncf_ = std::make_shared(out_fn.c_str(), NcFile::Replace); - if ( ! ncf_->is_valid()) - throw std::runtime_error(std::string("Could not open file ") + out_fn + - " for writing."); - - // Thank you, TempestRemap, for figuring out the Exodus stuff. - static const int len_str = 33; - auto nodes_dim = ncf_->add_dim("num_nodes", nn_); - auto len_str_dim = ncf_->add_dim("len_string", len_str); - auto time_dim = ncf_->add_dim("time_step"); - auto cells_dim = ncf_->add_dim("num_elem", nc_); - auto num_el_blk_dim = ncf_->add_dim("num_el_blk", 1); - auto nodes_per_cell_dim = ncf_->add_dim("num_nod_per_el1", szslice(c2n)); - auto att_block1_dim = ncf_->add_dim("num_att_in_blk1", 1); - ncf_->add_dim("len_line", 81); - ncf_->add_dim("num_dim", 3); - ncf_->add_dim("num_el_in_blk1", nc_); - ncf_->add_att("api_version", 4.98f); - ncf_->add_att("version", 4.98f); - ncf_->add_att("floating_point_word_size", 8); - ncf_->add_att("file_size", 1); - ncf_->add_att("title", "slmm::io::NetcdfWriter::init"); - - ncf_->add_var("time_whole", ncDouble, time_dim); - ncf_->add_var("eb_names", ncChar, num_el_blk_dim, len_str_dim); - { // elem map - std::vector elem(nc_); - for (Int i = 0; i < nc_; ++i) elem[i] = i+1; - ncf_->add_var("elem_map", ncInt, cells_dim)->put(elem.data(), nc_); - } - { // c2n - auto v = ncf_->add_var("connect1", ncInt, cells_dim, nodes_per_cell_dim); - v->add_att("elem_type", "SHELL4"); - std::vector connect(nc_*szslice(c2n)); - for (Int i = 0, k = 0; i < nslices(c2n); ++i) - for (Int j = 0; j < szslice(c2n); ++j, ++k) - connect[k] = c2n(i,j) + 1; - v->set_cur(0, 0); - v->put(connect.data(), nc_, szslice(c2n)); - } - { // coords - std::vector buf(nn_); - double* const d = buf.data(); - for (Int i = 0; i < nn_; ++i) d[i] = p(i,0); - ncf_->add_var("coordx", ncDouble, nodes_dim)->put(d, nn_); - for (Int i = 0; i < nn_; ++i) d[i] = p(i,1); - ncf_->add_var("coordy", ncDouble, nodes_dim)->put(d, nn_); - for (Int i = 0; i < nn_; ++i) d[i] = p(i,2); - ncf_->add_var("coordz", ncDouble, nodes_dim)->put(d, nn_); - } - { // various other things - int one = 1; - ncf_->add_var("eb_status", ncInt, num_el_blk_dim)->put(&one, 1); - auto v = ncf_->add_var("eb_prop1", ncInt, num_el_blk_dim); - v->put(&one, 1); - v->add_att("name", "ID"); - std::vector buf(nc_, 1.0); - v = ncf_->add_var("attrib1", ncDouble, cells_dim, att_block1_dim); - v->put(buf.data(), nc_, 1); - } - - add_att("np", np); - add_att("monotone_type", monotone_type); -#else - std::cerr << "Warning: NetcdfWriter::init: Netcdf was not compiled in.\n"; -#endif -} - -template -void NetcdfWriter::add_att (const char* name, const T& val) { -#ifdef SLMM_HAVE_NETCDF - ncf_->add_att(name, val); -#endif -} - -void NetcdfWriter::add_nodal_field (const std::string& name, const Int dim) { -#ifdef SLMM_HAVE_NETCDF - if (define_done_) - throw std::runtime_error( - "Can't add a new field after end_definition() was called."); - const auto& it = name2field_.find(name); - if (it != name2field_.end()) - throw std::runtime_error("Field name was already added."); - name2field_[name] = FieldIdx(FieldType::node, node_fields_.size()); - node_fields_.push_back(Field(name, dim)); -#endif -} - -void NetcdfWriter::add_element_field (const std::string& name, const Int dim) { -#ifdef SLMM_HAVE_NETCDF - if (define_done_) - throw std::runtime_error( - "Can't add a new field after end_definition() was called."); - const auto& it = name2field_.find(name); - if (it != name2field_.end()) - throw std::runtime_error("Field name was already added."); - name2field_[name] = FieldIdx(FieldType::elem, elem_fields_.size()); - elem_fields_.push_back(Field(name, dim)); -#endif -} - -void NetcdfWriter::end_definition () { -#ifdef SLMM_HAVE_NETCDF - NcDim* const str_d = ncf_->get_dim("len_string"); - NcDim* const time_d = ncf_->get_dim("time_step"); - - do { - Int num_vars = 0; - for (auto f: node_fields_) - num_vars += static_cast(f.ncvars.size()); - if ( ! num_vars) break; - - NcDim* const nodes_d = ncf_->get_dim("num_nodes"); - NcDim* const nv_d = ncf_->add_dim("num_nod_var", num_vars); - NcVar* const name_v = ncf_->add_var("name_nod_var", ncChar, nv_d, str_d); - Int varno = 1; - for (std::size_t i = 0; i < node_fields_.size(); ++i) { - Field& f = node_fields_[i]; - if (f.ncvars.size() == 1) { - name_v->set_cur(i, 0); - name_v->put(f.name.c_str(), 1, f.name.size()); - - std::stringstream ss; - ss << "vals_nod_var" << varno++; - f.ncvars[0] = ncf_->add_var(ss.str().c_str(), ncDouble, time_d, nodes_d); - } else { - //todo dim > 1 - throw std::runtime_error("dim > 1 not impl'ed."); - } - } - } while (0); - - do { - Int num_vars = 0; - for (auto f: elem_fields_) - num_vars += static_cast(f.ncvars.size()); - if ( ! num_vars) break; - - NcDim* const elem_d = ncf_->get_dim("num_elem"); - NcDim* const ev_d = ncf_->add_dim("num_elem_var", num_vars); - NcVar* const name_v = ncf_->add_var("name_elem_var", ncChar, ev_d, str_d); - Int varno = 1; - for (std::size_t i = 0; i < elem_fields_.size(); ++i) { - Field& f = elem_fields_[i]; - if (f.ncvars.size() == 1) { - name_v->set_cur(i, 0); - name_v->put(f.name.c_str(), 1, f.name.size()); - - std::stringstream ss; - ss << "vals_elem_var" << varno++ << "eb1"; - f.ncvars[0] = ncf_->add_var(ss.str().c_str(), ncDouble, time_d, elem_d); - } else { - //todo dim > 1 - throw std::runtime_error("dim > 1 not impl'ed."); - } - } - } while (0); - - time_ = -1; - time_idx_ = -1; - time_v_ = ncf_->get_var("time_whole"); - - define_done_ = true; -#endif -} - -static void check_state (const Int time_idx, const bool define_done) { -#ifdef SLMM_HAVE_NETCDF - if (time_idx == -1) - throw std::runtime_error( - "Need to advance_time_to before writing fields."); - if ( ! define_done) - throw std::runtime_error( - "Can't write a field until end_definition() is called."); -#endif -} - -void NetcdfWriter::write_field (const std::string& name, const double* field) { -#ifdef SLMM_HAVE_NETCDF - check_state(time_idx_, define_done_); - const auto& it = name2field_.find(name); - if (it == name2field_.end()) - throw std::runtime_error("Invalid field."); - Field& f = it->second.first == FieldType::node ? - node_fields_[it->second.second] : elem_fields_[it->second.second]; - assert(f.ncvars.size() == 1); //todo dim > 1 - f.ncvars[0]->set_rec(time_idx_); - f.ncvars[0]->put_rec(field); - ncf_->sync(); -#endif -} - -void NetcdfWriter::advance_time_to (const double t) { -#ifdef SLMM_HAVE_NETCDF - ++time_idx_; - if (t <= time_) - throw std::runtime_error("t must be > current time."); - time_ = t; - time_v_->set_rec(time_idx_); - time_v_->put_rec(&time_); -#endif -} - -NetcdfWriter::Field::Field (const std::string& name, const Int dim) - : name(name), ncvars(dim, nullptr) -{} - -void get_field_vals (const NcFile& ncr, FieldType::Enum ft, const int field_idx, - const int time_idx, double* vals) { -#ifdef SLMM_HAVE_NETCDF - std::stringstream ss; - int nvals; - if (ft == FieldType::node) { - ss << "vals_nod_var" << field_idx + 1; - NcDim* const nodes_dim = ncr.get_dim("num_nodes"); - nvals = nodes_dim->size(); - } else { - ss << "vals_elem_var" << field_idx + 1 << "eb1"; - NcDim* const cell_dim = ncr.get_dim("num_elem"); - nvals = cell_dim->size(); - } - NcVar* const f_v = ncr.get_var(ss.str().c_str()); - f_v->set_cur(time_idx, 0); - f_v->get(vals, 1, nvals); -#endif -} - -void get_field_names ( - const NcFile& ncr, std::vector& node_names, - std::vector& elem_names) -{ -#ifdef SLMM_HAVE_NETCDF - NcDim* const str_d = ncr.get_dim("len_string"); - std::vector str(str_d->size()); - str.back() = '\0'; - do { - NcDim* const nv_d = ncr.get_dim("num_nod_var"); - if ( ! nv_d) break; - NcVar* const name_v = ncr.get_var("name_nod_var"); - for (int i = 0; i < nv_d->size(); ++i) { - name_v->set_cur(i, 0); - name_v->get(str.data(), 1, str.size()); - node_names.push_back(std::string(str.data())); - } - } while (0); - do { - NcDim* const ev_d = ncr.get_dim("num_elem_var"); - if ( ! ev_d) break; - NcVar* const name_v = ncr.get_var("name_elem_var"); - for (int i = 0; i < ev_d->size(); ++i) { - name_v->set_cur(i, 0); - name_v->get(str.data(), 1, str.size()); - elem_names.push_back(std::string(str.data())); - } - } while (0); -#endif -} - -#ifdef SLMM_HAVE_NETCDF -static NcValues* get_att_val (const NcFile& ncr, const char* name) { - NcAtt* att; - NcValues* vals; - if ( ! (att = ncr.get_att(name)) || - ! (vals = att->values())) - throw std::runtime_error(std::string("No attribute ") + name); - delete att; - return vals; -} -#endif - -Int get_np (const NcFile& ncr) { -#ifdef SLMM_HAVE_NETCDF - NcValues* vals = get_att_val(ncr, "np"); - const Int np = vals->as_int(0); - delete vals; - return np; -#else - return 0; -#endif -} - -} // namespace io -} // namespace slmm diff --git a/siqk/slmm/slmm_io.hpp b/siqk/slmm/slmm_io.hpp deleted file mode 100644 index 83728a4..0000000 --- a/siqk/slmm/slmm_io.hpp +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef INCLUDE_SLMM_IO_HPP -#define INCLUDE_SLMM_IO_HPP - -#include "slmm_defs.hpp" - -#include -#include -#include - -class NcFile; -class NcVar; - -namespace slmm { -namespace io { - -struct FieldType { enum Enum { node, elem }; }; - -class NetcdfWriter { - struct Field { - std::string name; - std::vector ncvars; - Field(const std::string& name, const Int dim); - }; - - Size nn_, nc_; - Int time_idx_; - double time_; - bool define_done_; - std::shared_ptr ncf_; - NcVar* time_v_; - std::vector node_fields_, elem_fields_; - typedef std::pair FieldIdx; - std::map name2field_; - - void init(const Vec3s::HostMirror& p, const Idxs::HostMirror& c2n, - const std::string& out_fn, const Int np, const Int monotone_type); - template void add_att(const char* name, const T& val); - -public: - // Open a Netcdf file for writing. - NetcdfWriter(const Vec3s::HostMirror& p, const Idxs::HostMirror& c2n, - const std::string& out_fn, - const Int np = 4, const Int monotone_type = 0); - - // Add fields on the mesh to the file. - void add_nodal_field(const std::string& name, const Int dim = 1); - void add_element_field(const std::string& name, const Int dim = 1); - - // After adding all the fields, end the definition phase, providing the first - // time at which fields will be recorded. - void end_definition(); - - // Advance time forward before writing fields for a time step. - void advance_time_to(const double t); - - // If multidimensional, the fast index is the mesh dimension. - void write_field(const std::string& name, const double* field); -}; - -// vals must be preallocated. -void get_field_vals(const NcFile& ncr, FieldType::Enum ft, const int field_idx, - const int time_idx, double* vals); - -void get_field_names( - const NcFile& ncr, std::vector& nodal_field_names, - std::vector& element_field_names); - -Int get_np(const NcFile& ncr); - -} // namespace io -} // namespace slmm - -#endif diff --git a/siqk/slmm/slmm_mesh.cpp b/siqk/slmm/slmm_mesh.cpp deleted file mode 100644 index 93e18ca..0000000 --- a/siqk/slmm/slmm_mesh.cpp +++ /dev/null @@ -1,486 +0,0 @@ -#include "slmm_mesh.hpp" -#include "slmm_gll.hpp" -#include "slmm_util.hpp" - -#include -#include - -namespace slmm { -namespace mesh { - -static void make_equiangular_nodes (const Int ne, std::vector& x) { - const Real d = 1.0 / std::sqrt(3.0); - const Real dtheta = 0.5*M_PI / ne; - x.resize(ne+1); - if (ne % 2 == 1) { - const Int n = (ne + 1) / 2; - for (Int i = 0; i < n; ++i) - x[n + i] = d*std::tan((i + 0.5)*dtheta); - for (Int i = 0; i < n; ++i) - x[n - 1 - i] = -x[n + i]; - } else { - const Int n = ne / 2; - x[n] = 0; - for (Int i = 1; i <= n; ++i) - x[n + i] = d*std::tan(i*dtheta); - for (Int i = 1; i <= n; ++i) - x[n - i] = -x[n + i]; - } -} - -static void make_planar_mesh (Vec3s::HostMirror& p, Idxs::HostMirror& e, - const Int n) { - std::vector x; - make_equiangular_nodes(n, x); - ko::resize(e, n*n, 4); - ko::resize(p, (n+1)*(n+1), 3); - for (Int iy = 0; iy < n+1; ++iy) - for (Int ix = 0; ix < n+1; ++ix) { - const auto idx = (n+1)*iy + ix; - p(idx,0) = x[ix]; - p(idx,1) = x[iy]; - p(idx,2) = 0; - } - for (Int iy = 0; iy < n; ++iy) - for (Int ix = 0; ix < n; ++ix) { - const auto idx = n*iy + ix; - e(idx,0) = (n+1)*iy + ix; - e(idx,1) = (n+1)*iy + ix+1; - e(idx,2) = (n+1)*(iy+1) + ix+1; - e(idx,3) = (n+1)*(iy+1) + ix; - } -} - -template -static void rotate (const Real R[9], V p) { - const Real x = p[0], y = p[1], z = p[2]; - p[0] = R[0]*x + R[1]*y + R[2]*z; - p[1] = R[3]*x + R[4]*y + R[5]*z; - p[2] = R[6]*x + R[7]*y + R[8]*z; -} - -template -static void translate (const Real xlate[3], V p) { - for (Int i = 0; i < 3; ++i) p[i] += xlate[i]; -} - -static void transform_planar_mesh (const Real R[9], const Real xlate[3], - Vec3s::HostMirror& p) { - for (Int i = 0; i < nslices(p); ++i) { - rotate(R, slice(p, i)); - translate(xlate, slice(p, i)); - } -} - -// Remove vertices marked unused and adjust numbering. -static void remove_unused_vertices (Vec3s::HostMirror& p, Idxs::HostMirror& e, - const Real unused) { - // adjust[i] is the number to subtract from i. Hence if e(ei,0) was originally - // i, it is adjusted to i - adjust[i]. - std::vector adjust(nslices(p), 0); - Int rmcnt = 0; - for (Int i = 0; i < nslices(p); ++i) { - if (p(i,0) != unused) continue; - adjust[i] = 1; - ++rmcnt; - } - // Cumsum. - for (Int i = 1; i < nslices(p); ++i) - adjust[i] += adjust[i-1]; - // Adjust e. - for (Int ei = 0; ei < nslices(e); ++ei) - for (Int k = 0; k < szslice(e); ++k) - e(ei,k) -= adjust[e(ei,k)]; - // Remove unused from p. - Vec3s::HostMirror pc("copy", nslices(p), szslice(p)); - ko::deep_copy(pc, p); - ko::resize(p, nslices(p) - rmcnt, szslice(p)); - for (Int i = 0, j = 0; i < nslices(pc); ++i) { - if (pc(i,0) == unused) continue; - for (Int k = 0; k < szslice(pc); ++k) p(j,k) = pc(i,k); - ++j; - } -} - -void make_cubedsphere (Vec3s::HostMirror& p, Idxs::HostMirror& e, const Int n) { - // Transformation of the reference mesh make_planar_mesh to make each of the - // six faces. - const Real d = 1.0 / std::sqrt(3.0); - static Real R[6][9] = {{ 1, 0, 0, 0, 0, 0, 0, 1, 0}, // face 0, -y - { 0, 0, 0, 1, 0, 0, 0, 1, 0}, // 1, +x - {-1, 0, 0, 0, 0, 0, 0, 1, 0}, // 2, +y - { 0, 0, 0,-1, 0, 0, 0, 1, 0}, // 3, -x - { 1, 0, 0, 0, 1, 0, 0, 0, 0}, // 4, +z - {-1, 0, 0, 0, 1, 0, 0, 0, 0}}; // 5, -z - static Real xlate[6][3] = {{ 0,-d, 0}, { d, 0, 0}, { 0, d, 0}, - {-d, 0, 0}, { 0, 0, d}, { 0, 0,-d}}; - // Construct 6 uncoupled faces. - Vec3s::HostMirror ps[6]; - Vec3s::HostMirror& p_ref = ps[0]; - Idxs::HostMirror es[6]; - Idxs::HostMirror& e_ref = es[0]; - make_planar_mesh(p_ref, e_ref, n); - ko::resize(e, 6*nslices(e_ref), 4); - ko::resize(p, 6*nslices(p_ref), 3); - for (Int i = 1; i < 6; ++i) { - ko::resize(es[i], nslices(e_ref), 4); - ko::deep_copy(es[i], e_ref); - ko::resize(ps[i], nslices(p_ref), 3); - ko::deep_copy(ps[i], p_ref); - transform_planar_mesh(R[i], xlate[i], ps[i]); - } - transform_planar_mesh(R[0], xlate[0], ps[0]); - // Pack (p,e), accounting for equivalent vertices. For the moment, keep the p - // slot for an equivalent vertex to make node numbering simpler, but make the - // value bogus so we know if there's a problem in the numbering. - const Real unused = -2; - ko::deep_copy(p, unused); - Int p_base = 0, e_base = 0; - { // -y face - const Vec3s::HostMirror& fp = ps[0]; - Idxs::HostMirror& fe = es[0]; - for (Int j = 0; j < nslices(fp); ++j) - for (Int k = 0; k < 3; ++k) p(j,k) = fp(j,k); - for (Int j = 0; j < nslices(fe); ++j) - for (Int k = 0; k < 4; ++k) e(j,k) = fe(j,k); - p_base += nslices(p_ref); - e_base += nslices(e_ref); - } - for (Int fi = 1; fi <= 2; ++fi) { // +x, +y faces - const Vec3s::HostMirror& fp = ps[fi]; - Idxs::HostMirror& fe = es[fi]; - for (Int j = 0; j < nslices(fp); ++j) { - if (j % (n+1) == 0) continue; // equiv vertex - for (Int k = 0; k < 3; ++k) p(p_base+j,k) = fp(j,k); - } - for (Int j = 0; j < nslices(fe); ++j) { - for (Int k = 0; k < 4; ++k) fe(j,k) += p_base; - // Left 2 vertices of left elem on face fi equiv to right 2 vertices of - // right elem on face fi-1. Write to the face, then copy to e, so that - // other faces can use these updated data. - if (j % n == 0) { - fe(j,0) = es[fi-1](j+n-1,1); - fe(j,3) = es[fi-1](j+n-1,2); - } - for (Int k = 0; k < 4; ++k) e(e_base+j,k) = fe(j,k); - } - p_base += nslices(p_ref); - e_base += nslices(e_ref); - } - { // -x face - const Vec3s::HostMirror& fp = ps[3]; - Idxs::HostMirror& fe = es[3]; - for (Int j = 0; j < nslices(fp); ++j) { - if (j % (n+1) == 0 || (j+1) % (n+1) == 0) continue; - for (Int k = 0; k < 3; ++k) p(p_base+j,k) = fp(j,k); - } - for (Int j = 0; j < nslices(fe); ++j) { - for (Int k = 0; k < 4; ++k) fe(j,k) += p_base; - if (j % n == 0) { - fe(j,0) = es[2](j+n-1,1); - fe(j,3) = es[2](j+n-1,2); - } else if ((j+1) % n == 0) { - fe(j,1) = es[0]((j+1)-n,0); - fe(j,2) = es[0]((j+1)-n,3); - } - for (Int k = 0; k < 4; ++k) e(e_base+j,k) = fe(j,k); - } - p_base += nslices(p_ref); - e_base += nslices(e_ref); - } - { // +z face - const Vec3s::HostMirror& fp = ps[4]; - Idxs::HostMirror& fe = es[4]; - for (Int j = n+1; j < nslices(fp) - (n+1); ++j) { - if (j % (n+1) == 0 || (j+1) % (n+1) == 0) continue; - for (Int k = 0; k < 3; ++k) p(p_base+j,k) = fp(j,k); - } - for (Int j = 0; j < nslices(fe); ++j) - for (Int k = 0; k < 4; ++k) fe(j,k) += p_base; - for (Int j = 0; j < n; ++j) { // -y - fe(j,0) = es[0](n*(n-1)+j,3); - fe(j,1) = es[0](n*(n-1)+j,2); - } - for (Int j = 0; j < n; ++j) { // +y - fe(n*(n-1)+j,2) = es[2](n*n-1-j,3); - fe(n*(n-1)+j,3) = es[2](n*n-1-j,2); - } - for (Int j = 0, i3 = 0; j < nslices(fe); j += n, ++i3) { // -x - fe(j,0) = es[3](n*n-1-i3,2); - fe(j,3) = es[3](n*n-1-i3,3); - } - for (Int j = n-1, i1 = 0; j < nslices(fe); j += n, ++i1) { // +x - fe(j,1) = es[1](n*(n-1)+i1,3); - fe(j,2) = es[1](n*(n-1)+i1,2); - } - for (Int j = 0; j < nslices(fe); ++j) - for (Int k = 0; k < 4; ++k) e(e_base+j,k) = fe(j,k); - p_base += nslices(p_ref); - e_base += nslices(e_ref); - } - { // -z face - const Vec3s::HostMirror& fp = ps[5]; - Idxs::HostMirror& fe = es[5]; - for (Int j = n+1; j < nslices(fp) - (n+1); ++j) { - if (j % (n+1) == 0 || (j+1) % (n+1) == 0) continue; - for (Int k = 0; k < 3; ++k) p(p_base+j,k) = fp(j,k); - } - for (Int j = 0; j < nslices(fe); ++j) - for (Int k = 0; k < 4; ++k) fe(j,k) += p_base; - for (Int j = 0; j < n; ++j) { // -y - fe(j,0) = es[0](n-1-j,1); - fe(j,1) = es[0](n-1-j,0); - } - for (Int j = 0; j < n; ++j) { // +y - fe(n*(n-1)+j,2) = es[2](j,1); - fe(n*(n-1)+j,3) = es[2](j,0); - } - for (Int j = 0, i3 = 0; j < nslices(fe); j += n, ++i3) { // -x - fe(j,0) = es[1](i3,0); - fe(j,3) = es[1](i3,1); - } - for (Int j = n-1, i1 = 0; j < nslices(fe); j += n, ++i1) { // +x - fe(j,1) = es[3](n-1-i1,1); - fe(j,2) = es[3](n-1-i1,0); - } - for (Int j = 0; j < nslices(fe); ++j) - for (Int k = 0; k < 4; ++k) e(e_base+j,k) = fe(j,k); - } - // Now go back and remove the unused vertices and adjust the numbering. - remove_unused_vertices(p, e, unused); - // Project to the unit sphere. - for (Int i = 0; i < nslices(p); ++i) - geometry::normalize(slice(p, i)); -} - -void make_cgll_from_geo ( - const Vec3s::HostMirror& geo_p, const Idxs::HostMirror& geo_c2n, const Int np, - Vec3s::HostMirror& cgll_p, Idxs::HostMirror& cgll_c2n) -{ - Idxs::HostMirror geo_c2e, geo_e2n; - impl::make_c2e_from_c2n(np, geo_c2n, geo_c2e, geo_e2n); - ko::resize(cgll_p, - nslices(geo_p) + // corner nodes - (np-2)*nslices(geo_e2n) + // np-2 per edge - siqk::square(np-2)*nslices(geo_c2n), // nodes inside a geo cell - 3); - ko::resize(cgll_c2n, nslices(geo_c2n), siqk::square(np)); - Int pi = 0; - // Geo cell vertices. - for ( ; pi < nslices(geo_p); ++pi) - for (Int k = 0; k < 3; ++k) cgll_p(pi,k) = geo_p(pi,k); - ko::View nodes("nodes", np, np); - const Real* gll_x = nullptr, * gll_wt = nullptr; - GLL gll; - gll.get_coef(np, gll_x, gll_wt); - // Add new edge nodes. - for (Int gci = 0; gci < nslices(geo_c2n); ++gci) { - const auto geo_nodes = slice(geo_c2n, gci); - for (Int ei = 0; ei < 4; ++ei) { - // If my edge is i -> j and j > i, then I'm responsible for adding these. - if (geo_nodes[ei] > geo_nodes[(ei+1) % 4]) continue; - // edge[0] -> edge[np-1] is the geo edge. - auto edge = slice(geo_e2n, geo_c2e(gci,ei)); - assert(edge[0] == geo_nodes[ei]); - assert(edge[np-1] == geo_nodes[(ei+1) % 4]); - // Add the new nodes. - const auto p0 = slice(cgll_p, edge[0]); - const auto p1 = slice(cgll_p, edge[np-1]); - for (Int i = 1; i < np-1; ++i) { - auto p = slice(cgll_p, pi); - const Real alpha = 0.5*(gll_x[i] + 1); - for (Int k = 0; k < 3; ++k) - p[k] = (1 - alpha)*p0[k] + alpha*p1[k]; - edge[i] = pi; - ++pi; - } - } - } - for (Int gci = 0; gci < nslices(geo_c2n); ++gci) { - const auto geo_nodes = slice(geo_c2n, gci); - // Record the newly created edge nodes. - for (Int ei = 0; ei < 4; ++ei) { - const auto edge = slice(geo_e2n, geo_c2e(gci,ei)); - if (geo_nodes[ei] < geo_nodes[(ei+1) % 4]) { - assert(edge[0] == geo_nodes[ei]); - assert(edge[np-1] == geo_nodes[(ei+1) % 4]); - switch (ei) { - case 0: for (Int i = 0; i < np; ++i) nodes(i,0) = edge[i]; break; - case 1: for (Int i = 0; i < np; ++i) nodes(np-1,i) = edge[i]; break; - case 2: for (Int i = 0; i < np; ++i) nodes(i,np-1) = edge[np-1-i]; break; - case 3: for (Int i = 0; i < np; ++i) nodes(0,i) = edge[np-1-i]; break; - default: assert(0); - } - } else { - assert(edge[np-1] == geo_nodes[ei]); - assert(edge[0] == geo_nodes[(ei+1) % 4]); - switch (ei) { - case 0: for (Int i = 0; i < np; ++i) nodes(i,0) = edge[np-1-i]; break; - case 1: for (Int i = 0; i < np; ++i) nodes(np-1,i) = edge[np-1-i]; break; - case 2: for (Int i = 0; i < np; ++i) nodes(i,np-1) = edge[i]; break; - case 3: for (Int i = 0; i < np; ++i) nodes(0,i) = edge[i]; break; - default: assert(0); - } - } - } - // Add new internal nodes. - for (Int j = 1; j < np-1; ++j) { - const auto p0 = slice(cgll_p, nodes(0,j)); - const auto p1 = slice(cgll_p, nodes(np-1,j)); - for (Int i = 1; i < np-1; ++i) { - assert(pi < nslices(cgll_p)); - auto p = slice(cgll_p, pi); - const Real alpha = 0.5*(gll_x[i] + 1); - for (Int k = 0; k < 3; ++k) - p[k] = (1 - alpha)*p0[k] + alpha*p1[k]; - nodes(i,j) = pi; - ++pi; - } - } - // Fill CGLL cell with nodes. - { - auto cell = slice(cgll_c2n, gci); - for (Int j = 0, k = 0; j < np; ++j) - for (Int i = 0; i < np; ++i, ++k) - cell[k] = nodes(i,j); - } - } - // Project to the unit sphere. - for (Int i = 0; i < nslices(cgll_p); ++i) - geometry::normalize(slice(cgll_p, i)); -} - -void make_io_cgll_from_internal_cgll ( - const Vec3s::HostMirror& cgll_p, const Idxs::HostMirror& cgll_c2n, - Idxs::HostMirror& cgll_io_c2n) -{ - const Int np2 = szslice(cgll_c2n), np = std::sqrt(np2), - nsc = siqk::square(np-1); - ko::resize(cgll_io_c2n, nslices(cgll_c2n)*nsc, 4); - for (Int ci = 0; ci < nslices(cgll_c2n); ++ci) { - const auto cell = slice(cgll_c2n, ci); - for (Int scj = 0; scj < np-1; ++scj) - for (Int sci = 0; sci < np-1; ++sci) { - auto subcell = slice(cgll_io_c2n, nsc*ci + (np-1)*scj + sci); - subcell[0] = cell[np* scj + sci ]; - subcell[1] = cell[np* scj + sci+1]; - subcell[2] = cell[np*(scj+1) + sci+1]; - subcell[3] = cell[np*(scj+1) + sci ]; - } - } -} - -void make_dgll_from_cgll ( - const Vec3s::HostMirror& cgll_p, const Idxs::HostMirror& cgll_c2n, - IdxArray::HostMirror& dglln2cglln, Idxs::HostMirror& dgll_c2n) -{ - const Int np2 = szslice(cgll_c2n); - ko::resize(dglln2cglln, np2*nslices(cgll_c2n)); - ko::resize(dgll_c2n, nslices(cgll_c2n), szslice(cgll_c2n)); - IdxArray::HostMirror cgll_c2n_used("used", nslices(cgll_p)); - ko::deep_copy(cgll_c2n_used, 0); - Int pi = nslices(cgll_p); - for (Int ci = 0; ci < nslices(cgll_c2n); ++ci) { - const auto cgll_cell = slice(cgll_c2n, ci); - auto dgll_cell = slice(dgll_c2n, ci); - for (Int ni = 0; ni < np2; ++ni) { - const Int cgll_node_nmbr = cgll_cell[ni]; - dglln2cglln[ci*np2 + ni] = cgll_node_nmbr; - if (cgll_c2n_used[cgll_node_nmbr]) - dgll_cell[ni] = pi++; - else { - dgll_cell[ni] = cgll_node_nmbr; - cgll_c2n_used[cgll_node_nmbr] = 1; - } - } - } -#ifndef NDEBUG - assert(pi == nslices(cgll_c2n) * np2); - for (Int i = 0; i < nslices(cgll_c2n_used); ++i) - assert(cgll_c2n_used[i]); -#endif -} - -namespace impl { -void calc_elem_ctr (const Vec3s::HostMirror& p, const Idxs::HostMirror& e, - const Int ei, Real ctr[3]) { - for (Int j = 0; j < 3; ++j) ctr[j] = 0; - Int n = 0; - for (Int i = 0; i < szslice(e); ++i) { - if (e(ei,i) < 0) break; - for (Int j = 0; j < 3; ++j) ctr[j] += p(e(ei,i),j); - ++n; - } - for (Int j = 0; j < 3; ++j) ctr[j] /= n; -} - -struct Edge { - const Int lo, hi; - Edge (const Int& n0, const Int& n1) - : lo(n0 < n1 ? n0 : n1), - hi(n0 < n1 ? n1 : n0) - {} - bool operator< (const Edge& e) const { - if (lo < e.lo) return true; - if (lo == e.lo) return hi < e.hi; - return false; - } -}; - -void make_c2e_from_c2n (const Int np, const Idxs::HostMirror& c2n, - Idxs::HostMirror& c2e, Idxs::HostMirror& e2n) { - const Int nnode = szslice(c2n); - // Number the edges. - std::map edge2nmbr; - Int nmbr = 0; - for (Int ci = 0; ci < nslices(c2n); ++ci) { - const auto cell = slice(c2n, ci); - for (Int ni = 0; ni < nnode; ++ni) { - Edge e(cell[ni], cell[(ni+1) % nnode]); - const auto it = edge2nmbr.find(e); - if (it == edge2nmbr.end()) - edge2nmbr[e] = nmbr++; - } - } - // Fill the adjacency arrays. - ko::resize(c2e, nslices(c2n), szslice(c2n)); - ko::resize(e2n, nmbr, np); - for (Int ci = 0; ci < nslices(c2n); ++ci) { - const auto cell = slice(c2n, ci); - for (Int ni = 0; ni < nnode; ++ni) { - Edge e(cell[ni], cell[(ni+1) % nnode]); - const auto it = edge2nmbr.find(e); - assert(it != edge2nmbr.end()); - const Int nmbr = it->second; - c2e(ci, ni) = nmbr; - e2n(nmbr, 0) = it->first.lo; - e2n(nmbr, np-1) = it->first.hi; - } - } -} - -Int check_elem_normal_against_sphere (const Vec3s::HostMirror& p, - const Idxs::HostMirror& e) { - Int nerr = 0; - for (Int ei = 0; ei < nslices(e); ++ei) { // for each element - Real sphere[3]; // ray through elem ctr - calc_elem_ctr(p, e, ei, sphere); - for (Int ti = 0; ti < szslice(e) - 2; ++ti) { // for each tri - if (e(ei,ti+2) < 0) break; - Real tri_normal[3]; { - Real v[2][3]; - for (Int j = 0; j < 2; ++j) { - geometry::copy(v[j], slice(p, e(ei,ti+j+1))); - geometry::axpy(-1, slice(p, e(ei,0)), v[j]); - } - geometry::cross(v[0], v[1], tri_normal); - } - if (geometry::dot(tri_normal, sphere) <= 0) - ++nerr; - } - } - return nerr; -} -} // namespace impl -} // namespace mesh -} // namespace slmm diff --git a/siqk/slmm/slmm_mesh.hpp b/siqk/slmm/slmm_mesh.hpp deleted file mode 100644 index 895b30c..0000000 --- a/siqk/slmm/slmm_mesh.hpp +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef INCLUDE_SLMM_MESH_HPP -#define INCLUDE_SLMM_MESH_HPP - -#include "slmm_defs.hpp" - -namespace slmm { -namespace mesh { - -// c is cell (aka element). n is node. e is edge. Hence c2e is the cell-to-edge -// adjacency array. -// geo is the basic geometric mesh. cgll is a continuous GLL mesh induced by -// the geometric mesh and the reference map. dgll is a discontinuous GLL map -// induced by the CGLL mesh. -// In a geo cell, the four nodes are ordered CCW. When a geometric mesh is -// converted to GLL, geometric cell i is divided into n = (np-1)^2 subcells. A -// GLL cell is 1-1 with a geo cell and contains GLL subcells. -// For netcdf I/O, we'll need to make GLL subcells explicitly, and they will -// be numbered n*i : n*(i+1)-1. make_io_cgll_from_internal_cgll does this. We'll -// use 'io' vs 'internal' decoration to distinguish these when necessary. -// For internal use, we don't need to form these cells explicitly. Instead, -// cgll_c2n has (np-1)^2 slots per slice. Nodes are ordered, e.g. with np=4, -// 12 13 14 15 -// 8 9 10 11 -// 4 5 6 7 -// 0 1 2 3. -// Hence cgll_c2n(i_cell, k) gives the k'th node of cell i_cell. -// With respect to the reference square (e.g., in siqk::sqr), in a quad the -// bottom-left node is (-1,-1), the bottom-right is (1,0), the top-right is -// (1,1), and the top-left is (-1,1). -// DGLL topology looks the same except that edge nodes are not -// shared. dglln2cglln(k) maps the k'th DGLL node to the corresponding CGLL -// node. cglln2dglln(k,:) is the list of DGLL nodes associated with CGLL node k. -// In all topology arrays, -1 indicates the end of a list. E.g., if CGLL node -// k corresponds to 2 DGLL nodes, then cglln2dglln(k,{0,1}) have values, and the -// rest are -1. - -void make_cubedsphere( - Vec3s::HostMirror& geo_p, Idxs::HostMirror& geo_c2n, const Int ne); - -void make_cgll_from_geo( - const Vec3s::HostMirror& geo_p, const Idxs::HostMirror& geo_c2n, - const Int np, Vec3s::HostMirror& cgll_p, Idxs::HostMirror& cgll_c2n); - -void make_io_cgll_from_internal_cgll( - const Vec3s::HostMirror& cgll_p, const Idxs::HostMirror& cgll_c2n, - Idxs::HostMirror& cgll_io_c2n); - -// dgll_c2n(cell_nmbr, :) contains node numbers for the DGLL mesh. However, a -// separate coordinate array (with redundant coordinates) is not -// created. Instead, use dglln2cglln as follows. The coordinates of the node -// dgll_c2n(cell_nmbr, k) are cgll_p(dglln2cglln(dgll_c2n(cell_nmbr, k)), :). -void make_dgll_from_cgll( - const Vec3s::HostMirror& cgll_p, const Idxs::HostMirror& cgll_c2n, - IdxArray::HostMirror& dglln2cglln, Idxs::HostMirror& dgll_c2n); - -namespace impl { -// slice(e2n,i) has np slots, and slots 0 and np-1 are filled. -void make_c2e_from_c2n(const Int np, const Idxs::HostMirror& c2n, - Idxs::HostMirror& c2e, Idxs::HostMirror& e2n); - -// Return 0 if all elements' subtri normals point outward relative to the -// sphere. -Int check_elem_normal_against_sphere( - const Vec3s::HostMirror& p, const Idxs::HostMirror& e); -} // namespace impl -} // namespace mesh -} // namespace slmm - -#endif diff --git a/siqk/slmm/slmm_runtests.py b/siqk/slmm/slmm_runtests.py deleted file mode 100755 index d2d361a..0000000 --- a/siqk/slmm/slmm_runtests.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/python - -import os, sys, re - -def readall (fn): - # Shorthand for reading in all the text in a file. - try: - with open(fn, 'r') as f: - text = f.read() - except: - text = '' - return text - -def writeall (text, fn, for_real): - if for_real: - with open(fn, 'w') as f: - f.write(text) - -def parse_one_liner (text): - class struct: - pass - hits = re.findall('
        .*', text) - hits = re.findall('l2 (?P[^ ]*) .* cv re (?P[^ ]*) ', hits[0]) - o = struct - o.l2 = float(hits[0][0]) - o.cv = float(hits[0][1]) - return o - -def runtest (cmd): - outfn = 'runtests.tmp' - os.system(cmd + ' &> ' + outfn) - return readall(outfn) - -def print_test (cmd): - print '{0:.<70s}'.format(cmd + ' '), - -def print_result (passed): - if not passed: - print '***FAILED' - return 1 - else: - print ' PASSED' - return 0 - -def check_passed (cmd): - print_test(cmd) - out = runtest(cmd) - hits = re.findall('PASSED', out) - passed = len(hits) > 0 - return print_result(passed) - -def check_errs (cmd, l2_err, cv=10): - print_test(cmd) - out = runtest(cmd) - o = parse_one_liner(out) - passed = o.l2 <= l2_err and o.cv <= cv - return print_result(passed) - -nerr = 0 -nerr += check_passed('./slmm_test -q -c test_make_cubedsphere') -nerr += check_passed('./slmm_test -q -c test_gll') -nerr += check_passed('./slmm_test -q -c test_time_int') -nerr += check_passed('./slmm_test -q -c test_make_gll_mesh') - -base = './slmmir -nsteps 12 -ne 10 -we 0 -ode divergent -ic gaussianhills ' -nerr += check_errs(base + '-np 3', 6.3e-3, 1e-5) -nerr += check_errs(base + '-np 3 -xyz', 6.3e-3, 1e-5) -nerr += check_errs(base + '-np 3 -xyz -d2c', 8.8e-3, 1e-5) -nerr += check_errs(base + '-np 4 -xyz -d2c', 5e-3, 2e-7) - -print '{0:d} tests failed'.format(nerr) diff --git a/siqk/slmm/slmm_test.cpp b/siqk/slmm/slmm_test.cpp deleted file mode 100644 index 8a48375..0000000 --- a/siqk/slmm/slmm_test.cpp +++ /dev/null @@ -1,201 +0,0 @@ -#include "slmm_defs.hpp" -#include "slmm_mesh.hpp" -#include "slmm_gll.hpp" -#include "slmm_io.hpp" -#include "slmm_time_int.hpp" -#include "slmm_gallery.hpp" -#include "slmm_debug.hpp" -using namespace slmm; - -struct Command { - enum Enum { - test_make_cubedsphere, test_gll, test_make_gll_mesh, test_time_int - }; - static Enum from_string (const std::string& s) { - if (s == "test_make_cubedsphere") return test_make_cubedsphere; - if (s == "test_gll") return test_gll; - if (s == "test_make_gll_mesh") return test_make_gll_mesh; - if (s == "test_time_int") return test_time_int; - throw std::runtime_error(s + " is not a command."); - } -}; - -struct Input { - Command::Enum command; - Int n; - Real angle; - bool write_matlab, quiet; - std::string fn_pre_out; - - Input(Int argc, char** argv); - void print(std::ostream& os) const; -}; - -static Int test_make_cubedsphere (const Input& in) { - const Int np = 4; - Vec3s::HostMirror cp; - Idxs::HostMirror c2n; - mesh::make_cubedsphere(cp, c2n, in.n); - Int nerr = 0; - { - const Int ne = mesh::impl::check_elem_normal_against_sphere(cp, c2n); - if (ne) std::cerr << "FAIL: check_elem_normal_against_sphere\n"; - nerr += ne; - } - { - Idxs::HostMirror c2e, e2n; - mesh::impl::make_c2e_from_c2n(np, c2n, c2e, e2n); - Int ne = 0; - // Every edge has two cells, and each cell is a quad. - if (nslices(e2n) != 4/2*nslices(c2n)) { - ++ne; - std::cerr << "FAIL: make_c2e_from_c2n\n"; - } - nerr += ne; - } - if (in.write_matlab) - write_matlab("cm", cp, c2n); - return nerr; -} - -static Int test_gll (const Input& in) { - Int nerr = 0; - const Real tol = 1e2*std::numeric_limits::epsilon(); - GLL gll; - const Real* x, * wt; - const Int np = 4; - gll.get_coef(np, x, wt); - Real sum = 0; - for (Int i = 0; i < np; ++i) - sum += wt[i]; - if (std::abs(2 - sum) > tol) ++nerr; - for (Int j = 0; j < np; ++j) { - Real gj[GLL::max_np]; gll.eval(np, x[j], gj); - for (Int i = 0; i < np; ++i) { - if (j == i) continue; - if (std::abs(gj[i]) > tol) ++nerr; - } - } - return nerr; -} - -static Int test_make_gll_mesh (const Input& in) { - const Int np = 4; - Vec3s::HostMirror geo_p, cgll_p; - Idxs::HostMirror geo_c2n, cgll_c2n, cgll_io_c2n; - mesh::make_cubedsphere(geo_p, geo_c2n, in.n); - Int nerr = 0; - mesh::make_cgll_from_geo(geo_p, geo_c2n, np, cgll_p, cgll_c2n); - mesh::make_io_cgll_from_internal_cgll(cgll_p, cgll_c2n, cgll_io_c2n); - { // Clip the mesh against itself and get the total area. - const Real - area = siqk::test::test_area_ot(cgll_p, cgll_io_c2n, - cgll_p, cgll_io_c2n), - true_area = 4*M_PI, - re = std::abs(area - true_area)/true_area; - if (re >= 1e-10) { - fprintf(stderr, "true area %1.4e mesh area %1.4e relerr %1.4e\n", - true_area, area, re); - ++nerr; - } - } - { - const Int ne = mesh::impl::check_elem_normal_against_sphere( - cgll_p, cgll_io_c2n); - if (ne) std::cerr << "FAIL: check_elem_normal_against_sphere\n"; - nerr += ne; - } - { - IdxArray::HostMirror dglln2cglln; - Idxs::HostMirror dgll_c2n; - mesh::make_dgll_from_cgll(cgll_p, cgll_c2n, dglln2cglln, dgll_c2n); - const Int np2 = szslice(cgll_c2n); - Int ne = 0; - for (Int ci = 0; ci < nslices(cgll_c2n); ++ci) { - const auto cgll_cell = slice(cgll_c2n, ci); - for (Int ni = 0; ni < siqk::square(np); ++ni) - if (dglln2cglln[ci*np2 + ni] != cgll_cell[ni]) - ++ne; - } - if (ne) { - nerr += ne; - std::cerr << "FAIL: make_dgll_from_cgll\n"; - } - } - if ( ! in.fn_pre_out.empty()) { - io::NetcdfWriter ncw(cgll_p, cgll_io_c2n, in.fn_pre_out + ".g"); - ncw.add_nodal_field("x"); - ncw.end_definition(); - const Int n = nslices(cgll_p); - std::vector x(n), lat(n), lon(n); - for (Int i = 0; i < n; ++i) { - const auto p = slice(cgll_p, i); - xyz2ll(p[0], p[1], p[2], lat[i], lon[i]); - } - ncw.advance_time_to(1); - gallery::InitialCondition::init( - gallery::InitialCondition::CosineBells, - nslices(cgll_p), lat.data(), lon.data(), x.data()); - ncw.write_field("x", x.data()); - ncw.advance_time_to(1.5); - gallery::InitialCondition::init( - gallery::InitialCondition::SlottedCylinders, - nslices(cgll_p), lat.data(), lon.data(), x.data()); - ncw.write_field("x", x.data()); - ncw.advance_time_to(2.5); - gallery::InitialCondition::init( - gallery::InitialCondition::CorrelatedCosineBells, - nslices(cgll_p), lat.data(), lon.data(), x.data()); - ncw.write_field("x", x.data()); - } - if (in.write_matlab) { - write_matlab("cm", geo_p, geo_c2n); - write_matlab("m", cgll_p, cgll_io_c2n); - write_matlab("gll", cgll_p, cgll_c2n); - } - return nerr; -} - -static Int test_time_int (const Input& in) { - return timeint::test::test_ark( ! in.quiet); -} - -Input::Input (Int argc, char** argv) - : command(Command::test_make_cubedsphere), n(10), angle(M_PI*1e-1), - write_matlab(false), quiet(false) -{ - for (Int i = 1; i < argc; ++i) { - const std::string& token = argv[i]; - if (eq(token, "-c", "--command")) command = Command::from_string(argv[++i]); - else if (eq(token, "-n")) n = atoi(argv[++i]); - else if (eq(token, "-q", "--quiet")) quiet = true; - else if (eq(token, "-m", "--write-matlab")) write_matlab = true; - else if (eq(token, "-o", "--output-prefix")) fn_pre_out = argv[++i]; - else if (eq(token, "--angle")) angle = atof(argv[++i]); - } - - if ( ! quiet) print(std::cout); -} - -void Input::print (std::ostream& os) const { - os << "command " << command << "\n" - << "n (-n): " << n << "\n" - << "write matlab (-m): " << write_matlab << "\n" - << "angle (--angle): " << angle << "\n"; -} - -int main (int argc, char** argv) { - Kokkos::initialize(argc, argv); - { - Input in(argc, argv); - Int nerr = 0; - switch (in.command) { - case Command::test_make_cubedsphere: nerr = test_make_cubedsphere(in); break; - case Command::test_gll: nerr = test_gll(in); break; - case Command::test_make_gll_mesh: nerr = test_make_gll_mesh(in); break; - case Command::test_time_int: nerr = test_time_int(in); break; - } - std::cerr << (nerr ? "FAIL" : "PASS") << "ED\n"; - } - Kokkos::finalize_all(); -} diff --git a/siqk/slmm/slmm_time_int.cpp b/siqk/slmm/slmm_time_int.cpp deleted file mode 100644 index 96b2e05..0000000 --- a/siqk/slmm/slmm_time_int.cpp +++ /dev/null @@ -1,156 +0,0 @@ -#include "slmm_time_int.hpp" -#include "slmm_util.hpp" - -namespace slmm { -namespace timeint { -namespace test { -class TestFunctor { - mutable Size nsteps_; - -protected: - Real tspan_[2], ys_[2]; - -public: - TestFunctor (const Real tspan[2], const Real ys[2]) - : nsteps_(0) - { - copy(2, tspan, tspan_); - copy(2, ys, ys_); - } - Size nsteps () const { return nsteps_; } - void reset () { nsteps_ = 0; } - const Real* tspan () const { return tspan_; } - const Real* ys () const { return ys_; } - void record (const Real t, const Real* const y) const { ++nsteps_; } - virtual bool eval (const Real t, const Real* const y, Real* const f) const = 0; - virtual void eval_solution(const Real t, Real* const f) const = 0; -}; - -// ODE -// y'(t) = lambda y(t) -// with solution -// y(tf) = y(ts) e^(lambda (tf - ts)). -class LambdaFunctor : public TestFunctor { - Real lambda_[2]; -public: - LambdaFunctor (const Real lambda[2], const Real tspan[2], const Real ys[2]) - : TestFunctor(tspan, ys) - { - copy(2, lambda, lambda_); - } - virtual bool eval (const Real t, const Real* const y, Real* const f) const { - f[0] = lambda_[0]*y[0] - lambda_[1]*y[1]; - f[1] = lambda_[0]*y[1] + lambda_[1]*y[0]; - return true; - } - virtual void eval_solution (const Real t, Real* const y) const { - const Real dt = t - tspan_[0]; - const Real - c = std::cos(dt*lambda_[1]), - s = std::sin(dt*lambda_[1]), - mag = std::exp(dt*lambda_[0]); - y[0] = mag*(ys_[0]*c - ys_[1]*s); - y[1] = mag*(ys_[0]*s + ys_[1]*c); - } -}; - -class TimeDepFunctor : public TestFunctor { - const Real a_; -public: - TimeDepFunctor (const Real a, const Real tspan[2], const Real ys[2]) - : TestFunctor(tspan, ys), a_(a) - {} - virtual bool eval (const Real t, const Real* const y, Real* const f) const { - f[0] = a_*t; - f[1] = -0.5*a_*t; - return true; - } - virtual void eval_solution (const Real t, Real* const y) const { - const Real dst = square(t) - square(tspan_[0]); - y[0] = ys_[0] + 0.5*a_*dst; - y[1] = ys_[1] - 0.25*a_*dst; - } -}; - -enum ARKMethod { method_ark23, method_ark45 }; - -bool test_ark_y2 (TestFunctor& fun, const ARKMethod method, - const bool verbose = true) { - std::ostream& os = std::cout; - auto ios_state = save_ios(os); - - Options opts; - opts.set_initial_step(1e-3); - opts.set_abs_tol(1e-20); - Workspace w; - Real ya[2]; - fun.eval_solution(fun.tspan()[1], ya); - - Real rtol = 1e-1; - Real rds[6]; - const Size ntrial = static_cast(sizeof(rds)/sizeof(*rds)); - const Real rtol_increase = 100; - const Real den = std::sqrt(square(ya[0]) + square(ya[1])); - - for (Size trial = 0; trial < ntrial; ++trial) { - rtol *= 1/rtol_increase; - opts.set_rel_tol(rtol); - fun.reset(); - Real y[2]; - copy(2, fun.ys(), y); - if (method == method_ark23) - ark23(opts, fun, y, 2, fun.tspan()[0], fun.tspan()[1], w); - else - ark45(opts, fun, y, 2, fun.tspan()[0], fun.tspan()[1], w); - - rds[trial] = std::sqrt(square(y[0] - ya[0]) + square(y[1] - ya[1])) / den; - if (verbose) { - os.precision(2); - os << " trial " << std::setw(2) << trial - << " nsteps " << std::setw(6) << fun.nsteps() - << " rtol " << std::scientific << rtol - << " reldif " << rds[trial] << "\n"; - } - } - - const Real improvement = rds[ntrial-2] / rds[ntrial-1]; - const bool pass = - rds[ntrial-1] <= 1e2*rtol && - (improvement >= 0.9*rtol_increase || - rds[ntrial-1] <= 1e2*std::numeric_limits::epsilon()); - return pass; -} - -Int test_ark (const bool verbose) { - if (verbose) - std::cout << "> Adaptive Runge-Kutta 2-3 unit test\n"; - static const Real tspan[] = {0.5, 71.2}, ys[] = {3.6, -0.7}; - bool pass = true; - { - static const Real lambda[] = {-0.02, 0.25}; - { - if (verbose) std::cout << " Standard test function.\n"; - LambdaFunctor fun(lambda, tspan, ys); - pass = pass && test_ark_y2(fun, method_ark23, verbose); - pass = pass && test_ark_y2(fun, method_ark45, verbose); - } - { - if (verbose) std::cout << " Standard test function backwards in time.\n"; - const Real tspanb[] = {8, -3}; - LambdaFunctor fun(lambda, tspanb, ys); - pass = pass && test_ark_y2(fun, method_ark23, verbose); - pass = pass && test_ark_y2(fun, method_ark45, verbose); - } - } - { - if (verbose) std::cout << " Exact time-dependent function.\n"; - TimeDepFunctor fun(0.1, tspan, ys); - pass = pass && test_ark_y2(fun, method_ark23, verbose); - pass = pass && test_ark_y2(fun, method_ark45, verbose); - } - return pass ? 0 : 1; -} - -} // namespace test -} // namespace timeint -} // namespace slmm diff --git a/siqk/slmm/slmm_time_int.hpp b/siqk/slmm/slmm_time_int.hpp deleted file mode 100644 index 1da226a..0000000 --- a/siqk/slmm/slmm_time_int.hpp +++ /dev/null @@ -1,424 +0,0 @@ -#ifndef INCLUDE_SLMM_TIME_INT_HPP -#define INCLUDE_SLMM_TIME_INT_HPP - -#include "slmm_defs.hpp" -#include "slmm_util.hpp" - -#include -#include -#include -#include -#include -#include -#include - -namespace slmm { -namespace timeint { -class Options { - Real initial_step_, rel_tol_, abs_tol_, max_step_size_; - -public: - Options () - : initial_step_(1e-3), rel_tol_(1e-3), abs_tol_(1e-6), max_step_size_(1e300) - {} - - void set_initial_step (const Real is) { initial_step_ = is; } - void set_rel_tol (const Real rt) { rel_tol_ = rt; } - void set_abs_tol (const Real at) { abs_tol_ = at; } - void set_max_step_size (const Real mss) { max_step_size_ = mss; } - - Real initial_step () const { return initial_step_; } - Real rel_tol () const { return rel_tol_; } - Real abs_tol () const { return abs_tol_; } - Real max_step_size () const { return max_step_size_; } -}; - -struct Workspace { - std::vector r; -}; - -struct Info { - Real good_initial_step; -}; - -struct ReturnState { - enum Enum { success, function_eval_failed, step_too_small }; -}; - -template -inline void copy (const Size n, const T* const s, T* const d) -{ for (Size i = 0; i < n; ++i) d[i] = s[i]; } - -inline void aixiy (const Size n, - const Real a0, const Real* const x0, - const Real a1, const Real* const x1, - Real* const y) { - for (Size i = 0; i < n; ++i) - y[i] = a0*x0[i] + a1*x1[i]; -} -inline void aixiy (const Size n, - const Real a0, const Real* const x0, - const Real a1, const Real* const x1, - const Real a2, const Real* const x2, - Real* const y) { - for (Size i = 0; i < n; ++i) - y[i] = a0*x0[i] + a1*x1[i] + a2*x2[i]; -} -inline void aixiy (const Size n, - const Real a0, const Real* const x0, - const Real a1, const Real* const x1, - const Real a2, const Real* const x2, - const Real a3, const Real* const x3, - Real* const y) { - for (Size i = 0; i < n; ++i) - y[i] = a0*x0[i] + a1*x1[i] + a2*x2[i] + a3*x3[i]; -} -inline void aixiy (const Size n, - const Real a0, const Real* const x0, - const Real a1, const Real* const x1, - const Real a2, const Real* const x2, - const Real a3, const Real* const x3, - const Real a4, const Real* const x4, - Real* const y) { - for (Size i = 0; i < n; ++i) - y[i] = a0*x0[i] + a1*x1[i] + a2*x2[i] + a3*x3[i] + a4*x4[i]; -} -inline void aixiy (const Size n, - const Real a0, const Real* const x0, - const Real a1, const Real* const x1, - const Real a2, const Real* const x2, - const Real a3, const Real* const x3, - const Real a4, const Real* const x4, - const Real a5, const Real* const x5, - Real* const y) { - for (Size i = 0; i < n; ++i) - y[i] = a0*x0[i] + a1*x1[i] + a2*x2[i] + a3*x3[i] + a4*x4[i] + a5*x5[i]; -} - -/*! \brief Implements the same RK3(2) pair as Matlab's ode23. - * - * A Functor f has - * - method - * bool eval(Real t, const Real* y, Real* f) const - * to evaluate f(t), the ODE at time t. Return false on failure. - * - method - * record(Real t, const Real* y) - * to optionally record y(t). - * - * \param opts [in] Options struct. - * \param fun [in] ODE Functor. - * \param y_caller [in/out] On input, y(t_s); on output, y(t_f). - * \param n [in] length(y). - * \param ts [in] t_s. - * \param tf [in] t_f. - * \param w [in/out] Workspace. Reuse between calls to minimize allocations. - * - * Cite: - * P. Bogacki, L.F. Shampine, "A 3(2) Pair of Runge-Kutta Formulas", - * Appl. Math Lett. 2(4), 321-325, 1989. - * and - * The MATLAB ODE Suite, L. F. Shampine and M. W. Reichelt, SIAM Journal on - * Scientific Computing, 18-1, 1997. - */ -template -ReturnState::Enum -ark23 (const Options& opts, const Functor& fun, - Real* const y_caller, const Size n, - const Real ts, const Real tf, - Workspace& w, Info* info=0) { - static const Real pow = 1.0/3.0; - - const Real threshold = opts.abs_tol() / opts.rel_tol(); - const int tdir = tf >= ts ? 1 : -1; - - w.r.resize(5*n); - Real* f0 = w.r.data(); - Real* const f1 = f0 + n; - Real* const f2 = f1 + n; - Real* f3 = f2 + n; - Real* y0 = y_caller; - Real* y1 = f3 + n; - - Real t = ts; - const Real sgn = sign(tf - ts); - Real absh = std::abs(opts.initial_step()); - if (info) info->good_initial_step = absh; - bool fgood = fun.eval(t, y0, f0); - fun.record(t, y0); - if ( ! fgood) return ReturnState::function_eval_failed; - - while (sgn*t < sgn*tf) { - const double hmin = 16*std::numeric_limits::epsilon()*t; - bool no_failed = true; - Real err, tnew; - for (;;) { // Integrate one step; loop until success. - // Get tnew and sanitized h. - absh = std::min(absh, opts.max_step_size()); - Real h = tdir*absh; - if (sgn*(t + h) > sgn*tf) { - h = tf - t; - absh = std::abs(h); - } - tnew = t + h; - h = tnew - t; - - // Integration rule. - do { - aixiy(n, 1, y0, 0.5*h, f0, y1); - fgood = fun.eval(t + 0.5*h, y1, f1); - if ( ! fgood) break; - aixiy(n, 1, y0, 0.75*h, f1, y1); - fgood = fun.eval(t + 0.75*h, y1, f2); - if ( ! fgood) break; - aixiy(n, 1, y0, 2.0*h/9.0, f0, h/3.0, f1, 4.0*h/9.0, f2, y1); - fgood = fun.eval(tnew, y1, f3); - } while (0); - - // Determine error. - err = 0; - if ( ! fgood) { - err = opts.rel_tol() + 1; - no_failed = false; - } else { - // Coefficients from subtracting the order-2 prediction from the order-3 - // prediction: - static const Real E[] = {-5.0/72.0, 1.0/12.0, 1.0/9.0, -1.0/8.0}; - // Element-wise error control: - // err = absh * norm( (f*E) ./ max( max(abs(y), abs(yt)), - // threshold), - // inf ); - for (Size i = 0; i < n; ++i) { - const Real fE = - std::abs(E[0]*f0[i] + E[1]*f1[i] + E[2]*f2[i] + E[3]*f3[i]); - const Real den = - std::max(std::max(std::abs(y0[i]), std::abs(y1[i])), - threshold); - err = std::max(err, fE / den); - } - err *= absh; - } - - // Determine if the step succeeded. If it did not, compute a smaller step - // size and try again. - if (err > opts.rel_tol()) { - if (absh <= hmin) { - fun.record(t, y1); - return ReturnState::step_too_small; - } - if (no_failed) { - no_failed = false; - absh = std::max( - hmin, absh*std::max( - 0.5, 0.8*std::pow(opts.rel_tol()/err, pow))); - } else { - absh = std::max(hmin, 0.5*absh); - } - } else { - // Successful step. Break from the integration loop. - break; - } - } // One integration step. - if (info) info->good_initial_step = absh; - - if (no_failed) { - // Integration step succeeded on first try. Increase the step size. - const Real fac = 0.8*std::pow(opts.rel_tol()/err, pow); - // Don't increase the step size by more than 5x. - absh = std::min(5, fac)*absh; - } - - t = tnew; - // Swap pointers. - std::swap(y0, y1); - std::swap(f0, f3); - - fun.record(t, y0); - } - - // On output, y_caller contains y(tf). If the pointers don't agree (because of - // swapping above), copy. - if (y_caller != y0) - memcpy(y_caller, y0, n*sizeof(*y_caller)); - - return ReturnState::success; -} - -/*! \brief Implements the same RK5(4) pair as Matlab's ode45. - * - * Cite: - * Dormand, J. R.; Prince, P. J. (1980), "A family of embedded Runge-Kutta - * formulae", Journal of Computational and Applied Mathematics 6 (1): 19–26. - * and - * The MATLAB ODE Suite, L. F. Shampine and M. W. Reichelt, SIAM Journal on - * Scientific Computing, 18-1, 1997. - * - * The Butcher tableau is - * - * 0 | - * 1/5 | 1/5 - * 3/10 | 3/40 9/40 - * 4/5 | 44/45 -56/15 32/9 - * 8/9 | 19372/656 -25360/2187 64448/6561 -212/729 - * 1 | 9017/3168 -355/33 46732/5247 49/176 -5103/18656 - * 1 | 35/384 0 500/1113 125/192 -2187/6784 11/84 - * -------------------------------------------------------------------------------- - * | 35/384 0 500/1113 125/192 -2187/6784 11/84 0 - * | 5179/57600 0 7571/16695 393/640 -92097/339200 187/2100 1/40 - * - * and the corresponding E array, obtained from subtracting the first row of b - * from the second, is - * - * -71/57600 0 71/16695 -71/1920 17253/339200 -88/2100 1/40 - */ -template -ReturnState::Enum -ark45 (const Options& opts, const Functor& fun, - Real* const y_caller, const Size n, - const Real ts, const Real tf, - Workspace& w, Info* info=0) { - static const Real - c2 = 0.2, c3 = 0.3, c4 = 0.8, c5 = 8.0/9.0; - static const Real - a21 = c2, - a31 = 3.0/40.0, a32 = 9.0/40.0, - a41 = 44.0/45.0, a42 = -56.0/15.0, a43 = 32.0/9.0, - a51 = 19372.0/6561.0, a52 = -25360.0/2187.0, a53 = 64448.0/6561.0, - a54 = -212.0/729.0, - a61 = 9017.0/3168.0, a62 = -355.0/33.0, a63 = 46732.0/5247.0, - a64 = 49.0/176.0, a65 = -5103.0/18656.0, - a71 = 35.0/384.0, a73 = 500.0/1113.0, a74 = 125.0/192.0, - a75 = -2187.0/6784.0, a76 = 11.0/84.0; - static const Real pow = 1.0/5.0; - // Coefficients from subtracting the order-4 prediction from the order-5 - // prediction: - static const Real E[] = {-71.0/57600.0, 0.0, 71.0/16695.0, -71.0/1920.0, - 17253.0/339200.0, -88.0/2100.0, 1.0/40.0}; - - const Real threshold = opts.abs_tol() / opts.rel_tol(); - const int tdir = tf >= ts ? 1 : -1; - - w.r.resize(9*n); - Real* f0 = w.r.data(); - Real* const f1 = f0 + n; - Real* const f2 = f1 + n; - Real* const f3 = f2 + n; - Real* const f4 = f3 + n; - Real* const f5 = f4 + n; - Real* f6 = f5 + n; - Real* y0 = y_caller; - Real* y1 = f6 + n; - - Real t = ts; - const Real sgn = sign(tf - ts); - Real absh = std::abs(opts.initial_step()); - if (info) info->good_initial_step = absh; - bool fgood = fun.eval(t, y0, f0); - fun.record(t, y0); - if ( ! fgood) return ReturnState::function_eval_failed; - - while (sgn*t < sgn*tf) { - const double hmin = 16*std::numeric_limits::epsilon()*t; - bool no_failed = true; - Real err, tnew; - for (;;) { // Integrate one step; loop until success. - // Get tnew and sanitized h. - absh = std::min(absh, opts.max_step_size()); - Real h = tdir*absh; - if (sgn*(t + h) > sgn*tf) { - h = tf - t; - absh = std::abs(h); - } - tnew = t + h; - h = tnew - t; - - // Integration rule. - do { - aixiy(n, 1, y0, a21*h, f0, y1); - fgood = fun.eval(t + c2*h, y1, f1); - if ( ! fgood) break; - aixiy(n, 1, y0, a31*h, f0, a32*h, f1, y1); - fgood = fun.eval(t + c3*h, y1, f2); - if ( ! fgood) break; - aixiy(n, 1, y0, a41*h, f0, a42*h, f1, a43*h, f2, y1); - fgood = fun.eval(t + c4*h, y1, f3); - if ( ! fgood) break; - aixiy(n, 1, y0, a51*h, f0, a52*h, f1, a53*h, f2, a54*h, f3, y1); - fgood = fun.eval(t + c5*h, y1, f4); - if ( ! fgood) break; - aixiy(n, 1, y0, a61*h, f0, a62*h, f1, a63*h, f2, a64*h, f3, a65*h, f4, y1); - fgood = fun.eval(tnew, y1, f5); - if ( ! fgood) break; - aixiy(n, 1, y0, a71*h, f0, a73*h, f2, a74*h, f3, a75*h, f4, a76*h, f5, y1); - fgood = fun.eval(tnew, y1, f6); - } while (0); - - // Determine error. - err = 0; - if ( ! fgood) { - err = opts.rel_tol() + 1; - no_failed = false; - } else { - for (Size i = 0; i < n; ++i) { - const Real fE = - std::abs(E[0]*f0[i] + E[1]*f1[i] + E[2]*f2[i] + E[3]*f3[i] + - E[4]*f4[i] + E[5]*f5[i] + E[6]*f6[i]); - const Real den = - std::max(std::max(std::abs(y0[i]), std::abs(y1[i])), - threshold); - err = std::max(err, fE / den); - } - err *= absh; - } - - // Determine if the step succeeded. If it did not, compute a smaller step - // size and try again. - if (err > opts.rel_tol()) { - if (absh <= hmin) { - fun.record(t, y1); - return ReturnState::step_too_small; - } - if (no_failed) { - no_failed = false; - absh = std::max( - hmin, absh*std::max( - 0.5, 0.8*std::pow(opts.rel_tol()/err, pow))); - } else { - absh = std::max(hmin, 0.5*absh); - } - } else { - // Successful step. Break from the integration loop. - break; - } - } // One integration step. - if (info) info->good_initial_step = absh; - - if (no_failed) { - // Integration step succeeded on first try. Increase the step size. - const Real fac = 0.8*std::pow(opts.rel_tol()/err, pow); - // Don't increase the step size by more than 5x. - absh = std::min(5, fac)*absh; - } - - t = tnew; - // Swap pointers. - std::swap(y0, y1); - std::swap(f0, f6); - - fun.record(t, y0); - } - - // On output, y_caller contains y(tf). If the pointers don't agree (because of - // swapping above), copy. - if (y_caller != y0) - memcpy(y_caller, y0, n*sizeof(*y_caller)); - - return ReturnState::success; -} - -namespace test { -Int test_ark(const bool verbose); -} // namespace test -} // namespace timeint -} // namespace slmm - -#endif diff --git a/siqk/slmm/slmm_util.cpp b/siqk/slmm/slmm_util.cpp deleted file mode 100644 index d1c4480..0000000 --- a/siqk/slmm/slmm_util.cpp +++ /dev/null @@ -1,30 +0,0 @@ -#include "slmm_util.hpp" - -#include -#include -#include - -namespace slmm { - -double wall_time () { - static const double us = 1.0e6; - timeval t; - gettimeofday(&t, 0); - return (t.tv_sec*us + t.tv_usec)/us; -} - -std::string& tolower (std::string& s) { - for (auto& c: s) - c = std::tolower(c); - return s; -} - -std::string format_strings_as_list (const char** strings, const Size n) { - std::stringstream ss; - ss << "{"; - for (Size i = 0; i < n-1; ++i) ss << strings[i] << ", "; - ss << strings[n-1] << "}"; - return ss.str(); -} - -} // namespace slmm diff --git a/siqk/slmm/slmm_util.hpp b/siqk/slmm/slmm_util.hpp deleted file mode 100644 index a192d6c..0000000 --- a/siqk/slmm/slmm_util.hpp +++ /dev/null @@ -1,153 +0,0 @@ -#ifndef INCLUDE_SLMM_UTIL_HPP -#define INCLUDE_SLMM_UTIL_HPP - -#include "slmm_defs.hpp" - -#include - -namespace slmm { -using siqk::square; -template inline constexpr T cube (const T& x) { return x*x*x; } - -struct consts { - static constexpr Real earth_radius_m = 6.37122e6; -}; - -template inline T sign (const T& a) { return a >= 0 ? 1 : -1; } - -inline Real sec2day (const Real sec) { return sec/(24*3600); } -inline Real day2sec (const Real day) { return day*(24*3600); } - -// Output is in radians. -//todo Make a version that lets you pass R = mag(x,y,z). -inline void xyz2ll (const Real x, const Real y, const Real z, - Real& lat, Real& lon) { - const Real r = std::sqrt(square(x) + square(y) + square(z)); - lat = std::asin(z/r); - lon = std::atan2(y, x); -} - -// Input is in radians. -inline void ll2xyz (const Real lat, const Real lon, Real& x, Real& y, Real& z, - const Real radius = 1) { - const Real sinl = std::sin(lat), cosl = std::cos(lat); - x = radius*std::cos(lon)*cosl; - y = radius*std::sin(lon)*cosl; - z = radius*sinl; -} - -// Eq after eq 10 in Lauritzen et al test cases paper. -inline Real great_circle_dist ( - const Real lat1, const Real lon1, const Real lat2, const Real lon2, - const Real R = 1) -{ - return R*std::acos(std::sin(lat1)*std::sin(lat2) + - std::cos(lat1)*std::cos(lat2)*std::cos(lon1 - lon2)); -} - -inline constexpr Real m2radlat (const Real m) -{ return m/consts::earth_radius_m; } - -inline Real m2radlon(const Real lat, const Real m) -{ return m2radlat(m)/std::abs(std::cos(lat)); } - -inline constexpr Real deg2rad (const Real v) { return v * (M_PI/180); } -inline constexpr Real rad2deg (const Real v) { return v * (180/M_PI); } - -inline Real reldif (const Real a, const Real b, const Real abstol = 0) -{ return std::abs(b - a)/(abstol + std::abs(a)); } - -// Row-major R. -inline void form_rotation (const Real axis[3], const Real angle, Real r[9]) { - const Real nrm = std::sqrt(square(axis[0]) + square(axis[1]) + - square(axis[2])); - const Real& x = axis[0] / nrm, & y = axis[1] / nrm, & z = axis[2] / nrm, - & th = angle; - const Real cth = std::cos(th), sth = std::sin(th), omcth = 1 - cth; - r[0] = cth + x*x*omcth; - r[3] = y*x*omcth + z*sth; - r[6] = z*x*omcth - y*sth; - r[1] = x*y*omcth - z*sth; - r[4] = cth + y*y*omcth; - r[7] = z*y*omcth + x*sth; - r[2] = x*z*omcth + y*sth; - r[5] = y*z*omcth - x*sth; - r[8] = cth + z*z*omcth; -} - -/*! \brief RAII std stream state saver. - * - * Example: Preserve std::cout's state so manipulations don't affect others' use - * of cout. - */ -template class IosSaver { - Stream& s_; - std::ios state_; -public: - IosSaver (Stream& s) : s_(s), state_(nullptr) { state_.copyfmt(s); } - IosSaver (const IosSaver& ios) : s_(ios.s_), state_(nullptr) - { state_.copyfmt(ios.state_); } - IosSaver operator= (const IosSaver&) = delete; - ~IosSaver () { s_.copyfmt(state_); } -}; -template inline IosSaver save_ios (Stream& s) -{ return IosSaver(s); } - -template -inline T* tin (T* const p, const char* const msg="") { - if ( ! p) - throw std::runtime_error(std::string(std::string("Null pointer: ") + msg)); - return p; -} - -inline bool -eq (const std::string& a, const char* const b1, const char* const b2 = 0) { - return (a == std::string(b1) || (b2 && a == std::string(b2)) || - a == std::string("-") + std::string(b1)); -} - -std::string& tolower(std::string& s); - -std::string format_strings_as_list(const char** strings, const Size n); - -double wall_time(); - -template inline Int len (const V& v) -{ return static_cast(v.dimension_0()); } - -template inline Int len (const std::vector& v) -{ return static_cast(v.size()); } - -class ProgressBar { - std::string name_; - const Int nits_; // total # iterations - const Real wf_; // write frequency in percentage points - Int it_; - Real next_; - std::ostream& os_; - -public: - ProgressBar (const std::string& name, const Int niterations, - const Real write_freq = 1.0, std::ostream& os = std::cout) - : name_(name), nits_(niterations), wf_(write_freq), it_(0), next_(0), - os_(os) - { - os_ << name_ << ":"; - os_.flush(); - } - - void update () { - ++it_; - const Real p = 100 * it_ / nits_; - if (p >= next_ || it_ == nits_) { - os_ << " " << p; - if (it_ == nits_) os_ << "\n"; - os_.flush(); - next_ += wf_; - } - } -}; - -} // namespace slmm - -#endif diff --git a/siqk/slmm/slmmir.cpp b/siqk/slmm/slmmir.cpp deleted file mode 100644 index 160380e..0000000 --- a/siqk/slmm/slmmir.cpp +++ /dev/null @@ -1,1712 +0,0 @@ -#include "slmm_defs.hpp" -#include "slmm_mesh.hpp" -#include "slmm_gll.hpp" -#include "slmm_io.hpp" -#include "slmm_time_int.hpp" -#include "slmm_gallery.hpp" -#include "slmm_debug.hpp" -using namespace slmm; - -// ----------------------------------------------------------------------------- -// NLA stuff taken from cflexp1 tr_gll. All of this needs to be rewritten for -// Kokkos. My plan is to get the program running end to end correctly, and then -// I'll go back and transition things to Kokkos and to running on the GPU. - -template class Array { - T* p_; - std::size_t n_, cap_; -public: - Array () { init(); } - Array(std::size_t n); - Array(std::size_t n, const T& init); - ~Array () { clear(); } - // Initialize the object with the assumption that all variables are uninit'ed - // prior to calling. - void init(); - void clear(); - // optclear means optionally clear. The function has the semantics of - // clearing, but it may not actually release the memory. - void optclear_and_resize(std::size_t n); - // _ft indicates first touch. - void optclear_and_resize_ft(std::size_t n); - void optclear_and_resize(std::size_t n, const T& i); - void optclear_and_reserve(std::size_t n); - void optclear_and_reserve_ft(std::size_t n); - T& operator[] (std::size_t i) { return p_[i]; } - const T& operator[] (std::size_t i) const { return p_[i]; } - T& back () { return p_[n_-1]; } - const T& back () const { return p_[n_-1]; } - std::size_t size () const { return n_; } - bool empty () const { return size() == 0; } - T* data () const { return p_; } - // This does not realloc; reserve must provide the necessary memory. It does - // not throw, either. It asserts. - void unsafe_push_back(const T& e); - T* begin () { return p_; } - T* end () { return p_ + n_; } - void set (const T& v) { for (std::size_t i = 0; i < n_; ++i) p_[i] = v; } -}; - -// All indices and sizes are relative to blocks except m() and n(). -// Whether each block is row- or col-major is up to the caller. -// Each row's cols must be sorted. -template -class BlockMatrix { -public: - typedef ScalarT Scalar; - typedef SizeT Size; - typedef IntT Int; - - typedef BlockMatrix Me; - - // Don't need N, really, but it's handy for assertions/debugging. - Int M_, N_, m_, n_; - std::shared_ptr rowptr_p_; - std::shared_ptr colidx_p_; - std::shared_ptr d_p_; - Size* rowptr_; - Int* colidx_; - Scalar* d_; - -public: - BlockMatrix () - : M_(0), m_(0), n_(0), rowptr_(nullptr), colidx_(nullptr), d_(nullptr) - {} - - BlockMatrix (const Int M, const Int N, const Int m, const Int n, - const Size* rowptr, const Int* colidx) { - init(M, N, m, n, rowptr, colidx); - } - - void init (const Int M, const Int N, const Int m, const Int n, - const Size* rowptr, const Int* colidx) { - M_ = M; N_ = N; m_ = m; n_ = n; - rowptr_p_ = std::shared_ptr(new Size[M_ + 1], - std::default_delete()); - rowptr_ = rowptr_p_.get(); - memcpy(rowptr_, rowptr, (M_ + 1)*sizeof(Size)); - colidx_p_ = std::shared_ptr(new Int[rowptr[M_]], - std::default_delete()); - colidx_ = colidx_p_.get(); - memcpy(colidx_, colidx, rowptr[M_]*sizeof(Int)); - d_p_ = std::shared_ptr(new Scalar[rowptr_[M_]*m_*n_], - std::default_delete()); - d_ = d_p_.get(); - } - - const Size* rowptr () const { return rowptr_; } - const Int* colidx () const { return colidx_; } - - const Int M () const { return M_; } - const Int N () const { return N_; } - const Int m () const { return m_; } - const Int n () const { return n_; } - - const Scalar* blockrow (const Int br) const { - assert(br < M_); - return d_ + rowptr_[br]*m_*n_; - } - Scalar* blockrow (const Int br) { - return const_cast(const_cast(this)->blockrow(br)); - } - - const Scalar* block (const Int br, const Int bc) const { - assert(br < M_); - assert(bc < N_); - const Int* const beg = colidx_ + rowptr_[br]; - const Int* const end = colidx_ + rowptr_[br+1]; - const Int* const idx = std::lower_bound(beg, end, bc); - if (idx == end) return nullptr; - const Int i = static_cast(idx - colidx_); - return d_ + i*m_*n_; - } - Scalar* block (const Int br, const Int bc) { - return const_cast(const_cast(this)->block(br, bc)); - } - - void zero () { for (Size i = 0; i < rowptr_[M_]*m_*n_; ++i) d_[i] = 0; } - - static void test(); -}; - -class FullMassMatrix { - typedef BlockMatrix MT; - - int np_; - MT m_; - -public: - typedef std::shared_ptr Ptr; - - FullMassMatrix () : np_(0) {} - FullMassMatrix (const int nelem, const int np) { init(nelem, np); } - - void init(const int nelem, const int np); - - int np2 () const { return np_*np_; } - int np4 () const { return np2()*np2(); } - - const Real* block(const int i) const; - Real* block(const int i); - - const MT& get_M () const { return m_; } - - void factor(); - void solve(const int elem, Real* const bx, const int nrhs, - const int ldbx) const; -}; - -class RemapData { -public: - typedef std::shared_ptr Ptr; - typedef BlockMatrix MT; - typedef Array VT; - typedef siqk::Octree Octree; - - // Full block-diag target-target mass matrix, factored. - FullMassMatrix fmm_; - // Search tree over Eulerian mesh. - Octree ot_; - // Target-source matrix. - MT T_; - // Jacobian(ref square -> sphere). - RealArray::HostMirror Jt_; - // Eulerian mesh basis function integrals. - RealArray::HostMirror dgbfi_, cgbfi_; - -public: - // Set up. - FullMassMatrix& fmm () { return fmm_; } - Octree& octree () { return ot_; } - MT& T () { return T_; } - RealArray::HostMirror& Jt () { return Jt_; } - RealArray::HostMirror& dgbfi () { return dgbfi_; } - RealArray::HostMirror& cgbfi () { return cgbfi_; } - - // Apply. - Int T_nrows () const { return T_.M()*T_.m(); } - Int T_ncols () const { return T_.N()*T_.n(); } - const Octree& octree () const { return ot_; } - const ConstRealArray::HostMirror& Jt () const { return Jt_; } - const ConstRealArray::HostMirror& dgbfi () const { return dgbfi_; } - const ConstRealArray::HostMirror& cgbfi () const { return cgbfi_; } - - // y = T x. - void apply_T(const Real* x, const int ldx, Real* y, const int ldy, - const int nrhs) const; - // y = T' x. Not needed in practice, but used in check(). - void apply_T_transp(const Real* x, const int ldx, Real* y, const int ldy, - const int nrhs) const; - // x = M_full \ b. - void solve_M_full(Real* bx, const int nrhs, const int ldxb) const; - // y = R_full x - void apply_R_full(const Real* x, const int ldx, Real* y, const int ldy, - const int nrhs) const; - // y = R_lump x - void apply_R_lump(const Real* x, const int ldx, Real* y, const int ldy, - const int nrhs) const; - - // Perform and print some checks. Each entry of these Jacobians is the - // integral over the spherical quad of a basis function. So it's really more - // than just a Jacobian. - void check(const Real* Js, const Real* Jt) const; - // If T is expected to be identical to M (analytically), check how close it - // really is. Works only before 'factor' is called. - void compare_MT() const; -}; - -template inline void touch (T* const p, const size_t n, - const T& init = T()) { - // 1 KB should be a safe lower bound on page size. Touch enough to touch every - // page; I don't think there's any need to touch more memory than that. - for (size_t i = 0; i < n; i += 1024 / sizeof(T)) - p[i] = init; - // Make sure the last part is touched. - if (n) p[n-1] = init; -} -template inline T* -allocn (const size_t n, const bool first_touch = false) { - if ( ! n) return 0; - T* p = new T[n]; - if (first_touch) touch(p, n); - return p; -} -template inline void deln (T*& p) { - if (p) delete[] p; - p = 0; -} -template inline void deln_const (const T* p) { - if (p) delete[] p; -} -template inline void del (T*& p) { - if (p) delete p; - p = 0; -} - -template -inline void Array::init () { - n_ = cap_ = 0; - p_ = 0; -} - -template -inline Array::Array (std::size_t n) - : p_(0), n_(0), cap_(0) -{ optclear_and_resize(n); } - -template -inline Array::Array (std::size_t n, const T& init) - : p_(0), n_(0), cap_(0) -{ optclear_and_resize(n, init); } - -template -inline void Array::clear () { - n_ = cap_ = 0; - deln(p_); -} - -template -inline void Array::optclear_and_reserve (std::size_t n) { - n_ = 0; - if (n <= cap_) return; - clear(); - p_ = allocn(n); - cap_ = n; -} - -template -inline void Array::optclear_and_reserve_ft (std::size_t n) { - n_ = 0; - if (n <= cap_) return; - clear(); - p_ = allocn(n, true); - cap_ = n; -} - -template -inline void Array::optclear_and_resize (std::size_t n) { - if (n <= cap_) { - n_ = n; - return; - } - optclear_and_reserve(n); - n_ = n; -} - -template -inline void Array::optclear_and_resize_ft (std::size_t n) { - if (n <= cap_) { - n_ = n; - return; - } - optclear_and_reserve_ft(n); - n_ = n; -} - -template -inline void Array::optclear_and_resize (std::size_t n, const T& init) { - optclear_and_resize(n); - for (std::size_t i = 0; i < n_; ++i) - memcpy(p_ + i, &init, sizeof(init)); -} - -template -inline void Array::unsafe_push_back (const T& e) { - assert(n_ < cap_); - p_[n_++] = e; -} - -template -void BlockMatrix::test () { - static const Size rowptr[] = {0, 2, 3, 6, 8 }; - static const Size colidx[] = {0, 1, 1, 0, 2, 3, 1, 3}; - static const Int M = sizeof(rowptr)/sizeof(Size) - 1; - static const int m = 3, n = 4; - - { - BlockMatrix a(M, M, m, n, rowptr, colidx); - - assert(a.M() == M); - assert(a.m() == m); - assert(a.n() == n); - - const auto rowptr = a.rowptr(); - const auto colidx = a.colidx(); - for (Int r = 0, ctr = 1; r < a.M(); ++r) { - Scalar* d = a.blockrow(r); - for (Int j = 0; j < rowptr[r+1] - rowptr[r]; ++j, ++ctr) { - for (Int i = 0; i < a.m()*a.n(); ++i) - d[i] = ctr; - d += a.m()*a.n(); - } - } - - for (Int r = 0, ctr = 1; r < M; ++r) - for (Int j = rowptr[r]; j < rowptr[r+1]; ++j, ++ctr) { - Scalar const* const d = a.block(r, colidx[j]); - assert(d); - for (Int i = 0; i < m*n; ++i) - assert(d[i] == ctr); - } - } -} - -extern "C" { - void dgemm_(const char* transa, const char* transb, const int* m, - const int* n, const int* k, const double* alpha, const double* a, - const int* lda, const double* b, const int* ldb, - const double* beta, double* c, const int* ldc); - void dpotrf_(const char* uplo, const int* n, double* a, const int* lda, - int* info); - void dpotrs_(const char* uplo, const int* n, const int* nrhs, const double* a, - const int* lda, double* b, const int* ldb, int* info); -} - -inline void dgemm ( - char transa, char transb, int m, int nrhs, int n, double alpha, - const double* a, int lda, const double* b, int ldb, double beta, - const double* c, int ldc) -{ - dgemm_(&transa, &transb, &m, &nrhs, &n, &alpha, const_cast(a), &lda, - const_cast(b), &ldb, &beta, const_cast(c), &ldc); -} - -void FullMassMatrix::init (const int nelem, const int np) { - np_ = np; - Array rowptr(nelem + 1), colidx(nelem); - for (int i = 0; i < nelem; ++i) { - rowptr[i] = i; - colidx[i] = i; - } - rowptr[nelem] = nelem; - m_.init(nelem, nelem, np2(), np2(), rowptr.data(), colidx.data()); - m_.zero(); - assert(m_.m() == np2() && m_.n() == np2()); - assert(m_.M() == m_.N() && m_.M() == nelem); - assert(m_.blockrow(0) + np4() == m_.blockrow(1)); -} - -const double* FullMassMatrix::block (const int i) const { - assert(m_.blockrow(i) - m_.blockrow(0) == i*np4()); - return m_.blockrow(i); -} -double* FullMassMatrix::block (const int i) { - return const_cast(const_cast(m_).blockrow(i)); -} - -void FullMassMatrix::factor () { - const int n = np2(); -# pragma omp parallel for - for (int i = 0; i < m_.M(); ++i) { - double* const d = block(i); - const char uplo = 'L'; - int info; - dpotrf_(&uplo, &n, d, &n, &info); - if (info != 0) { - fprintf(stderr, "M() %d i %d info %d\n", m_.M(), i, info); - fprintf(stderr, "a = ["); - for (int c = 0; c < n; ++c) { - for (int r = 0; r < n; ++r) - fprintf(stderr, " %1.15e", d[n*c + r]); - fprintf(stderr, ";"); - } - fprintf(stderr, "];\n"); - } - assert(info == 0); - } -} - -void FullMassMatrix:: -solve (const int elem, double* const bx, const int nrhs, const int ldbx) const { - const int n = np2(); - const double* const d = block(elem); - const char uplo = 'L'; - int info; - dpotrs_(&uplo, &n, &nrhs, const_cast(d), &n, bx, &ldbx, &info); - assert(info == 0); -} - -void RemapData::apply_T (const double* x, const int ldx, double* y, - const int ldy, const int nrhs) const { - const MT::Scalar* const d = T_.blockrow(0); - const MT::Size* const rowptr = T_.rowptr(); - const MT::Int* const colidx = T_.colidx(); -# pragma omp parallel - { - const MT::Int n = T_.N()*T_.n(); -# pragma omp for - for (MT::Int i = 0; i < n; ++i) - y[i] = 0; -# pragma omp for - for (MT::Size br = 0; br < T_.M(); ++br) - for (MT::Int j = rowptr[br]; j < rowptr[br+1]; ++j) { - const MT::Int bc = colidx[j]; - const MT::Scalar* const b = d + j*T_.m()*T_.n(); - dgemm('t', 'n', T_.m(), nrhs, T_.n(), 1, b, T_.m(), x + bc*T_.n(), ldx, - 1, y + br*T_.m(), ldy); - } - } -} - -void RemapData::apply_T_transp (const double* x, const int ldx, double* y, - const int ldy, const int nrhs) const { - const MT::Scalar* const d = T_.blockrow(0); - const MT::Size* const rowptr = T_.rowptr(); - const MT::Int* const colidx = T_.colidx(); - for (MT::Int i = 0, n = T_.M()*T_.m(); i < n; ++i) - y[i] = 0; - for (MT::Size br = 0; br < T_.M(); ++br) - for (MT::Int j = rowptr[br]; j < rowptr[br+1]; ++j) { - const MT::Int bc = colidx[j]; - const MT::Scalar* const b = d + j*T_.m()*T_.n(); - dgemm('n', 'n', T_.m(), nrhs, T_.n(), 1, b, T_.m(), x + br*T_.m(), ldx, 1, - y + bc*T_.n(), ldy); - } -} - -void RemapData::solve_M_full (double* bx, const int nrhs, - const int ldxb) const { -# pragma omp parallel for - for (MT::Int br = 0; br < T_.M(); ++br) - fmm_.solve(br, bx + br*fmm_.np2(), nrhs, ldxb); -} - -void RemapData::apply_R_full (const double* x, const int ldx, double* y, - const int ldy, const int nrhs) const { - const MT::Int n = T_nrows(); - apply_T(x, n, y, n, 1); - solve_M_full(y, 1, n); -} - -static void report (const std::string label, const Real* const x_t, - const Real* const x, const Int n) { - Real me = 0, den = 0; - for (Int i = 0; i < n; ++i) { - me = std::max(me, std::abs(x[i] - x_t[i])); - den = std::max(den, std::abs(x_t[i])); - } - printf("> RemapData %21s: %1.3e\n", label.c_str(), me/den); -} - -void RemapData::check (const Real* Js, const Real* Jt) const { - const int n = T_nrows(); - // This routine assumes T is nxn. - Array e(n), x(n), y(n); - e.set(1); - - memcpy(x.data(), Jt, n*sizeof(Real)); - solve_M_full(x.data(), 1, n); - report("M_full \\ Jt = e", e.data(), x.data(), n); - - apply_T_transp(e.data(), n, x.data(), n, 1); - report("e' T = Js'", Js, x.data(), n); - - apply_T(e.data(), n, x.data(), n, 1); - report("T e = Jt", Jt, x.data(), n); - - apply_R_full(e.data(), n, x.data(), n, 1); - report("[ct] R_full e = e", e.data(), x.data(), n); - - memcpy(x.data(), Jt, n*sizeof(Real)); - solve_M_full(x.data(), 1, n); - apply_T_transp(x.data(), n, y.data(), n, 1); - report("[cv] Jt' R_full = Js'", Js, y.data(), n); -} - -void RemapData::compare_MT () const { - Real diag_num = 0, diag_den = 0; - const auto& M = fmm_.get_M(); - const auto& T = T_; - assert(M.M() == T.M()); - assert(M.m() == T.m()); - for (Int br = 0; br < T.M(); ++br) { - const auto Mb = M.block(br, br); - const auto Tb = T.block(br, br); - for (Int k = 0; k < square(M.m()); ++k) { - diag_num += square(Tb[k] - Mb[k]); - diag_den += square(Mb[k]); - } - } - printf("> rd(M,T) %1.3e\n", std::sqrt(diag_num/diag_den)); -} - -// ----------------------------------------------------------------------------- -// fwd = forward: The mesh at t_{n-1} is the departure mesh and is integrated -// forward in time. It is the source mesh. -// bwd = backward: The mesh at t_n is the departure mesh and is integrated -// backward in time. It is the target mesh. -// R = M \ T. M is the mass matrix. T is the mixed mass matrix mapping source -// to target. - -// Some debug and code stuff. -namespace { -class Debug { - int index_; - std::string filename_; - bool on_; - -public: - Debug () - : index_(1), filename_("dbgout.m"), on_(true) - { -#ifdef SLMM_DEBUG - FILE* fid = fopen(filename_.c_str(), "w"); - fclose(fid); -#endif - } - - void advance () { ++index_; } - - void set_on (const bool set) { on_ = set; } - - template - void write_p (const std::string& name, const CV3s& p) { -#ifdef SLMM_DEBUG - if ( ! on_) return; - FILE* fid = fopen(filename_.c_str(), "a"); - fprintf(fid, "%s{%d} = [", name.c_str(), index_); - for (Int ip = 0; ip < nslices(p); ++ip) - fprintf(fid, " %1.15e %1.15e %1.15e;", p(ip,0), p(ip,1), p(ip,2)); - fprintf(fid, "].';\n"); - fclose(fid); -#endif - } - - template - void write_c2n (const std::string& name, const CIs& e) { -#ifdef SLMM_DEBUG - if ( ! on_) return; - FILE* fid = fopen(filename_.c_str(), "a"); - fprintf(fid, "%s{%d} = [", name.c_str(), index_); - for (Int ie = 0; ie < nslices(e); ++ie) { - for (Int k = 0; k < szslice(e); ++k) - fprintf(fid, " %d", e(ie,k)+1); - fprintf(fid, ";"); - } - fprintf(fid, "].';\n"); - fclose(fid); -#endif - } - - void write (const std::string& name, const BlockMatrix& m) { -#ifdef SLMM_DEBUG - if ( ! on_) return; - FILE* fid = fopen(filename_.c_str(), "a"); - fprintf(fid, "tmp = ["); - const Size* rowptr = m.rowptr(); - const Int* colidx = m.colidx(); - for (Int R = 0; R < m.M(); ++R) - for (Int J = rowptr[R]; J < rowptr[R+1]; ++J) { - const Int C = colidx[J]; - const Real* const block = m.block(R, C); - for (Int r = 0, k = 0; r < m.m(); ++r) - for (Int c = 0; c < m.n(); ++c, ++k) - fprintf(fid, "%d %d %1.15e\n", m.m()*R + r + 1, - m.n()*C + c + 1, block[k]); - } - fprintf(fid, "];\n"); - fprintf(fid, "%s{%d} = sparse(tmp(:,1),tmp(:,2),tmp(:,3),%d,%d);\n", - name.c_str(), index_, m.M()*m.m(), m.N()*m.n()); - fclose(fid); -#endif - } - - void write (const std::string& name, const Real* const a, const Int n) { -#ifdef SLMM_DEBUG - if ( ! on_) return; - FILE* fid = fopen(filename_.c_str(), "a"); - fprintf(fid, "%s{%d} = [", name.c_str(), index_); - for (Int i = 0; i < n; ++i) - fprintf(fid, " %1.15e", a[i]); - fprintf(fid, "].';\n"); - fclose(fid); -#endif - } -}; -static Debug gdbg; - -class Timer { -public: - enum Op { ts_setup, ts, ts_integrate, ts_remap, ts_rest, ts_error, - ts_remap_T, ts_remap_node_jac, - ts_remap_T_geometry, ts_remap_T_crs, ts_remap_T_fill, - total, NTIMERS }; - static inline void init () { -#ifdef SLMM_TIME - for (int i = 0; i < NTIMERS; ++i) et_[i] = 0; -#endif - } - static inline void start (const Op op) { -#ifdef SLMM_TIME - gettimeofday(&t_start_[op], 0); -#endif - } - static inline void stop (const Op op) { -#ifdef SLMM_TIME - timeval t2; - gettimeofday(&t2, 0); - const timeval& t1 = t_start_[op]; - static const double us = 1.0e6; - et_[op] += (t2.tv_sec*us + t2.tv_usec - t1.tv_sec*us - t1.tv_usec)/us; -#endif - } -# define tpr(op) do { \ - printf("%-20s %10.3e %10.1f\n", #op, et_[op], 100*et_[op]/tot); \ - } while (0) - static void print () { -#ifdef SLMM_TIME - const double tot = et_[total]; - tpr(ts_setup); tpr(ts); tpr(ts_integrate); tpr(ts_remap); - tpr(ts_remap_T); tpr(ts_remap_T_geometry); tpr(ts_remap_T_crs); - tpr(ts_remap_T_fill); tpr(ts_remap_node_jac); tpr(ts_rest); - tpr(ts_error); - printf("%-20s %10.3e %10.1f\n", "total", et_[total], 100.0); -#endif - } -#undef tpr -private: -#ifdef SLMM_TIME - static timeval t_start_[NTIMERS]; - static double et_[NTIMERS]; -#endif -}; -#ifdef SLMM_TIME -timeval Timer::t_start_[Timer::NTIMERS]; -double Timer::et_[Timer::NTIMERS]; -#endif -} // anon namespace - -static constexpr Int max_nvert = 8; -static constexpr Int max_hits = 25; // Covers at least a 2-halo. - -class MeshIntegrator { -protected: - std::vector ll_; -public: - MeshIntegrator (const Int nnodes) - : ll_(2*nnodes) - {} - virtual ~MeshIntegrator () {} - std::vector& get_ll () { return ll_; } - // Must be called from inside ||{}. - virtual void integrate(const Real ts, const Real tf, Vec3s::HostMirror& p) =0; -}; - -template -class MeshIntegratorWithOdeFn : public MeshIntegrator { - std::vector ws_; - std::vector initial_step_; - bool use_xyz_form_; - -public: - MeshIntegratorWithOdeFn (const Int nnodes, const bool use_xyz_form = false) - : MeshIntegrator(nnodes), initial_step_(nnodes, 1e-3), - use_xyz_form_(use_xyz_form) - {} - - virtual void integrate (const Real ts, const Real tf, Vec3s::HostMirror& p) { - const Int nn = nslices(p); - assert(2*nn == static_cast(ll_.size())); - ws_.resize(omp_get_max_threads()); -# pragma omp parallel for schedule(static, 4) - for (Int i = 0; i < nn; ++i) { - const int tid = omp_get_thread_num(); - - // Our primary interest in these numerical experiments is order of - // accuracy when the flow field is exact. Hence here we use extremely - // tight error tolerances. - timeint::Options opts; - opts.set_abs_tol(std::numeric_limits::epsilon()); - opts.set_rel_tol(1e2*std::numeric_limits::epsilon()); - opts.set_initial_step(initial_step_[i]); - - timeint::Info info; - OdeFn fun; - fun.set_xyz_form(use_xyz_form_); - if ( ! use_xyz_form_) { - Real lli[] = {ll_[2*i], ll_[2*i+1]}; - timeint::ark45(opts, fun, lli, 2, ts, tf, ws_[tid], &info); - auto n = slice(p, i); - ll2xyz(lli[0], lli[1], n[0], n[1], n[2]); - } else { - Real u[3]; - ll2xyz(ll_[2*i], ll_[2*i+1], u[0], u[1], u[2]); - timeint::ark45(opts, fun, u, 3, ts, tf, ws_[tid], &info); - geometry::normalize(u); - auto n = slice(p, i); - for (Int j = 0; j < 3; ++j) n[j] = u[j]; - } - initial_step_[i] = info.good_initial_step; - } - } -}; - -class MeshRotator : public MeshIntegrator { - Vec3s::HostMirror p_; - Real axis_[3]; - -public: - MeshRotator (const ConstVec3s::HostMirror& p) - : MeshIntegrator(nslices(p)) - { - axis_[0] = 0.2; axis_[1] = 0.7; axis_[2] = 1; - geometry::normalize(axis_); - ko::resize(p_, nslices(p), szslice(p)); - ko::deep_copy(p_, p); - } - - virtual void integrate (const Real ts, const Real tf, Vec3s::HostMirror& p) { - const Int nn = nslices(p); - assert(2*nn == static_cast(ll_.size())); - const Real - period = day2sec(12), - a = 2*M_PI*(tf - ts)/period; - Real r[9]; - form_rotation(axis_, a, r); -# pragma omp parallel for - for (Int i = 0; i < nn; ++i) { - auto n = slice(p_, i); - const Real x = n[0], y = n[1], z = n[2]; - n = slice(p, i); - n[0] = r[0]*x + r[1]*y + r[2]*z; - n[1] = r[3]*x + r[4]*y + r[5]*z; - n[2] = r[6]*x + r[7]*y + r[8]*z; - } - } -}; - -struct MeshIntegratorFactory : public gallery::WindFieldType { - static std::shared_ptr - create (const std::string& ode, const bool use_xyz_form, - const ConstVec3s::HostMirror& p) - { return create(from_string(ode), use_xyz_form, p); } - - static std::shared_ptr - create (const Enum& ode, const bool use_xyz_form, - const ConstVec3s::HostMirror& p) { - const Int nnodes = nslices(p); - switch (ode) { - case Dcmip1d3ll: - return std::make_shared >(nnodes, use_xyz_form); - case NonDivergentWindField: - return std::make_shared >(nnodes, use_xyz_form); - case DivergentWindField: - return std::make_shared >(nnodes, use_xyz_form); - case NonDivergentWindFieldHack: - return std::make_shared >(nnodes, use_xyz_form); - case Rotate: - return std::make_shared(p); - default: - assert(0); - } - } -}; - -struct IntegrateOptions { - enum Enum { fwd, bwd, test_looa }; - Enum stepping; - bool d2c; // Each step, and in error, convert dgll <-> cgll. -}; - -struct Input { - std::string output_fn, ode, initial_condition, program_name; - Real T; - Int ne, nsteps, write_every, monotone_type, np, tq_order; - bool debug, write_matlab; - bool xyz_form; // Integrate in (x,y,z) space instead of (lat,lon). - IntegrateOptions integrate_options; - - Input(Int argc, char** argv); - void print(std::ostream& os) const; -}; - -// _s is start and _e is end. -struct Output { - Real - l2_err, max_err, mass_s, mass_e, min_s, max_s, min_e, max_e, - et_timestep, - mass_gll_s, mass_gll_e; -}; - -struct RemapOptions { - Int np, monotone_type; - - RemapOptions () - : np(4), monotone_type(0) - {} -}; - -struct Mesh { - Int np, tq_order; - Vec3s::HostMirror geo_p, geo_nml, cgll_p; - Idxs::HostMirror geo_c2n, geo_c2nml, cgll_c2n, dgll_c2n, cgll_io_c2n; - IdxArray::HostMirror dglln2cglln; -}; - -static void copy_vertices ( - const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& c2n, - const Int ci, Real* ps) -{ - const auto cell = slice(c2n, ci); - for (Int i = 0; i < szslice(c2n); ++i) { - const auto n = slice(p, cell[i]); - for (Int k = 0; k < 3; ++k) ps[k] = n[k]; - ps += 3; - } -} - -static void calc_node_jacobians ( - const Mesh& m, const ConstVec3s::HostMirror& p, RealArray::HostMirror& J_dg) -{ - const Int np2 = square(m.np); - ko::resize(J_dg, nslices(m.geo_c2n)*np2); - GLL gll; - const Real* gll_x, * gll_wt; - gll.get_coef(m.np, gll_x, gll_wt); -# pragma omp parallel for - for (Int ci = 0; ci < nslices(m.geo_c2n); ++ci) { - const auto cell = slice(m.geo_c2n, ci); - for (Int j = 0, basis_idx = 0; j < m.np; ++j) { - const Real b = 0.5*(gll_x[j] + 1); - for (Int i = 0; i < m.np; ++i, ++basis_idx) { - const Real a = 0.5*(gll_x[i] + 1); - Real J[9]; - siqk::sqr::impl::calc_Jacobian(p, cell, a, b, J); - geometry::cross(J, J+3, J+6); - const Real jac = std::sqrt(geometry::norm2(J+6)); - J_dg(ci*np2 + basis_idx) = jac; - } - } - } -} - -static void calc_basis_function_integrals ( - const Int np, const Int tq_order, const ConstVec3s::HostMirror& p, - const ConstIdxs::HostMirror& c2n, RealArray::HostMirror& dgbfi) -{ - const Int np2 = square(np); - ko::resize(dgbfi, nslices(c2n)*np2); - ko::deep_copy(dgbfi, 0); - siqk::TriangleQuadrature tq; - siqk::RawConstVec3s tq_bary; - siqk::RawConstArray tq_w; - tq.get_coef(tq_order, tq_bary, tq_w); - const Int nq = len(tq_w); - GLL gll; -# pragma omp parallel for - for (Int ci = 0; ci < nslices(c2n); ++ci) { // cell - Real ps[12]; - copy_vertices(p, c2n, ci, ps); - const auto cell = slice(c2n, ci); - for (Int k = 1; k <= 2; ++k) // 2 triangles per quad cell - for (Int q = 0; q < nq; ++q) { // quad point - Real sphere_coord[3]; - const Real jac = geometry::calc_tri_jacobian( - ps, ps+3*k, ps+3*(k+1), slice(tq_bary, q), sphere_coord); - Real gj[GLL::max_np], gi[GLL::max_np]; { - Real a, b; - siqk::sqr::calc_sphere_to_ref(p, cell, sphere_coord, a, b); - gll.eval(np, b, gj); - gll.eval(np, a, gi); - } - const Real d0 = 0.5 * tq_w[q] * jac; - for (Int j = 0, basis_idx = 0; j < np; ++j) { // along ref y dir - const Real d1 = d0 * gj[j]; - for (Int i = 0; i < np; ++i, ++basis_idx) // along ref x dir - dgbfi(ci*np2 + basis_idx) += d1 * gi[i]; - } - } - } -} - -static void calc_basis_function_integrals ( - const Mesh& m, const ConstVec3s::HostMirror& p, RealArray::HostMirror& dgbfi, - RealArray::HostMirror& cgbfi) -{ - calc_basis_function_integrals(m.np, m.tq_order, p, m.geo_c2n, dgbfi); - ko::resize(cgbfi, nslices(m.cgll_p)); - ko::deep_copy(cgbfi, 0); - for (Int i = 0; i < len(m.dglln2cglln); ++i) - cgbfi(m.dglln2cglln(i)) += dgbfi(i); -} - -static void calc_gll_basis_function_integrals ( - const Mesh& m, const ConstVec3s::HostMirror& p, RealArray::HostMirror& J_dg) -{ - const Int np2 = square(m.np); - ko::resize(J_dg, nslices(m.geo_c2n)*np2); - GLL gll; - const Real* gll_x, * gll_wt; - gll.get_coef(m.np, gll_x, gll_wt); -# pragma omp parallel for - for (Int ci = 0; ci < nslices(m.geo_c2n); ++ci) { - const auto cell = slice(m.geo_c2n, ci); - for (Int j = 0, basis_idx = 0; j < m.np; ++j) { - const Real b = 0.5*(gll_x[j] + 1); - for (Int i = 0; i < m.np; ++i, ++basis_idx) { - const Real a = 0.5*(gll_x[i] + 1); - Real J[9]; - siqk::sqr::impl::calc_Jacobian(p, cell, a, b, J); - geometry::cross(J, J+3, J+6); - const Real jac = std::sqrt(geometry::norm2(J+6)); - // Product of weights is the integral of the 2D basis function on the - // ref square. Multiply by Jacobian of the map bilinear quad -> - // sphere. Since this is GLL quadrature, there's exactly one quadrature - // point. - J_dg(ci*np2 + basis_idx) = 0.25 * jac * gll_wt[i] * gll_wt[j]; - } - } - } -} - -static void map_cgll2dgll ( - const IdxArray::HostMirror& dglln2cglln, const Real* const cg_data, - Real* const dg_data) -{ -# pragma omp parallel for - for (Int i = 0; i < len(dglln2cglln); ++i) - dg_data[i] = cg_data[dglln2cglln[i]]; -} - -static void map_dgll2cgll ( - const IdxArray::HostMirror& dglln2cglln, - const ConstRealArray::HostMirror& dgbfi, - const ConstRealArray::HostMirror& cgbfi, - const Real* const dg_data, Real* const cg_data, const Int cnn) -{ - for (Int i = 0; i < cnn; ++i) cg_data[i] = 0; - for (Int i = 0; i < len(dglln2cglln); ++i) { - const Int i_cgll = dglln2cglln(i); - cg_data[i_cgll] += (dgbfi(i) / cgbfi(i_cgll)) * dg_data[i]; - } -} - -static void calc_M_fwd (const Mesh& m, RemapData& rd) { - const auto& p = m.geo_p; - const auto& c2n = m.geo_c2n; - auto& fmm = rd.fmm(); - fmm.init(nslices(c2n), m.np); - const Int np = m.np, np2 = square(np); - siqk::TriangleQuadrature tq; - siqk::RawConstVec3s tq_bary; - siqk::RawConstArray tq_w; - tq.get_coef(m.tq_order, tq_bary, tq_w); - const Int nq = len(tq_w); - GLL gll; -# pragma omp parallel for - for (Int ci = 0; ci < nslices(c2n); ++ci) { - Real ps[12]; - copy_vertices(p, c2n, ci, ps); - const auto cell = slice(c2n, ci); - Real* block = fmm.block(ci); - for (Int k = 1; k <= 2; ++k) - for (Int q = 0; q < nq; ++q) { - Real sphere_coord[3]; - const Real jac = geometry::calc_tri_jacobian( - ps, ps+3*k, ps+3*(k+1), slice(tq_bary, q), sphere_coord); - Real gj[GLL::max_np], gi[GLL::max_np]; { - Real a, b; - siqk::sqr::calc_sphere_to_ref(p, cell, sphere_coord, a, b); - gll.eval(np, b, gj); - gll.eval(np, a, gi); - } - const Real d0 = 0.5 * tq_w[q] * jac; - for (Int aj = 0, a_basis_idx = 0; aj < np; ++aj) { - const Real d1 = d0 * gj[aj]; - for (Int ai = 0; ai < np; ++ai, ++a_basis_idx) { - const Real d2 = d1 * gi[ai]; - for (Int bj = 0, b_basis_idx = 0; bj < np; ++bj) { - const Real d3 = d2 * gj[bj]; - for (Int bi = 0; bi < np; ++bi, ++b_basis_idx) { - if (b_basis_idx < a_basis_idx) continue; - const Real d = d3 * gi[bi]; - block[np2*a_basis_idx + b_basis_idx] += d; - if (a_basis_idx != b_basis_idx) - block[np2*b_basis_idx + a_basis_idx] += d; - } - } - } - } - } - } - gdbg.write("M", rd.fmm().get_M()); - //fmm.factor(); -} - -class CountIntersectionsFunctor { -protected: - const siqk::sh::Mesh& cm_; - const ConstVec3s::HostMirror p_; - const ConstIdxs::HostMirror e_; - Int hits_[max_hits]; - Int k_, nh_; - -public: - CountIntersectionsFunctor ( - const siqk::sh::Mesh& cm, const ConstVec3s::HostMirror& p, - const ConstIdxs::HostMirror& c2n) - : cm_(cm), p_(p), e_(c2n), nh_(0) - {} - - void reset (const Int clipped_ci) { - k_ = clipped_ci; - nh_ = 0; - } - - void operator() (const Int clip_ci) { - // Check whether we've clipped against this polygon before and there was a - // non-0 intersection. - for (Int i = 0; i < nh_; ++i) - if (hits_[i] == clip_ci) - return; - // We have not, so do the intersection. - Int no = 0; - { - // Area of all overlapping regions. - // In and out vertex lists. - Real buf[9*max_nvert]; - siqk::RawVec3s - vi(buf, max_nvert, 3), - vo(buf + 3*max_nvert, max_nvert, 3), - wrk(buf + 6*max_nvert, max_nvert, 3); - Int ni; - ni = 0; - for (Int i = 0; i < szslice(e_); ++i) { - if (e_(k_,i) == -1) break; - geometry::copy(slice(vi, i), slice(p_, e_(k_,i))); - ++ni; - } - siqk::sh::clip_against_poly(cm_, clip_ci, vi, ni, vo, no, wrk); - } - if (no) { - // Non-0 intersection, so record. - if (nh_ == max_hits) Kokkos::abort("max_hits is too small."); - hits_[nh_++] = clip_ci; - } - } - - Int get_nhits () const { return nh_; } - const Int* get_hits () const { return hits_; } -}; - -static void calc_T_pattern_fwd ( - const Mesh& m, const ConstVec3s::HostMirror& depart_p, - const RemapData::Octree& ot, std::vector& rowptr, - std::vector& colidx) -{ - Timer::start(Timer::ts_remap_T_geometry); - const Int ncell = nslices(m.geo_c2n); - Idxs::HostMirror hits("hits", ncell, max_hits); - { - siqk::sh::Mesh cm; - cm.p = m.geo_p; cm.e = m.geo_c2n; cm.nml = m.geo_nml; cm.en = m.geo_c2nml; -# pragma omp parallel for schedule(static, 20) - for (Int ci = 0; ci < ncell; ++ci) { - Real bb[6]; - RemapData::Octree::calc_bb(depart_p, slice(m.geo_c2n, ci), - szslice(m.geo_c2n), bb); - CountIntersectionsFunctor cif(cm, depart_p, m.geo_c2n); - cif.reset(ci); - ot.apply(bb, cif); - const Int* ci_hits = cif.get_hits(); - const Int hin = cif.get_nhits(); - for (Int hi = 0; hi < hin; ++hi) - hits(ci, hi) = ci_hits[hi]; - if (hin < max_hits) - hits(ci, hin) = -1; - } - } - Timer::stop(Timer::ts_remap_T_geometry); Timer::start(Timer::ts_remap_T_crs); - // Need to form transpose of the matrix that is most naturally created by the - // above if using CRS format. - rowptr.resize(ncell + 1, 0); - for (Int ci = 0; ci < ncell; ++ci) - for (Int hi = 0; hi < max_hits; ++hi) { - if (hits(ci, hi) == -1) break; - ++rowptr[hits(ci, hi) + 1]; - } - // Cumsum. - for (Int ci = 1; ci <= ncell; ++ci) - rowptr[ci] += rowptr[ci-1]; - colidx.resize(rowptr[ncell]); - // Shift up 1. - for (Int ci = ncell; ci > 0; --ci) - rowptr[ci] = rowptr[ci-1]; - for (Int ci = 0; ci < ncell; ++ci) - for (Int hi = 0; hi < max_hits; ++hi) { - const Int row = hits(ci, hi); - if (row == -1) break; - colidx[rowptr[row+1]] = ci; - ++rowptr[row+1]; - } -# pragma omp parallel for - for (Int ci = 0; ci < ncell; ++ci) - std::sort(colidx.data() + rowptr[ci], colidx.data() + rowptr[ci+1]); - Timer::stop(Timer::ts_remap_T_crs); -} - -static void fill_T_fwd (const Mesh& m, const ConstVec3s::HostMirror& depart_p, - RemapData::MT& T) { - const Int ncell = nslices(m.geo_c2n); - const Int np = m.np, np2 = square(np), np4 = square(np2); - siqk::TriangleQuadrature tq; - siqk::RawConstVec3s tq_bary; - siqk::RawConstArray tq_w; - tq.get_coef(m.tq_order, tq_bary, tq_w); - const Int nq = len(tq_w); - GLL gll; - const Size* rowptr = T.rowptr(); - const Int* colidx = T.colidx(); - siqk::sh::Mesh cm; - cm.p = m.geo_p; cm.e = m.geo_c2n; cm.nml = m.geo_nml; cm.en = m.geo_c2nml; -# pragma omp parallel for schedule(static, 1) - for (Int tci = 0; tci < ncell; ++tci) { - Real* block = T.blockrow(tci); - const auto tcell = slice(m.geo_c2n, tci); - for (Int cj = rowptr[tci]; cj < rowptr[tci+1]; ++cj) { - const Int sci = colidx[cj]; - const auto scell = slice(m.geo_c2n, sci); - Real buf[9*max_nvert]; - siqk::RawVec3s - vi(buf, max_nvert, 3), - vo(buf + 3*max_nvert, max_nvert, 3), - wrk(buf + 6*max_nvert, max_nvert, 3); - Int ni = 0, no; - for (Int i = 0; i < szslice(m.geo_c2n); ++i) { - if (scell[i] == -1) break; - geometry::copy(slice(vi, i), slice(depart_p, scell[i])); - ++ni; - } - siqk::sh::clip_against_poly(cm, tci, vi, ni, vo, no, wrk); - assert(no); - { - for (Int i = 0; i < np4; ++i) block[i] = 0; - for (Int ktri = 1; ktri < no-1; ++ktri) // triangles in vo - for (Int q = 0; q < nq; ++q) { // quad point - Real sphere_coord[3]; - const Real jac = geometry::calc_tri_jacobian( - slice(vo,0), slice(vo,ktri), slice(vo,ktri+1), slice(tq_bary, q), - sphere_coord); - Real tgj[GLL::max_np], tgi[GLL::max_np], - sgj[GLL::max_np], sgi[GLL::max_np]; - { - Real ta, tb, sa, sb; - siqk::sqr::calc_sphere_to_ref(m.geo_p, tcell, sphere_coord, - ta, tb); - siqk::sqr::calc_sphere_to_ref(depart_p, scell, sphere_coord, - sa, sb); - gll.eval(np, tb, tgj); - gll.eval(np, ta, tgi); - gll.eval(np, sb, sgj); - gll.eval(np, sa, sgi); - } - const Real d0 = 0.5 * tq_w[q] * jac; - for (Int tj = 0, t_basis_idx = 0; tj < np; ++tj) { - const Real d1 = d0 * tgj[tj]; - for (Int ti = 0; ti < np; ++ti, ++t_basis_idx) { - const Real d2 = d1 * tgi[ti]; - for (Int sj = 0, s_basis_idx = 0; sj < np; ++sj) { - const Real d3 = d2 * sgj[sj]; - for (Int si = 0; si < np; ++si, ++s_basis_idx) { - const Real d = d3 * sgi[si]; - block[np2*t_basis_idx + s_basis_idx] += d; - } - } - } - } - } - } - block += np4; - } - } -} - -static void calc_T_fwd (const Mesh& m, const Vec3s::HostMirror& depart_p, - RemapData& rd) -{ - { // Build T's sparse matrix nonzero pattern. - std::vector rowptr, colidx; - calc_T_pattern_fwd(m, depart_p, rd.octree(), rowptr, colidx); - const Int N = len(rowptr)-1, n = square(m.np); - rd.T().init(N, N, n, n, rowptr.data(), colidx.data()); - } - Timer::start(Timer::ts_remap_T_fill); - fill_T_fwd(m, depart_p, rd.T()); - Timer::stop(Timer::ts_remap_T_fill); -} - -// On input, src_tracer is rho*tracer. On output, it is just the updated -// tracer. Density is removed for output and error checking. -static void remap ( - RemapData& rd, const Mesh& m, const Vec3s::HostMirror& depart_p, - Real* const src_tracer, Real* const tgt_tracer, const Int ntracers, - Real* const src_density, Real* const tgt_density, - // If in_dgll, we're working in DGLL space the whole time; otherwise, we're - // doing CGLL -> DGLL -> remap -> CGLL. If in_dgll, wrk can be null; - // otherwise, it must have length >= 2 dnn. - const bool in_dgll, Real* const wrk) -{ - // For debugging and analysis, factor here. - static bool first = true; if (first) { - //rd.compare_MT(); - rd.fmm().factor(); - first = false; - } - - Timer::start(Timer::ts_remap_T); - const Int dnn = len(m.dglln2cglln), cnn = nslices(m.cgll_p), - len = in_dgll ? dnn : cnn; - calc_T_fwd(m, depart_p, rd); - Timer::stop(Timer::ts_remap_T); Timer::start(Timer::ts_remap_node_jac); - RealArray::HostMirror Js; - calc_node_jacobians(m, depart_p, Js); - Timer::stop(Timer::ts_remap_node_jac); - - for (Int ti = 0; ti < ntracers; ++ti) { - Real* src, * tgt; - if (in_dgll) { - src = src_tracer + ti*len; - tgt = tgt_tracer + ti*len; - } else { - src = wrk; - tgt = wrk + dnn; - map_cgll2dgll(m.dglln2cglln, src_tracer + ti*len, src); - } - // Adjust density according to the flow. At this point, the tracer field has - // density in it. -# pragma omp parallel for - for (Int i = 0; i < dnn; ++i) { - const Real q = rd.Jt()[i]/Js[i]; - src[i] *= q; - } - // L2 project. - rd.apply_R_full(src, dnn, tgt, dnn, 1); - if ( ! in_dgll) - map_dgll2cgll(m.dglln2cglln, rd.dgbfi(), rd.cgbfi(), tgt, - tgt_tracer + ti*len, cnn); - } - - { - Real* src, * tgt; - if (in_dgll) { - src = src_density; - tgt = tgt_density; - } else { - src = wrk; - tgt = wrk + dnn; - map_cgll2dgll(m.dglln2cglln, src_density, src); - } -# pragma omp parallel for - for (Int i = 0; i < dnn; ++i) { - const Real q = rd.Jt()[i]/Js[i]; - src[i] *= q; - } - rd.apply_R_full(src, dnn, tgt, dnn, 1); - if ( ! in_dgll) - map_dgll2cgll(m.dglln2cglln, rd.dgbfi(), rd.cgbfi(), tgt, - tgt_density, cnn); - } - - // For output, remove density from tracer field. - for (Int ti = 0; ti < ntracers; ++ti) { -# pragma omp parallel for - for (Int i = 0; i < len; ++i) - tgt_tracer[ti*len + i] /= tgt_density[i]; - } -} - -static void print_error ( - const Mesh& m, const ConstRealArray::HostMirror& J_gll, const bool in_dgll, - const Real* const fs, const Real* const ds, - const Real* const fe, const Real* const de, Output& out) -{ - Real l2_num = 0, l2_den = 0, max_num = 0, max_den = 0; - out.max_s = -1e300; out.min_s = 1e300; - out.max_e = -1e300; out.min_e = 1e300; - out.mass_s = 0; out.mass_e = 0; - out.mass_gll_s = 0; out.mass_gll_e = 0; - siqk::TriangleQuadrature tq; - siqk::RawConstVec3s tq_bary; - siqk::RawConstArray tq_w; - tq.get_coef(m.tq_order, tq_bary, tq_w); - const Int nq = len(tq_w); - GLL gll; - const auto& c2n = m.geo_c2n; - const auto& p = m.geo_p; - const Int np = m.np, np2 = square(np); - // GLL mass conservation. - for (Int ci = 0; ci < nslices(c2n); ++ci) - for (Int j = 0, basis_idx = 0; j < m.np; ++j) - for (Int i = 0; i < m.np; ++i, ++basis_idx) { - const Int k = ci*np2 + basis_idx; - const Real w = J_gll[k]; - const Int idx = in_dgll ? k : m.cgll_c2n(ci, basis_idx); - out.mass_gll_s += w * ds[idx]; - out.mass_gll_e += w * de[idx]; - } - // Mass conservation wrt quadrature approximation of exact integrals. - for (Int ci = 0; ci < nslices(c2n); ++ci) { - const auto cell = slice(c2n, ci); - Real ps[12]; - copy_vertices(p, c2n, ci, ps); - for (Int k = 1; k <= 2; ++k) - for (Int q = 0; q < nq; ++q) { - Real sphere_coord[3]; - const Real jac = geometry::calc_tri_jacobian( - ps, ps+3*k, ps+3*(k+1), slice(tq_bary, q), sphere_coord); - Real gj[GLL::max_np], gi[GLL::max_np]; { - Real a, b; - siqk::sqr::calc_sphere_to_ref(p, cell, sphere_coord, a, b); - gll.eval(np, b, gj); - gll.eval(np, a, gi); - } - const Real d0 = 0.5 * tq_w[q] * jac; - for (Int j = 0, basis_idx = 0; j < np; ++j) { - const Real d1 = d0 * gj[j]; - for (Int i = 0; i < np; ++i, ++basis_idx) { - const Int k = ci*np2 + basis_idx; - const Int idx = in_dgll ? k : m.cgll_c2n(ci, basis_idx); - const Real w = d1 * gi[i]; - const Real e = fe[idx] - fs[idx]; - out.mass_s += w * ds[idx]; - out.mass_e += w * de[idx]; - l2_num += w * square(e); - l2_den += w * square(fs[idx]); - max_num = std::max(max_num, std::abs(e)); - max_den = std::max(max_den, std::abs(fs[idx])); - out.min_s = std::min(out.min_s, fs[idx]); - out.max_s = std::max(out.max_s, fs[idx]); - out.min_e = std::min(out.min_e, fe[idx]); - out.max_e = std::max(out.max_e, fe[idx]); - } - } - } - } - out.l2_err = std::sqrt(l2_num/l2_den); - out.max_err = max_num/max_den; - printf("> re l2 %9.3e max %9.3e\n", out.l2_err, out.max_err); - printf("> [cv] re %10.3e\n", reldif(out.mass_s, out.mass_e)); - printf("> [cv gll] re %10.3e\n", reldif(out.mass_gll_s, out.mass_gll_e)); - printf("> [mo] min %10.3e %10.3e [%10.3e] max %10.3e %10.3e [%10.3e]\n", - out.min_s, out.min_e, out.min_e - out.min_s, - out.max_s, out.max_e, out.max_e - out.max_s); -} - -static void print_one_liner (const Input& in, const Output& out) { - std::cout << "
          method " << in.integrate_options.stepping - << " ode " << in.ode << " ic " << in.initial_condition - << " T " << in.T << " np " << in.np << " ne " << in.ne - << " tq " << in.tq_order << " nsteps " << in.nsteps - << " mono " << in.monotone_type; - printf(" re l2 %9.3e max %9.3e", out.l2_err, out.max_err); - printf(" cv re %9.3e", reldif(out.mass_s, out.mass_e)); - printf(" cvgll re %9.3e", reldif(out.mass_gll_s, out.mass_gll_e)); - printf(" mo min %9.3e %9.3e %9.3e max %9.3e %9.3e %9.3e", - out.min_s, out.min_e, out.min_e - out.min_s, - out.max_s, out.max_e, out.max_e - out.max_s); - printf(" et ts %9.3e nthr %d", out.et_timestep, omp_get_max_threads()); - std::cout << " prog " << in.program_name; - std::cout << " xyz " << in.xyz_form; - std::cout << " d2c " << in.integrate_options.d2c; - std::cout << "\n"; -} - -static void init_mesh (const Int np, const Int tq_order, const Int ne, - Mesh& m) { - m.np = np; - m.tq_order = tq_order; - mesh::make_cubedsphere(m.geo_p, m.geo_c2n, ne); - mesh::make_cgll_from_geo(m.geo_p, m.geo_c2n, np, m.cgll_p, m.cgll_c2n); - mesh::make_dgll_from_cgll(m.cgll_p, m.cgll_c2n, m.dglln2cglln, m.dgll_c2n); - mesh::make_io_cgll_from_internal_cgll(m.cgll_p, m.cgll_c2n, m.cgll_io_c2n); - { - siqk::sh::Mesh sm; sm.p = m.geo_p; sm.e = m.geo_c2n; - siqk::test::fill_normals(sm); - m.geo_nml = sm.nml; m.geo_c2nml = sm.en; - } -} - -// A bit of complication in this routine is opts.d2c. The natural thing to do -// is to work in DGLL space the whole time, except possibly when writing to the -// netcdf file. However, we need to mimic the intended application: at each -// step, we get CGLL fields, convert to DGLL, L2 project, then convert back. If -// opts.d2c, mimic this behavior; if ! opts.d2c, stay in DGLL space the whole -// time except when writing to the file. We support both behaviors so we can -// analyze the impact of going back and forth on accuracy. -static void integrate ( - const Mesh& m, const std::shared_ptr& mi, - const RemapOptions& ro, const Real T, const Int nsteps, - gallery::InitialCondition::Shape ic, const std::string& out_fn, - const Int write_every, const IntegrateOptions opts, Output& out) -{ - Timer::start(Timer::ts_setup); - const Int dnn = len(m.dglln2cglln), cnn = nslices(m.cgll_p), - len = opts.d2c ? cnn : dnn; - - // Initialize I/O. - std::shared_ptr ncw; - if (write_every > 0) { - ncw = std::make_shared( - m.cgll_p, m.cgll_io_c2n, out_fn + ".g", ro.np, ro.monotone_type); - ncw->add_nodal_field("tracer"); - ncw->add_nodal_field("density"); - ncw->end_definition(); - } - - // Eulerian mesh remap data. - RealArray::HostMirror Jt_gll; - calc_gll_basis_function_integrals(m, m.geo_p, Jt_gll); - RemapData rd; - calc_M_fwd(m, rd); - rd.octree().init(m.geo_p, m.geo_c2n); - calc_node_jacobians(m, m.cgll_p, rd.Jt()); - calc_basis_function_integrals(m, m.geo_p, rd.dgbfi(), rd.cgbfi()); - - // Initialize data and workspace. - std::vector tracer[2], density[2]; - std::vector* tracer_p[2], * density_p[2]; - for (Int i = 0; i < 2; ++i) { - tracer[i].resize(len); - tracer_p[i] = &tracer[i]; - density[i].resize(len); - density_p[i] = &density[i]; - } - for (Int k = 0; k < 2; ++k) - for (Int i = 0; i < len; ++i) - (*density_p[k])[i] = 1; - std::vector wrk(opts.d2c ? 2*dnn : cnn); - // Record the initial and final states. - std::vector error_data(4*len); - - { - // Get the initial conditions. - std::vector lat(cnn), lon(cnn); - for (Int i = 0; i < cnn; ++i) { - const auto n = slice(m.cgll_p, i); - xyz2ll(n[0], n[1], n[2], lat[i], lon[i]); - } - Real* data = opts.d2c ? tracer_p[0]->data() : wrk.data(); - gallery::InitialCondition::init( - ic, nslices(m.cgll_p), lat.data(), lon.data(), data); - // Record the ICs. - if ( ! opts.d2c) - map_cgll2dgll(m.dglln2cglln, data, tracer_p[0]->data()); - memcpy(error_data.data(), tracer_p[0]->data(), len*sizeof(Real)); - if (ncw) { - ncw->advance_time_to(0); - ncw->write_field("tracer", data); - } - memcpy(error_data.data() + len, density_p[0]->data(), len*sizeof(Real)); - if (ncw) { - data = opts.d2c ? density_p[0]->data() : wrk.data(); - if ( ! opts.d2c) - map_dgll2cgll(m.dglln2cglln, rd.dgbfi(), rd.cgbfi(), - density_p[0]->data(), data, cnn); - ncw->write_field("density", data); - } - } - // Remap is done on density*tracer, but sometimes the tracer field doesn't - // have the density rho in it. - for (Int i = 0; i < len; ++i) - (*tracer_p[0])[i] *= (*density_p[0])[i]; - - // Time step. - Vec3s::HostMirror departure_p; - ko::resize(departure_p, nslices(m.geo_p), szslice(m.geo_p)); - const Real dt = T/nsteps; - const Int last_step = - opts.stepping == IntegrateOptions::test_looa ? 1 : nsteps - 1; - ProgressBar progress_bar("integrate", last_step+1, 10); - const auto step_t = siqk::tic(); - Timer::stop(Timer::ts_setup); Timer::start(Timer::ts); - for (Int step = 0; step <= last_step; ++step) { - Timer::start(Timer::ts_integrate); - const Real tf = step == last_step ? T : dt*(step + 1); - switch (opts.stepping) { - case IntegrateOptions::fwd: - // Integrate mesh forward in time. - mi->integrate(dt*step, tf, departure_p); - break; - case IntegrateOptions::bwd: - throw std::runtime_error("IntegrateOptions::bwd is not impl'ed."); - break; - case IntegrateOptions::test_looa: - switch (step) { - case 0: mi->integrate(dt*step, tf, departure_p); break; - case 1: mi->integrate(dt*step, dt*(step - 1), departure_p); break; - default: assert(0); break; - } - break; - } - Timer::stop(Timer::ts_integrate); Timer::start(Timer::ts_remap); - remap(rd, m, departure_p, tracer_p[0]->data(), tracer_p[1]->data(), 1, - density_p[0]->data(), density_p[1]->data(), ! opts.d2c, wrk.data()); - Timer::stop(Timer::ts_remap); Timer::start(Timer::ts_rest); - if (step == 0) { - // Analyze the remap operator R = M \ T. - RealArray::HostMirror dgbfi_s; - calc_basis_function_integrals(m.np, m.tq_order, departure_p, m.geo_c2n, - dgbfi_s); - printf("\n> triangle quadrature jacobians\n"); - rd.check(dgbfi_s.ptr_on_device(), rd.dgbfi().ptr_on_device()); - RealArray::HostMirror Js_gll; - calc_gll_basis_function_integrals(m, departure_p, Js_gll); - printf("> GLL jacobians\n"); - rd.check(Js_gll.ptr_on_device(), Jt_gll.ptr_on_device()); - } - gdbg.write("T", rd.T()); - gdbg.write_p("geo_p", m.geo_p); gdbg.write_c2n("geo_c2n", m.geo_c2n); - gdbg.write_p("departure_p", departure_p); - - // Netcdf I/O. - if (ncw && (step % write_every == 0 || step == last_step)) { - ncw->advance_time_to(tf); - if (opts.d2c) { - ncw->write_field("tracer", tracer_p[1]->data()); - ncw->write_field("density", density_p[1]->data()); - } else { - Real* const data = wrk.data(); - map_dgll2cgll(m.dglln2cglln, rd.dgbfi(), rd.cgbfi(), - tracer_p[1]->data(), data, cnn); - ncw->write_field("tracer", data); - map_dgll2cgll(m.dglln2cglln, rd.dgbfi(), rd.cgbfi(), - density_p[1]->data(), data, cnn); - ncw->write_field("density", data); - } - } - // Record data for error analysis. - if (step == last_step) { - memcpy(error_data.data() + 2*len, tracer_p[1]->data(), len*sizeof(Real)); - memcpy(error_data.data() + 3*len, density_p[1]->data(), len*sizeof(Real)); - } - -# pragma omp parallel for - for (Int i = 0; i < len; ++i) - (*tracer_p[1])[i] *= (*density_p[1])[i]; - - std::swap(tracer_p[0], tracer_p[1]); - std::swap(density_p[0], density_p[1]); - progress_bar.update(); - gdbg.advance(); - gdbg.set_on(false); - } - const Real step_et = siqk::toc(step_t); - Timer::stop(Timer::ts); - siqk::print_times("timestep", step_et); - out.et_timestep = step_et; - - { Timer::start(Timer::ts_error); - const Real* const d = error_data.data(); - print_error(m, Jt_gll, ! opts.d2c, d, d + len, d + 2*len, - d + 3*len, out); - Timer::stop(Timer::ts_error); - } -} - -static void run (const Input& in) { - const Real T = day2sec(in.T); - const auto ic = gallery::InitialCondition::from_string( - in.initial_condition); - RemapOptions ro; - ro.np = in.np; - - Mesh m; - init_mesh(in.np, in.tq_order, in.ne, m); - - auto mi = MeshIntegratorFactory::create(in.ode, in.xyz_form, m.geo_p); - // Get lat-lon of geo mesh nodes. - const Int nn = nslices(m.geo_p); -# pragma omp parallel for - for (Int i = 0; i < nn; ++i) { - const auto n = slice(m.geo_p, i); - Real* const lli = mi->get_ll().data() + 2*i; - xyz2ll(n[0], n[1], n[2], lli[0], lli[1]); - } - - Output out; - integrate(m, mi, ro, T, in.nsteps, ic, in.output_fn, in.write_every, - in.integrate_options, out); - print_one_liner(in, out); -} - -Input::Input (int argc, char** argv) - : output_fn("tmp/out"), ode("divergent"), - initial_condition("xyztrig"), T(12), ne(5), nsteps(120), write_every(1), - monotone_type(0), np(4), tq_order(12), debug(false), xyz_form(false) -{ - program_name = argv[0]; - integrate_options.stepping = IntegrateOptions::fwd; - integrate_options.d2c = false; - for (int i = 1; i < argc; ++i) { - const std::string& token = argv[i]; - if (eq(token, "-o", "--output")) - output_fn = argv[++i]; - else if (eq(token, "-T")) - T = atof(argv[++i]); - else if (eq(token, "-nsteps")) - nsteps = atoi(argv[++i]); - else if (eq(token, "-ode")) - ode = argv[++i]; - else if (eq(token, "-ic")) - initial_condition = argv[++i]; - else if (eq(token, "-mono", "--monotone")) - monotone_type = atoi(argv[++i]); - else if (eq(token, "-np")) - np = atoi(argv[++i]); - else if (eq(token, "-tq")) - tq_order = atoi(argv[++i]); - else if (eq(token, "-ne")) - ne = atoi(argv[++i]); - else if (eq(token, "-we", "--write-every")) - write_every = atoi(argv[++i]); - else if (eq(token, "-looa", "--looa")) - integrate_options.stepping = IntegrateOptions::test_looa; - else if (eq(token, "-xyz", "--xyz")) - xyz_form = true; - else if (eq(token, "-d2c", "--d2c")) - integrate_options.d2c = true; - else if (eq(token, "-d", "--debug")) - debug = true; - } - - if (np == 4) tq_order = 20; - - print(std::cout); -} - -void Input::print (std::ostream& os) const { - os << "output filename (-o): " << output_fn << "\n" - << "ode (-ode, " << MeshIntegratorFactory::get_inputs() << "): " - << ode << "\n" - << "xyz_form (-xyz): " << xyz_form << "\n" - << "initial condition (-ic, " - << gallery::InitialCondition::get_inputs() << "): " - << initial_condition << "\n" - << "T (-T): " << T << " [day]\n" - << "nsteps (-nsteps): " << nsteps << "\n" - << "np (-np): " << np << "\n" - << "tq (-tq): " << tq_order << "\n" - << "ne (-ne): " << ne << "\n" - << "monotone_type (-mono, {0,1,2,3}): " << monotone_type << "\n" - << "write every (-we): " << write_every << "\n" - << "test_looa (--looa): " - << (integrate_options.stepping == IntegrateOptions::test_looa) << "\n" - << "d2c (-d2c): " << integrate_options.d2c << "\n" - << "debug (-d): " << debug << "\n"; -} - -int main (int argc, char** argv) { - Kokkos::initialize(argc, argv); { - Timer::init(); - Timer::start(Timer::total); - BlockMatrix<>::test(); - Input in(argc, argv); - run(in); - Timer::stop(Timer::total); - Timer::print(); - } Kokkos::finalize_all(); -} From ee19e98f4d83e72c0b20a1273e995f10d8962f29 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Mon, 13 Mar 2017 09:28:18 -0600 Subject: [PATCH 15/28] Fix siqk_runtests.py; was using wrong exe name. --- siqk/siqk_runtests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/siqk/siqk_runtests.py b/siqk/siqk_runtests.py index d5783b9..eea9e21 100644 --- a/siqk/siqk_runtests.py +++ b/siqk/siqk_runtests.py @@ -21,7 +21,7 @@ for n in [4, 20, 40, 79]: if quick and n > 20: break for angle in angles: - cmd = ('OMP_NUM_THREADS=8 ./a.out --testno 1 --angle {angle:1.15e} -n {n:d}'. + cmd = ('OMP_NUM_THREADS=8 ./siqk_test --testno 1 --angle {angle:1.15e} -n {n:d}'. format(angle=angle, n=n)) stat = os.system(cmd + ' |& grep PASSED &> /dev/null') if stat: @@ -36,7 +36,7 @@ for angle in angles: for xlate in xlates: for ylate in ylates: - cmd = ('OMP_NUM_THREADS=8 ./a.out --testno 0 --xlate {xlate:1.15e} --ylate {ylate:1.14e} --angle {angle:1.15e} -n {n:d}'. + cmd = ('OMP_NUM_THREADS=8 ./siqk_test --testno 0 --xlate {xlate:1.15e} --ylate {ylate:1.14e} --angle {angle:1.15e} -n {n:d}'. format(xlate=xlate, ylate=ylate, angle=angle, n=n)) stat = os.system(cmd + ' |& grep PASSED &> /dev/null') if stat: From 5448f436c94938c3fe7007b822767bf1c4448e66 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Mon, 21 Aug 2017 15:38:13 -0600 Subject: [PATCH 16/28] Update SIQK and SLMMIR. QLT coming soon. --- siqk/make.inc.amb | 2 +- siqk/siqk_defs.hpp | 22 ++++++++++++++++++++-- siqk/siqk_geometry.hpp | 2 +- siqk/siqk_intersect.hpp | 2 +- siqk/siqk_quadrature.hpp | 36 +++++++++++++++++++++++++++++++++--- siqk/siqk_search.hpp | 2 +- siqk/siqk_sqr.hpp | 5 ++++- 7 files changed, 61 insertions(+), 10 deletions(-) diff --git a/siqk/make.inc.amb b/siqk/make.inc.amb index 457e199..71ed8b5 100644 --- a/siqk/make.inc.amb +++ b/siqk/make.inc.amb @@ -1,3 +1,3 @@ opt=-O3 -CXX=g++-4.7 +CXX=g++ KOKKOS=/home/ambradl/lib/kokkos/cpu diff --git a/siqk/siqk_defs.hpp b/siqk/siqk_defs.hpp index 5653417..ad3c896 100644 --- a/siqk/siqk_defs.hpp +++ b/siqk/siqk_defs.hpp @@ -41,6 +41,20 @@ static void prarr (const std::string& name, const T* const v, const size_t n) { std::cerr << "\n"; } +#define SIQK_THROW_IF(condition, message) do { \ + if (condition) { \ + std::stringstream _ss_; \ + _ss_ << __FILE__ << ":" << __LINE__ << ": The condition:\n" << #condition \ + "\nled to the exception\n" << message << "\n"; \ + throw std::logic_error(_ss_.str()); \ + } \ + } while (0) + +#define SIQK_STDERR_IF(condition, message) do { \ + try { SIQK_THROW_IF(condition, message); } \ + catch (const std::logic_error& e) { std::cerr << e.what(); } \ +} while (0) + #ifdef SIQK_TIME static timeval tic () { timeval t; @@ -196,6 +210,10 @@ void swap (T& a, T&b) { b = tmp; } template KOKKOS_INLINE_FUNCTION constexpr T square (const T& x) { return x*x; } -} -#endif +template KOKKOS_INLINE_FUNCTION +T sign (const T& a) { return a > 0 ? 1 : (a < 0 ? -1 : 0); } + +} // namespace siqk + +#endif // INCLUDE_SIQK_DEFS_HPP diff --git a/siqk/siqk_geometry.hpp b/siqk/siqk_geometry.hpp index c916406..9ad9ecd 100644 --- a/siqk/siqk_geometry.hpp +++ b/siqk/siqk_geometry.hpp @@ -307,4 +307,4 @@ struct SphereGeometry { } // namespace siqk -#endif +#endif // INCLUDE_SIQK_GEOMETRY_HPP diff --git a/siqk/siqk_intersect.hpp b/siqk/siqk_intersect.hpp index fbc6dfd..d380002 100644 --- a/siqk/siqk_intersect.hpp +++ b/siqk/siqk_intersect.hpp @@ -335,4 +335,4 @@ template Real test_area_ot ( } // namespace test } // namespace siqk -#endif // INCLUDE_SIQK_HPP +#endif // INCLUDE_SIQK_INTERSECT_HPP diff --git a/siqk/siqk_quadrature.hpp b/siqk/siqk_quadrature.hpp index 8f2a300..18e5ccb 100644 --- a/siqk/siqk_quadrature.hpp +++ b/siqk/siqk_quadrature.hpp @@ -5,9 +5,12 @@ namespace siqk { -/* See, e.g., +/* For the TRISYM entries, see, e.g., Zhang, Linbo, Tao Cui, and Hui Liu. "A set of symmetric quadrature rules on - triangles and tetrahedra." J. of Computational Mathematics (2009): 89-96. + triangles and tetrahedra." J. of Computational Mathematics (2009): 89-96. + For the TRITAYLOR, see + Day, David M. and Mark A. Taylor, "A new 11 point degree 6 cubature formula + for the triangle", PAMM 7 (2007). */ #define SIQK_QUADRATURE_TRISYM_ORDER4_COORD \ {0.108103018168070, 0.445948490915965, 0.445948490915965, \ @@ -278,9 +281,30 @@ namespace siqk { 0.0053678057381874528034004789844857,0.0053678057381874528034004789844857,0.0053678057381874528034004789844857, \ 0.0053678057381874528034004789844857} +#define SIQK_QUADRATURE_TRITAY_ORDER6_COORD \ + {4.724686653264358e-02, 5.725498667747682e-02, 8.954981467898796e-01, \ + 4.280913872509884e-02, 8.953626400245792e-01, 6.182822125032195e-02, \ + 2.921805130458027e-01, 6.844757484565146e-01, 2.334373849768268e-02, \ + 8.712234683377076e-01, 6.874625591502949e-02, 6.003027574726293e-02, \ + 5.086198608278325e-02, 6.156762055758400e-01, 3.334618083413767e-01, \ + 2.128646728100595e-01, 6.279461411977890e-01, 1.591891859921515e-01, \ + 2.817957679526839e-01, 6.290913834186361e-02, 6.552950937054525e-01, \ + 6.225041026512227e-01, 6.837821192050995e-02, 3.091176854282673e-01, \ + 7.604403244598745e-02, 2.875294583743921e-01, 6.364265091796204e-01, \ + 5.941924379444020e-01, 3.287835564131346e-01, 7.702400564246337e-02, \ + 3.353648085404556e-01, 3.122904050136449e-01, 3.523447864458995e-01} + +#define SIQK_QUADRATURE_TRITAY_ORDER6_WEIGHT \ + {3.806807185295551e-02, 3.837935530775279e-02, 4.620045674456197e-02, \ + 5.346758944419899e-02, 8.375582696574595e-02, 1.016448330255167e-01, \ + 1.018615244613670e-01, 1.114218316600018e-01, 1.120094502629461e-01, \ + 1.247875714375583e-01, 1.884034888373949e-01} + class TriangleQuadrature { const Real trisym_order4_coord_ [ 18] = SIQK_QUADRATURE_TRISYM_ORDER4_COORD; const Real trisym_order4_weight_ [ 6] = SIQK_QUADRATURE_TRISYM_ORDER4_WEIGHT; + const Real tritay_order6_coord_ [ 33] = SIQK_QUADRATURE_TRITAY_ORDER6_COORD; + const Real tritay_order6_weight_ [ 11] = SIQK_QUADRATURE_TRITAY_ORDER6_WEIGHT; const Real trisym_order8_coord_ [ 48] = SIQK_QUADRATURE_TRISYM_ORDER8_COORD; const Real trisym_order8_weight_ [ 16] = SIQK_QUADRATURE_TRISYM_ORDER8_WEIGHT; const Real trisym_order12_coord_ [ 99] = SIQK_QUADRATURE_TRISYM_ORDER12_COORD; @@ -301,6 +325,10 @@ class TriangleQuadrature { coord = RawConstVec3s(trisym_order4_coord_, 6, 3); weight = RawConstArray(trisym_order4_weight_, 6); break; + case 6: + coord = RawConstVec3s(tritay_order6_coord_, 11, 3); + weight = RawConstArray(tritay_order6_weight_, 11); + break; case 8: coord = RawConstVec3s(trisym_order8_coord_, 16, 3); weight = RawConstArray(trisym_order8_weight_, 16); @@ -317,10 +345,12 @@ class TriangleQuadrature { coord = RawConstVec3s(trisym_order20_coord_, 88, 3); weight = RawConstArray(trisym_order20_weight_, 88); break; + default: + ko::abort("TriangleQuadrature::get_coef: order not supported."); } } }; } // namespace siqk -#endif +#endif // INCLUDE_SIQK_QUADRATURE_HPP diff --git a/siqk/siqk_search.hpp b/siqk/siqk_search.hpp index c9d70b8..381ac99 100644 --- a/siqk/siqk_search.hpp +++ b/siqk/siqk_search.hpp @@ -375,4 +375,4 @@ class Octree { } // namespace siqk -#endif +#endif // INCLUDE_SIQK_SEARCH_HPP diff --git a/siqk/siqk_sqr.hpp b/siqk/siqk_sqr.hpp index 8a20cf7..abace0a 100644 --- a/siqk/siqk_sqr.hpp +++ b/siqk/siqk_sqr.hpp @@ -66,6 +66,9 @@ void calc_residual (const ConstVec3sT& p, const Quad& e, const Real a, // Compute the Jacobian matrix of the residual function: Jacobian(ref square -> // sphere). +// TODO Consider rewriting this in terms of the p=1 basis isoparametric +// interpolation formulation. Better performance? See +// calc_isoparametric_jacobian in slmmir.cpp. template KOKKOS_INLINE_FUNCTION void calc_Jacobian (const ConstVec3sT& p, const Quad& e, const Real a, @@ -263,4 +266,4 @@ inline Int test_sphere_to_ref (const ConstVec3s::HostMirror& p, } // namespace sqr } // namespace siqk -#endif +#endif // INCLUDE_SIQK_SQR_HPP From 2da5afc19248db8a121acb44e3332837627618d8 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Mon, 21 Aug 2017 21:18:09 -0600 Subject: [PATCH 17/28] Expose draft of QLT library. --- qlt/Makefile | 23 + qlt/make.inc.ws | 4 + qlt/qlt.cpp | 1740 ++++++++++++++++++++++++++++++++++++++++++++ qlt/qlt.hpp | 258 +++++++ qlt/qlt_inline.hpp | 392 ++++++++++ qlt/qlt_kokkos.hpp | 43 ++ qlt/qlt_test.cpp | 89 +++ qlt/readme.txt | 13 + siqk/readme.txt | 4 +- 9 files changed, 2564 insertions(+), 2 deletions(-) create mode 100644 qlt/Makefile create mode 100644 qlt/make.inc.ws create mode 100644 qlt/qlt.cpp create mode 100644 qlt/qlt.hpp create mode 100644 qlt/qlt_inline.hpp create mode 100644 qlt/qlt_kokkos.hpp create mode 100644 qlt/qlt_test.cpp create mode 100644 qlt/readme.txt diff --git a/qlt/Makefile b/qlt/Makefile new file mode 100644 index 0000000..fcd2b0e --- /dev/null +++ b/qlt/Makefile @@ -0,0 +1,23 @@ +include make.inc + +CXXFLAGS=$(opt) -Wall -pedantic -fopenmp -std=c++11 -I$(KOKKOS)/include -DQLT_TIME +LDFLAGS=-fopenmp -L$(KOKKOS)/lib -lkokkos -ldl +LINK_LAPACK_BLAS=-llapack -lblas + +SOURCES=qlt.cpp qlt_test.cpp + +OBJECTS=$(SOURCES:.cpp=.o) + +.cpp.o: + $(MPICXX) $(CFLAGS) $(CXXFLAGS) -c $< -o $@ + +all: testqlt + +testqlt: qlt_test.o qlt.o + $(MPICXX) qlt_test.o qlt.o $(LDFLAGS) -o testqlt + +clean: + rm -f *.o testqlt + +qlt.o: qlt.hpp qlt_inline.hpp qlt_kokkos.hpp +qlt_test.o: qlt.hpp diff --git a/qlt/make.inc.ws b/qlt/make.inc.ws new file mode 100644 index 0000000..daeeeff --- /dev/null +++ b/qlt/make.inc.ws @@ -0,0 +1,4 @@ +opt= +MPICXX=mpicxx +KOKKOS=/home/ambradl/lib/kokkos/cpu +#KOKKOS=/home/ambradl/lib/kokkos/cpu-serial diff --git a/qlt/qlt.cpp b/qlt/qlt.cpp new file mode 100644 index 0000000..fab38b0 --- /dev/null +++ b/qlt/qlt.cpp @@ -0,0 +1,1740 @@ +#include "qlt.hpp" + +#include + +#include +#include + +#include +#include +#include +#include + +namespace qlt { +namespace mpi { +template MPI_Datatype get_type(); +template <> MPI_Datatype get_type() { return MPI_INT; } +template <> MPI_Datatype get_type() { return MPI_DOUBLE; } + +template +int reduce (const Parallel& p, const T* sendbuf, T* rcvbuf, int count, MPI_Op op, + int root) { + MPI_Datatype dt = get_type(); + return MPI_Reduce(const_cast(sendbuf), rcvbuf, count, dt, op, root, p.comm()); +} + +template +int all_reduce (const Parallel& p, const T* sendbuf, T* rcvbuf, int count, MPI_Op op) { + MPI_Datatype dt = get_type(); + return MPI_Allreduce(const_cast(sendbuf), rcvbuf, count, dt, op, p.comm()); +} + +template +int isend (const Parallel& p, const T* buf, int count, int dest, int tag, + MPI_Request* ireq) { + MPI_Datatype dt = get_type(); + MPI_Request ureq; + MPI_Request* req = ireq ? ireq : &ureq; + int ret = MPI_Isend(const_cast(buf), count, dt, dest, tag, p.comm(), req); + if ( ! ireq) MPI_Request_free(req); + return ret; +} + +template +int irecv (const Parallel& p, T* buf, int count, int src, int tag, MPI_Request* ireq) { + MPI_Datatype dt = get_type(); + MPI_Request ureq; + MPI_Request* req = ireq ? ireq : &ureq; + int ret = MPI_Irecv(buf, count, dt, src, tag, p.comm(), req); + if ( ! ireq) MPI_Request_free(req); + return ret; +} + +int waitany (int count, MPI_Request* reqs, int* index, MPI_Status* stats = nullptr) { + return MPI_Waitany(count, reqs, index, stats ? stats : MPI_STATUS_IGNORE); +} + +int waitall (int count, MPI_Request* reqs, MPI_Status* stats = nullptr) { + return MPI_Waitall(count, reqs, stats ? stats : MPI_STATUS_IGNORE); +} + +template +int gather (const Parallel& p, const T* sendbuf, int sendcount, + T* recvbuf, int recvcount, int root) { + MPI_Datatype dt = get_type(); + return MPI_Gather(sendbuf, sendcount, dt, recvbuf, recvcount, dt, root, p.comm()); +} + +template +int gatherv (const Parallel& p, const T* sendbuf, int sendcount, + T* recvbuf, const int* recvcounts, const int* displs, int root) { + MPI_Datatype dt = get_type(); + return MPI_Gatherv(sendbuf, sendcount, dt, recvbuf, recvcounts, displs, dt, root, + p.comm()); +} + +bool all_ok (const Parallel& p, bool im_ok) { + int ok = im_ok, msg; + all_reduce(p, &ok, &msg, 1, MPI_LAND); + return static_cast(msg); +} +} // namespace mpi + +Parallel::Ptr make_parallel (MPI_Comm comm) { + return std::make_shared(comm); +} + +Int Parallel::size () const { + int sz = 0; + MPI_Comm_size(comm_, &sz); + return sz; +} + +Int Parallel::rank () const { + int pid = 0; + MPI_Comm_rank(comm_, &pid); + return pid; +} + +namespace impl { +#define pr(m) do { \ + int _pid_ = 0; \ + MPI_Comm_rank(MPI_COMM_WORLD, &_pid_); \ + std::stringstream _ss_; \ + _ss_.precision(15); \ + _ss_ << "pid " << _pid_ << " " << m << std::endl; \ + std::cerr << _ss_.str(); \ + } while (0) +#define pr0(m) do { \ + int _pid_; MPI_Comm_rank(MPI_COMM_WORLD, &_pid_); \ + if (_pid_ != 0) break; \ + std::stringstream _ss_; \ + _ss_ << "pid " << _pid_ << " " << m << std::endl; \ + std::cerr << _ss_.str(); \ + } while (0) +#define prc(m) pr(#m << " | " << (m)) +#define pr0c(m) pr0(#m << " | " << (m)) +#define puf(m) "(" << #m << " " << (m) << ")" +#define pu(m) << " " << puf(m) +template +void prarr (const std::string& name, const T* const v, const size_t n) { + std::stringstream ss; + ss.precision(15); + ss << name << " = ["; + for (size_t i = 0; i < n; ++i) ss << " " << v[i]; + ss << "];"; + pr(ss.str()); +} +#define mprarr(m) qlt::impl::prarr(#m, m.data(), m.size()) + +#define qlt_assert(condition) do { \ + if ( ! (condition)) { \ + std::stringstream _ss_; \ + _ss_ << __FILE__ << ":" << __LINE__ << ": FAIL:\n" << #condition \ + << "\n"; \ + throw std::logic_error(_ss_.str()); \ + } \ + } while (0) +#define qlt_throw_if(condition, message) do { \ + if (condition) { \ + std::stringstream _ss_; \ + _ss_ << __FILE__ << ":" << __LINE__ << ": The condition:\n" \ + << #condition "\nled to the exception\n" << message << "\n"; \ + throw std::logic_error(_ss_.str()); \ + } \ + } while (0) +#define qlt_kernel_assert(condition) do { \ + if ( ! (condition)) \ + Kokkos::abort(#condition); \ + } while (0) +#define qlt_kernel_throw_if(condition, message) do { \ + if (condition) \ + Kokkos::abort(#condition " led to the exception\n" message); \ + } while (0) + +inline Real reldif (const Real a, const Real b) +{ return std::abs(b - a)/std::max(std::abs(a), std::abs(b)); } + +struct FILECloser { void operator() (FILE* fh) { fclose(fh); } }; +} // namespace impl + +class Timer { +public: + enum Op { tree, analyze, trcrinit, trcrgen, trcrcheck, + qltrun, qltrunl2r, qltrunr2l, snp, waitall, + total, NTIMERS }; + static inline void init () { +#ifdef QLT_TIME + for (int i = 0; i < NTIMERS; ++i) { + et_[i] = 0; + cnt_[i] = 0; + } +#endif + } + static inline void reset (const Op op) { +#ifdef QLT_TIME + et_[op] = 0; + cnt_[op] = 0; +#endif + } + static inline void start (const Op op) { +#ifdef QLT_TIME + gettimeofday(&t_start_[op], 0); + ++cnt_[op]; +#endif + } + static inline void stop (const Op op) { +#ifdef QLT_TIME + timeval t2; + gettimeofday(&t2, 0); + const timeval& t1 = t_start_[op]; + static const double us = 1.0e6; + et_[op] += (t2.tv_sec*us + t2.tv_usec - t1.tv_sec*us - t1.tv_usec)/us; +#endif + } +# define tpr(op) do { \ + printf("%-20s %10.3e %10.1f (%4d %10.3e)\n", \ + #op, et_[op], 100*et_[op]/tot, cnt_[op], et_[op]/cnt_[op]); \ + } while (0) + static void print () { +#ifdef QLT_TIME + const double tot = et_[total]; + tpr(tree); tpr(analyze); + tpr(trcrinit); tpr(trcrgen); tpr(trcrcheck); + tpr(qltrun); tpr(qltrunl2r); tpr(qltrunr2l); tpr(snp); tpr(waitall); + printf("%-20s %10.3e %10.1f\n", "total", tot, 100.0); +#endif + } +#undef tpr +private: +#ifdef QLT_TIME + static timeval t_start_[NTIMERS]; + static double et_[NTIMERS]; + static int cnt_[NTIMERS]; +#endif +}; +#ifdef QLT_TIME +timeval Timer::t_start_[Timer::NTIMERS]; +double Timer::et_[Timer::NTIMERS]; +int Timer::cnt_[Timer::NTIMERS]; +#endif + +namespace impl { +struct NodeSets { + typedef std::shared_ptr ConstPtr; + + enum : int { mpitag = 42 }; + + // A node in the tree that is relevant to this rank. + struct Node { + // Rank of the node. If the node is in a level, then its rank is my rank. If + // it's not in a level, then it is a comm partner of a node on this rank. + Int rank; + // Globally unique identifier; cellidx if leaf node, ie, if nkids == 0. + Int id; + // This node's parent, a comm partner, if such a partner is required. + const Node* parent; + // This node's kids, comm partners, if such partners are required. Parent + // and kid nodes are pruned relative to the full tree over the mesh to + // contain just the nodes that matter to this rank. + Int nkids; + const Node* kids[2]; + // Offset factor into bulk data. An offset is a unit; actual buffer sizes + // are multiples of this unit. + Int offset; + + Node () : rank(-1), id(-1), parent(nullptr), nkids(0), offset(-1) {} + }; + + // A level in the level schedule that is constructed to orchestrate + // communication. A node in a level depends only on nodes in lower-numbered + // levels (l2r) or higher-numbered (r2l). + // + // The communication patterns are as follows: + // > l2r + // MPI rcv into kids + // sum into node + // MPI send from node + // > r2l + // MPI rcv into node + // solve QP for kids + // MPI send from kids + struct Level { + struct MPIMetaData { + Int rank; // Rank of comm partner. + Int offset; // Offset to start of buffer for this comm. + Int size; // Size of this buffer in units of offsets. + }; + + // The nodes in the level. + std::vector nodes; + // MPI information for this level. + std::vector me, kids; + // Have to keep requests separate so we can call waitall if we want to. + mutable std::vector me_req, kids_req; + }; + + // Levels. nodes[0] is level 0, the leaf level. + std::vector levels; + // Number of data slots this rank needs. Each node owned by this rank, plus + // kids on other ranks, have an associated slot. + Int nslots; + + // Allocate a node. The list node_mem_ is the mechanism for memory ownership; + // node_mem_ isn't used for anything other than owning nodes. + Node* alloc () { + node_mem_.push_front(Node()); + return &node_mem_.front(); + } + + void print(std::ostream& os) const; + +private: + std::list node_mem_; +}; + +void NodeSets::print (std::ostream& os) const { + std::stringstream ss; + if (levels.empty()) return; + const Int myrank = levels[0].nodes[0]->rank; + ss << "pid " << myrank << ":"; + ss << " #levels " << levels.size(); + for (size_t i = 0; i < levels.size(); ++i) { + const auto& lvl = levels[i]; + ss << "\n " << i << ": " << lvl.nodes.size(); + std::set ps, ks; + for (size_t j = 0; j < lvl.nodes.size(); ++j) { + const auto n = lvl.nodes[j]; + for (Int k = 0; k < n->nkids; ++k) + if (n->kids[k]->rank != myrank) + ks.insert(n->kids[k]->rank); + if (n->parent && n->parent->rank != myrank) + ps.insert(n->parent->rank); + } + ss << " |"; + for (const auto& e : ks) ss << " " << e; + if ( ! lvl.kids.empty()) ss << " (" << lvl.kids.size() << ") |"; + for (const auto& e : ps) ss << " " << e; + if ( ! lvl.me.empty()) ss << " (" << lvl.me.size() << ")"; + } + ss << "\n"; + os << ss.str(); +} + +// Find tree depth, assign ranks to non-leaf nodes, and init 'reserved'. +Int init_tree (const tree::Node::Ptr& node, Int& id) { + node->reserved = nullptr; + Int depth = 0; + for (Int i = 0; i < node->nkids; ++i) { + qlt_assert(node.get() == node->kids[i]->parent); + depth = std::max(depth, init_tree(node->kids[i], id)); + } + if (node->nkids) { + node->rank = node->kids[0]->rank; + node->cellidx = id++; + } else { + qlt_throw_if(node->cellidx < 0 || node->cellidx >= id, + "cellidx is " << node->cellidx << " but should be between " << + 0 << " and " << id); + } + return depth + 1; +} + +void level_schedule_and_collect ( + NodeSets& ns, const Int& my_rank, const tree::Node::Ptr& node, Int& level, + bool& need_parent_ns_node) +{ + qlt_assert(node->rank != -1); + level = -1; + bool make_ns_node = false; + for (Int i = 0; i < node->nkids; ++i) { + Int kid_level; + bool kid_needs_ns_node; + level_schedule_and_collect(ns, my_rank, node->kids[i], kid_level, + kid_needs_ns_node); + level = std::max(level, kid_level); + if (kid_needs_ns_node) make_ns_node = true; + } + ++level; + // Is parent node needed for isend? + const bool node_is_owned = node->rank == my_rank; + need_parent_ns_node = node_is_owned; + if (node_is_owned || make_ns_node) { + qlt_assert( ! node->reserved); + NodeSets::Node* ns_node = ns.alloc(); + // Levels hold only owned nodes. + if (node_is_owned) ns.levels[level].nodes.push_back(ns_node); + node->reserved = ns_node; + ns_node->rank = node->rank; + ns_node->id = node->cellidx; + ns_node->parent = nullptr; + if (node_is_owned) { + // If this node is owned, it needs to have information about all kids. + ns_node->nkids = node->nkids; + for (Int i = 0; i < node->nkids; ++i) { + const auto& kid = node->kids[i]; + if ( ! kid->reserved) { + // This kid isn't owned by this rank. But need it for irecv. + NodeSets::Node* ns_kid; + kid->reserved = ns_kid = ns.alloc(); + ns_node->kids[i] = ns_kid; + qlt_assert(kid->rank != my_rank); + ns_kid->rank = kid->rank; + ns_kid->id = kid->cellidx; + ns_kid->parent = nullptr; // Not needed. + // The kid may have kids in the original tree, but in the tree pruned + // according to rank, it does not. + ns_kid->nkids = 0; + } else { + // This kid is owned by this rank, so fill in its parent pointer. + NodeSets::Node* ns_kid = static_cast(kid->reserved); + ns_node->kids[i] = ns_kid; + ns_kid->parent = ns_node; + } + } + } else { + // This node is not owned. Update the owned kids with its parent. + ns_node->nkids = 0; + for (Int i = 0; i < node->nkids; ++i) { + const auto& kid = node->kids[i]; + if (kid->reserved && kid->rank == my_rank) { + NodeSets::Node* ns_kid = static_cast(kid->reserved); + ns_node->kids[ns_node->nkids++] = ns_kid; + ns_kid->parent = ns_node; + } + } + } + } +} + +void level_schedule_and_collect (NodeSets& ns, const Int& my_rank, + const tree::Node::Ptr& tree) { + Int iunused; + bool bunused; + level_schedule_and_collect(ns, my_rank, tree, iunused, bunused); +} + +void consolidate (NodeSets& ns) { + auto levels = ns.levels; + ns.levels.clear(); + for (const auto& level : levels) + if ( ! level.nodes.empty()) + ns.levels.push_back(level); +} + +typedef std::pair RankNode; + +void init_offsets (const Int my_rank, std::vector& rns, + std::vector& mmds, Int& offset) { + // Set nodes on my rank to have rank -1 so that they sort first. + for (auto& rn : rns) + if (rn.first == my_rank) + rn.first = -1; + + // Sort so that all comms with a given rank are contiguous. Stable sort so + // that rns retains its order, in particular in the leaf node level. + std::stable_sort(rns.begin(), rns.end()); + + // Collect nodes into groups by rank and set up comm metadata for each group. + Int prev_rank = -1; + for (auto& rn : rns) { + const Int rank = rn.first; + if (rank == -1) { + if (rn.second->offset == -1) + rn.second->offset = offset++; + continue; + } + if (rank != prev_rank) { + qlt_assert(rank > prev_rank); + prev_rank = rank; + mmds.push_back(NodeSets::Level::MPIMetaData()); + auto& mmd = mmds.back(); + mmd.rank = rank; + mmd.offset = offset; + mmd.size = 0; + } + ++mmds.back().size; + rn.second->offset = offset++; + } +} + +// Set up comm data. Consolidate so that there is only one message between me +// and another rank per level. Determine an offset for each node, to be +// multiplied by data-size factors later, for use in data buffers. +void init_comm (const Int my_rank, NodeSets& ns) { + ns.nslots = 0; + for (auto& lvl : ns.levels) { + Int nkids = 0; + for (const auto& n : lvl.nodes) + nkids += n->nkids; + + std::vector me(lvl.nodes.size()), kids(nkids); + for (size_t i = 0, mi = 0, ki = 0; i < lvl.nodes.size(); ++i) { + const auto& n = lvl.nodes[i]; + me[mi].first = n->parent ? n->parent->rank : my_rank; + me[mi].second = const_cast(n); + ++mi; + for (Int k = 0; k < n->nkids; ++k) { + kids[ki].first = n->kids[k]->rank; + kids[ki].second = const_cast(n->kids[k]); + ++ki; + } + } + + init_offsets(my_rank, me, lvl.me, ns.nslots); + lvl.me_req.resize(lvl.me.size()); + init_offsets(my_rank, kids, lvl.kids, ns.nslots); + lvl.kids_req.resize(lvl.kids.size()); + } +} + +// Analyze the tree to extract levels. Levels are run from 0 to #level - 1. Each +// level has nodes whose corresponding operations depend on only nodes in +// lower-indexed levels. This mechanism prevents deadlock in the general case of +// multiple cells per rank, with multiple ranks appearing in a subtree other +// than the root. +// In addition, the set of nodes collected into levels are just those owned by +// this rank, and those with which owned nodes must communicate. +// Once this function is done, the tree can be deleted. +NodeSets::ConstPtr analyze (const Parallel::Ptr& p, const Int& ncells, + const tree::Node::Ptr& tree) { + const auto nodesets = std::make_shared(); + qlt_assert( ! tree->parent); + Int id = ncells; + const Int depth = init_tree(tree, id); + nodesets->levels.resize(depth); + level_schedule_and_collect(*nodesets, p->rank(), tree); + consolidate(*nodesets); + init_comm(p->rank(), *nodesets); + return nodesets; +} + +// Check that the offsets are self consistent. +Int check_comm (const NodeSets::ConstPtr& ns) { + Int nerr = 0; + std::vector offsets(ns->nslots, 0); + for (const auto& lvl : ns->levels) + for (const auto& n : lvl.nodes) { + qlt_assert(n->offset < ns->nslots); + ++offsets[n->offset]; + for (Int i = 0; i < n->nkids; ++i) + if (n->kids[i]->rank != n->rank) + ++offsets[n->kids[i]->offset]; + } + for (const auto& e : offsets) + if (e != 1) ++nerr; + return nerr; +} + +// Check that there are the correct number of leaf nodes, and that their offsets +// all come first and are ordered the same as ns->levels[0]->nodes. +Int check_leaf_nodes (const Parallel::Ptr& p, const NodeSets::ConstPtr& ns, + const Int ncells) { + Int nerr = 0; + qlt_assert( ! ns->levels.empty()); + qlt_assert( ! ns->levels[0].nodes.empty()); + Int my_nleaves = 0; + for (const auto& n : ns->levels[0].nodes) { + qlt_assert( ! n->nkids); + ++my_nleaves; + } + for (const auto& n : ns->levels[0].nodes) { + qlt_assert(n->offset < my_nleaves); + qlt_assert(n->id < ncells); + } + Int glbl_nleaves = 0; + mpi::all_reduce(*p, &my_nleaves, &glbl_nleaves, 1, MPI_SUM); + if (glbl_nleaves != ncells) + ++nerr; + return nerr; +} + +// Sum cellidx using the QLT comm pattern. +Int test_comm_pattern (const Parallel::Ptr& p, const NodeSets::ConstPtr& ns, + const Int ncells) { + Int nerr = 0; + // Rank-wide data buffer. + std::vector data(ns->nslots); + // Sum this rank's cellidxs. + for (auto& n : ns->levels[0].nodes) + data[n->offset] = n->id; + // Leaves to root. + for (size_t il = 0; il < ns->levels.size(); ++il) { + auto& lvl = ns->levels[il]; + // Set up receives. + for (size_t i = 0; i < lvl.kids.size(); ++i) { + const auto& mmd = lvl.kids[i]; + mpi::irecv(*p, &data[mmd.offset], mmd.size, mmd.rank, NodeSets::mpitag, + &lvl.kids_req[i]); + } + //todo Replace with simultaneous waitany and isend. + mpi::waitall(lvl.kids_req.size(), lvl.kids_req.data()); + // Combine kids' data. + for (auto& n : lvl.nodes) { + if ( ! n->nkids) continue; + data[n->offset] = 0; + for (Int i = 0; i < n->nkids; ++i) + data[n->offset] += data[n->kids[i]->offset]; + } + // Send to parents. + for (size_t i = 0; i < lvl.me.size(); ++i) { + const auto& mmd = lvl.me[i]; + mpi::isend(*p, &data[mmd.offset], mmd.size, mmd.rank, NodeSets::mpitag, + &lvl.me_req[i]); + } + if (il+1 == ns->levels.size()) + mpi::waitall(lvl.me_req.size(), lvl.me_req.data()); + } + // Root to leaves. + for (size_t il = ns->levels.size(); il > 0; --il) { + auto& lvl = ns->levels[il-1]; + // Get the global sum from parent. + for (size_t i = 0; i < lvl.me.size(); ++i) { + const auto& mmd = lvl.me[i]; + mpi::irecv(*p, &data[mmd.offset], mmd.size, mmd.rank, NodeSets::mpitag, + &lvl.me_req[i]); + } + //todo Replace with simultaneous waitany and isend. + mpi::waitall(lvl.me_req.size(), lvl.me_req.data()); + // Pass to kids. + for (auto& n : lvl.nodes) { + if ( ! n->nkids) continue; + for (Int i = 0; i < n->nkids; ++i) + data[n->kids[i]->offset] = data[n->offset]; + } + // Send. + for (size_t i = 0; i < lvl.kids.size(); ++i) { + const auto& mmd = lvl.kids[i]; + mpi::isend(*p, &data[mmd.offset], mmd.size, mmd.rank, NodeSets::mpitag, + &lvl.kids_req[i]); + } + } + // Wait on sends to clean up. + for (size_t il = 0; il < ns->levels.size(); ++il) { + auto& lvl = ns->levels[il]; + if (il+1 < ns->levels.size()) + mpi::waitall(lvl.me_req.size(), lvl.me_req.data()); + mpi::waitall(lvl.kids_req.size(), lvl.kids_req.data()); + } + { // Check that all leaf nodes have the right number. + const Int desired_sum = (ncells*(ncells - 1)) / 2; + for (const auto& n : ns->levels[0].nodes) + if (data[n->offset] != desired_sum) ++nerr; + if (p->amroot()) { + std::cout << " " << data[ns->levels[0].nodes[0]->offset]; + std::cout.flush(); + } + } + return nerr; +} + +// Unit tests for NodeSets. +Int unittest (const Parallel::Ptr& p, const NodeSets::ConstPtr& ns, + const Int ncells) { + Int nerr = 0; + nerr += check_comm(ns); + if (nerr) return nerr; + nerr += check_leaf_nodes(p, ns, ncells); + if (nerr) return nerr; + nerr += test_comm_pattern(p, ns, ncells); + if (nerr) return nerr; + return nerr; +} +} // namespace impl + +template +void QLT::init (const std::string& name, IntList& d, + typename IntList::HostMirror& h, size_t n) { + d = IntList(name, n); + h = Kokkos::create_mirror_view(d); +} + +template +int QLT::MetaData::get_problem_type (const int& idx) { + return problem_type_[idx]; +} + +// icpc doesn't let us use problem_type_ here, even though it's constexpr. +template +int QLT::MetaData::get_problem_type_idx (const int& mask) { + switch (mask) { + case CPT::s: case CPT::st: return 0; + case CPT::cs: case CPT::cst: return 1; + case CPT::t: return 2; + case CPT::ct: return 3; + default: qlt_kernel_throw_if(true, "Invalid problem type."); return -1; + } +} + +template +int QLT::MetaData::get_problem_type_l2r_bulk_size (const int& mask) { + if (mask & ProblemType::conserve) return 4; + return 3; +} + +template +int QLT::MetaData::get_problem_type_r2l_bulk_size (const int& mask) { + if (mask & ProblemType::shapepreserve) return 1; + return 3; +} + +template +void QLT::MetaData::init (const MetaDataBuilder& mdb) { + const Int ntracers = mdb.trcr2prob.size(); + + Me::init("trcr2prob", a_d_.trcr2prob, a_h_.trcr2prob, ntracers); + std::copy(mdb.trcr2prob.begin(), mdb.trcr2prob.end(), a_h_.trcr2prob.data()); + Kokkos::deep_copy(a_d_.trcr2prob, a_h_.trcr2prob); + + Me::init("bidx2trcr", a_d_.bidx2trcr, a_h_.bidx2trcr, ntracers); + Me::init("trcr2bl2r", a_d_.trcr2bl2r, a_h_.trcr2bl2r, ntracers); + Me::init("trcr2br2l", a_d_.trcr2br2l, a_h_.trcr2br2l, ntracers); + a_h_.prob2trcrptr[0] = 0; + a_h_.prob2bl2r[0] = 1; // rho is at 0. + a_h_.prob2br2l[0] = 0; + for (Int pi = 0; pi < nprobtypes; ++pi) { + a_h_.prob2trcrptr[pi+1] = a_h_.prob2trcrptr[pi]; + const Int l2rbulksz = get_problem_type_l2r_bulk_size(get_problem_type(pi)); + const Int r2lbulksz = get_problem_type_r2l_bulk_size(get_problem_type(pi)); + for (Int ti = 0; ti < ntracers; ++ti) { + const auto problem_type = a_h_.trcr2prob[ti]; + if (problem_type != problem_type_[pi]) continue; + const auto tcnt = a_h_.prob2trcrptr[pi+1] - a_h_.prob2trcrptr[pi]; + a_h_.trcr2bl2r[ti] = a_h_.prob2bl2r[pi] + tcnt*l2rbulksz; + a_h_.trcr2br2l[ti] = a_h_.prob2br2l[pi] + tcnt*r2lbulksz; + a_h_.bidx2trcr[a_h_.prob2trcrptr[pi+1]++] = ti; + } + Int ni = a_h_.prob2trcrptr[pi+1] - a_h_.prob2trcrptr[pi]; + a_h_.prob2bl2r[pi+1] = a_h_.prob2bl2r[pi] + ni*l2rbulksz; + a_h_.prob2br2l[pi+1] = a_h_.prob2br2l[pi] + ni*r2lbulksz; + } + Kokkos::deep_copy(a_d_.bidx2trcr, a_h_.bidx2trcr); + Kokkos::deep_copy(a_d_.trcr2bl2r, a_h_.trcr2bl2r); + Kokkos::deep_copy(a_d_.trcr2br2l, a_h_.trcr2br2l); + + Me::init("trcr2bidx", a_d_.trcr2bidx, a_h_.trcr2bidx, ntracers); + for (Int ti = 0; ti < ntracers; ++ti) + a_h_.trcr2bidx(a_h_.bidx2trcr(ti)) = ti; + Kokkos::deep_copy(a_d_.trcr2bidx, a_h_.trcr2bidx); + + a_h = a_h_; + + // Won't default construct Unmanaged, so have to do pointer stuff and raw + // array copy explicitly. + a_d.trcr2prob = a_d_.trcr2prob; + a_d.bidx2trcr = a_d_.bidx2trcr; + a_d.trcr2bidx = a_d_.trcr2bidx; + a_d.trcr2bl2r = a_d_.trcr2bl2r; + a_d.trcr2br2l = a_d_.trcr2br2l; + std::copy(a_h_.prob2trcrptr, a_h_.prob2trcrptr + nprobtypes + 1, + a_d.prob2trcrptr); + std::copy(a_h_.prob2bl2r, a_h_.prob2bl2r + nprobtypes + 1, a_d.prob2bl2r); + std::copy(a_h_.prob2br2l, a_h_.prob2br2l + nprobtypes + 1, a_d.prob2br2l); + qlt_assert(a_d.prob2trcrptr[nprobtypes] == ntracers); +} + +template +void QLT::BulkData::init (const MetaData& md, const Int& nslots) { + l2r_data_ = RealList("l2r_data", md.a_h.prob2bl2r[md.nprobtypes]*nslots); + r2l_data_ = RealList("r2l_data", md.a_h.prob2br2l[md.nprobtypes]*nslots); + l2r_data = l2r_data_; + r2l_data = r2l_data_; +} + +template +void QLT::init (const Parallel::Ptr& p, const Int& ncells, const tree::Node::Ptr& tree) { + p_ = p; + Timer::start(Timer::analyze); + ns_ = impl::analyze(p, ncells, tree); + init_ordinals(); + Timer::stop(Timer::analyze); + mdb_ = std::make_shared(); +} + +template +void QLT::init_ordinals () { + for (const auto& n : ns_->levels[0].nodes) + gci2lci_[n->id] = n->offset; +} + +template +QLT::QLT (const Parallel::Ptr& p, const Int& ncells, const tree::Node::Ptr& tree) { + init(p, ncells, tree); +} + +template +void QLT::print (std::ostream& os) const { + ns_->print(os); +} + +// Number of cells owned by this rank. +template +Int QLT::nlclcells () const { return ns_->levels[0].nodes.size(); } + +// Cells owned by this rank, in order of local numbering. Thus, +// gci2lci(gcis[i]) == i. Ideally, the caller never actually calls gci2lci(), +// and instead uses the information from get_owned_glblcells to determine +// local cell indices. +template +void QLT::get_owned_glblcells (std::vector& gcis) const { + gcis.resize(ns_->levels[0].nodes.size()); + for (const auto& n : ns_->levels[0].nodes) + gcis[n->offset] = n->id; +} + +// For global cell index cellidx, i.e., the globally unique ordinal associated +// with a cell in the caller's tree, return this rank's local index for +// it. This is not an efficient operation. +template +Int QLT::gci2lci (const Int& gci) const { + const auto it = gci2lci_.find(gci); + if (it == gci2lci_.end()) { + pr(puf(gci)); + std::vector gcis; + get_owned_glblcells(gcis); + mprarr(gcis); + } + qlt_throw_if(it == gci2lci_.end(), "gci " << gci << " not in gci2lci map."); + return it->second; +} + +// Set up QLT tracer metadata. Once end_tracer_declarations is called, it is +// an error to call declare_tracer again. Call declare_tracer in order of the +// tracer index in the caller's numbering. +template +void QLT::declare_tracer (int problem_type) { + qlt_throw_if( ! mdb_, "end_tracer_declarations was already called; " + "it is an error to call declare_tracer now."); + // For its exception side effect, and to get canonical problem type, since + // some possible problem types map to the same canonical one: + problem_type = md_.get_problem_type(md_.get_problem_type_idx(problem_type)); + mdb_->trcr2prob.push_back(problem_type); +} + +template +void QLT::end_tracer_declarations () { + md_.init(*mdb_); + mdb_ = nullptr; + bd_.init(md_, ns_->nslots); +} + +template +int QLT::get_problem_type (const Int& tracer_idx) const { + qlt_throw_if(tracer_idx < 0 || tracer_idx > md_.a_h.trcr2prob.extent_int(0), + "tracer_idx is out of bounds: " << tracer_idx); + return md_.a_h.trcr2prob[tracer_idx]; +} + +template +Int QLT::get_num_tracers () const { + return md_.a_h.trcr2prob.size(); +} + +template +void QLT::run () { + Timer::start(Timer::qltrunl2r); + using namespace impl; + // Number of data per slot. + const Int l2rndps = md_.a_d.prob2bl2r[md_.nprobtypes]; + const Int r2lndps = md_.a_d.prob2br2l[md_.nprobtypes]; + // Leaves to root. + for (size_t il = 0; il < ns_->levels.size(); ++il) { + auto& lvl = ns_->levels[il]; + // Set up receives. + for (size_t i = 0; i < lvl.kids.size(); ++i) { + const auto& mmd = lvl.kids[i]; + mpi::irecv(*p_, &bd_.l2r_data(mmd.offset*l2rndps), mmd.size*l2rndps, mmd.rank, + NodeSets::mpitag, &lvl.kids_req[i]); + } + //todo Replace with simultaneous waitany and isend. + Timer::start(Timer::waitall); + mpi::waitall(lvl.kids_req.size(), lvl.kids_req.data()); + Timer::stop(Timer::waitall); + // Combine kids' data. + //todo Kernelize, interacting with waitany todo above. + Timer::start(Timer::snp); + for (const auto& n : lvl.nodes) { + if ( ! n->nkids) continue; + qlt_kernel_assert(n->nkids == 2); + // Total density. + bd_.l2r_data(n->offset*l2rndps) = (bd_.l2r_data(n->kids[0]->offset*l2rndps) + + bd_.l2r_data(n->kids[1]->offset*l2rndps)); + // Tracers. + for (Int pti = 0; pti < md_.nprobtypes; ++pti) { + const Int problem_type = md_.get_problem_type(pti); + const bool sum_only = problem_type & ProblemType::shapepreserve; + const Int bsz = md_.get_problem_type_l2r_bulk_size(problem_type); + const Int bis = md_.a_d.prob2trcrptr[pti], bie = md_.a_d.prob2trcrptr[pti+1]; + for (Int bi = bis; bi < bie; ++bi) { + const Int bdi = md_.a_d.trcr2bl2r(md_.a_d.bidx2trcr(bi)); + Real* const me = &bd_.l2r_data(n->offset*l2rndps + bdi); + const Real* const k0 = &bd_.l2r_data(n->kids[0]->offset*l2rndps + bdi); + const Real* const k1 = &bd_.l2r_data(n->kids[1]->offset*l2rndps + bdi); + me[0] = sum_only ? k0[0] + k1[0] : impl::min(k0[0], k1[0]); + me[1] = k0[1] + k1[1] ; + me[2] = sum_only ? k0[2] + k1[2] : impl::max(k0[2], k1[2]); + if (bsz == 4) + me[3] = k0[3] + k1[3] ; + } + } + } + Timer::stop(Timer::snp); + // Send to parents. + for (size_t i = 0; i < lvl.me.size(); ++i) { + const auto& mmd = lvl.me[i]; + mpi::isend(*p_, &bd_.l2r_data(mmd.offset*l2rndps), mmd.size*l2rndps, mmd.rank, + NodeSets::mpitag, &lvl.me_req[i]); + } + if (il+1 == ns_->levels.size()) { + Timer::start(Timer::waitall); + mpi::waitall(lvl.me_req.size(), lvl.me_req.data()); + Timer::stop(Timer::waitall); + } + } + Timer::stop(Timer::qltrunl2r); Timer::start(Timer::qltrunr2l); + // Root. + if ( ! ns_->levels.empty() && ns_->levels.back().nodes.size() == 1 && + ! ns_->levels.back().nodes[0]->parent) { + const auto& n = ns_->levels.back().nodes[0]; + for (Int pti = 0; pti < md_.nprobtypes; ++pti) { + const Int problem_type = md_.get_problem_type(pti); + const Int bis = md_.a_d.prob2trcrptr[pti], bie = md_.a_d.prob2trcrptr[pti+1]; + for (Int bi = bis; bi < bie; ++bi) { + const Int l2rbdi = md_.a_d.trcr2bl2r(md_.a_d.bidx2trcr(bi)); + const Int r2lbdi = md_.a_d.trcr2br2l(md_.a_d.bidx2trcr(bi)); + // If QLT is enforcing global mass conservation, set the root's r2l Qm + // value to the l2r Qm_prev's sum; otherwise, copy the l2r Qm value to + // the r2l one. + const Int os = problem_type & ProblemType::conserve ? 3 : 1; + bd_.r2l_data(n->offset*r2lndps + r2lbdi) = + bd_.l2r_data(n->offset*l2rndps + l2rbdi + os); + if ( ! (problem_type & ProblemType::shapepreserve)) { + // We now know the global q_{min,max}. Start propagating it + // leafward. + bd_.r2l_data(n->offset*r2lndps + r2lbdi + 1) = + bd_.l2r_data(n->offset*l2rndps + l2rbdi + 0); + bd_.r2l_data(n->offset*r2lndps + r2lbdi + 2) = + bd_.l2r_data(n->offset*l2rndps + l2rbdi + 2); + } + } + } + } + // Root to leaves. + for (size_t il = ns_->levels.size(); il > 0; --il) { + auto& lvl = ns_->levels[il-1]; + for (size_t i = 0; i < lvl.me.size(); ++i) { + const auto& mmd = lvl.me[i]; + mpi::irecv(*p_, &bd_.r2l_data(mmd.offset*r2lndps), mmd.size*r2lndps, mmd.rank, + NodeSets::mpitag, &lvl.me_req[i]); + } + //todo Replace with simultaneous waitany and isend. + Timer::start(Timer::waitall); + mpi::waitall(lvl.me_req.size(), lvl.me_req.data()); + Timer::stop(Timer::waitall); + // Solve QP for kids' values. + //todo Kernelize, interacting with waitany todo above. + Timer::start(Timer::snp); + for (const auto& n : lvl.nodes) { + if ( ! n->nkids) continue; + for (Int pti = 0; pti < md_.nprobtypes; ++pti) { + const Int problem_type = md_.get_problem_type(pti); + const Int bis = md_.a_d.prob2trcrptr[pti], bie = md_.a_d.prob2trcrptr[pti+1]; + for (Int bi = bis; bi < bie; ++bi) { + const Int l2rbdi = md_.a_d.trcr2bl2r(md_.a_d.bidx2trcr(bi)); + const Int r2lbdi = md_.a_d.trcr2br2l(md_.a_d.bidx2trcr(bi)); + qlt_assert(n->nkids == 2); + if ( ! (problem_type & ProblemType::shapepreserve)) { + // Pass q_{min,max} info along. l2r data are updated for use in + // solve_node_problem. r2l data are updated for use in isend. + const Real q_min = bd_.r2l_data(n->offset*r2lndps + r2lbdi + 1); + const Real q_max = bd_.r2l_data(n->offset*r2lndps + r2lbdi + 2); + bd_.l2r_data(n->offset*l2rndps + l2rbdi + 0) = q_min; + bd_.l2r_data(n->offset*l2rndps + l2rbdi + 2) = q_max; + for (Int k = 0; k < 2; ++k) { + bd_.l2r_data(n->kids[k]->offset*l2rndps + l2rbdi + 0) = q_min; + bd_.l2r_data(n->kids[k]->offset*l2rndps + l2rbdi + 2) = q_max; + bd_.r2l_data(n->kids[k]->offset*r2lndps + r2lbdi + 1) = q_min; + bd_.r2l_data(n->kids[k]->offset*r2lndps + r2lbdi + 2) = q_max; + } + } + const auto& k0 = n->kids[0]; + const auto& k1 = n->kids[1]; + solve_node_problem( + problem_type, + bd_.l2r_data( n->offset*l2rndps), + &bd_.l2r_data( n->offset*l2rndps + l2rbdi), + bd_.r2l_data( n->offset*r2lndps + r2lbdi), + bd_.l2r_data(k0->offset*l2rndps), + &bd_.l2r_data(k0->offset*l2rndps + l2rbdi), + bd_.r2l_data(k0->offset*r2lndps + r2lbdi), + bd_.l2r_data(k1->offset*l2rndps), + &bd_.l2r_data(k1->offset*l2rndps + l2rbdi), + bd_.r2l_data(k1->offset*r2lndps + r2lbdi)); + } + } + } + Timer::stop(Timer::snp); + // Send. + for (size_t i = 0; i < lvl.kids.size(); ++i) { + const auto& mmd = lvl.kids[i]; + mpi::isend(*p_, &bd_.r2l_data(mmd.offset*r2lndps), mmd.size*r2lndps, mmd.rank, + NodeSets::mpitag, &lvl.kids_req[i]); + } + } + // Wait on sends to clean up. + for (size_t il = 0; il < ns_->levels.size(); ++il) { + auto& lvl = ns_->levels[il]; + if (il+1 < ns_->levels.size()) + mpi::waitall(lvl.me_req.size(), lvl.me_req.data()); + mpi::waitall(lvl.kids_req.size(), lvl.kids_req.data()); + } + Timer::stop(Timer::qltrunr2l); +} + +template +constexpr Int QLT::MetaData::problem_type_[]; + +namespace test { +using namespace impl; + +class TestQLT { + typedef QLT QLTT; + typedef Kokkos::View R2D; + + struct Tracer { + typedef QLTT::ProblemType PT; + + Int idx; + Int problem_type; + Int perturbation_type; + bool no_change_should_hold, safe_should_hold, local_should_hold; + bool write; + + std::string str () const { + std::stringstream ss; + ss << "(ti " << idx; + if (problem_type & PT::conserve) ss << " c"; + if (problem_type & PT::shapepreserve) ss << " s"; + if (problem_type & PT::consistent) ss << " t"; + ss << " pt " << perturbation_type << " ssh " << safe_should_hold + << " lsh " << local_should_hold << ")"; + return ss.str(); + } + + Tracer () + : idx(-1), problem_type(-1), perturbation_type(-1), no_change_should_hold(false), + safe_should_hold(true), local_should_hold(true), write(false) + {} + }; + + struct Values { + Values (const Int ntracers, const Int ncells) + : ncells_(ncells), v_((4*ntracers + 1)*ncells) + {} + Int ncells () const { return ncells_; } + Real* rhom () { return v_.data(); } + Real* Qm_min (const Int& ti) { return v_.data() + ncells_*(1 + 4*ti ); } + Real* Qm (const Int& ti) { return v_.data() + ncells_*(1 + 4*ti + 1); } + Real* Qm_max (const Int& ti) { return v_.data() + ncells_*(1 + 4*ti + 2); } + Real* Qm_prev (const Int& ti) { return v_.data() + ncells_*(1 + 4*ti + 3); } + const Real* rhom () const { return const_cast(this)->rhom(); } + const Real* Qm_min (const Int& ti) const + { return const_cast(this)->Qm_min (ti); } + const Real* Qm (const Int& ti) const + { return const_cast(this)->Qm (ti); } + const Real* Qm_max (const Int& ti) const + { return const_cast(this)->Qm_max (ti); } + const Real* Qm_prev (const Int& ti) const + { return const_cast(this)->Qm_prev(ti); } + private: + Int ncells_; + std::vector v_; + }; + + // For solution output, if requested. + struct Writer { + std::unique_ptr fh; + std::vector ngcis; // Number of i'th rank's gcis_ array. + std::vector displs; // Cumsum of above. + std::vector gcis; // Global cell indices packed by rank's gcis_ vector. + ~Writer () { + if ( ! fh) return; + fprintf(fh.get(), " return s\n"); + } + }; + +private: + const Parallel::Ptr p_; + const Int ncells_; + QLTT qlt_; + // Caller index (local cell index in the app code) -> QLT lclcellidx. + std::vector gcis_, i2lci_; + std::vector tracers_; + // For optional output. + bool write_inited_; + std::shared_ptr w_; // Only on root. + +private: + void init_numbering (const tree::Node::Ptr& node) { + // TestQLT doesn't actually care about a particular ordering, as there is no + // geometry to the test problem. However, use *some* ordering to model what + // a real problem must do. + if ( ! node->nkids) { + if (node->rank == p_->rank()) { + gcis_.push_back(node->cellidx); + i2lci_.push_back(qlt_.gci2lci(gcis_.back())); + } + return; + } + for (Int i = 0; i < node->nkids; ++i) + init_numbering(node->kids[i]); + } + + void init_tracers () { + Timer::start(Timer::trcrinit); + typedef Tracer::PT PT; + static const Int pts[] = { + PT::conserve | PT::shapepreserve | PT::consistent, + PT::shapepreserve, // Test a noncanonical problem type. + PT::conserve | PT::consistent, + PT::consistent + }; + Int tracer_idx = 0; + for (Int perturb = 0; perturb < 6; ++perturb) + for (Int ti = 0; ti < 4; ++ti) { + Tracer t; + t.problem_type = pts[ti]; + const bool shapepreserve = t.problem_type & PT::shapepreserve; + t.idx = tracer_idx++; + t.perturbation_type = perturb; + t.safe_should_hold = true; + t.no_change_should_hold = perturb == 0; + t.local_should_hold = perturb < 4 && shapepreserve; + t.write = perturb == 2 && ti == 2; + tracers_.push_back(t); + qlt_.declare_tracer(t.problem_type); + } + qlt_.end_tracer_declarations(); + qlt_assert(qlt_.get_num_tracers() == static_cast(tracers_.size())); + for (size_t i = 0; i < tracers_.size(); ++i) + qlt_assert(qlt_.get_problem_type(i) == (tracers_[i].problem_type | PT::consistent)); + Timer::stop(Timer::trcrinit); + } + + static Real urand () { return rand() / ((Real) RAND_MAX + 1.0); } + + static void generate_rho (Values& v) { + auto r = v.rhom(); + const Int n = v.ncells(); + for (Int i = 0; i < n; ++i) + r[i] = 0.5 + 1.5*urand(); + } + + static void generate_Q (const Tracer& t, Values& v) { + Real* rhom = v.rhom(), * Qm_min = v.Qm_min(t.idx), * Qm = v.Qm(t.idx), + * Qm_max = v.Qm_max(t.idx), * Qm_prev = v.Qm_prev(t.idx); + const Int n = v.ncells(); + for (Int i = 0; i < n; ++i) { + const Real + q_min = 0.1 + 0.8*urand(), + q_max = std::min(1, q_min + (0.9 - q_min)*urand()), + q = q_min + (q_max - q_min)*urand(); + // Check correctness up to FP. + assert(q_min >= 0 && + q_max <= 1 + 10*std::numeric_limits::epsilon() && + q_min <= q && q <= q_max); + Qm_min[i] = q_min*rhom[i]; + Qm_max[i] = q_max*rhom[i]; + // Protect against FP error. + Qm[i] = std::max(Qm_min[i], std::min(Qm_max[i], q*rhom[i])); + // Set previous Qm to the current unperturbed value. + Qm_prev[i] = Qm[i]; + } + } + + static void gen_rand_perm (const size_t n, std::vector& p) { + p.resize(n); + for (size_t i = 0; i < n; ++i) + p[i] = i; + for (size_t i = 0; i < n; ++i) { + const int j = urand()*n, k = urand()*n; + std::swap(p[j], p[k]); + } + } + + // Permuting the Qm array, even just on a rank as long as there is > 1 cell, + // produces a problem likely requiring considerable reconstruction, which + // reconstruction assuredly satisfies the properties. But because this is a + // local operation only, it doesn't test the 1 cell/rank case. + static void permute_Q (const Tracer& t, Values& v) { + Real* const Qm = v.Qm(t.idx); + const Int N = v.ncells(); + std::vector p; + gen_rand_perm(N, p); + std::vector Qm_orig(N); + std::copy(Qm, Qm + N, Qm_orig.begin()); + for (Int i = 0; i < N; ++i) + Qm[i] = Qm_orig[p[i]]; + } + + void add_const_to_Q (const Tracer& t, Values& v, + // Move 0 < alpha <= 1 of the way to the QLT or safety + // feasibility bound. + const Real& alpha, + // Whether the modification should be done in a + // mass-conserving way. + const bool conserve_mass, + // Only safety problem is feasible. + const bool safety_problem) { + // Some of these reductions aren't used at present. Might add more test + // options later that use them. + Real rhom, Qm, Qm_max; { + Real Qm_sum_lcl[3] = {0}; + for (Int i = 0; i < v.ncells(); ++i) { + Qm_sum_lcl[0] += v.rhom()[i]; + Qm_sum_lcl[1] += v.Qm(t.idx)[i]; + Qm_sum_lcl[2] += v.Qm_max(t.idx)[i]; + } + Real Qm_sum_gbl[3] = {0}; + mpi::all_reduce(*p_, Qm_sum_lcl, Qm_sum_gbl, 3, MPI_SUM); + rhom = Qm_sum_gbl[0]; Qm = Qm_sum_gbl[1]; Qm_max = Qm_sum_gbl[2]; + } + Real Qm_max_safety = 0; + if (safety_problem) { + Real q_safety_lcl = v.Qm_max(t.idx)[0] / v.rhom()[0]; + for (Int i = 1; i < v.ncells(); ++i) + q_safety_lcl = std::max(q_safety_lcl, v.Qm_max(t.idx)[i] / v.rhom()[i]); + Real q_safety_gbl = 0; + mpi::all_reduce(*p_, &q_safety_lcl, &q_safety_gbl, 1, MPI_MAX); + Qm_max_safety = q_safety_gbl*rhom; + } + const Real dQm = safety_problem ? + ((Qm_max - Qm) + alpha * (Qm_max_safety - Qm_max)) / ncells_ : + alpha * (Qm_max - Qm) / ncells_; + for (Int i = 0; i < v.ncells(); ++i) + v.Qm(t.idx)[i] += dQm; + // Now permute Qm so that it's a little more interesting. + permute_Q(t, v); + // Adjust Qm_prev. Qm_prev is used to test the PT::conserve case, and also + // simply to record the correct total mass. The modification above modified + // Q's total mass. If conserve_mass, then Qm_prev needs to be made to sum to + // the same new mass. If ! conserve_mass, we want Qm_prev to be modified in + // an interesting way, so that PT::conserve doesn't trivially undo the mod + // that was made above when the root fixes the mass discrepancy. + const Real + relax = 0.9, + dQm_prev = (conserve_mass ? dQm : + (safety_problem ? + ((Qm_max - Qm) + relax*alpha * (Qm_max_safety - Qm_max)) / ncells_ : + relax*alpha * (Qm_max - Qm) / ncells_)); + for (Int i = 0; i < v.ncells(); ++i) + v.Qm_prev(t.idx)[i] += dQm_prev; + } + + void perturb_Q (const Tracer& t, Values& v) { + // QLT is naturally mass conserving. But if QLT isn't being asked to impose + // mass conservation, then the caller better have a conservative + // method. Here, we model that by saying that Qm_prev and Qm should sum to + // the same mass. + const bool cm = ! (t.problem_type & Tracer::PT::conserve); + // For the edge cases, we cannot be exactly on the edge and still expect the + // q-limit checks to pass to machine precision. Thus, back away from the + // edge by an amount that bounds the error in the global mass due to FP, + // assuming each cell's mass is O(1). + const Real edg = 1 - ncells_*std::numeric_limits::epsilon(); + switch (t.perturbation_type) { + case 0: + // Do nothing, to test that QLT doesn't make any changes if none is + // needed. + break; + case 1: permute_Q(t, v); break; + case 2: add_const_to_Q(t, v, 0.5, cm, false); break; + case 3: add_const_to_Q(t, v, edg, cm, false); break; + case 4: add_const_to_Q(t, v, 0.5, cm, true ); break; + case 5: add_const_to_Q(t, v, edg, cm, true ); break; + } + } + + static std::string get_tracer_name (const Tracer& t) { + std::stringstream ss; + ss << "t" << t.idx; + return ss.str(); + } + + void init_writer () { + if (p_->amroot()) { + w_ = std::make_shared(); + w_->fh = std::unique_ptr(fopen("QLT.py", "w")); + int n = gcis_.size(); + w_->ngcis.resize(p_->size()); + mpi::gather(*p_, &n, 1, w_->ngcis.data(), 1, p_->root()); + w_->displs.resize(p_->size() + 1); + w_->displs[0] = 0; + for (size_t i = 0; i < w_->ngcis.size(); ++i) + w_->displs[i+1] = w_->displs[i] + w_->ngcis[i]; + qlt_assert(w_->displs.back() == ncells_); + w_->gcis.resize(ncells_); + mpi::gatherv(*p_, gcis_.data(), gcis_.size(), w_->gcis.data(), w_->ngcis.data(), + w_->displs.data(), p_->root()); + } else { + int n = gcis_.size(); + mpi::gather(*p_, &n, 1, static_cast(nullptr), 0, p_->root()); + Int* Inull = nullptr; + const int* inull = nullptr; + mpi::gatherv(*p_, gcis_.data(), gcis_.size(), Inull, inull, inull, p_->root()); + } + write_inited_ = true; + } + + void gather_field (const Real* Qm_lcl, std::vector& Qm_gbl, + std::vector& wrk) { + if (p_->amroot()) { + Qm_gbl.resize(ncells_); + wrk.resize(ncells_); + mpi::gatherv(*p_, Qm_lcl, gcis_.size(), wrk.data(), w_->ngcis.data(), + w_->displs.data(), p_->root()); + for (Int i = 0; i < ncells_; ++i) + Qm_gbl[w_->gcis[i]] = wrk[i]; + } else { + Real* rnull = nullptr; + const int* inull = nullptr; + mpi::gatherv(*p_, Qm_lcl, gcis_.size(), rnull, inull, inull, p_->root()); + } + } + + void write_field (const std::string& tracer_name, const std::string& field_name, + const std::vector& Qm) { + if ( ! p_->amroot()) return; + fprintf(w_->fh.get(), " s.%s.%s = [", tracer_name.c_str(), field_name.c_str()); + for (const auto& e : Qm) + fprintf(w_->fh.get(), "%1.15e, ", e); + fprintf(w_->fh.get(), "]\n"); + } + + void write_pre (const Tracer& t, Values& v) { + if ( ! t.write) return; + std::vector f, wrk; + if ( ! write_inited_) { + init_writer(); + if (w_) + fprintf(w_->fh.get(), + "def getsolns():\n" + " class Struct:\n" + " pass\n" + " s = Struct()\n" + " s.all = Struct()\n"); + gather_field(v.rhom(), f, wrk); + write_field("all", "rhom", f); + } + const auto name = get_tracer_name(t); + if (w_) + fprintf(w_->fh.get(), " s.%s = Struct()\n", name.c_str()); + gather_field(v.Qm_min(t.idx), f, wrk); + write_field(name, "Qm_min", f); + gather_field(v.Qm_prev(t.idx), f, wrk); + write_field(name, "Qm_orig", f); + gather_field(v.Qm(t.idx), f, wrk); + write_field(name, "Qm_pre", f); + gather_field(v.Qm_max(t.idx), f, wrk); + write_field(name, "Qm_max", f); + } + + void write_post (const Tracer& t, Values& v) { + if ( ! t.write) return; + const auto name = get_tracer_name(t); + std::vector Qm, wrk; + gather_field(v.Qm(t.idx), Qm, wrk); + write_field(name, "Qm_qlt", Qm); + } + + static void check (const QLTT& qlt) { + const Int n = qlt.nlclcells(); + std::vector gcis; + qlt.get_owned_glblcells(gcis); + qlt_assert(static_cast(gcis.size()) == n); + for (Int i = 0; i < n; ++i) + qlt_assert(qlt.gci2lci(gcis[i]) == i); + } + + static Int check (const Parallel& p, const std::vector& ts, const Values& v) { + static const bool details = false; + static const Real ulp2 = 2*std::numeric_limits::epsilon(); + Int nerr = 0; + std::vector lcl_mass(2*ts.size()), q_min_lcl(ts.size()), q_max_lcl(ts.size()); + std::vector t_ok(ts.size(), 1), local_violated(ts.size(), 0); + for (size_t ti = 0; ti < ts.size(); ++ti) { + const auto& t = ts[ti]; + + qlt_assert(t.safe_should_hold); + const bool safe_only = ! t.local_should_hold; + const Int n = v.ncells(); + const Real* rhom = v.rhom(), * Qm_min = v.Qm_min(t.idx), * Qm = v.Qm(t.idx), + * Qm_max = v.Qm_max(t.idx), * Qm_prev = v.Qm_prev(t.idx); + + q_min_lcl[ti] = 1; + q_max_lcl[ti] = 0; + for (Int i = 0; i < n; ++i) { + const bool lv = Qm[i] < Qm_min[i] || Qm[i] > Qm_max[i]; + if (lv) local_violated[ti] = 1; + if ( ! safe_only && lv) { + if (details) + pr("check q " << t.str() << ": " << Qm[i] << " " << + (Qm[i] < Qm_min[i] ? Qm[i] - Qm_min[i] : Qm[i] - Qm_max[i])); + t_ok[ti] = false; + ++nerr; + } + if (t.no_change_should_hold && Qm[i] != Qm_prev[i]) { + if (details) + pr("Q should be unchanged but is not: " << Qm_prev[i] << " changed to " << + Qm[i] << " in " << t.str()); + t_ok[ti] = false; + ++nerr; + } + lcl_mass[2*ti ] += Qm_prev[i]; + lcl_mass[2*ti + 1] += Qm[i]; + q_min_lcl[ti] = std::min(q_min_lcl[ti], Qm_min[i]/rhom[i]); + q_max_lcl[ti] = std::max(q_max_lcl[ti], Qm_max[i]/rhom[i]); + } + } + + std::vector q_min_gbl(ts.size(), 0), q_max_gbl(ts.size(), 0); + mpi::all_reduce(p, q_min_lcl.data(), q_min_gbl.data(), q_min_lcl.size(), MPI_MIN); + mpi::all_reduce(p, q_max_lcl.data(), q_max_gbl.data(), q_max_lcl.size(), MPI_MAX); + + for (size_t ti = 0; ti < ts.size(); ++ti) { + // Check safety problem. If local_should_hold and it does, then the safety + // problem is by construction also solved (since it's a relaxation of the + // local problem). + const auto& t = ts[ti]; + const bool safe_only = ! t.local_should_hold; + if (safe_only) { + const Int n = v.ncells(); + const Real* rhom = v.rhom(), * Qm_min = v.Qm_min(t.idx), * Qm = v.Qm(t.idx), + * Qm_max = v.Qm_max(t.idx); + const Real q_min = q_min_gbl[ti], q_max = q_max_gbl[ti]; + for (Int i = 0; i < n; ++i) { + if (Qm[i] < q_min*rhom[i]*(1 - ulp2) || + Qm[i] > q_max*rhom[i]*(1 + ulp2)) { + if (details) + pr("check q " << t.str() << ": " << q_min*rhom[i] << " " << Qm_min[i] << + " " << Qm[i] << " " << Qm_max[i] << " " << q_max*rhom[i] << " | " << + (Qm[i] < q_min*rhom[i] ? + Qm[i] - q_min*rhom[i] : + Qm[i] - q_max*rhom[i])); + t_ok[ti] = false; + ++nerr; + } + } + } + } + + std::vector glbl_mass(2*ts.size(), 0); + mpi::reduce(p, lcl_mass.data(), glbl_mass.data(), lcl_mass.size(), MPI_SUM, + p.root()); + std::vector t_ok_gbl(ts.size(), 0); + mpi::reduce(p, t_ok.data(), t_ok_gbl.data(), t_ok.size(), MPI_MIN, p.root()); + // Right now we're not using these: + std::vector local_violated_gbl(ts.size(), 0); + mpi::reduce(p, local_violated.data(), local_violated_gbl.data(), + local_violated.size(), MPI_MAX, p.root()); + + if (p.amroot()) { + const Real tol = 1e3*std::numeric_limits::epsilon(); + for (size_t ti = 0; ti < ts.size(); ++ti) { + // Check mass conservation. + const Real desired_mass = glbl_mass[2*ti], actual_mass = glbl_mass[2*ti+1], + rd = reldif(desired_mass, actual_mass); + const bool mass_failed = rd > tol; + if (mass_failed) { + ++nerr; + t_ok_gbl[ti] = false; + } + if ( ! t_ok_gbl[ti]) { + std::cout << "FAIL " << ts[ti].str(); + if (mass_failed) std::cout << " mass re " << rd; + std::cout << "\n"; + } + } + } + + return nerr; + } + +public: + TestQLT (const Parallel::Ptr& p, const tree::Node::Ptr& tree, + const Int& ncells, const bool verbose = false) + : p_(p), ncells_(ncells), qlt_(p_, ncells, tree), write_inited_(false) + { + check(qlt_); + init_numbering(tree); + init_tracers(); + if (verbose) qlt_.print(std::cout); + } + + Int run (const Int nrepeat = 1, const bool write=false) { + Timer::start(Timer::trcrgen); + const Int nt = qlt_.get_num_tracers(), nlclcells = qlt_.nlclcells(); + Values v(nt, nlclcells); + generate_rho(v); + { + Real* rhom = v.rhom(); + for (Int i = 0; i < nlclcells; ++i) + qlt_.set_rho(i2lci_[i], rhom[i]); + } + for (Int ti = 0; ti < nt; ++ti) { + generate_Q(tracers_[ti], v); + perturb_Q(tracers_[ti], v); + if (write) write_pre(tracers_[ti], v); + } + Timer::stop(Timer::trcrgen); + for (Int trial = 0; trial <= nrepeat; ++trial) { + for (Int ti = 0; ti < nt; ++ti) { + Real* Qm_min = v.Qm_min(ti), * Qm = v.Qm(ti), * Qm_max = v.Qm_max(ti), + * Qm_prev = v.Qm_prev(ti); + for (Int i = 0; i < nlclcells; ++i) + qlt_.set_Q(i2lci_[i], ti, Qm[i], Qm_min[i], Qm_max[i], Qm_prev[i]); + } + MPI_Barrier(p_->comm()); + Timer::start(Timer::qltrun); + qlt_.run(); + MPI_Barrier(p_->comm()); + Timer::stop(Timer::qltrun); + if (trial == 0) { + Timer::reset(Timer::qltrun); + Timer::reset(Timer::qltrunl2r); + Timer::reset(Timer::qltrunr2l); + Timer::reset(Timer::waitall); + Timer::reset(Timer::snp); + } + } + Timer::start(Timer::trcrcheck); + Int nerr = 0; + for (Int ti = 0; ti < nt; ++ti) { + Real* Qm = v.Qm(ti); + for (Int i = 0; i < nlclcells; ++i) + Qm[i] = qlt_.get_Q(i2lci_[i], ti); + if (write) write_post(tracers_[ti], v); + } + nerr += check(*p_, tracers_, v); + Timer::stop(Timer::trcrcheck); + return nerr; + } +}; + +// Test all QLT variations and situations. +Int test_qlt (const Parallel::Ptr& p, const tree::Node::Ptr& tree, const Int& ncells, + const int nrepeat = 1, + // Diagnostic output for dev and illustration purposes. To be + // clear, no QLT unit test requires output to be checked; each + // checks in-memory data and returns a failure count. + const bool write = false, + const bool verbose = false) { + return TestQLT(p, tree, ncells, verbose).run(nrepeat, write); +} +} // namespace test + +// Tree for a 1-D periodic domain, for unit testing. +namespace oned { +struct Mesh { + struct ParallelDecomp { + enum Enum { + // The obvious distribution of ranks: 1 rank takes exactly 1 contiguous + // set of cell indices. + contiguous, + // For heavy-duty testing of QLT comm pattern, use a ridiculous assignment + // of ranks to cell indices. This forces the QLT tree to communicate, + // pack, and unpack in silly ways. + pseudorandom + }; + }; + + Mesh (const Int nc, const Parallel::Ptr& p, + const ParallelDecomp::Enum& parallel_decomp = ParallelDecomp::contiguous) { + init(nc, p, parallel_decomp); + } + + void init (const Int nc, const Parallel::Ptr& p, + const ParallelDecomp::Enum& parallel_decomp) { + nc_ = nc; + nranks_ = p->size(); + p_ = p; + pd_ = parallel_decomp; + qlt_assert(nranks_ <= nc_); + } + + Int ncell () const { return nc_; } + + const Parallel::Ptr& parallel () const { return p_; } + + Int rank (const Int& ci) const { + switch (pd_) { + case ParallelDecomp::contiguous: + return std::min(nranks_ - 1, ci / (nc_ / nranks_)); + default: { + const auto chunk = ci / nranks_; + return (ci + chunk) % nranks_; + } + } + } + + static Int unittest (const Parallel::Ptr& p) { + const Mesh::ParallelDecomp::Enum dists[] = { Mesh::ParallelDecomp::pseudorandom, + Mesh::ParallelDecomp::contiguous }; + Int ne = 0; + for (size_t id = 0; id < sizeof(dists)/sizeof(*dists); ++id) { + Mesh m(std::max(42, 3*p->size()), p, dists[id]); + const Int nc = m.ncell(); + for (Int ci = 0; ci < nc; ++ci) + if (m.rank(ci) < 0 || m.rank(ci) >= p->size()) + ++ne; + } + return ne; + } + +private: + Int nc_, nranks_; + Parallel::Ptr p_; + ParallelDecomp::Enum pd_; +}; + +tree::Node::Ptr make_tree (const Mesh& m, const Int cs, const Int ce, + const tree::Node* parent) { + const Int cn = ce - cs, cn0 = cn/2; + tree::Node::Ptr n = std::make_shared(); + n->parent = parent; + if (cn == 1) { + n->nkids = 0; + n->rank = m.rank(cs); + n->cellidx = cs; + return n; + } + n->nkids = 2; + n->kids[0] = make_tree(m, cs, cs + cn0, n.get()); + n->kids[1] = make_tree(m, cs + cn0, ce, n.get()); + return n; +} + +tree::Node::Ptr make_tree (const Mesh& m) { + return make_tree(m, 0, m.ncell(), nullptr); +} + +tree::Node::Ptr make_tree (const Parallel::Ptr& p, const Int& ncells) { + Mesh m(ncells, p); + return make_tree(m); +} + +namespace test { +void mark_cells (const tree::Node::Ptr& node, std::vector& cells) { + if ( ! node->nkids) { + ++cells[node->cellidx]; + return; + } + for (Int i = 0; i < node->nkids; ++i) + mark_cells(node->kids[i], cells); +} + +Int unittest (const Parallel::Ptr& p) { + const Mesh::ParallelDecomp::Enum dists[] = { Mesh::ParallelDecomp::pseudorandom, + Mesh::ParallelDecomp::contiguous }; + Int ne = 0; + for (size_t id = 0; id < sizeof(dists)/sizeof(*dists); ++id) { + Mesh m(std::max(42, 3*p->size()), p, Mesh::ParallelDecomp::pseudorandom); + tree::Node::Ptr tree = make_tree(m); + std::vector cells(m.ncell(), 0); + mark_cells(tree, cells); + for (Int i = 0; i < m.ncell(); ++i) + if (cells[i] != 1) ++ne; + } + return ne; +} +} // namespace test +} // namespace oned + +namespace test { +Int unittest_NodeSets (const Parallel::Ptr& p) { + using Mesh = oned::Mesh; + const Int szs[] = { p->size(), 3*p->size() }; + const Mesh::ParallelDecomp::Enum dists[] = { Mesh::ParallelDecomp::pseudorandom, + Mesh::ParallelDecomp::contiguous }; + Int nerr = 0; + for (size_t is = 0; is < sizeof(szs)/sizeof(*szs); ++is) + for (size_t id = 0; id < sizeof(dists)/sizeof(*dists); ++id) { + Mesh m(szs[is], p, dists[id]); + tree::Node::Ptr tree = make_tree(m); + impl::NodeSets::ConstPtr nodesets = impl::analyze(p, m.ncell(), tree); + tree = nullptr; + nerr += impl::unittest(p, nodesets, m.ncell()); + } + return nerr; +} + +Int unittest_QLT (const Parallel::Ptr& p, const bool write_requested=false) { + using Mesh = oned::Mesh; + const Int szs[] = { p->size(), 2*p->size(), 7*p->size(), 21*p->size() }; + const Mesh::ParallelDecomp::Enum dists[] = { Mesh::ParallelDecomp::contiguous, + Mesh::ParallelDecomp::pseudorandom }; + Int nerr = 0; + for (size_t is = 0, islim = sizeof(szs)/sizeof(*szs); is < islim; ++is) + for (size_t id = 0, idlim = sizeof(dists)/sizeof(*dists); id < idlim; ++id) { + if (p->amroot()) { + std::cout << " (" << szs[is] << ", " << id << ")"; + std::cout.flush(); + } + Mesh m(szs[is], p, dists[id]); + tree::Node::Ptr tree = make_tree(m); + const bool write = (write_requested && m.ncell() < 3000 && + is == islim-1 && id == idlim-1); + nerr += test::test_qlt(p, tree, m.ncell(), 1, write); + } + return nerr; +} + +Int run (const Parallel::Ptr& p, const Input& in) { + Int nerr = 0; + if (in.unittest) { + Int ne; + ne = oned::Mesh::unittest(p); + if (ne && p->amroot()) std::cerr << "FAIL: Mesh::unittest()\n"; + nerr += ne; + ne = oned::test::unittest(p); + if (ne && p->amroot()) std::cerr << "FAIL: oned::unittest_tree()\n"; + nerr += ne; + ne = unittest_NodeSets(p); + if (ne && p->amroot()) std::cerr << "FAIL: oned::unittest_NodeSets()\n"; + nerr += ne; + ne = unittest_QLT(p, in.write); + if (ne && p->amroot()) std::cerr << "FAIL: oned::unittest_QLT()\n"; + nerr += ne; + if (p->amroot()) std::cout << "\n"; + } + if (nerr) + return nerr; + // Performance test. + if (in.ncells > 0) { + oned::Mesh m(in.ncells, p, + (in.pseudorandom ? + oned::Mesh::ParallelDecomp::pseudorandom : + oned::Mesh::ParallelDecomp::contiguous)); + Timer::init(); + Timer::start(Timer::total); Timer::start(Timer::tree); + tree::Node::Ptr tree = make_tree(m); + Timer::stop(Timer::tree); + test::test_qlt(p, tree, in.ncells, in.nrepeat, false, in.verbose); + Timer::stop(Timer::total); + if (p->amroot()) Timer::print(); + } + return nerr; +} + +} // namespace test +} // namespace qlt + +#ifdef KOKKOS_HAVE_SERIAL +template class qlt::QLT; +#endif +#ifdef KOKKOS_HAVE_OPENMP +template class qlt::QLT; +#endif +#ifdef KOKKOS_HAVE_CUDA +template class qlt::QLT; +#endif diff --git a/qlt/qlt.hpp b/qlt/qlt.hpp new file mode 100644 index 0000000..c59f37a --- /dev/null +++ b/qlt/qlt.hpp @@ -0,0 +1,258 @@ +#ifndef INCLUDE_QLT_HPP +#define INCLUDE_QLT_HPP + +#include + +#include +#include +#include +#include +#include + +#include +#include "qlt_kokkos.hpp" + +// QLT: Quasi-local tree-based non-iterative tracer density reconstructor for +// mass conservation, shape preservation, and tracer consistency. +namespace qlt { +typedef int Int; +typedef size_t Size; +typedef double Real; + +namespace impl { class NodeSets; } + +class Parallel { + MPI_Comm comm_; +public: + typedef std::shared_ptr Ptr; + Parallel(MPI_Comm comm) : comm_(comm) {} + MPI_Comm comm () const { return comm_; } + Int size() const; + Int rank() const; + Int root () const { return 0; } + bool amroot () const { return rank() == root(); } +}; + +Parallel::Ptr make_parallel(MPI_Comm comm); + +namespace tree { +// The caller builds a tree of these nodes to pass to QLT. +struct Node { + typedef std::shared_ptr Ptr; + const Node* parent; // (Can't be a shared_ptr: would be a circular dependency.) + Int rank; // Owning rank. + Int cellidx; // If a leaf, the cell to which this node corresponds. + Int nkids; // 0 at leaf, 1 or 2 otherwise. + Node::Ptr kids[2]; + void* reserved; // For internal use. + Node () : parent(nullptr), rank(-1), cellidx(-1), nkids(0), reserved(nullptr) {} +}; +} // namespace tree + +template +class QLT { +public: + typedef typename impl::DeviceType::type Device; + typedef QLT Me; + typedef std::shared_ptr Ptr; + + struct ProblemType { + enum : Int { conserve = 1, shapepreserve = 1 << 1, consistent = 1 << 2 }; + }; + + // Set up QLT topology and communication data structures based on a tree. + QLT(const Parallel::Ptr& p, const Int& ncells, const tree::Node::Ptr& tree); + + void print(std::ostream& os) const; + + // Number of cells owned by this rank. + Int nlclcells() const; + + // Cells owned by this rank, in order of local numbering. Thus, + // gci2lci(gcis[i]) == i. Ideally, the caller never actually calls gci2lci(), + // and instead uses the information from get_owned_glblcells to determine + // local cell indices. + void get_owned_glblcells(std::vector& gcis) const; + + // For global cell index cellidx, i.e., the globally unique ordinal associated + // with a cell in the caller's tree, return this rank's local index for + // it. This is not an efficient operation. + Int gci2lci(const Int& gci) const; + + // Set up QLT tracer metadata. Once end_tracer_declarations is called, it is + // an error to call declare_tracer again. Call declare_tracer in order of the + // tracer index in the caller's numbering. + void declare_tracer(int problem_type); + + void end_tracer_declarations(); + + int get_problem_type(const Int& tracer_idx) const; + + Int get_num_tracers() const; + + // set_{rho,Q}: Set cell values prior to running the QLT algorithm. + // set_rho must be called before set_Q. + // lclcellidx is gci2lci(cellidx). + // Notation: + // rho: Total density. + // Q: Tracer density. + // q: Tracer mixing ratio = Q/rho. + // *m: Mass corresponding to the density; results from an integral over a + // region, such as a cell. + KOKKOS_INLINE_FUNCTION + void set_rho(const Int& lclcellidx, + // Current total mass in this cell. + const Real& rhom); + + KOKKOS_INLINE_FUNCTION + void set_Q(const Int& lclcellidx, const Int& tracer_idx, + // Current tracer mass in this cell. + const Real& Qm, + // Minimum and maximum permitted tracer mass in this cell. + const Real& Qm_min, const Real& Qm_max, + // If mass conservation is requested, provide the previous Qm, + // which will be summed to give the desired global mass. + const Real Qm_prev = -1); + + // Run the QLT algorithm with the values set by set_{rho,Q}. + void run(); + + // Get a cell's tracer mass Qm after the QLT algorithm has run. + KOKKOS_INLINE_FUNCTION + Real get_Q(const Int& lclcellidx, const Int& tracer_idx); + +private: + typedef Kokkos::View IntList; + typedef impl::Const ConstIntList; + typedef impl::ConstUnmanaged ConstUnmanagedIntList; + + static void init(const std::string& name, IntList& d, + typename IntList::HostMirror& h, size_t n); + + struct MetaDataBuilder { + typedef std::shared_ptr Ptr; + std::vector trcr2prob; + }; + + struct MetaData { + enum : Int { nprobtypes = 4 }; + + template + struct Arrays { + // trcr2prob(i) is the ProblemType of tracer i. + IntListT trcr2prob; + // bidx2trcr(prob2trcrptr(i) : prob2trcrptr(i+1)-1) is the list of + // tracers having ProblemType index i. bidx2trcr is the permutation + // from the user's tracer index to the bulk data's ordering (bidx). + Int prob2trcrptr[nprobtypes+1]; + IntListT bidx2trcr; + // Inverse of bidx2trcr. + IntListT trcr2bidx; + // Points to the start of l2r bulk data for each problem type, within a + // slot. + Int prob2bl2r[nprobtypes + 1]; + // Point to the start of l2r bulk data for each tracer, within a slot. + IntListT trcr2bl2r; + // Same for r2l bulk data. + Int prob2br2l[nprobtypes + 1]; + IntListT trcr2br2l; + }; + + static int get_problem_type(const int& idx); + + // icpc doesn't let us use problem_type_ here, even though it's constexpr. + static int get_problem_type_idx(const int& mask); + + static int get_problem_type_l2r_bulk_size(const int& mask); + + static int get_problem_type_r2l_bulk_size(const int& mask); + + struct CPT { + // We could make the l2r buffer smaller by one entry, Qm. However, the + // l2r comm is more efficient if it's done with one buffer. Similarly, + // we separate the r2l data into a separate buffer for packing and MPI + // efficiency. + // There are 7 possible problems. + // The only problem not supported is conservation alone. It makes very + // little sense to use QLT for conservation alone. + // The remaining 6 fall into 4 categories of details. These 4 categories + // are traceked by QLT; which of the original 6 problems being solved is + // not important. + enum { + // l2r: rhom, (Qm_min, Qm, Qm_max)*; l2r, r2l: Qm* + s = ProblemType::shapepreserve, + st = ProblemType::shapepreserve | ProblemType::consistent, + // l2r: rhom, (Qm_min, Qm, Qm_max, Qm_prev)*; l2r, r2l: Qm* + cs = ProblemType::conserve | s, + cst = ProblemType::conserve | st, + // l2r: rhom, (q_min, Qm, q_max)*; l2r, r2l: Qm* + t = ProblemType::consistent, + // l2r: rhom, (q_min, Qm, q_max, Qm_prev)*; l2r, r2l: Qm* + ct = ProblemType::conserve | t + }; + }; + + Arrays a_h; + Arrays a_d; + + void init(const MetaDataBuilder& mdb); + + private: + static constexpr Int problem_type_[] = { CPT::st, CPT::cst, CPT::t, CPT::ct }; + Arrays a_h_; + Arrays a_d_; + }; + + struct BulkData { + typedef Kokkos::View RealList; + typedef impl::Unmanaged UnmanagedRealList; + + UnmanagedRealList l2r_data, r2l_data; + + void init(const MetaData& md, const Int& nslots); + + private: + RealList l2r_data_, r2l_data_; + }; + +private: + void init(const Parallel::Ptr& p, const Int& ncells, const tree::Node::Ptr& tree); + + void init_ordinals(); + + KOKKOS_INLINE_FUNCTION + static void solve_node_problem(const Int problem_type, + const Real& rhom, const Real* pd, const Real& Qm, + const Real& rhom0, const Real* k0d, Real& Qm0, + const Real& rhom1, const Real* k1d, Real& Qm1); + +private: + Parallel::Ptr p_; + // Tree and communication topology. + std::shared_ptr ns_; + // Globally unique cellidx -> rank-local index. + std::map gci2lci_; + // Temporary to collect caller's tracer information prior to calling + // end_tracer_declarations(). + typename MetaDataBuilder::Ptr mdb_; + // Constructed in end_tracer_declarations(). + MetaData md_; + BulkData bd_; +}; + +namespace test { +struct Input { + bool unittest, write; + Int ncells, ntracers, tracer_type, nrepeat; + bool pseudorandom, verbose; +}; + +Int run(const Parallel::Ptr& p, const Input& in); +} // namespace test +} // namespace qlt + +// These are the definitions that must be visible in the calling translation +// unit, unless Cuda relocatable device code is enabled. +#include "qlt_inline.hpp" + +#endif diff --git a/qlt/qlt_inline.hpp b/qlt/qlt_inline.hpp new file mode 100644 index 0000000..a16aa6a --- /dev/null +++ b/qlt/qlt_inline.hpp @@ -0,0 +1,392 @@ +#ifndef INCLUDE_QLT_INLINE_HPP +#define INCLUDE_QLT_INLINE_HPP + +#include + +namespace qlt { + +template KOKKOS_INLINE_FUNCTION +void QLT::set_rho (const Int& lclcellidx, const Real& rhom) { + const Int ndps = md_.a_d.prob2bl2r[md_.nprobtypes]; + bd_.l2r_data(ndps*lclcellidx) = rhom; +} + +template KOKKOS_INLINE_FUNCTION +void QLT::set_Q (const Int& lclcellidx, const Int& tracer_idx, + const Real& Qm, + const Real& Qm_min, const Real& Qm_max, + const Real Qm_prev) { + const Int ndps = md_.a_d.prob2bl2r[md_.nprobtypes]; + Real* bd; { + const Int bdi = md_.a_d.trcr2bl2r(tracer_idx); + bd = &bd_.l2r_data(ndps*lclcellidx + bdi); + } + bd[1] = Qm; + { + const Int problem_type = md_.a_d.trcr2prob(tracer_idx); + if (problem_type & ProblemType::shapepreserve) { + bd[0] = Qm_min; + bd[2] = Qm_max; + } else if (problem_type & ProblemType::consistent) { + const Real rhom = bd_.l2r_data(ndps*lclcellidx); + bd[0] = Qm_min / rhom; + bd[2] = Qm_max / rhom; + } else { + Kokkos::abort("set_Q: invalid problem_type."); + } + if (problem_type & ProblemType::conserve) { + if (Qm_prev < 0) Kokkos::abort("Qm_prev was not provided to set_Q."); + bd[3] = Qm_prev; + } + } +} + +template KOKKOS_INLINE_FUNCTION +Real QLT::get_Q (const Int& lclcellidx, const Int& tracer_idx) { + const Int ndps = md_.a_d.prob2br2l[md_.nprobtypes]; + const Int bdi = md_.a_d.trcr2br2l(tracer_idx); + return bd_.r2l_data(ndps*lclcellidx + bdi); +} + +namespace impl { +// GPU-friendly replacements for std::min/max. +template KOKKOS_INLINE_FUNCTION +const T& min (const T& a, const T& b) { return a < b ? a : b; } +template KOKKOS_INLINE_FUNCTION +const T& max (const T& a, const T& b) { return a > b ? a : b; } +} + +namespace slv { +KOKKOS_INLINE_FUNCTION +Real get_xbd (const Real* xbd, const Int i, const bool xbds_scalar) +{ return xbds_scalar ? *xbd : xbd[i]; } + +KOKKOS_INLINE_FUNCTION +bool is_inside (const Real xi, const Real* xlo, const Real* xhi, const Int i, + const bool xbds_scalar) { + return (xi > get_xbd(xlo, i, xbds_scalar) && + xi < get_xbd(xhi, i, xbds_scalar)); +} + +KOKKOS_INLINE_FUNCTION +bool is_outside (const Real xi, const Real* xlo, const Real* xhi, const Int i, + const bool xbds_scalar) { + return (xi < get_xbd(xlo, i, xbds_scalar) || + xi > get_xbd(xhi, i, xbds_scalar)); +} + +KOKKOS_INLINE_FUNCTION +Real calc_r_tol (const Real b, const Real* a, const Real* y, const Int n) { + Real ab = std::abs(b); + for (Int i = 0; i < n; ++i) ab = std::max(ab, std::abs(a[i]*y[i])); + return 1e1*std::numeric_limits::epsilon()*std::abs(ab); +} + +KOKKOS_INLINE_FUNCTION +void calc_r (const Int n, const Real* w, const Real* a, const Real b, + const Real* xlo, const Real* xhi, const bool xbds_scalar, + const Real* y, const Real& lambda, Real* x, Real& r, Real& r_lambda) { + r = 0; + r_lambda = 0; + for (Int i = 0; i < n; ++i) { + const Real q = a[i]/w[i]; + const Real x_trial = y[i] + lambda*q; + Real xtmp; + if (x_trial < (xtmp = get_xbd(xlo, i, xbds_scalar))) + x[i] = xtmp; + else if (x_trial > (xtmp = get_xbd(xhi, i, xbds_scalar))) + x[i] = xtmp; + else { + x[i] = x_trial; + r_lambda += a[i]*q; + } + r += a[i]*x[i]; + } + r -= b; +} + +// Solve +// min_x sum_i w(i) (x(i) - y(i))^2 +// st a' x = b +// xlo <= x <= xhi. +// This function assumes w > 0 to save a few operations. Return 0 on success and +// x == y, 1 on success and x != y, -1 if infeasible, -2 if max_its hit with no +// solution. See Section 3 of Bochev, Ridzal, Shashkov, Fast optimization-based +// conservative remap of scalar fields through aggregate mass transfer. lambda +// is used in check_1eq_bc_qp_foc. +//todo 2D version of this function that takes advantage of 2D. +KOKKOS_INLINE_FUNCTION +Int solve_1eq_bc_qp (const Int n, const Real* w, const Real* a, const Real b, + const Real* xlo, const Real* xhi, const bool xbds_scalar, + const Real* y, Real* x, const Int max_its = 100) { + const Real r_tol = calc_r_tol(b, a, y, n); + + { // Check for a quick exit. + bool all_in = true; + Real r = 0; + for (Int i = 0; i < n; ++i) { + if (is_outside(x[i], xlo, xhi, i, xbds_scalar)) { + all_in = false; + break; + } + r += a[i]*x[i]; + } + if (all_in) { + r -= b; + if (std::abs(r) <= r_tol) + return 0; + } + } + + { // Eval r at end points to check for feasibility, and also possibly a quick + // exit on a common case. + Real r = -b; + for (Int i = 0; i < n; ++i) { + x[i] = get_xbd(xlo, i, xbds_scalar); + r += a[i]*x[i]; + } + if (std::abs(r) <= r_tol) return 1; + if (r > 0) return -1; + r = -b; + for (Int i = 0; i < n; ++i) { + x[i] = get_xbd(xhi, i, xbds_scalar); + r += a[i]*x[i]; + } + if (std::abs(r) <= r_tol) return 1; + if (r < 0) return -1; + } + + { // Check for a quick exit: the bounds are so tight that the midpoint of the + // box satisfies r_tol. + Real r = -b; + for (Int i = 0; i < n; ++i) { + x[i] = 0.5*(get_xbd(xlo, i, xbds_scalar) + get_xbd(xhi, i, xbds_scalar)); + r += a[i]*x[i]; + } + if (std::abs(r) <= r_tol) return 1; + } + + const Real wall_dist = 1e-3; + + // Get lambda endpoints. + Real lamlo = 0, lamhi = 0; + for (Int i = 0; i < n; ++i) { + const Real rq = w[i]/a[i]; + const Real lamlo_i = rq*(get_xbd(xlo, i, xbds_scalar) - y[i]); + const Real lamhi_i = rq*(get_xbd(xhi, i, xbds_scalar) - y[i]); + if (i == 0) { + lamlo = lamlo_i; + lamhi = lamhi_i; + } else { + lamlo = impl::min(lamlo, lamlo_i); + lamhi = impl::max(lamhi, lamhi_i); + } + } + const Real lamlo_feas = lamlo, lamhi_feas = lamhi; + Real lambda = lamlo <= 0 && lamhi >= 0 ? 0 : lamlo; + + Int info = -2; + + // Bisection-safeguarded Newton iteration for r(lambda) = 0. + bool prev_step_bisect = false; + Int nbisect = 0; + for (Int iteration = 0; iteration < max_its; ++iteration) { + // Compute x, r, r_lambda. + Real r, r_lambda; + calc_r(n, w, a, b, xlo, xhi, xbds_scalar, y, lambda, x, r, r_lambda); + // Is r(lambda) - b sufficiently == 0? + if (std::abs(r) <= r_tol) { + info = 1; + break; + } + // Check if the lambda bounds are too close. + if (nbisect > 64) { + if (lamhi == lamhi_feas || lamlo == lamlo_feas) { + // r isn't small enough and one lambda bound is on the feasibility + // limit. The QP must not be feasible. + info = -1; + break; + } + info = 1; + break; + } + // Adjust lambda bounds. + if (r > 0) + lamhi = lambda; + else + lamlo = lambda; + if (r_lambda != 0) { + // Newton step. + lambda -= r/r_lambda; + } else { + // Force bisection. + lambda = lamlo; + } + // Safeguard. The wall distance check assures progress, but use it only + // every other potential bisection. + const Real D = prev_step_bisect ? 0 : wall_dist*(lamhi - lamlo); + if (lambda - lamlo < D || lamhi - lambda < D) { + lambda = 0.5*(lamlo + lamhi); + ++nbisect; + prev_step_bisect = true; + } else { + prev_step_bisect = false; + } + } + + return info; +} + +KOKKOS_INLINE_FUNCTION +void r2l_nl_adjust_bounds (Real Qm_bnd[2], const Real rhom[2], Real Qm_extra) { + Real q[2]; + for (Int i = 0; i < 2; ++i) q[i] = Qm_bnd[i] / rhom[i]; + if (Qm_extra < 0) { + Int i0, i1; + if (q[0] >= q[1]) { i0 = 0; i1 = 1; } else { i0 = 1; i1 = 0; } + const Real Qm_gap = (q[i1] - q[i0])*rhom[i0]; + if (Qm_gap <= Qm_extra) { + Qm_bnd[i0] += Qm_extra; + return; + } + } else { + Int i0, i1; + if (q[0] <= q[1]) { i0 = 0; i1 = 1; } else { i0 = 1; i1 = 0; } + const Real Qm_gap = (q[i1] - q[i0])*rhom[i0]; + if (Qm_gap >= Qm_extra) { + Qm_bnd[i0] += Qm_extra; + return; + } + } + { // Have to adjust both. Adjust so that the q bounds are the same. This + // procedure assures that as long as rhom is conservative, then the + // adjustment never pushes q_{min,max} out of the safety bounds. + const Real Qm_tot = Qm_bnd[0] + Qm_bnd[1] + Qm_extra; + const Real rhom_tot = rhom[0] + rhom[1]; + const Real q_tot = Qm_tot / rhom_tot; + for (Int i = 0; i < 2; ++i) + Qm_bnd[i] = q_tot*rhom[i]; + } +} + +KOKKOS_INLINE_FUNCTION +void r2l_l_adjust_bounds (const Int np, Real* q_min, Real* q_max, const Real* rhom, + Real Qm_extra) { + assert(0); // Not used right now, but want to eventually. Need to do some more analysis. + static constexpr int max_np = 16; + Real* const q_bnd = Qm_extra < 0 ? q_min : q_max; + // Try solving a QP that adjusts a q bound. + Real Qm = Qm_extra; + Real w[max_np], q_bnd_min[max_np], q_bnd_max[max_np], q_bnd_orig[max_np]; + q_bnd_min[0] = q_min[0]; + q_bnd_max[0] = q_max[0]; + for (Int i = 0; i < np; ++i) { + const Real rhomi = rhom[i]; + Qm += q_bnd[i]*rhomi; + q_bnd_orig[i] = q_bnd[i]; + w[i] = rhomi; + if (Qm_extra < 0) { + q_bnd_min[0] = impl::min(q_bnd_min[0], q_min[i]); + q_bnd_max[i] = q_max[i]; + } else { + q_bnd_min[i] = q_min[i]; + q_bnd_max[0] = impl::max(q_bnd_max[0], q_max[i]); + } + } + if (Qm_extra < 0) + for (Int i = 1; i < np; ++i) q_bnd_min[i] = q_bnd_min[0]; + else + for (Int i = 1; i < np; ++i) q_bnd_max[i] = q_bnd_max[0]; + // Check for feasibility. + bool feasible; { + Real Qm_lo = 0, Qm_hi = 0; + for (Int i = 0; i < np; ++i) { + Qm_lo += q_bnd_min[i]*w[i]; + Qm_hi += q_bnd_max[i]*w[i]; + } + feasible = Qm_lo <= Qm && Qm <= Qm_hi; + } + if (feasible) { + solve_1eq_bc_qp(np, w, w, Qm, q_bnd_min, q_bnd_max, false, q_bnd_orig, q_bnd); + } else { + // The QP isn't feasible, so set the bound to a constant. + Real rhom_tot = 0, Qm_tot = Qm_extra; + for (Int i = 0; i < np; ++i) { + const Real rhomi = rhom[i]; + rhom_tot += rhomi; + Qm_tot += q_bnd_orig[i]*rhomi; + } + const Real q_tot = Qm_tot / rhom_tot; + for (Int i = 0; i < np; ++i) + q_bnd[i] = q_tot; + //return; + // Assert that this constant is outside of all previous bound values. That's + // why the QP wasn't feasible. + if (Qm_extra < 0) + for (Int i = 0; i < np; ++i) + assert(q_tot <= q_bnd_orig[i]); + else + for (Int i = 0; i < np; ++i) + assert(q_tot >= q_bnd_orig[i]); + } +} + +KOKKOS_INLINE_FUNCTION +void solve_node_problem (const Real& rhom, const Real* pd, const Real& Qm, + const Real& rhom0, const Real* k0d, Real& Qm0, + const Real& rhom1, const Real* k1d, Real& Qm1) { + Real Qm_min_kids [] = {k0d[0], k1d[0]}; + Real Qm_orig_kids[] = {k0d[1], k1d[1]}; + Real Qm_max_kids [] = {k0d[2], k1d[2]}; + { // Set the target values so that mass gets redistributed in a relative sense + // rather than absolute. If a kid doesn't have much mass, don't give it too + // much. + const Real Qm_orig = pd[1], Qm_extra = Qm - Qm_orig; + if (Qm_orig != 0) + for (Int i = 0; i < 2; ++i) + Qm_orig_kids[i] += (Qm_orig_kids[i] / Qm_orig) * Qm_extra; + } + { // The ideal problem is not assuredly feasible. Test for feasibility. If not + // feasible, adjust bounds to solve the safety problem, which is assuredly + // feasible if the total density field rho is mass conserving (Q doesn't + // have to be mass conserving, of course; achieving mass conservation is one + // use for QLT). + const Real Qm_min = pd[0], Qm_max = pd[2]; + const bool lo = Qm < Qm_min, hi = Qm > Qm_max; + if (lo || hi) { + const Real rhom_kids[] = {rhom0, rhom1}; + r2l_nl_adjust_bounds(lo ? Qm_min_kids : Qm_max_kids, + rhom_kids, + Qm - (lo ? Qm_min : Qm_max)); + } + } + { // Solve the node's QP. + static const Real ones[] = {1, 1}; + Real Qm_kids[2] = {k0d[1], k1d[1]}; + solve_1eq_bc_qp(2, ones, ones, Qm, Qm_min_kids, Qm_max_kids, false, Qm_orig_kids, + Qm_kids); + Qm0 = Qm_kids[0]; + Qm1 = Qm_kids[1]; + } +} +} // namespace slv + +template KOKKOS_INLINE_FUNCTION +void QLT::solve_node_problem (const Int problem_type, + const Real& rhom, const Real* pd, const Real& Qm, + const Real& rhom0, const Real* k0d, Real& Qm0, + const Real& rhom1, const Real* k1d, Real& Qm1) { + if ( ! (problem_type & ProblemType::shapepreserve)) { + Real mpd[3], mk0d[3], mk1d[3]; + mpd[0] = pd [0]*rhom ; mpd [1] = pd[1] ; mpd [2] = pd [2]*rhom ; + mk0d[0] = k0d[0]*rhom0; mk0d[1] = k0d[1]; mk0d[2] = k0d[2]*rhom0; + mk1d[0] = k1d[0]*rhom1; mk1d[1] = k1d[1]; mk1d[2] = k1d[2]*rhom1; + slv::solve_node_problem(rhom, mpd, Qm, rhom0, mk0d, Qm0, rhom1, mk1d, Qm1); + return; + } + slv::solve_node_problem(rhom, pd, Qm, rhom0, k0d, Qm0, rhom1, k1d, Qm1); +} + +} // namespace qlt + +#endif diff --git a/qlt/qlt_kokkos.hpp b/qlt/qlt_kokkos.hpp new file mode 100644 index 0000000..3ac727a --- /dev/null +++ b/qlt/qlt_kokkos.hpp @@ -0,0 +1,43 @@ +#ifndef INCLUDE_QLT_KOKKOS_HPP +#define INCLUDE_QLT_KOKKOS_HPP + +namespace qlt { +namespace impl { +template +using MemoryTraits = Kokkos::MemoryTraits< + MemoryTraitsType::Unmanaged | MemoryTraitsType::RandomAccess | + MemoryTraitsType::Atomic | flag>; + +template +using Unmanaged = Kokkos::View< + typename View::data_type, typename View::array_layout, + typename View::device_type, MemoryTraits >; +template +using Const = Kokkos::View< + typename View::const_data_type, typename View::array_layout, + typename View::device_type, typename View::memory_traits>; +template +using ConstUnmanaged = Const >; + +template +struct DeviceType { + typedef Kokkos::Device type; +}; + +#ifdef KOKKOS_HAVE_CUDA +typedef Kokkos::Device DefaultDeviceType; + +template <> struct DeviceType { + typedef DefaultDeviceType type; +}; +#else +typedef Kokkos::Device DefaultDeviceType; +#endif +} +} + +#endif diff --git a/qlt/qlt_test.cpp b/qlt/qlt_test.cpp new file mode 100644 index 0000000..a465c27 --- /dev/null +++ b/qlt/qlt_test.cpp @@ -0,0 +1,89 @@ +#include "qlt.hpp" + +#include +#include + +#define throw_if(condition, message) do { \ + if (condition) { \ + std::stringstream _ss_; \ + _ss_ << __FILE__ << ":" << __LINE__ << ": The condition:\n" \ + << #condition "\nled to the exception\n" << message << "\n"; \ + throw std::logic_error(_ss_.str()); \ + } \ + } while (0) + +inline bool eq (const std::string& a, const char* const b1, const char* const b2 = 0) { + return (a == std::string(b1) || (b2 && a == std::string(b2)) || + a == std::string("-") + std::string(b1)); +} + +struct InputParser { + qlt::test::Input in; + + class ArgAdvancer { + const int argc_; + char const* const* argv_; + int i_; + public: + ArgAdvancer (int argc, char** argv) : argc_(argc), argv_(argv), i_(1) {} + const char* advance () { + if (i_+1 >= argc_) throw_if(true, "Command line is missing an argument."); + return argv_[++i_]; + } + const char* token () const { return argv_[i_]; } + void incr () { ++i_; } + bool more () const { return i_ < argc_; } + }; + + InputParser (int argc, char** argv, const qlt::Parallel::Ptr& p) { + in.unittest = false; + in.write = false; + in.ncells = 0; + in.ntracers = 1; + in.tracer_type = 0; + in.nrepeat = 1; + in.pseudorandom = false; + in.verbose = false; + for (ArgAdvancer aa(argc, argv); aa.more(); aa.incr()) { + const char* token = aa.token(); + if (eq(token, "-t", "--unittest")) in.unittest = true; + else if (eq(token, "-w", "--write")) in.write = true; + else if (eq(token, "-nc", "--ncells")) in.ncells = std::atoi(aa.advance()); + else if (eq(token, "-nt", "--ntracers")) in.ntracers = std::atoi(aa.advance()); + else if (eq(token, "-tt", "--tracertype")) in.tracer_type = std::atoi(aa.advance()); + else if (eq(token, "-nr", "--nrepeat")) in.nrepeat = std::atoi(aa.advance()); + else if (eq(token, "--random")) in.pseudorandom = true; + else if (eq(token, "-v", "--verbose")) in.verbose = true; + else throw_if(true, "Invalid token " << token); + } + throw_if(in.tracer_type < 0 || in.tracer_type >= 4, "Tracer type is out of bounds [0, 3]."); + throw_if(in.ntracers < 1, "Number of tracers is < 1."); + } + + void print (std::ostream& os) const { + os << "ncells " << in.ncells + << " nrepeat " << in.nrepeat; + if (in.pseudorandom) os << " random"; + os << "\n"; + } +}; + +int main (int argc, char** argv) { + int ret = 0; + MPI_Init(&argc, &argv); + auto p = qlt::make_parallel(MPI_COMM_WORLD); + srand(p->rank()); + Kokkos::initialize(argc, argv); + try { + InputParser inp(argc, argv, p); + if (p->amroot()) inp.print(std::cout); + ret = qlt::test::run(p, inp.in); + if (p->amroot()) std::cout << (ret != 0 ? "FAIL" : "PASS") << "\n"; + } catch (const std::exception& e) { + if (p->amroot()) + std::cerr << e.what(); + } + Kokkos::finalize_all(); + MPI_Finalize(); + return ret; +} diff --git a/qlt/readme.txt b/qlt/readme.txt new file mode 100644 index 0000000..261c49a --- /dev/null +++ b/qlt/readme.txt @@ -0,0 +1,13 @@ +For clarity, suppose your your C++ compiler is g++-4.8 in what follows. But it +can be something else. + +1. Get and install the standalone Kokkos TPL: + +$ git clone https://github.com/kokkos/kokkos.git +$ ./kokkos/generate_makefile.bash --with-openmp --ldflags=-fPIC --prefix=/path/to/desired/installation --compiler=g++-4.8 + +2. cp an existing make.inc.* file to one for your machine, say, +make.inc.mymachine. Edit it with machine-specific information. Then + $ ln -s make.inc.machine make.inc + $ make -j8 + $ ./siqk_runtests.py diff --git a/siqk/readme.txt b/siqk/readme.txt index 2ace5d7..261c49a 100644 --- a/siqk/readme.txt +++ b/siqk/readme.txt @@ -1,10 +1,10 @@ -For clarity, suppose your your C++ compiler is g++-4.7 in what follows. But it +For clarity, suppose your your C++ compiler is g++-4.8 in what follows. But it can be something else. 1. Get and install the standalone Kokkos TPL: $ git clone https://github.com/kokkos/kokkos.git -$ ./kokkos/generate_makefile.bash --with-openmp --ldflags=-fPIC --prefix=/path/to/desired/installation --compiler=g++-4.7 +$ ./kokkos/generate_makefile.bash --with-openmp --ldflags=-fPIC --prefix=/path/to/desired/installation --compiler=g++-4.8 2. cp an existing make.inc.* file to one for your machine, say, make.inc.mymachine. Edit it with machine-specific information. Then From 4b0d769cb2eb24788d2b68974b4e05a821ccdf53 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Tue, 22 Aug 2017 11:51:46 -0600 Subject: [PATCH 18/28] Update QLT readme. --- qlt/readme.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qlt/readme.txt b/qlt/readme.txt index 261c49a..339e6b2 100644 --- a/qlt/readme.txt +++ b/qlt/readme.txt @@ -9,5 +9,5 @@ $ ./kokkos/generate_makefile.bash --with-openmp --ldflags=-fPIC --prefix=/path/t 2. cp an existing make.inc.* file to one for your machine, say, make.inc.mymachine. Edit it with machine-specific information. Then $ ln -s make.inc.machine make.inc - $ make -j8 - $ ./siqk_runtests.py + $ make -j2 + $ mpirun -np 4 ./testqlt -t # Look for PASS From 9665a36ed54ed57213c3a36815994090c75a4ec6 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Thu, 28 Dec 2017 19:44:17 -0700 Subject: [PATCH 19/28] CEDR: QLT -> CEDR. CEDR will contain more than just QLT, so generalize the names and files. Add a 1D transport test. Still needs to be cleaned up from an interface perspective, but has the substance of what I want. Add a quick exit to solve_node_problem. Add a 2D specialization of solve_1eq_bc_qp, useful for a binary QLT tree. Still need to put in unit tests. Add a script to make a single-file version of this for near-term Homme integration testing. Modify default weights to match paper. --- cedr/Makefile | 25 ++ cedr/cedr.hpp | 12 + qlt/qlt_kokkos.hpp => cedr/cedr_kokkos.hpp | 16 +- cedr/cedr_local.cpp | 16 + cedr/cedr_local.hpp | 30 ++ cedr/cedr_local_inl.hpp | 282 +++++++++++++++ cedr/cedr_mpi.cpp | 40 +++ cedr/cedr_mpi.hpp | 89 +++++ cedr/cedr_mpi_inl.hpp | 4 + qlt/qlt.cpp => cedr/cedr_qlt.cpp | 321 +++++------------ qlt/qlt.hpp => cedr/cedr_qlt.hpp | 78 ++-- cedr/cedr_qlt_inl.hpp | 152 ++++++++ cedr/cedr_test.cpp | 96 +++++ cedr/cedr_test_1d_transport.cpp | 308 ++++++++++++++++ cedr/cedr_util.cpp | 23 ++ cedr/cedr_util.hpp | 90 +++++ {qlt => cedr}/make.inc.ws | 0 cedr/make_qltcpp.sh | 10 + {qlt => cedr}/readme.txt | 0 qlt/Makefile | 23 -- qlt/qlt_inline.hpp | 392 --------------------- qlt/qlt_test.cpp | 89 ----- 22 files changed, 1321 insertions(+), 775 deletions(-) create mode 100644 cedr/Makefile create mode 100644 cedr/cedr.hpp rename qlt/qlt_kokkos.hpp => cedr/cedr_kokkos.hpp (73%) create mode 100644 cedr/cedr_local.cpp create mode 100644 cedr/cedr_local.hpp create mode 100644 cedr/cedr_local_inl.hpp create mode 100644 cedr/cedr_mpi.cpp create mode 100644 cedr/cedr_mpi.hpp create mode 100644 cedr/cedr_mpi_inl.hpp rename qlt/qlt.cpp => cedr/cedr_qlt.cpp (85%) rename qlt/qlt.hpp => cedr/cedr_qlt.hpp (81%) create mode 100644 cedr/cedr_qlt_inl.hpp create mode 100644 cedr/cedr_test.cpp create mode 100644 cedr/cedr_test_1d_transport.cpp create mode 100644 cedr/cedr_util.cpp create mode 100644 cedr/cedr_util.hpp rename {qlt => cedr}/make.inc.ws (100%) create mode 100644 cedr/make_qltcpp.sh rename {qlt => cedr}/readme.txt (100%) delete mode 100644 qlt/Makefile delete mode 100644 qlt/qlt_inline.hpp delete mode 100644 qlt/qlt_test.cpp diff --git a/cedr/Makefile b/cedr/Makefile new file mode 100644 index 0000000..64f28d6 --- /dev/null +++ b/cedr/Makefile @@ -0,0 +1,25 @@ +include make.inc + +CXXFLAGS=$(opt) -Wall -pedantic -fopenmp -std=c++11 -I$(KOKKOS)/include -DQLT_TIME +LDFLAGS=-fopenmp -L$(KOKKOS)/lib -lkokkos -ldl +LINK_LAPACK_BLAS=-llapack -lblas + +SOURCES=cedr_qlt.cpp cedr_test.cpp cedr_mpi.cpp cedr_local.cpp cedr_util.cpp \ + cedr_test_1d_transport.cpp + +OBJECTS=$(SOURCES:.cpp=.o) + +.cpp.o: + $(MPICXX) $(CFLAGS) $(CXXFLAGS) -c $< -o $@ + +all: testcedr + +testcedr: cedr_test.o $(OBJECTS) + $(MPICXX) $(OBJECTS) $(LDFLAGS) -o testcedr + +clean: + rm -f *.o testcedr + +cedr_qlt.o: cedr_mpi.hpp cedr_mpi_inl.hpp cedr_local.hpp cedr_local_inl.hpp \ + cedr_qlt.hpp cedr_qlt_inl.hpp cedr_kokkos.hpp cedr_util.hpp +cedr_test.o: cedr_qlt.hpp cedr_util.hpp diff --git a/cedr/cedr.hpp b/cedr/cedr.hpp new file mode 100644 index 0000000..5de4a3b --- /dev/null +++ b/cedr/cedr.hpp @@ -0,0 +1,12 @@ +#ifndef INCLUDE_CEDR_HPP +#define INCLUDE_CEDR_HPP + +#include // Need some source for std::size_t. + +namespace cedr { +typedef int Int; +typedef std::size_t Size; +typedef double Real; +} + +#endif diff --git a/qlt/qlt_kokkos.hpp b/cedr/cedr_kokkos.hpp similarity index 73% rename from qlt/qlt_kokkos.hpp rename to cedr/cedr_kokkos.hpp index 3ac727a..ec25b02 100644 --- a/qlt/qlt_kokkos.hpp +++ b/cedr/cedr_kokkos.hpp @@ -1,7 +1,9 @@ -#ifndef INCLUDE_QLT_KOKKOS_HPP -#define INCLUDE_QLT_KOKKOS_HPP +#ifndef INCLUDE_CEDR_KOKKOS_HPP +#define INCLUDE_CEDR_KOKKOS_HPP -namespace qlt { +#include + +namespace cedr { namespace impl { template using MemoryTraits = Kokkos::MemoryTraits< @@ -37,6 +39,14 @@ template <> struct DeviceType { typedef Kokkos::Device DefaultDeviceType; #endif + +// GPU-friendly replacements for std::*. +template KOKKOS_INLINE_FUNCTION +const T& min (const T& a, const T& b) { return a < b ? a : b; } +template KOKKOS_INLINE_FUNCTION +const T& max (const T& a, const T& b) { return a > b ? a : b; } +template KOKKOS_INLINE_FUNCTION +void swap (T& a, T& b) { const T tmp = a; a = b; b = tmp; } } } diff --git a/cedr/cedr_local.cpp b/cedr/cedr_local.cpp new file mode 100644 index 0000000..dec8f00 --- /dev/null +++ b/cedr/cedr_local.cpp @@ -0,0 +1,16 @@ +#include "cedr_local.hpp" +#include "cedr_local_inl.hpp" + +namespace cedr { +namespace local { + +Int test_sort4 () {} + +Int unittest () { + Int ne, nerr = 0; + ne = test_sort4(); + if (ne) std::cerr << ""; +} + +} +} diff --git a/cedr/cedr_local.hpp b/cedr/cedr_local.hpp new file mode 100644 index 0000000..ef6e924 --- /dev/null +++ b/cedr/cedr_local.hpp @@ -0,0 +1,30 @@ +#ifndef INCLUDE_CEDR_LOCAL_HPP +#define INCLUDE_CEDR_LOCAL_HPP + +#include "cedr.hpp" +#include "cedr_kokkos.hpp" + +namespace cedr { +namespace local { + +// Solve +// min_x sum_i w(i) (x(i) - y(i))^2 +// st a' x = b +// xlo <= x <= xhi, +// a(i), w(i) > 0. Return 0 on success and x == y, 1 on success and x != y, -1 +// if infeasible, -2 if max_its hit with no solution. See Section 3 of Bochev, +// Ridzal, Shashkov, Fast optimization-based conservative remap of scalar fields +// through aggregate mass transfer. lambda is used in check_1eq_bc_qp_foc. +KOKKOS_INLINE_FUNCTION +Int solve_1eq_bc_qp(const Int n, const Real* w, const Real* a, const Real b, + const Real* xlo, const Real* xhi, + const Real* y, Real* x, const Int max_its = 100); + +Int unittest(); + +} +} + +#include "cedr_local_inl.hpp" + +#endif diff --git a/cedr/cedr_local_inl.hpp b/cedr/cedr_local_inl.hpp new file mode 100644 index 0000000..9734644 --- /dev/null +++ b/cedr/cedr_local_inl.hpp @@ -0,0 +1,282 @@ +#ifndef INCLUDE_CEDR_LOCAL_INL_HPP +#define INCLUDE_CEDR_LOCAL_INL_HPP + +#include "cedr_util.hpp" + +namespace cedr { +namespace local { + +KOKKOS_INLINE_FUNCTION +bool is_inside (const Real xi, const Real* xlo, const Real* xhi, const Int i) { + return (xi > xlo[i] && xi < xhi[i]); +} + +KOKKOS_INLINE_FUNCTION +bool is_outside (const Real xi, const Real* xlo, const Real* xhi, const Int i) { + return (xi < xlo[i] || xi > xhi[i]); +} + +template +KOKKOS_INLINE_FUNCTION +void sort4 (T* x) { + T buf[4]; + for (Int i = 0; i < 2; ++i) { + const Int j = 2*i; + if (x[j] <= x[j+1]) { buf[j] = x[j]; buf[j+1] = x[j+1]; } + else { buf[j] = x[j+1]; buf[j+1] = x[j]; } + } + Int p0 = 0, p1 = 2; + for (Int i = 0; i < 4; ++i) + x[i] = (p1 >= 4 || (p0 < 2 && buf[p0] <= buf[p1]) ? + buf[p0++] : + buf[p1++]); + cedr_assert(p0 == 2 && p1 == 4); +} + +// 2D special case for efficiency. +KOKKOS_INLINE_FUNCTION +Int solve_1eq_bc_qp_2d (const Int n, const Real* w, const Real* a, const Real b, + const Real* xlo, const Real* xhi, + const Real* y, Real* x) { + cedr_assert(n == 2); + + { // Check if the optimal point ignoring bound constraints is in bounds. + Real q[2], qsum = 0; + for (int i = 0; i < 2; ++i) { + q[i] = a[i]/w[i]; + qsum += q[i]; + } + Real dm = b; + for (int i = 0; i < 2; ++i) + dm -= a[i]*y[i]; + bool good = true; + for (int i = 0; i < 2; ++i) { + x[i] = y[i] + dm*(q[i]/qsum); + if (is_outside(x[i], xlo, xhi, i)) { + good = false; + break; + } + } + if (good) return dm == 0 ? 0 : 1; + } + + // Solve for intersection of a'x = b, given by the parameterized line + // p(alpa) = x_base + alpha x_dir, + // with a bounding line. + + // Get parameterized line. + Real x_base[2]; + for (int i = 0; i < 2; ++i) + x_base[i] = 0.5*b/a[i]; + Real x_dir[] = {-a[1], a[0]}; + + // Get the 4 alpha values. + struct Alpha { + Real alpha; + Int side; + bool operator<= (const Alpha& a) const { return alpha <= a.alpha; } + }; + Alpha alphas[4]; + auto set_alpha = [&] (const Real* xbd, const Int& idx, const Int& side) { + alphas[side].alpha = (xbd[idx] - x_base[idx])/x_dir[idx]; + alphas[side].side = side; + }; + set_alpha(xlo, 1, 0); // bottom + set_alpha(xhi, 0, 1); // right + set_alpha(xhi, 1, 2); // top + set_alpha(xlo, 0, 3); // left + + // Sort alphas. The middle two bound the feasible interval. + sort4(alphas); + + // Eval the middle two and record the better of the them. + auto eval_xi = [&] (const Real& alpha, const Int& idx) { + return x_base[idx] + alpha*x_dir[idx]; + }; + auto eval_obj = [&] (const Real& alpha) { + Real obj = 0; + for (Int i = 0; i < 2; ++i) { + x[i] = eval_xi(alpha, i); + obj += w[i]*cedr::util::square(y[i] - x[i]); + } + return obj; + }; + const Int ai = eval_obj(alphas[1].alpha) <= eval_obj(alphas[2].alpha) ? 1 : 2; + + Int info = 1, clipidx = 0; + const auto& aai = alphas[ai]; + switch (aai.side) { + case 0: x[0] = eval_xi(aai.alpha, 0); x[1] = xlo[1]; clipidx = 0; break; + case 1: x[0] = xhi[0]; x[1] = eval_xi(aai.alpha, 1); clipidx = 1; break; + case 2: x[0] = eval_xi(aai.alpha, 0); x[1] = xhi[1]; clipidx = 0; break; + case 3: x[0] = xlo[0]; x[1] = eval_xi(aai.alpha, 1); clipidx = 1; break; + default: cedr_assert(0); info = -2; + } + x[clipidx] = cedr::impl::min(xhi[clipidx], cedr::impl::max(xlo[clipidx], x[clipidx])); + return info; +} + +KOKKOS_INLINE_FUNCTION +Real calc_r_tol (const Real b, const Real* a, const Real* y, const Int n) { + Real ab = std::abs(b); + for (Int i = 0; i < n; ++i) ab = std::max(ab, std::abs(a[i]*y[i])); + return 1e1*std::numeric_limits::epsilon()*std::abs(ab); +} + +KOKKOS_INLINE_FUNCTION +void calc_r (const Int n, const Real* w, const Real* a, const Real b, + const Real* xlo, const Real* xhi, const Real* y, const Real& lambda, + Real* x, Real& r, Real& r_lambda) { + r = 0; + r_lambda = 0; + for (Int i = 0; i < n; ++i) { + const Real q = a[i]/w[i]; + const Real x_trial = y[i] + lambda*q; + Real xtmp; + if (x_trial < (xtmp = xlo[i])) + x[i] = xtmp; + else if (x_trial > (xtmp = xhi[i])) + x[i] = xtmp; + else { + x[i] = x_trial; + r_lambda += a[i]*q; + } + r += a[i]*x[i]; + } + r -= b; +} + +KOKKOS_INLINE_FUNCTION +Int solve_1eq_bc_qp (const Int n, const Real* w, const Real* a, const Real b, + const Real* xlo, const Real* xhi, const Real* y, Real* x, + const Int max_its) { + const Real r_tol = calc_r_tol(b, a, y, n); + Int info; + + { // Check for a quick exit. + bool all_in = true; + Real r = 0; + info = 0; + for (Int i = 0; i < n; ++i) { + if (x[i] != y[i]) { + x[i] = y[i]; + info = 1; + } + if (is_outside(x[i], xlo, xhi, i)) { + all_in = false; + break; + } + r += a[i]*x[i]; + } + if (all_in) { + r -= b; + if (std::abs(r) <= r_tol) + return info; + } + } + + if (n == 2) + return solve_1eq_bc_qp_2d(n, w, a, b, xlo, xhi, y, x); + + { // Eval r at end points to check for feasibility, and also possibly a quick + // exit on a common case. + Real r = -b; + for (Int i = 0; i < n; ++i) { + x[i] = xlo[i]; + r += a[i]*x[i]; + } + if (std::abs(r) <= r_tol) return 1; + if (r > 0) return -1; + r = -b; + for (Int i = 0; i < n; ++i) { + x[i] = xhi[i]; + r += a[i]*x[i]; + } + if (std::abs(r) <= r_tol) return 1; + if (r < 0) return -1; + } + + { // Check for a quick exit: the bounds are so tight that the midpoint of the + // box satisfies r_tol. + Real r = -b; + for (Int i = 0; i < n; ++i) { + x[i] = 0.5*(xlo[i] + xhi[i]); + r += a[i]*x[i]; + } + if (std::abs(r) <= r_tol) return 1; + } + + const Real wall_dist = 1e-3; + + // Get lambda endpoints. + Real lamlo = 0, lamhi = 0; + for (Int i = 0; i < n; ++i) { + const Real rq = w[i]/a[i]; + const Real lamlo_i = rq*(xlo[i] - y[i]); + const Real lamhi_i = rq*(xhi[i] - y[i]); + if (i == 0) { + lamlo = lamlo_i; + lamhi = lamhi_i; + } else { + lamlo = impl::min(lamlo, lamlo_i); + lamhi = impl::max(lamhi, lamhi_i); + } + } + const Real lamlo_feas = lamlo, lamhi_feas = lamhi; + Real lambda = lamlo <= 0 && lamhi >= 0 ? 0 : lamlo; + + // Bisection-safeguarded Newton iteration for r(lambda) = 0. + bool prev_step_bisect = false; + Int nbisect = 0; + info = -2; + for (Int iteration = 0; iteration < max_its; ++iteration) { + // Compute x, r, r_lambda. + Real r, r_lambda; + calc_r(n, w, a, b, xlo, xhi, y, lambda, x, r, r_lambda); + // Is r(lambda) - b sufficiently == 0? + if (std::abs(r) <= r_tol) { + info = 1; + break; + } + // Check if the lambda bounds are too close. + if (nbisect > 64) { + if (lamhi == lamhi_feas || lamlo == lamlo_feas) { + // r isn't small enough and one lambda bound is on the feasibility + // limit. The QP must not be feasible. + info = -1; + break; + } + info = 1; + break; + } + // Adjust lambda bounds. + if (r > 0) + lamhi = lambda; + else + lamlo = lambda; + if (r_lambda != 0) { + // Newton step. + lambda -= r/r_lambda; + } else { + // Force bisection. + lambda = lamlo; + } + // Safeguard. The wall distance check assures progress, but use it only + // every other potential bisection. + const Real D = prev_step_bisect ? 0 : wall_dist*(lamhi - lamlo); + if (lambda - lamlo < D || lamhi - lambda < D) { + lambda = 0.5*(lamlo + lamhi); + ++nbisect; + prev_step_bisect = true; + } else { + prev_step_bisect = false; + } + } + + return info; +} + +} // namespace local +} // namespace cedr + +#endif diff --git a/cedr/cedr_mpi.cpp b/cedr/cedr_mpi.cpp new file mode 100644 index 0000000..e561f30 --- /dev/null +++ b/cedr/cedr_mpi.cpp @@ -0,0 +1,40 @@ +#include "cedr_mpi.hpp" + +namespace cedr { +namespace mpi { + +Parallel::Ptr make_parallel (MPI_Comm comm) { + return std::make_shared(comm); +} + +Int Parallel::size () const { + int sz = 0; + MPI_Comm_size(comm_, &sz); + return sz; +} + +Int Parallel::rank () const { + int pid = 0; + MPI_Comm_rank(comm_, &pid); + return pid; +} + +template <> MPI_Datatype get_type() { return MPI_INT; } +template <> MPI_Datatype get_type() { return MPI_DOUBLE; } + +int waitany (int count, MPI_Request* reqs, int* index, MPI_Status* stats) { + return MPI_Waitany(count, reqs, index, stats ? stats : MPI_STATUS_IGNORE); +} + +int waitall (int count, MPI_Request* reqs, MPI_Status* stats) { + return MPI_Waitall(count, reqs, stats ? stats : MPI_STATUS_IGNORE); +} + +bool all_ok (const Parallel& p, bool im_ok) { + int ok = im_ok, msg; + all_reduce(p, &ok, &msg, 1, MPI_LAND); + return static_cast(msg); +} + +} +} diff --git a/cedr/cedr_mpi.hpp b/cedr/cedr_mpi.hpp new file mode 100644 index 0000000..25fd466 --- /dev/null +++ b/cedr/cedr_mpi.hpp @@ -0,0 +1,89 @@ +#ifndef INCLUDE_CEDR_MPI_HPP +#define INCLUDE_CEDR_MPI_HPP + +#include + +#include + +#include "cedr.hpp" + +namespace cedr { +namespace mpi { + +class Parallel { + MPI_Comm comm_; +public: + typedef std::shared_ptr Ptr; + Parallel(MPI_Comm comm) : comm_(comm) {} + MPI_Comm comm () const { return comm_; } + Int size() const; + Int rank() const; + Int root () const { return 0; } + bool amroot () const { return rank() == root(); } +}; + +Parallel::Ptr make_parallel(MPI_Comm comm); + +template MPI_Datatype get_type(); + +template +int reduce (const Parallel& p, const T* sendbuf, T* rcvbuf, int count, MPI_Op op, + int root) { + MPI_Datatype dt = get_type(); + return MPI_Reduce(const_cast(sendbuf), rcvbuf, count, dt, op, root, p.comm()); +} + +template +int all_reduce (const Parallel& p, const T* sendbuf, T* rcvbuf, int count, MPI_Op op) { + MPI_Datatype dt = get_type(); + return MPI_Allreduce(const_cast(sendbuf), rcvbuf, count, dt, op, p.comm()); +} + +template +int isend (const Parallel& p, const T* buf, int count, int dest, int tag, + MPI_Request* ireq) { + MPI_Datatype dt = get_type(); + MPI_Request ureq; + MPI_Request* req = ireq ? ireq : &ureq; + int ret = MPI_Isend(const_cast(buf), count, dt, dest, tag, p.comm(), req); + if ( ! ireq) MPI_Request_free(req); + return ret; +} + +template +int irecv (const Parallel& p, T* buf, int count, int src, int tag, MPI_Request* ireq) { + MPI_Datatype dt = get_type(); + MPI_Request ureq; + MPI_Request* req = ireq ? ireq : &ureq; + int ret = MPI_Irecv(buf, count, dt, src, tag, p.comm(), req); + if ( ! ireq) MPI_Request_free(req); + return ret; +} + +int waitany(int count, MPI_Request* reqs, int* index, MPI_Status* stats = nullptr); + +int waitall(int count, MPI_Request* reqs, MPI_Status* stats = nullptr); + +template +int gather (const Parallel& p, const T* sendbuf, int sendcount, + T* recvbuf, int recvcount, int root) { + MPI_Datatype dt = get_type(); + return MPI_Gather(sendbuf, sendcount, dt, recvbuf, recvcount, dt, root, p.comm()); +} + +template +int gatherv (const Parallel& p, const T* sendbuf, int sendcount, + T* recvbuf, const int* recvcounts, const int* displs, int root) { + MPI_Datatype dt = get_type(); + return MPI_Gatherv(sendbuf, sendcount, dt, recvbuf, recvcounts, displs, dt, root, + p.comm()); +} + +bool all_ok(const Parallel& p, bool im_ok); + +} +} + +#include "cedr_mpi_inl.hpp" + +#endif diff --git a/cedr/cedr_mpi_inl.hpp b/cedr/cedr_mpi_inl.hpp new file mode 100644 index 0000000..d63fbf2 --- /dev/null +++ b/cedr/cedr_mpi_inl.hpp @@ -0,0 +1,4 @@ +#ifndef INCLUDE_CEDR_MPI_INL_HPP +#define INCLUDE_CEDR_MPI_INL_HPP + +#endif diff --git a/qlt/qlt.cpp b/cedr/cedr_qlt.cpp similarity index 85% rename from qlt/qlt.cpp rename to cedr/cedr_qlt.cpp index fab38b0..a917e5e 100644 --- a/qlt/qlt.cpp +++ b/cedr/cedr_qlt.cpp @@ -1,4 +1,4 @@ -#include "qlt.hpp" +#include "cedr_qlt.hpp" #include @@ -10,153 +10,8 @@ #include #include +namespace cedr { namespace qlt { -namespace mpi { -template MPI_Datatype get_type(); -template <> MPI_Datatype get_type() { return MPI_INT; } -template <> MPI_Datatype get_type() { return MPI_DOUBLE; } - -template -int reduce (const Parallel& p, const T* sendbuf, T* rcvbuf, int count, MPI_Op op, - int root) { - MPI_Datatype dt = get_type(); - return MPI_Reduce(const_cast(sendbuf), rcvbuf, count, dt, op, root, p.comm()); -} - -template -int all_reduce (const Parallel& p, const T* sendbuf, T* rcvbuf, int count, MPI_Op op) { - MPI_Datatype dt = get_type(); - return MPI_Allreduce(const_cast(sendbuf), rcvbuf, count, dt, op, p.comm()); -} - -template -int isend (const Parallel& p, const T* buf, int count, int dest, int tag, - MPI_Request* ireq) { - MPI_Datatype dt = get_type(); - MPI_Request ureq; - MPI_Request* req = ireq ? ireq : &ureq; - int ret = MPI_Isend(const_cast(buf), count, dt, dest, tag, p.comm(), req); - if ( ! ireq) MPI_Request_free(req); - return ret; -} - -template -int irecv (const Parallel& p, T* buf, int count, int src, int tag, MPI_Request* ireq) { - MPI_Datatype dt = get_type(); - MPI_Request ureq; - MPI_Request* req = ireq ? ireq : &ureq; - int ret = MPI_Irecv(buf, count, dt, src, tag, p.comm(), req); - if ( ! ireq) MPI_Request_free(req); - return ret; -} - -int waitany (int count, MPI_Request* reqs, int* index, MPI_Status* stats = nullptr) { - return MPI_Waitany(count, reqs, index, stats ? stats : MPI_STATUS_IGNORE); -} - -int waitall (int count, MPI_Request* reqs, MPI_Status* stats = nullptr) { - return MPI_Waitall(count, reqs, stats ? stats : MPI_STATUS_IGNORE); -} - -template -int gather (const Parallel& p, const T* sendbuf, int sendcount, - T* recvbuf, int recvcount, int root) { - MPI_Datatype dt = get_type(); - return MPI_Gather(sendbuf, sendcount, dt, recvbuf, recvcount, dt, root, p.comm()); -} - -template -int gatherv (const Parallel& p, const T* sendbuf, int sendcount, - T* recvbuf, const int* recvcounts, const int* displs, int root) { - MPI_Datatype dt = get_type(); - return MPI_Gatherv(sendbuf, sendcount, dt, recvbuf, recvcounts, displs, dt, root, - p.comm()); -} - -bool all_ok (const Parallel& p, bool im_ok) { - int ok = im_ok, msg; - all_reduce(p, &ok, &msg, 1, MPI_LAND); - return static_cast(msg); -} -} // namespace mpi - -Parallel::Ptr make_parallel (MPI_Comm comm) { - return std::make_shared(comm); -} - -Int Parallel::size () const { - int sz = 0; - MPI_Comm_size(comm_, &sz); - return sz; -} - -Int Parallel::rank () const { - int pid = 0; - MPI_Comm_rank(comm_, &pid); - return pid; -} - -namespace impl { -#define pr(m) do { \ - int _pid_ = 0; \ - MPI_Comm_rank(MPI_COMM_WORLD, &_pid_); \ - std::stringstream _ss_; \ - _ss_.precision(15); \ - _ss_ << "pid " << _pid_ << " " << m << std::endl; \ - std::cerr << _ss_.str(); \ - } while (0) -#define pr0(m) do { \ - int _pid_; MPI_Comm_rank(MPI_COMM_WORLD, &_pid_); \ - if (_pid_ != 0) break; \ - std::stringstream _ss_; \ - _ss_ << "pid " << _pid_ << " " << m << std::endl; \ - std::cerr << _ss_.str(); \ - } while (0) -#define prc(m) pr(#m << " | " << (m)) -#define pr0c(m) pr0(#m << " | " << (m)) -#define puf(m) "(" << #m << " " << (m) << ")" -#define pu(m) << " " << puf(m) -template -void prarr (const std::string& name, const T* const v, const size_t n) { - std::stringstream ss; - ss.precision(15); - ss << name << " = ["; - for (size_t i = 0; i < n; ++i) ss << " " << v[i]; - ss << "];"; - pr(ss.str()); -} -#define mprarr(m) qlt::impl::prarr(#m, m.data(), m.size()) - -#define qlt_assert(condition) do { \ - if ( ! (condition)) { \ - std::stringstream _ss_; \ - _ss_ << __FILE__ << ":" << __LINE__ << ": FAIL:\n" << #condition \ - << "\n"; \ - throw std::logic_error(_ss_.str()); \ - } \ - } while (0) -#define qlt_throw_if(condition, message) do { \ - if (condition) { \ - std::stringstream _ss_; \ - _ss_ << __FILE__ << ":" << __LINE__ << ": The condition:\n" \ - << #condition "\nled to the exception\n" << message << "\n"; \ - throw std::logic_error(_ss_.str()); \ - } \ - } while (0) -#define qlt_kernel_assert(condition) do { \ - if ( ! (condition)) \ - Kokkos::abort(#condition); \ - } while (0) -#define qlt_kernel_throw_if(condition, message) do { \ - if (condition) \ - Kokkos::abort(#condition " led to the exception\n" message); \ - } while (0) - -inline Real reldif (const Real a, const Real b) -{ return std::abs(b - a)/std::max(std::abs(a), std::abs(b)); } - -struct FILECloser { void operator() (FILE* fh) { fclose(fh); } }; -} // namespace impl class Timer { public: @@ -326,16 +181,16 @@ Int init_tree (const tree::Node::Ptr& node, Int& id) { node->reserved = nullptr; Int depth = 0; for (Int i = 0; i < node->nkids; ++i) { - qlt_assert(node.get() == node->kids[i]->parent); + cedr_assert(node.get() == node->kids[i]->parent); depth = std::max(depth, init_tree(node->kids[i], id)); } if (node->nkids) { node->rank = node->kids[0]->rank; node->cellidx = id++; } else { - qlt_throw_if(node->cellidx < 0 || node->cellidx >= id, - "cellidx is " << node->cellidx << " but should be between " << - 0 << " and " << id); + cedr_throw_if(node->cellidx < 0 || node->cellidx >= id, + "cellidx is " << node->cellidx << " but should be between " << + 0 << " and " << id); } return depth + 1; } @@ -344,7 +199,7 @@ void level_schedule_and_collect ( NodeSets& ns, const Int& my_rank, const tree::Node::Ptr& node, Int& level, bool& need_parent_ns_node) { - qlt_assert(node->rank != -1); + cedr_assert(node->rank != -1); level = -1; bool make_ns_node = false; for (Int i = 0; i < node->nkids; ++i) { @@ -360,7 +215,7 @@ void level_schedule_and_collect ( const bool node_is_owned = node->rank == my_rank; need_parent_ns_node = node_is_owned; if (node_is_owned || make_ns_node) { - qlt_assert( ! node->reserved); + cedr_assert( ! node->reserved); NodeSets::Node* ns_node = ns.alloc(); // Levels hold only owned nodes. if (node_is_owned) ns.levels[level].nodes.push_back(ns_node); @@ -378,7 +233,7 @@ void level_schedule_and_collect ( NodeSets::Node* ns_kid; kid->reserved = ns_kid = ns.alloc(); ns_node->kids[i] = ns_kid; - qlt_assert(kid->rank != my_rank); + cedr_assert(kid->rank != my_rank); ns_kid->rank = kid->rank; ns_kid->id = kid->cellidx; ns_kid->parent = nullptr; // Not needed. @@ -445,7 +300,7 @@ void init_offsets (const Int my_rank, std::vector& rns, continue; } if (rank != prev_rank) { - qlt_assert(rank > prev_rank); + cedr_assert(rank > prev_rank); prev_rank = rank; mmds.push_back(NodeSets::Level::MPIMetaData()); auto& mmd = mmds.back(); @@ -499,7 +354,7 @@ void init_comm (const Int my_rank, NodeSets& ns) { NodeSets::ConstPtr analyze (const Parallel::Ptr& p, const Int& ncells, const tree::Node::Ptr& tree) { const auto nodesets = std::make_shared(); - qlt_assert( ! tree->parent); + cedr_assert( ! tree->parent); Int id = ncells; const Int depth = init_tree(tree, id); nodesets->levels.resize(depth); @@ -515,7 +370,7 @@ Int check_comm (const NodeSets::ConstPtr& ns) { std::vector offsets(ns->nslots, 0); for (const auto& lvl : ns->levels) for (const auto& n : lvl.nodes) { - qlt_assert(n->offset < ns->nslots); + cedr_assert(n->offset < ns->nslots); ++offsets[n->offset]; for (Int i = 0; i < n->nkids; ++i) if (n->kids[i]->rank != n->rank) @@ -531,16 +386,16 @@ Int check_comm (const NodeSets::ConstPtr& ns) { Int check_leaf_nodes (const Parallel::Ptr& p, const NodeSets::ConstPtr& ns, const Int ncells) { Int nerr = 0; - qlt_assert( ! ns->levels.empty()); - qlt_assert( ! ns->levels[0].nodes.empty()); + cedr_assert( ! ns->levels.empty()); + cedr_assert( ! ns->levels[0].nodes.empty()); Int my_nleaves = 0; for (const auto& n : ns->levels[0].nodes) { - qlt_assert( ! n->nkids); + cedr_assert( ! n->nkids); ++my_nleaves; } for (const auto& n : ns->levels[0].nodes) { - qlt_assert(n->offset < my_nleaves); - qlt_assert(n->id < ncells); + cedr_assert(n->offset < my_nleaves); + cedr_assert(n->id < ncells); } Int glbl_nleaves = 0; mpi::all_reduce(*p, &my_nleaves, &glbl_nleaves, 1, MPI_SUM); @@ -662,7 +517,7 @@ int QLT::MetaData::get_problem_type_idx (const int& mask) { case CPT::cs: case CPT::cst: return 1; case CPT::t: return 2; case CPT::ct: return 3; - default: qlt_kernel_throw_if(true, "Invalid problem type."); return -1; + default: cedr_kernel_throw_if(true, "Invalid problem type."); return -1; } } @@ -730,7 +585,7 @@ void QLT::MetaData::init (const MetaDataBuilder& mdb) { a_d.prob2trcrptr); std::copy(a_h_.prob2bl2r, a_h_.prob2bl2r + nprobtypes + 1, a_d.prob2bl2r); std::copy(a_h_.prob2br2l, a_h_.prob2br2l + nprobtypes + 1, a_d.prob2br2l); - qlt_assert(a_d.prob2trcrptr[nprobtypes] == ntracers); + cedr_assert(a_d.prob2trcrptr[nprobtypes] == ntracers); } template @@ -742,7 +597,8 @@ void QLT::BulkData::init (const MetaData& md, const Int& nslots) { } template -void QLT::init (const Parallel::Ptr& p, const Int& ncells, const tree::Node::Ptr& tree) { +void QLT::init (const Parallel::Ptr& p, const Int& ncells, + const tree::Node::Ptr& tree) { p_ = p; Timer::start(Timer::analyze); ns_ = impl::analyze(p, ncells, tree); @@ -794,7 +650,7 @@ Int QLT::gci2lci (const Int& gci) const { get_owned_glblcells(gcis); mprarr(gcis); } - qlt_throw_if(it == gci2lci_.end(), "gci " << gci << " not in gci2lci map."); + cedr_throw_if(it == gci2lci_.end(), "gci " << gci << " not in gci2lci map."); return it->second; } @@ -803,7 +659,7 @@ Int QLT::gci2lci (const Int& gci) const { // tracer index in the caller's numbering. template void QLT::declare_tracer (int problem_type) { - qlt_throw_if( ! mdb_, "end_tracer_declarations was already called; " + cedr_throw_if( ! mdb_, "end_tracer_declarations was already called; " "it is an error to call declare_tracer now."); // For its exception side effect, and to get canonical problem type, since // some possible problem types map to the same canonical one: @@ -820,8 +676,8 @@ void QLT::end_tracer_declarations () { template int QLT::get_problem_type (const Int& tracer_idx) const { - qlt_throw_if(tracer_idx < 0 || tracer_idx > md_.a_h.trcr2prob.extent_int(0), - "tracer_idx is out of bounds: " << tracer_idx); + cedr_throw_if(tracer_idx < 0 || tracer_idx > md_.a_h.trcr2prob.extent_int(0), + "tracer_idx is out of bounds: " << tracer_idx); return md_.a_h.trcr2prob[tracer_idx]; } @@ -855,7 +711,7 @@ void QLT::run () { Timer::start(Timer::snp); for (const auto& n : lvl.nodes) { if ( ! n->nkids) continue; - qlt_kernel_assert(n->nkids == 2); + cedr_kernel_assert(n->nkids == 2); // Total density. bd_.l2r_data(n->offset*l2rndps) = (bd_.l2r_data(n->kids[0]->offset*l2rndps) + bd_.l2r_data(n->kids[1]->offset*l2rndps)); @@ -870,9 +726,9 @@ void QLT::run () { Real* const me = &bd_.l2r_data(n->offset*l2rndps + bdi); const Real* const k0 = &bd_.l2r_data(n->kids[0]->offset*l2rndps + bdi); const Real* const k1 = &bd_.l2r_data(n->kids[1]->offset*l2rndps + bdi); - me[0] = sum_only ? k0[0] + k1[0] : impl::min(k0[0], k1[0]); + me[0] = sum_only ? k0[0] + k1[0] : cedr::impl::min(k0[0], k1[0]); me[1] = k0[1] + k1[1] ; - me[2] = sum_only ? k0[2] + k1[2] : impl::max(k0[2], k1[2]); + me[2] = sum_only ? k0[2] + k1[2] : cedr::impl::max(k0[2], k1[2]); if (bsz == 4) me[3] = k0[3] + k1[3] ; } @@ -942,7 +798,7 @@ void QLT::run () { for (Int bi = bis; bi < bie; ++bi) { const Int l2rbdi = md_.a_d.trcr2bl2r(md_.a_d.bidx2trcr(bi)); const Int r2lbdi = md_.a_d.trcr2br2l(md_.a_d.bidx2trcr(bi)); - qlt_assert(n->nkids == 2); + cedr_assert(n->nkids == 2); if ( ! (problem_type & ProblemType::shapepreserve)) { // Pass q_{min,max} info along. l2r data are updated for use in // solve_node_problem. r2l data are updated for use in isend. @@ -961,15 +817,15 @@ void QLT::run () { const auto& k1 = n->kids[1]; solve_node_problem( problem_type, - bd_.l2r_data( n->offset*l2rndps), + bd_.l2r_data( n->offset*l2rndps), &bd_.l2r_data( n->offset*l2rndps + l2rbdi), - bd_.r2l_data( n->offset*r2lndps + r2lbdi), - bd_.l2r_data(k0->offset*l2rndps), + bd_.r2l_data( n->offset*r2lndps + r2lbdi), + bd_.l2r_data(k0->offset*l2rndps), &bd_.l2r_data(k0->offset*l2rndps + l2rbdi), - bd_.r2l_data(k0->offset*r2lndps + r2lbdi), - bd_.l2r_data(k1->offset*l2rndps), + bd_.r2l_data(k0->offset*r2lndps + r2lbdi), + bd_.l2r_data(k1->offset*l2rndps), &bd_.l2r_data(k1->offset*l2rndps + l2rbdi), - bd_.r2l_data(k1->offset*r2lndps + r2lbdi)); + bd_.r2l_data(k1->offset*r2lndps + r2lbdi)); } } } @@ -1053,7 +909,7 @@ class TestQLT { // For solution output, if requested. struct Writer { - std::unique_ptr fh; + std::unique_ptr fh; std::vector ngcis; // Number of i'th rank's gcis_ array. std::vector displs; // Cumsum of above. std::vector gcis; // Global cell indices packed by rank's gcis_ vector. @@ -1115,9 +971,10 @@ class TestQLT { qlt_.declare_tracer(t.problem_type); } qlt_.end_tracer_declarations(); - qlt_assert(qlt_.get_num_tracers() == static_cast(tracers_.size())); + cedr_assert(qlt_.get_num_tracers() == static_cast(tracers_.size())); for (size_t i = 0; i < tracers_.size(); ++i) - qlt_assert(qlt_.get_problem_type(i) == (tracers_[i].problem_type | PT::consistent)); + cedr_assert(qlt_.get_problem_type(i) == (tracers_[i].problem_type | + PT::consistent)); Timer::stop(Timer::trcrinit); } @@ -1264,7 +1121,7 @@ class TestQLT { void init_writer () { if (p_->amroot()) { w_ = std::make_shared(); - w_->fh = std::unique_ptr(fopen("QLT.py", "w")); + w_->fh = std::unique_ptr(fopen("out_QLT.py", "w")); int n = gcis_.size(); w_->ngcis.resize(p_->size()); mpi::gather(*p_, &n, 1, w_->ngcis.data(), 1, p_->root()); @@ -1272,7 +1129,7 @@ class TestQLT { w_->displs[0] = 0; for (size_t i = 0; i < w_->ngcis.size(); ++i) w_->displs[i+1] = w_->displs[i] + w_->ngcis[i]; - qlt_assert(w_->displs.back() == ncells_); + cedr_assert(w_->displs.back() == ncells_); w_->gcis.resize(ncells_); mpi::gatherv(*p_, gcis_.data(), gcis_.size(), w_->gcis.data(), w_->ngcis.data(), w_->displs.data(), p_->root()); @@ -1351,9 +1208,9 @@ class TestQLT { const Int n = qlt.nlclcells(); std::vector gcis; qlt.get_owned_glblcells(gcis); - qlt_assert(static_cast(gcis.size()) == n); + cedr_assert(static_cast(gcis.size()) == n); for (Int i = 0; i < n; ++i) - qlt_assert(qlt.gci2lci(gcis[i]) == i); + cedr_assert(qlt.gci2lci(gcis[i]) == i); } static Int check (const Parallel& p, const std::vector& ts, const Values& v) { @@ -1365,7 +1222,7 @@ class TestQLT { for (size_t ti = 0; ti < ts.size(); ++ti) { const auto& t = ts[ti]; - qlt_assert(t.safe_should_hold); + cedr_assert(t.safe_should_hold); const bool safe_only = ! t.local_should_hold; const Int n = v.ncells(); const Real* rhom = v.rhom(), * Qm_min = v.Qm_min(t.idx), * Qm = v.Qm(t.idx), @@ -1443,14 +1300,14 @@ class TestQLT { for (size_t ti = 0; ti < ts.size(); ++ti) { // Check mass conservation. const Real desired_mass = glbl_mass[2*ti], actual_mass = glbl_mass[2*ti+1], - rd = reldif(desired_mass, actual_mass); + rd = cedr::util::reldif(desired_mass, actual_mass); const bool mass_failed = rd > tol; if (mass_failed) { ++nerr; t_ok_gbl[ti] = false; } if ( ! t_ok_gbl[ti]) { - std::cout << "FAIL " << ts[ti].str(); + std::cout << "FAIL " << ts[ti].str(); if (mass_failed) std::cout << " mass re " << rd; std::cout << "\n"; } @@ -1479,7 +1336,7 @@ class TestQLT { { Real* rhom = v.rhom(); for (Int i = 0; i < nlclcells; ++i) - qlt_.set_rho(i2lci_[i], rhom[i]); + qlt_.set_rhom(i2lci_[i], rhom[i]); } for (Int ti = 0; ti < nt; ++ti) { generate_Q(tracers_[ti], v); @@ -1492,7 +1349,7 @@ class TestQLT { Real* Qm_min = v.Qm_min(ti), * Qm = v.Qm(ti), * Qm_max = v.Qm_max(ti), * Qm_prev = v.Qm_prev(ti); for (Int i = 0; i < nlclcells; ++i) - qlt_.set_Q(i2lci_[i], ti, Qm[i], Qm_min[i], Qm_max[i], Qm_prev[i]); + qlt_.set_Qm(i2lci_[i], ti, Qm[i], Qm_min[i], Qm_max[i], Qm_prev[i]); } MPI_Barrier(p_->comm()); Timer::start(Timer::qltrun); @@ -1512,7 +1369,7 @@ class TestQLT { for (Int ti = 0; ti < nt; ++ti) { Real* Qm = v.Qm(ti); for (Int i = 0; i < nlclcells; ++i) - Qm[i] = qlt_.get_Q(i2lci_[i], ti); + Qm[i] = qlt_.get_Qm(i2lci_[i], ti); if (write) write_post(tracers_[ti], v); } nerr += check(*p_, tracers_, v); @@ -1559,7 +1416,7 @@ struct Mesh { nranks_ = p->size(); p_ = p; pd_ = parallel_decomp; - qlt_assert(nranks_ <= nc_); + cedr_assert(nranks_ <= nc_); } Int ncell () const { return nc_; } @@ -1598,8 +1455,12 @@ struct Mesh { }; tree::Node::Ptr make_tree (const Mesh& m, const Int cs, const Int ce, - const tree::Node* parent) { - const Int cn = ce - cs, cn0 = cn/2; + const tree::Node* parent, const bool imbalanced) { + const Int + cn = ce - cs, + cn0 = ( imbalanced && cn > 2 ? + cn/3 : + cn/2 ); tree::Node::Ptr n = std::make_shared(); n->parent = parent; if (cn == 1) { @@ -1609,18 +1470,19 @@ tree::Node::Ptr make_tree (const Mesh& m, const Int cs, const Int ce, return n; } n->nkids = 2; - n->kids[0] = make_tree(m, cs, cs + cn0, n.get()); - n->kids[1] = make_tree(m, cs + cn0, ce, n.get()); + n->kids[0] = make_tree(m, cs, cs + cn0, n.get(), imbalanced); + n->kids[1] = make_tree(m, cs + cn0, ce, n.get(), imbalanced); return n; } -tree::Node::Ptr make_tree (const Mesh& m) { - return make_tree(m, 0, m.ncell(), nullptr); +tree::Node::Ptr make_tree (const Mesh& m, const bool imbalanced) { + return make_tree(m, 0, m.ncell(), nullptr, imbalanced); } -tree::Node::Ptr make_tree (const Parallel::Ptr& p, const Int& ncells) { +tree::Node::Ptr make_tree (const Parallel::Ptr& p, const Int& ncells, + const bool imbalanced) { Mesh m(ncells, p); - return make_tree(m); + return make_tree(m, imbalanced); } namespace test { @@ -1637,19 +1499,25 @@ Int unittest (const Parallel::Ptr& p) { const Mesh::ParallelDecomp::Enum dists[] = { Mesh::ParallelDecomp::pseudorandom, Mesh::ParallelDecomp::contiguous }; Int ne = 0; - for (size_t id = 0; id < sizeof(dists)/sizeof(*dists); ++id) { - Mesh m(std::max(42, 3*p->size()), p, Mesh::ParallelDecomp::pseudorandom); - tree::Node::Ptr tree = make_tree(m); - std::vector cells(m.ncell(), 0); - mark_cells(tree, cells); - for (Int i = 0; i < m.ncell(); ++i) - if (cells[i] != 1) ++ne; - } + for (size_t id = 0; id < sizeof(dists)/sizeof(*dists); ++id) + for (bool imbalanced: {false, true}) { + Mesh m(std::max(42, 3*p->size()), p, Mesh::ParallelDecomp::pseudorandom); + tree::Node::Ptr tree = make_tree(m, imbalanced); + std::vector cells(m.ncell(), 0); + mark_cells(tree, cells); + for (Int i = 0; i < m.ncell(); ++i) + if (cells[i] != 1) ++ne; + } return ne; } } // namespace test } // namespace oned +tree::Node::Ptr tree::make_tree_over_1d_mesh (const Parallel::Ptr& p, const Int& ncells, + const bool imbalanced) { + return oned::make_tree(oned::Mesh(ncells, p), imbalanced); +} + namespace test { Int unittest_NodeSets (const Parallel::Ptr& p) { using Mesh = oned::Mesh; @@ -1658,13 +1526,14 @@ Int unittest_NodeSets (const Parallel::Ptr& p) { Mesh::ParallelDecomp::contiguous }; Int nerr = 0; for (size_t is = 0; is < sizeof(szs)/sizeof(*szs); ++is) - for (size_t id = 0; id < sizeof(dists)/sizeof(*dists); ++id) { - Mesh m(szs[is], p, dists[id]); - tree::Node::Ptr tree = make_tree(m); - impl::NodeSets::ConstPtr nodesets = impl::analyze(p, m.ncell(), tree); - tree = nullptr; - nerr += impl::unittest(p, nodesets, m.ncell()); - } + for (size_t id = 0; id < sizeof(dists)/sizeof(*dists); ++id) + for (bool imbalanced: {false, true}) { + Mesh m(szs[is], p, dists[id]); + tree::Node::Ptr tree = make_tree(m, imbalanced); + impl::NodeSets::ConstPtr nodesets = impl::analyze(p, m.ncell(), tree); + tree = nullptr; + nerr += impl::unittest(p, nodesets, m.ncell()); + } return nerr; } @@ -1675,13 +1544,14 @@ Int unittest_QLT (const Parallel::Ptr& p, const bool write_requested=false) { Mesh::ParallelDecomp::pseudorandom }; Int nerr = 0; for (size_t is = 0, islim = sizeof(szs)/sizeof(*szs); is < islim; ++is) - for (size_t id = 0, idlim = sizeof(dists)/sizeof(*dists); id < idlim; ++id) { + for (size_t id = 0, idlim = sizeof(dists)/sizeof(*dists); id < idlim; ++id) + for (bool imbalanced: {false, true}) { if (p->amroot()) { - std::cout << " (" << szs[is] << ", " << id << ")"; + std::cout << " (" << szs[is] << ", " << id << ", " << imbalanced << ")"; std::cout.flush(); } Mesh m(szs[is], p, dists[id]); - tree::Node::Ptr tree = make_tree(m); + tree::Node::Ptr tree = make_tree(m, imbalanced); const bool write = (write_requested && m.ncell() < 3000 && is == islim-1 && id == idlim-1); nerr += test::test_qlt(p, tree, m.ncell(), 1, write); @@ -1689,7 +1559,7 @@ Int unittest_QLT (const Parallel::Ptr& p, const bool write_requested=false) { return nerr; } -Int run (const Parallel::Ptr& p, const Input& in) { +Int run_unit_and_randomized_tests (const Parallel::Ptr& p, const Input& in) { Int nerr = 0; if (in.unittest) { Int ne; @@ -1710,14 +1580,14 @@ Int run (const Parallel::Ptr& p, const Input& in) { if (nerr) return nerr; // Performance test. - if (in.ncells > 0) { + if (in.perftest && in.ncells > 0) { oned::Mesh m(in.ncells, p, (in.pseudorandom ? oned::Mesh::ParallelDecomp::pseudorandom : oned::Mesh::ParallelDecomp::contiguous)); Timer::init(); Timer::start(Timer::total); Timer::start(Timer::tree); - tree::Node::Ptr tree = make_tree(m); + tree::Node::Ptr tree = make_tree(m, false); Timer::stop(Timer::tree); test::test_qlt(p, tree, in.ncells, in.nrepeat, false, in.verbose); Timer::stop(Timer::total); @@ -1728,13 +1598,14 @@ Int run (const Parallel::Ptr& p, const Input& in) { } // namespace test } // namespace qlt +} // namespace cedr #ifdef KOKKOS_HAVE_SERIAL -template class qlt::QLT; +template class cedr::qlt::QLT; #endif #ifdef KOKKOS_HAVE_OPENMP -template class qlt::QLT; +template class cedr::qlt::QLT; #endif #ifdef KOKKOS_HAVE_CUDA -template class qlt::QLT; +template class cedr::qlt::QLT; #endif diff --git a/qlt/qlt.hpp b/cedr/cedr_qlt.hpp similarity index 81% rename from qlt/qlt.hpp rename to cedr/cedr_qlt.hpp index c59f37a..4ec8128 100644 --- a/qlt/qlt.hpp +++ b/cedr/cedr_qlt.hpp @@ -1,5 +1,5 @@ -#ifndef INCLUDE_QLT_HPP -#define INCLUDE_QLT_HPP +#ifndef INCLUDE_CEDR_QLT_HPP +#define INCLUDE_CEDR_QLT_HPP #include @@ -9,32 +9,18 @@ #include #include -#include -#include "qlt_kokkos.hpp" +#include "cedr.hpp" +#include "cedr_kokkos.hpp" +#include "cedr_mpi.hpp" +namespace cedr { // QLT: Quasi-local tree-based non-iterative tracer density reconstructor for // mass conservation, shape preservation, and tracer consistency. namespace qlt { -typedef int Int; -typedef size_t Size; -typedef double Real; +using cedr::mpi::Parallel; namespace impl { class NodeSets; } -class Parallel { - MPI_Comm comm_; -public: - typedef std::shared_ptr Ptr; - Parallel(MPI_Comm comm) : comm_(comm) {} - MPI_Comm comm () const { return comm_; } - Int size() const; - Int rank() const; - Int root () const { return 0; } - bool amroot () const { return rank() == root(); } -}; - -Parallel::Ptr make_parallel(MPI_Comm comm); - namespace tree { // The caller builds a tree of these nodes to pass to QLT. struct Node { @@ -47,12 +33,17 @@ struct Node { void* reserved; // For internal use. Node () : parent(nullptr), rank(-1), cellidx(-1), nkids(0), reserved(nullptr) {} }; + +// Utility to make a tree over a 1D mesh. For testing, it can be useful to +// create an imbalanced tree. +Node::Ptr make_tree_over_1d_mesh(const Parallel::Ptr& p, const Int& ncells, + const bool imbalanced = false); } // namespace tree -template +template class QLT { public: - typedef typename impl::DeviceType::type Device; + typedef typename cedr::impl::DeviceType::type Device; typedef QLT Me; typedef std::shared_ptr Ptr; @@ -90,8 +81,8 @@ class QLT { Int get_num_tracers() const; - // set_{rho,Q}: Set cell values prior to running the QLT algorithm. - // set_rho must be called before set_Q. + // set_{rhom,Qm}: Set cell values prior to running the QLT algorithm. + // set_rhom must be called before set_Qm. // lclcellidx is gci2lci(cellidx). // Notation: // rho: Total density. @@ -100,31 +91,31 @@ class QLT { // *m: Mass corresponding to the density; results from an integral over a // region, such as a cell. KOKKOS_INLINE_FUNCTION - void set_rho(const Int& lclcellidx, - // Current total mass in this cell. - const Real& rhom); + void set_rhom(const Int& lclcellidx, + // Current total mass in this cell. + const Real& rhom); KOKKOS_INLINE_FUNCTION - void set_Q(const Int& lclcellidx, const Int& tracer_idx, - // Current tracer mass in this cell. - const Real& Qm, - // Minimum and maximum permitted tracer mass in this cell. - const Real& Qm_min, const Real& Qm_max, - // If mass conservation is requested, provide the previous Qm, - // which will be summed to give the desired global mass. - const Real Qm_prev = -1); + void set_Qm(const Int& lclcellidx, const Int& tracer_idx, + // Current tracer mass in this cell. + const Real& Qm, + // Minimum and maximum permitted tracer mass in this cell. + const Real& Qm_min, const Real& Qm_max, + // If mass conservation is requested, provide the previous Qm, + // which will be summed to give the desired global mass. + const Real Qm_prev = -1); // Run the QLT algorithm with the values set by set_{rho,Q}. void run(); // Get a cell's tracer mass Qm after the QLT algorithm has run. KOKKOS_INLINE_FUNCTION - Real get_Q(const Int& lclcellidx, const Int& tracer_idx); + Real get_Qm(const Int& lclcellidx, const Int& tracer_idx); private: typedef Kokkos::View IntList; - typedef impl::Const ConstIntList; - typedef impl::ConstUnmanaged ConstUnmanagedIntList; + typedef cedr::impl::Const ConstIntList; + typedef cedr::impl::ConstUnmanaged ConstUnmanagedIntList; static void init(const std::string& name, IntList& d, typename IntList::HostMirror& h, size_t n); @@ -205,7 +196,7 @@ class QLT { struct BulkData { typedef Kokkos::View RealList; - typedef impl::Unmanaged UnmanagedRealList; + typedef cedr::impl::Unmanaged UnmanagedRealList; UnmanagedRealList l2r_data, r2l_data; @@ -242,17 +233,18 @@ class QLT { namespace test { struct Input { - bool unittest, write; + bool unittest, perftest, write; Int ncells, ntracers, tracer_type, nrepeat; bool pseudorandom, verbose; }; -Int run(const Parallel::Ptr& p, const Input& in); +Int run_unit_and_randomized_tests(const Parallel::Ptr& p, const Input& in); } // namespace test } // namespace qlt +} // namespace cedr // These are the definitions that must be visible in the calling translation // unit, unless Cuda relocatable device code is enabled. -#include "qlt_inline.hpp" +#include "cedr_qlt_inl.hpp" #endif diff --git a/cedr/cedr_qlt_inl.hpp b/cedr/cedr_qlt_inl.hpp new file mode 100644 index 0000000..7e8baf5 --- /dev/null +++ b/cedr/cedr_qlt_inl.hpp @@ -0,0 +1,152 @@ +#ifndef INCLUDE_CEDR_QLT_INL_HPP +#define INCLUDE_CEDR_QLT_INL_HPP + +#include + +#include "cedr_local.hpp" + +namespace cedr { +namespace qlt { + +template KOKKOS_INLINE_FUNCTION +void QLT::set_rhom (const Int& lclcellidx, const Real& rhom) { + const Int ndps = md_.a_d.prob2bl2r[md_.nprobtypes]; + bd_.l2r_data(ndps*lclcellidx) = rhom; +} + +template KOKKOS_INLINE_FUNCTION +void QLT::set_Qm (const Int& lclcellidx, const Int& tracer_idx, + const Real& Qm, + const Real& Qm_min, const Real& Qm_max, + const Real Qm_prev) { + const Int ndps = md_.a_d.prob2bl2r[md_.nprobtypes]; + Real* bd; { + const Int bdi = md_.a_d.trcr2bl2r(tracer_idx); + bd = &bd_.l2r_data(ndps*lclcellidx + bdi); + } + bd[1] = Qm; + { + const Int problem_type = md_.a_d.trcr2prob(tracer_idx); + if (problem_type & ProblemType::shapepreserve) { + bd[0] = Qm_min; + bd[2] = Qm_max; + } else if (problem_type & ProblemType::consistent) { + const Real rhom = bd_.l2r_data(ndps*lclcellidx); + bd[0] = Qm_min / rhom; + bd[2] = Qm_max / rhom; + } else { + cedr_kernel_throw_if(true, "set_Q: invalid problem_type."); + } + if (problem_type & ProblemType::conserve) { + cedr_kernel_throw_if(Qm_prev < 0, "Qm_prev was not provided to set_Q."); + bd[3] = Qm_prev; + } + } +} + +template KOKKOS_INLINE_FUNCTION +Real QLT::get_Qm (const Int& lclcellidx, const Int& tracer_idx) { + const Int ndps = md_.a_d.prob2br2l[md_.nprobtypes]; + const Int bdi = md_.a_d.trcr2br2l(tracer_idx); + return bd_.r2l_data(ndps*lclcellidx + bdi); +} + +//todo Replace this and the calling code with ReconstructSafely. +KOKKOS_INLINE_FUNCTION +void r2l_nl_adjust_bounds (Real Qm_bnd[2], const Real rhom[2], Real Qm_extra) { + Real q[2]; + for (Int i = 0; i < 2; ++i) q[i] = Qm_bnd[i] / rhom[i]; + if (Qm_extra < 0) { + Int i0, i1; + if (q[0] >= q[1]) { i0 = 0; i1 = 1; } else { i0 = 1; i1 = 0; } + const Real Qm_gap = (q[i1] - q[i0])*rhom[i0]; + if (Qm_gap <= Qm_extra) { + Qm_bnd[i0] += Qm_extra; + return; + } + } else { + Int i0, i1; + if (q[0] <= q[1]) { i0 = 0; i1 = 1; } else { i0 = 1; i1 = 0; } + const Real Qm_gap = (q[i1] - q[i0])*rhom[i0]; + if (Qm_gap >= Qm_extra) { + Qm_bnd[i0] += Qm_extra; + return; + } + } + { // Have to adjust both. Adjust so that the q bounds are the same. This + // procedure assures that as long as rhom is conservative, then the + // adjustment never pushes q_{min,max} out of the safety bounds. + const Real Qm_tot = Qm_bnd[0] + Qm_bnd[1] + Qm_extra; + const Real rhom_tot = rhom[0] + rhom[1]; + const Real q_tot = Qm_tot / rhom_tot; + for (Int i = 0; i < 2; ++i) + Qm_bnd[i] = q_tot*rhom[i]; + } +} + +namespace impl { +KOKKOS_INLINE_FUNCTION +void solve_node_problem (const Real& rhom, const Real* pd, const Real& Qm, + const Real& rhom0, const Real* k0d, Real& Qm0, + const Real& rhom1, const Real* k1d, Real& Qm1) { + Real Qm_min_kids [] = {k0d[0], k1d[0]}; + Real Qm_orig_kids[] = {k0d[1], k1d[1]}; + Real Qm_max_kids [] = {k0d[2], k1d[2]}; + { // The ideal problem is not assuredly feasible. Test for feasibility. If not + // feasible, adjust bounds to solve the safety problem, which is assuredly + // feasible if the total density field rho is mass conserving (Q doesn't + // have to be mass conserving, of course; achieving mass conservation is one + // use for QLT). + const Real Qm_min = pd[0], Qm_max = pd[2]; + const bool lo = Qm < Qm_min, hi = Qm > Qm_max; + if (lo || hi) { + const Real rhom_kids[] = {rhom0, rhom1}; + r2l_nl_adjust_bounds(lo ? Qm_min_kids : Qm_max_kids, + rhom_kids, + Qm - (lo ? Qm_min : Qm_max)); + } else { + // Quick exit if everything is OK as is. + if (Qm == pd[1] && // Was our total tracer mass wasn't adjusted? + // Are the kids' problems feasible? + Qm_orig_kids[0] >= Qm_min_kids[0] && Qm_orig_kids[0] <= Qm_max_kids[0] && + Qm_orig_kids[1] >= Qm_min_kids[1] && Qm_orig_kids[1] <= Qm_max_kids[1]) { + // Don't need to do anything, so skip even the math-based quick exits in + // solve_node_problem. + Qm0 = Qm_orig_kids[0]; + Qm1 = Qm_orig_kids[1]; + return; + } + } + } + { // Solve the node's QP. + static const Real ones[] = {1, 1}; + const Real w[] = {1/rhom0, 1/rhom1}; + Real Qm_kids[2] = {k0d[1], k1d[1]}; + local::solve_1eq_bc_qp(2, w, ones, Qm, Qm_min_kids, Qm_max_kids, + Qm_orig_kids, Qm_kids); + Qm0 = Qm_kids[0]; + Qm1 = Qm_kids[1]; + } +} +} // namespace impl + +template KOKKOS_INLINE_FUNCTION +void QLT::solve_node_problem (const Int problem_type, + const Real& rhom, const Real* pd, const Real& Qm, + const Real& rhom0, const Real* k0d, Real& Qm0, + const Real& rhom1, const Real* k1d, Real& Qm1) { + if ( ! (problem_type & ProblemType::shapepreserve)) { + Real mpd[3], mk0d[3], mk1d[3]; + mpd[0] = pd [0]*rhom ; mpd [1] = pd[1] ; mpd [2] = pd [2]*rhom ; + mk0d[0] = k0d[0]*rhom0; mk0d[1] = k0d[1]; mk0d[2] = k0d[2]*rhom0; + mk1d[0] = k1d[0]*rhom1; mk1d[1] = k1d[1]; mk1d[2] = k1d[2]*rhom1; + impl::solve_node_problem(rhom, mpd, Qm, rhom0, mk0d, Qm0, rhom1, mk1d, Qm1); + return; + } + impl::solve_node_problem(rhom, pd, Qm, rhom0, k0d, Qm0, rhom1, k1d, Qm1); +} + +} // namespace qlt +} // namespace cedr + +#endif diff --git a/cedr/cedr_test.cpp b/cedr/cedr_test.cpp new file mode 100644 index 0000000..9e6aa41 --- /dev/null +++ b/cedr/cedr_test.cpp @@ -0,0 +1,96 @@ +#include "cedr_qlt.hpp" +#include "cedr_mpi.hpp" +#include "cedr_util.hpp" +#include "cedr_test.hpp" + +#include +#include + +namespace cedr { +struct InputParser { + qlt::test::Input qin; + test::transport1d::Input tin; + + class ArgAdvancer { + const int argc_; + char const* const* argv_; + int i_; + public: + ArgAdvancer (int argc, char** argv) : argc_(argc), argv_(argv), i_(1) {} + const char* advance () { + if (i_+1 >= argc_) cedr_throw_if(true, "Command line is missing an argument."); + return argv_[++i_]; + } + const char* token () const { return argv_[i_]; } + void incr () { ++i_; } + bool more () const { return i_ < argc_; } + }; + + InputParser (int argc, char** argv, const qlt::Parallel::Ptr& p) { + using util::eq; + qin.unittest = false; + qin.perftest = false; + qin.write = false; + qin.ncells = 0; + qin.ntracers = 1; + qin.tracer_type = 0; + qin.nrepeat = 1; + qin.pseudorandom = false; + qin.verbose = false; + tin.ncells = 0; + for (ArgAdvancer aa(argc, argv); aa.more(); aa.incr()) { + const char* token = aa.token(); + if (eq(token, "-t", "--unittest")) qin.unittest = true; + else if (eq(token, "-pt", "--perftest")) qin.perftest = true; + else if (eq(token, "-w", "--write")) qin.write = true; + else if (eq(token, "-nc", "--ncells")) qin.ncells = std::atoi(aa.advance()); + else if (eq(token, "-nt", "--ntracers")) qin.ntracers = std::atoi(aa.advance()); + else if (eq(token, "-tt", "--tracertype")) qin.tracer_type = std::atoi(aa.advance()); + else if (eq(token, "-nr", "--nrepeat")) qin.nrepeat = std::atoi(aa.advance()); + else if (eq(token, "--proc-random")) qin.pseudorandom = true; + else if (eq(token, "-v", "--verbose")) qin.verbose = true; + else if (eq(token, "-t1d", "--transport1dtest")) tin.ncells = 1; + else cedr_throw_if(true, "Invalid token " << token); + } + + if (tin.ncells) { + tin.ncells = qin.ncells; + tin.verbose = qin.verbose; + } + + cedr_throw_if(qin.tracer_type < 0 || qin.tracer_type >= 4, + "Tracer type is out of bounds [0, 3]."); + cedr_throw_if(qin.ntracers < 1, "Number of tracers is < 1."); + } + + void print (std::ostream& os) const { + os << "ncells " << qin.ncells + << " nrepeat " << qin.nrepeat; + if (qin.pseudorandom) os << " random"; + os << "\n"; + } +}; +} // namespace cedr + +int main (int argc, char** argv) { + int ret = 0; + MPI_Init(&argc, &argv); + auto p = cedr::mpi::make_parallel(MPI_COMM_WORLD); + srand(p->rank()); + Kokkos::initialize(argc, argv); + try { + cedr::InputParser inp(argc, argv, p); + if (p->amroot()) inp.print(std::cout); + if (inp.qin.unittest || inp.qin.perftest) + ret += cedr::qlt::test::run_unit_and_randomized_tests(p, inp.qin); + if (inp.tin.ncells > 0) + ret += cedr::test::transport1d::run(p, inp.tin); + if (p->amroot()) std::cout << (ret != 0 ? "FAIL" : "PASS") << "\n"; + } catch (const std::exception& e) { + if (p->amroot()) + std::cerr << e.what(); + } + Kokkos::finalize_all(); + MPI_Finalize(); + return ret; +} diff --git a/cedr/cedr_test_1d_transport.cpp b/cedr/cedr_test_1d_transport.cpp new file mode 100644 index 0000000..e1dc92f --- /dev/null +++ b/cedr/cedr_test_1d_transport.cpp @@ -0,0 +1,308 @@ +#include "cedr_test.hpp" +#include "cedr_qlt.hpp" + +#include + +namespace cedr { +namespace test { +namespace transport1d { + +namespace interp { +inline Real to_periodic_core (const Real& xl, const Real& xr, const Real& x) { + if (x >= xl && x <= xr) return x; + const Real w = xr - xl, xmxl = x - xl; + return x - w*std::floor(xmxl / w); +} + +inline Real get_slope (const Real x[2], const Real y[2]) { + return (y[1] - y[0]) / (x[1] - x[0]); +} + +inline void +get_cubic (Real dx, Real v1, Real s1, Real v2, Real s2, Real c[4]) { + Real dx2 = dx*dx; + Real dx3 = dx2*dx; + Real den = -dx3; + Real b1, b2; + c[2] = s1; + c[3] = v1; + b1 = v2 - dx*c[2] - c[3]; + b2 = s2 - c[2]; + c[0] = (2.0*b1 - dx*b2) / den; + c[1] = (-3.0*dx*b1 + dx2*b2) / den; +} + +void cubic_interp_periodic ( + const Real* const x, const Int nx, const Real* const y, + const Real* const xi, const Int nxi, Real* const yi, + Int* const dod) +{ + const int nc = nx - 1; +# pragma omp parallel for + for (Int j = 0; j < nxi; ++j) { + const Real xi_per = to_periodic_core(x[0], x[nc], xi[j]); + Int ip1 = std::upper_bound(x, x + nx, xi_per) - x; + // Handle numerical issues at boundaries. + if (ip1 == 0) ++ip1; + else if (ip1 == nx) --ip1; + const Int i = ip1 - 1; + // Domain of dependence. + Int* dodj = dod + 4*j; + for (Int k = 0; k < 4; ++k) + dodj[k] = (i - 1 + k + nc) % nc; + // Slopes. + const bool at_start = i == 0, at_end = i == nc - 1; + const Real smid = get_slope(x+i, y+i); + Real s1, s2; + if (at_start) { + const Real a = (x[nc] - x[nc-1]) / ((x[1] - x[0]) + (x[nc] - x[nc-1])); + s1 = (1 - a)*get_slope(x+nc-1, y+nc-1) + a*smid; + } else { + const Real a = (x[i] - x[i-1]) / (x[ip1] - x[i-1]); + s1 = (1 - a)*get_slope(x+i-1, y+i-1) + a*smid; + } + if (at_end) { + const Real a = (x[ip1] - x[i]) / ((x[ip1] - x[i]) + (x[1] - x[0])); + s2 = (1 - a)*smid + a*get_slope(x, y); + } else { + const Real a = (x[ip1] - x[i]) / (x[i+2] - x[i]); + s2 = (1 - a)*smid + a*get_slope(x+ip1, y+ip1); + } + // Interp. + Real c[4]; + get_cubic(x[ip1] - x[i], y[i], s1, y[ip1], s2, c); + const Real xij = xi_per - x[i]; + yi[j] = (((c[0]*xij + c[1])*xij) + c[2])*xij + c[3]; + } +} +} // namespace interp + +class PyWriter { + typedef std::unique_ptr FilePtr; + FilePtr fh_; +public: + PyWriter(const std::string& filename); + void write(const std::string& field_name, const std::vector& v) const; +}; + +PyWriter::PyWriter (const std::string& filename) { + fh_ = FilePtr(fopen((filename + ".py").c_str(), "w")); + fprintf(fh_.get(), "s = {};\n"); +} + +void PyWriter::write (const std::string& field_name, const std::vector& v) const { + fprintf(fh_.get(), "s['%s'] = [", field_name.c_str()); + for (const auto& e: v) + fprintf(fh_.get(), " %1.15e,", e); + fprintf(fh_.get(), "]\n"); +} + +struct InitialCondition { + enum Enum { sin, bell, rect, uniform }; + static std::string convert (const Enum& e) { + switch (e) { + case Enum::sin: return "sin"; + case Enum::bell: return "bell"; + case Enum::rect: return "rect"; + case Enum::uniform: return "uniform"; + } + cedr_throw_if(true, "InitialCondition::convert can't convert " << e); + } + static Enum convert (const std::string& s) { + using util::eq; + if (eq(s, "sin")) return Enum::sin; + if (eq(s, "bell")) return Enum::bell; + if (eq(s, "rect")) return Enum::rect; + if (eq(s, "uniform")) return Enum::uniform; + cedr_throw_if(true, "InitialCondition::convert can't convert " << s); + } + static Real eval (const Enum& ic, const Real x) { + switch (ic) { + case Enum::sin: return 0.1 + 0.8*0.5*(1 + std::sin(6*M_PI*x)); + case Enum::bell: return x < 0.5 ? std::sin(2*M_PI*x) : 0; + case Enum::rect: return x > 0.66 || x < 0.33 ? 0 : 1; + case Enum::uniform: return 0.42; + } + cedr_throw_if(true, "InitialCondition::eval can't convert " << ic); + } +}; + +class Problem1D { + std::vector xb_, xcp_, rwrk_; + std::vector iwrk_; + + void init_mesh (const Int ncells, const bool nonuniform_mesh) { + xb_.resize(ncells+1); + xcp_.resize(ncells+1); + xb_[0] = 0; + if (nonuniform_mesh) { + // Large-scale, continuous variation in cell size, plus a huge jump at the + // periodic boundary. + for (Int i = 1; i <= ncells; ++i) { + const Real x = cedr::util::square(Real(i) / ncells); + xb_[i] = 0.01 + sin(0.5*M_PI*x*x*x*x); + } + // Random local cell sizes. + for (Int i = 1; i <= ncells; ++i) + xb_[i] *= 0.3 + cedr::util::urand(); + // Cumsum. + for (Int i = 1; i <= ncells; ++i) + xb_[i] += xb_[i-1]; + // Normalize. + for (Int i = 1; i <= ncells; ++i) + xb_[i] /= xb_[ncells]; + } else { + xb_.back() = 1; + for (Int i = 1; i < ncells; ++i) + xb_[i] = Real(i) / ncells; + } + for (Int i = 0; i < ncells; ++i) + xcp_[i] = 0.5*(xb_[i] + xb_[i+1]); + xcp_.back() = 1 + xcp_[0]; + } + + static void run_qlt (const Problem1D& p, + qlt::QLT& qlt, + const Real* yp, Real* y, const Int* dods) { + const Int n = p.ncells(); + for (Int i = 0; i < n; ++i) { + const Int* dod = dods + 4*i; + Real min = yp[dod[0]], max = min; + for (Int j = 1; j < 4; ++j) { + const Real v = yp[dod[j]]; + min = std::min(min, v); + max = std::max(max, v); + } + const Real area_i = p.area(i); + qlt.set_Qm(i, 0, y[i]*area_i, min*area_i, max*area_i, yp[i]*area_i); + } + qlt.run(); + for (Int i = 0; i < n; ++i) + y[i] = qlt.get_Qm(i, 0) / p.area(i); + y[n] = y[0]; + } + + static void run_caas (const Problem1D& p, const Real* yp, Real* y, const Int* dods) { + const Int n = p.ncells(); + std::vector lo(n), up(n), w(n); + Real m = 0; + for (Int i = 0; i < n; ++i) { + const Int* dod = dods + 4*i; + Real min = yp[dod[0]], max = min; + for (Int j = 1; j < 4; ++j) { + const Real v = yp[dod[j]]; + min = std::min(min, v); + max = std::max(max, v); + } + const Real area_i = p.area(i); + lo[i] = min*area_i; + up[i] = max*area_i; + y[i] = std::max(min, std::min(max, y[i])); + m += (yp[i] - y[i])*area_i; + } + Real wsum = 0; + for (Int i = 0; i < n; ++i) { + w[i] = m >= 0 ? up[i] - y[i]*p.area(i) : y[i]*p.area(i) - lo[i]; + wsum += w[i]; + } + for (Int i = 0; i < n; ++i) + y[i] += (m/(wsum*p.area(i)))*w[i]; + } + +public: + Problem1D (const Int ncells, const bool nonuniform_mesh = false) { + init_mesh(ncells, nonuniform_mesh); + } + + Int ncells () const { return xb_.size() - 1; } + Real xb (const Int& i) const { return xb_[i]; } + Real xcp (const Int& i) const { return xcp_[i]; } + Real area (const Int& i) const { return xb_[i+1] - xb_[i]; } + + const std::vector get_xb () const { return xb_; } + const std::vector get_xcp () const { return xcp_; } + + void cycle (const Int& nsteps, const Real* y0, Real* yf, + qlt::QLT* qlt = nullptr) { + const Int n = xcp_.size(); + rwrk_.resize(2*n); + iwrk_.resize(4*n); + Real* xcpi = rwrk_.data(); + Int* dod = iwrk_.data(); + + const Real xos = -1.0 / nsteps; + for (Int i = 0; i < n; ++i) + xcpi[i] = xcp_[i] + xos; + + Real* ys[] = {xcpi + n, yf}; + std::copy(y0, y0 + n, ys[0]); + for (Int ti = 0; ti < nsteps; ++ti) { + interp::cubic_interp_periodic(xcp_.data(), n, ys[0], + xcpi, n, ys[1], dod); + if (qlt) + run_qlt(*this, *qlt, ys[0], ys[1], dod); + else + run_caas(*this, ys[0], ys[1], dod); + std::swap(ys[0], ys[1]); + } + std::copy(ys[0], ys[0] + n, yf); + } +}; + +//todo Clean this up. Right now everything is hardcoded and kludgy. +// - optional write +// - some sort of brief quantitative output +// - better, more canonical IC +// - optional tree imbalance +// - optional mesh nonuniformity +// - choice of preservation methods +// - parallel? +Int run (const mpi::Parallel::Ptr& parallel, const Input& in) { + cedr_throw_if(parallel->size() > 1, "run_1d_transport_test runs in serial only."); + Int nerr = 0; + + Problem1D p(in.ncells, true /* nonuniform_mesh */ ); + + auto tree = qlt::tree::make_tree_over_1d_mesh(parallel, in.ncells, + true /* imbalanced */); + typedef qlt::QLT QLTT; + QLTT qlt(parallel, in.ncells, tree); + qlt.declare_tracer(QLTT::ProblemType::conserve | + QLTT::ProblemType::shapepreserve); + qlt.end_tracer_declarations(); + for (Int i = 0; i < in.ncells; ++i) + qlt.set_rhom(i, p.area(i)); + qlt.print(std::cout); + + std::vector y0(in.ncells+1); + for (Int i = 0, nc = p.ncells(); i < nc; ++i) + y0[i] = (p.xcp(i) < 0.4 || p.xcp(i) > 0.9 ? + InitialCondition::eval(InitialCondition::sin, p.xcp(i)) : + InitialCondition::eval(InitialCondition::rect, p.xcp(i))); + y0.back() = y0[0]; + + PyWriter w("out_transport1d"); + w.write("xb", p.get_xb()); + w.write("xcp", p.get_xcp()); + w.write("y0", y0); + + std::vector yf(in.ncells+1); + const Int nsteps = Int(3.17*in.ncells); + const Int ncycles = 1; + + std::copy(y0.begin(), y0.end(), yf.begin()); + for (Int i = 0; i < ncycles; ++i) + p.cycle(nsteps, yf.data(), yf.data(), &qlt); + w.write("yqlt", yf); + + std::copy(y0.begin(), y0.end(), yf.begin()); + for (Int i = 0; i < ncycles; ++i) + p.cycle(nsteps, yf.data(), yf.data()); + w.write("ycaas", yf); + + return nerr; +} + +} // namespace transport1d +} // namespace test +} // namespace cedr diff --git a/cedr/cedr_util.cpp b/cedr/cedr_util.cpp new file mode 100644 index 0000000..3854888 --- /dev/null +++ b/cedr/cedr_util.cpp @@ -0,0 +1,23 @@ +#include "cedr_util.hpp" + +namespace cedr { +namespace util { + +bool eq (const std::string& a, const char* const b1, const char* const b2) { + return (a == std::string(b1) || (b2 && a == std::string(b2)) || + a == std::string("-") + std::string(b1)); +} + +Real urand () { return std::rand() / ((Real) RAND_MAX + 1.0); } + +Real reldif (const Real* a, const Real* b, const Int n) { + Real num = 0, den = 0; + for (Int i = 0; i < n; ++i) { + num += std::abs(a[i] - b[i]); + den += std::abs(a[i]); + } + return num/den; +} + +} +} diff --git a/cedr/cedr_util.hpp b/cedr/cedr_util.hpp new file mode 100644 index 0000000..87f5e2b --- /dev/null +++ b/cedr/cedr_util.hpp @@ -0,0 +1,90 @@ +#ifndef INCLUDE_CEDR_UTIL_HPP +#define INCLUDE_CEDR_UTIL_HPP + +#include + +#include "cedr_kokkos.hpp" +#include "cedr_mpi.hpp" + +namespace cedr { +namespace util { + +template KOKKOS_INLINE_FUNCTION constexpr +T square (const T& x) { return x*x; } + +bool eq(const std::string& a, const char* const b1, const char* const b2 = 0); + +// Uniform rand in [0, 1). +Real urand(); + +#define pr(m) do { \ + int _pid_ = 0; \ + MPI_Comm_rank(MPI_COMM_WORLD, &_pid_); \ + std::stringstream _ss_; \ + _ss_.precision(15); \ + _ss_ << "pid " << _pid_ << " " << m << std::endl; \ + std::cerr << _ss_.str(); \ + } while (0) +#define pr0(m) do { \ + int _pid_; MPI_Comm_rank(MPI_COMM_WORLD, &_pid_); \ + if (_pid_ != 0) break; \ + std::stringstream _ss_; \ + _ss_ << "pid " << _pid_ << " " << m << std::endl; \ + std::cerr << _ss_.str(); \ + } while (0) +#define prc(m) pr(#m << " | " << (m)) +#define pr0c(m) pr0(#m << " | " << (m)) +#define puf(m) "(" << #m << " " << (m) << ")" +#define pu(m) << " " << puf(m) +template +void prarr (const std::string& name, const T* const v, const size_t n) { + std::stringstream ss; + ss.precision(15); + ss << name << " = ["; + for (size_t i = 0; i < n; ++i) ss << " " << v[i]; + ss << "];"; + pr(ss.str()); +} +#define mprarr(m) cedr::util::prarr(#m, m.data(), m.size()) + +#ifndef NDEBUG +# define cedr_assert(condition) do { \ + if ( ! (condition)) { \ + std::stringstream _ss_; \ + _ss_ << __FILE__ << ":" << __LINE__ << ": FAIL:\n" << #condition \ + << "\n"; \ + throw std::logic_error(_ss_.str()); \ + } \ + } while (0) +# define cedr_kernel_assert(condition) do { \ + if ( ! (condition)) \ + Kokkos::abort(#condition); \ + } while (0) +#else +# define cedr_assert(condition) +# define cedr_kernel_assert(condition) +#endif +#define cedr_throw_if(condition, message) do { \ + if (condition) { \ + std::stringstream _ss_; \ + _ss_ << __FILE__ << ":" << __LINE__ << ": The condition:\n" \ + << #condition "\nled to the exception\n" << message << "\n"; \ + throw std::logic_error(_ss_.str()); \ + } \ + } while (0) +#define cedr_kernel_throw_if(condition, message) do { \ + if (condition) \ + Kokkos::abort(#condition " led to the exception\n" message); \ + } while (0) + +inline Real reldif (const Real a, const Real b) +{ return std::abs(b - a)/std::max(std::abs(a), std::abs(b)); } + +Real reldif(const Real* a, const Real* b, const Int n); + +struct FILECloser { void operator() (FILE* fh) { fclose(fh); } }; + +} +} + +#endif diff --git a/qlt/make.inc.ws b/cedr/make.inc.ws similarity index 100% rename from qlt/make.inc.ws rename to cedr/make.inc.ws diff --git a/cedr/make_qltcpp.sh b/cedr/make_qltcpp.sh new file mode 100644 index 0000000..5362ec9 --- /dev/null +++ b/cedr/make_qltcpp.sh @@ -0,0 +1,10 @@ +# bash make_qltcpp.sh +# mpicxx -Wall -pedantic -fopenmp -std=c++11 -I/home/ambradl/lib/kokkos/cpu/include qlt.cpp -L/home/ambradl/lib/kokkos/cpu/lib -lkokkos -ldl +# OMP_PROC_BIND=false OMP_NUM_THREADS=2 mpirun -np 14 ./a.out -t + +(for f in cedr.hpp cedr_kokkos.hpp cedr_mpi.hpp cedr_util.hpp cedr_qlt.hpp cedr_local.hpp cedr_mpi_inl.hpp cedr_local_inl.hpp cedr_qlt_inl.hpp cedr_test.hpp cedr_util.cpp cedr_local.cpp cedr_mpi.cpp cedr_qlt.cpp cedr_test_1d_transport.cpp cedr_test.cpp; do + echo "//>> $f" + cat $f + echo "" +done) > qlt.cpp +sed sV'#include "cedr'V'//#include "cedr'V -i qlt.cpp diff --git a/qlt/readme.txt b/cedr/readme.txt similarity index 100% rename from qlt/readme.txt rename to cedr/readme.txt diff --git a/qlt/Makefile b/qlt/Makefile deleted file mode 100644 index fcd2b0e..0000000 --- a/qlt/Makefile +++ /dev/null @@ -1,23 +0,0 @@ -include make.inc - -CXXFLAGS=$(opt) -Wall -pedantic -fopenmp -std=c++11 -I$(KOKKOS)/include -DQLT_TIME -LDFLAGS=-fopenmp -L$(KOKKOS)/lib -lkokkos -ldl -LINK_LAPACK_BLAS=-llapack -lblas - -SOURCES=qlt.cpp qlt_test.cpp - -OBJECTS=$(SOURCES:.cpp=.o) - -.cpp.o: - $(MPICXX) $(CFLAGS) $(CXXFLAGS) -c $< -o $@ - -all: testqlt - -testqlt: qlt_test.o qlt.o - $(MPICXX) qlt_test.o qlt.o $(LDFLAGS) -o testqlt - -clean: - rm -f *.o testqlt - -qlt.o: qlt.hpp qlt_inline.hpp qlt_kokkos.hpp -qlt_test.o: qlt.hpp diff --git a/qlt/qlt_inline.hpp b/qlt/qlt_inline.hpp deleted file mode 100644 index a16aa6a..0000000 --- a/qlt/qlt_inline.hpp +++ /dev/null @@ -1,392 +0,0 @@ -#ifndef INCLUDE_QLT_INLINE_HPP -#define INCLUDE_QLT_INLINE_HPP - -#include - -namespace qlt { - -template KOKKOS_INLINE_FUNCTION -void QLT::set_rho (const Int& lclcellidx, const Real& rhom) { - const Int ndps = md_.a_d.prob2bl2r[md_.nprobtypes]; - bd_.l2r_data(ndps*lclcellidx) = rhom; -} - -template KOKKOS_INLINE_FUNCTION -void QLT::set_Q (const Int& lclcellidx, const Int& tracer_idx, - const Real& Qm, - const Real& Qm_min, const Real& Qm_max, - const Real Qm_prev) { - const Int ndps = md_.a_d.prob2bl2r[md_.nprobtypes]; - Real* bd; { - const Int bdi = md_.a_d.trcr2bl2r(tracer_idx); - bd = &bd_.l2r_data(ndps*lclcellidx + bdi); - } - bd[1] = Qm; - { - const Int problem_type = md_.a_d.trcr2prob(tracer_idx); - if (problem_type & ProblemType::shapepreserve) { - bd[0] = Qm_min; - bd[2] = Qm_max; - } else if (problem_type & ProblemType::consistent) { - const Real rhom = bd_.l2r_data(ndps*lclcellidx); - bd[0] = Qm_min / rhom; - bd[2] = Qm_max / rhom; - } else { - Kokkos::abort("set_Q: invalid problem_type."); - } - if (problem_type & ProblemType::conserve) { - if (Qm_prev < 0) Kokkos::abort("Qm_prev was not provided to set_Q."); - bd[3] = Qm_prev; - } - } -} - -template KOKKOS_INLINE_FUNCTION -Real QLT::get_Q (const Int& lclcellidx, const Int& tracer_idx) { - const Int ndps = md_.a_d.prob2br2l[md_.nprobtypes]; - const Int bdi = md_.a_d.trcr2br2l(tracer_idx); - return bd_.r2l_data(ndps*lclcellidx + bdi); -} - -namespace impl { -// GPU-friendly replacements for std::min/max. -template KOKKOS_INLINE_FUNCTION -const T& min (const T& a, const T& b) { return a < b ? a : b; } -template KOKKOS_INLINE_FUNCTION -const T& max (const T& a, const T& b) { return a > b ? a : b; } -} - -namespace slv { -KOKKOS_INLINE_FUNCTION -Real get_xbd (const Real* xbd, const Int i, const bool xbds_scalar) -{ return xbds_scalar ? *xbd : xbd[i]; } - -KOKKOS_INLINE_FUNCTION -bool is_inside (const Real xi, const Real* xlo, const Real* xhi, const Int i, - const bool xbds_scalar) { - return (xi > get_xbd(xlo, i, xbds_scalar) && - xi < get_xbd(xhi, i, xbds_scalar)); -} - -KOKKOS_INLINE_FUNCTION -bool is_outside (const Real xi, const Real* xlo, const Real* xhi, const Int i, - const bool xbds_scalar) { - return (xi < get_xbd(xlo, i, xbds_scalar) || - xi > get_xbd(xhi, i, xbds_scalar)); -} - -KOKKOS_INLINE_FUNCTION -Real calc_r_tol (const Real b, const Real* a, const Real* y, const Int n) { - Real ab = std::abs(b); - for (Int i = 0; i < n; ++i) ab = std::max(ab, std::abs(a[i]*y[i])); - return 1e1*std::numeric_limits::epsilon()*std::abs(ab); -} - -KOKKOS_INLINE_FUNCTION -void calc_r (const Int n, const Real* w, const Real* a, const Real b, - const Real* xlo, const Real* xhi, const bool xbds_scalar, - const Real* y, const Real& lambda, Real* x, Real& r, Real& r_lambda) { - r = 0; - r_lambda = 0; - for (Int i = 0; i < n; ++i) { - const Real q = a[i]/w[i]; - const Real x_trial = y[i] + lambda*q; - Real xtmp; - if (x_trial < (xtmp = get_xbd(xlo, i, xbds_scalar))) - x[i] = xtmp; - else if (x_trial > (xtmp = get_xbd(xhi, i, xbds_scalar))) - x[i] = xtmp; - else { - x[i] = x_trial; - r_lambda += a[i]*q; - } - r += a[i]*x[i]; - } - r -= b; -} - -// Solve -// min_x sum_i w(i) (x(i) - y(i))^2 -// st a' x = b -// xlo <= x <= xhi. -// This function assumes w > 0 to save a few operations. Return 0 on success and -// x == y, 1 on success and x != y, -1 if infeasible, -2 if max_its hit with no -// solution. See Section 3 of Bochev, Ridzal, Shashkov, Fast optimization-based -// conservative remap of scalar fields through aggregate mass transfer. lambda -// is used in check_1eq_bc_qp_foc. -//todo 2D version of this function that takes advantage of 2D. -KOKKOS_INLINE_FUNCTION -Int solve_1eq_bc_qp (const Int n, const Real* w, const Real* a, const Real b, - const Real* xlo, const Real* xhi, const bool xbds_scalar, - const Real* y, Real* x, const Int max_its = 100) { - const Real r_tol = calc_r_tol(b, a, y, n); - - { // Check for a quick exit. - bool all_in = true; - Real r = 0; - for (Int i = 0; i < n; ++i) { - if (is_outside(x[i], xlo, xhi, i, xbds_scalar)) { - all_in = false; - break; - } - r += a[i]*x[i]; - } - if (all_in) { - r -= b; - if (std::abs(r) <= r_tol) - return 0; - } - } - - { // Eval r at end points to check for feasibility, and also possibly a quick - // exit on a common case. - Real r = -b; - for (Int i = 0; i < n; ++i) { - x[i] = get_xbd(xlo, i, xbds_scalar); - r += a[i]*x[i]; - } - if (std::abs(r) <= r_tol) return 1; - if (r > 0) return -1; - r = -b; - for (Int i = 0; i < n; ++i) { - x[i] = get_xbd(xhi, i, xbds_scalar); - r += a[i]*x[i]; - } - if (std::abs(r) <= r_tol) return 1; - if (r < 0) return -1; - } - - { // Check for a quick exit: the bounds are so tight that the midpoint of the - // box satisfies r_tol. - Real r = -b; - for (Int i = 0; i < n; ++i) { - x[i] = 0.5*(get_xbd(xlo, i, xbds_scalar) + get_xbd(xhi, i, xbds_scalar)); - r += a[i]*x[i]; - } - if (std::abs(r) <= r_tol) return 1; - } - - const Real wall_dist = 1e-3; - - // Get lambda endpoints. - Real lamlo = 0, lamhi = 0; - for (Int i = 0; i < n; ++i) { - const Real rq = w[i]/a[i]; - const Real lamlo_i = rq*(get_xbd(xlo, i, xbds_scalar) - y[i]); - const Real lamhi_i = rq*(get_xbd(xhi, i, xbds_scalar) - y[i]); - if (i == 0) { - lamlo = lamlo_i; - lamhi = lamhi_i; - } else { - lamlo = impl::min(lamlo, lamlo_i); - lamhi = impl::max(lamhi, lamhi_i); - } - } - const Real lamlo_feas = lamlo, lamhi_feas = lamhi; - Real lambda = lamlo <= 0 && lamhi >= 0 ? 0 : lamlo; - - Int info = -2; - - // Bisection-safeguarded Newton iteration for r(lambda) = 0. - bool prev_step_bisect = false; - Int nbisect = 0; - for (Int iteration = 0; iteration < max_its; ++iteration) { - // Compute x, r, r_lambda. - Real r, r_lambda; - calc_r(n, w, a, b, xlo, xhi, xbds_scalar, y, lambda, x, r, r_lambda); - // Is r(lambda) - b sufficiently == 0? - if (std::abs(r) <= r_tol) { - info = 1; - break; - } - // Check if the lambda bounds are too close. - if (nbisect > 64) { - if (lamhi == lamhi_feas || lamlo == lamlo_feas) { - // r isn't small enough and one lambda bound is on the feasibility - // limit. The QP must not be feasible. - info = -1; - break; - } - info = 1; - break; - } - // Adjust lambda bounds. - if (r > 0) - lamhi = lambda; - else - lamlo = lambda; - if (r_lambda != 0) { - // Newton step. - lambda -= r/r_lambda; - } else { - // Force bisection. - lambda = lamlo; - } - // Safeguard. The wall distance check assures progress, but use it only - // every other potential bisection. - const Real D = prev_step_bisect ? 0 : wall_dist*(lamhi - lamlo); - if (lambda - lamlo < D || lamhi - lambda < D) { - lambda = 0.5*(lamlo + lamhi); - ++nbisect; - prev_step_bisect = true; - } else { - prev_step_bisect = false; - } - } - - return info; -} - -KOKKOS_INLINE_FUNCTION -void r2l_nl_adjust_bounds (Real Qm_bnd[2], const Real rhom[2], Real Qm_extra) { - Real q[2]; - for (Int i = 0; i < 2; ++i) q[i] = Qm_bnd[i] / rhom[i]; - if (Qm_extra < 0) { - Int i0, i1; - if (q[0] >= q[1]) { i0 = 0; i1 = 1; } else { i0 = 1; i1 = 0; } - const Real Qm_gap = (q[i1] - q[i0])*rhom[i0]; - if (Qm_gap <= Qm_extra) { - Qm_bnd[i0] += Qm_extra; - return; - } - } else { - Int i0, i1; - if (q[0] <= q[1]) { i0 = 0; i1 = 1; } else { i0 = 1; i1 = 0; } - const Real Qm_gap = (q[i1] - q[i0])*rhom[i0]; - if (Qm_gap >= Qm_extra) { - Qm_bnd[i0] += Qm_extra; - return; - } - } - { // Have to adjust both. Adjust so that the q bounds are the same. This - // procedure assures that as long as rhom is conservative, then the - // adjustment never pushes q_{min,max} out of the safety bounds. - const Real Qm_tot = Qm_bnd[0] + Qm_bnd[1] + Qm_extra; - const Real rhom_tot = rhom[0] + rhom[1]; - const Real q_tot = Qm_tot / rhom_tot; - for (Int i = 0; i < 2; ++i) - Qm_bnd[i] = q_tot*rhom[i]; - } -} - -KOKKOS_INLINE_FUNCTION -void r2l_l_adjust_bounds (const Int np, Real* q_min, Real* q_max, const Real* rhom, - Real Qm_extra) { - assert(0); // Not used right now, but want to eventually. Need to do some more analysis. - static constexpr int max_np = 16; - Real* const q_bnd = Qm_extra < 0 ? q_min : q_max; - // Try solving a QP that adjusts a q bound. - Real Qm = Qm_extra; - Real w[max_np], q_bnd_min[max_np], q_bnd_max[max_np], q_bnd_orig[max_np]; - q_bnd_min[0] = q_min[0]; - q_bnd_max[0] = q_max[0]; - for (Int i = 0; i < np; ++i) { - const Real rhomi = rhom[i]; - Qm += q_bnd[i]*rhomi; - q_bnd_orig[i] = q_bnd[i]; - w[i] = rhomi; - if (Qm_extra < 0) { - q_bnd_min[0] = impl::min(q_bnd_min[0], q_min[i]); - q_bnd_max[i] = q_max[i]; - } else { - q_bnd_min[i] = q_min[i]; - q_bnd_max[0] = impl::max(q_bnd_max[0], q_max[i]); - } - } - if (Qm_extra < 0) - for (Int i = 1; i < np; ++i) q_bnd_min[i] = q_bnd_min[0]; - else - for (Int i = 1; i < np; ++i) q_bnd_max[i] = q_bnd_max[0]; - // Check for feasibility. - bool feasible; { - Real Qm_lo = 0, Qm_hi = 0; - for (Int i = 0; i < np; ++i) { - Qm_lo += q_bnd_min[i]*w[i]; - Qm_hi += q_bnd_max[i]*w[i]; - } - feasible = Qm_lo <= Qm && Qm <= Qm_hi; - } - if (feasible) { - solve_1eq_bc_qp(np, w, w, Qm, q_bnd_min, q_bnd_max, false, q_bnd_orig, q_bnd); - } else { - // The QP isn't feasible, so set the bound to a constant. - Real rhom_tot = 0, Qm_tot = Qm_extra; - for (Int i = 0; i < np; ++i) { - const Real rhomi = rhom[i]; - rhom_tot += rhomi; - Qm_tot += q_bnd_orig[i]*rhomi; - } - const Real q_tot = Qm_tot / rhom_tot; - for (Int i = 0; i < np; ++i) - q_bnd[i] = q_tot; - //return; - // Assert that this constant is outside of all previous bound values. That's - // why the QP wasn't feasible. - if (Qm_extra < 0) - for (Int i = 0; i < np; ++i) - assert(q_tot <= q_bnd_orig[i]); - else - for (Int i = 0; i < np; ++i) - assert(q_tot >= q_bnd_orig[i]); - } -} - -KOKKOS_INLINE_FUNCTION -void solve_node_problem (const Real& rhom, const Real* pd, const Real& Qm, - const Real& rhom0, const Real* k0d, Real& Qm0, - const Real& rhom1, const Real* k1d, Real& Qm1) { - Real Qm_min_kids [] = {k0d[0], k1d[0]}; - Real Qm_orig_kids[] = {k0d[1], k1d[1]}; - Real Qm_max_kids [] = {k0d[2], k1d[2]}; - { // Set the target values so that mass gets redistributed in a relative sense - // rather than absolute. If a kid doesn't have much mass, don't give it too - // much. - const Real Qm_orig = pd[1], Qm_extra = Qm - Qm_orig; - if (Qm_orig != 0) - for (Int i = 0; i < 2; ++i) - Qm_orig_kids[i] += (Qm_orig_kids[i] / Qm_orig) * Qm_extra; - } - { // The ideal problem is not assuredly feasible. Test for feasibility. If not - // feasible, adjust bounds to solve the safety problem, which is assuredly - // feasible if the total density field rho is mass conserving (Q doesn't - // have to be mass conserving, of course; achieving mass conservation is one - // use for QLT). - const Real Qm_min = pd[0], Qm_max = pd[2]; - const bool lo = Qm < Qm_min, hi = Qm > Qm_max; - if (lo || hi) { - const Real rhom_kids[] = {rhom0, rhom1}; - r2l_nl_adjust_bounds(lo ? Qm_min_kids : Qm_max_kids, - rhom_kids, - Qm - (lo ? Qm_min : Qm_max)); - } - } - { // Solve the node's QP. - static const Real ones[] = {1, 1}; - Real Qm_kids[2] = {k0d[1], k1d[1]}; - solve_1eq_bc_qp(2, ones, ones, Qm, Qm_min_kids, Qm_max_kids, false, Qm_orig_kids, - Qm_kids); - Qm0 = Qm_kids[0]; - Qm1 = Qm_kids[1]; - } -} -} // namespace slv - -template KOKKOS_INLINE_FUNCTION -void QLT::solve_node_problem (const Int problem_type, - const Real& rhom, const Real* pd, const Real& Qm, - const Real& rhom0, const Real* k0d, Real& Qm0, - const Real& rhom1, const Real* k1d, Real& Qm1) { - if ( ! (problem_type & ProblemType::shapepreserve)) { - Real mpd[3], mk0d[3], mk1d[3]; - mpd[0] = pd [0]*rhom ; mpd [1] = pd[1] ; mpd [2] = pd [2]*rhom ; - mk0d[0] = k0d[0]*rhom0; mk0d[1] = k0d[1]; mk0d[2] = k0d[2]*rhom0; - mk1d[0] = k1d[0]*rhom1; mk1d[1] = k1d[1]; mk1d[2] = k1d[2]*rhom1; - slv::solve_node_problem(rhom, mpd, Qm, rhom0, mk0d, Qm0, rhom1, mk1d, Qm1); - return; - } - slv::solve_node_problem(rhom, pd, Qm, rhom0, k0d, Qm0, rhom1, k1d, Qm1); -} - -} // namespace qlt - -#endif diff --git a/qlt/qlt_test.cpp b/qlt/qlt_test.cpp deleted file mode 100644 index a465c27..0000000 --- a/qlt/qlt_test.cpp +++ /dev/null @@ -1,89 +0,0 @@ -#include "qlt.hpp" - -#include -#include - -#define throw_if(condition, message) do { \ - if (condition) { \ - std::stringstream _ss_; \ - _ss_ << __FILE__ << ":" << __LINE__ << ": The condition:\n" \ - << #condition "\nled to the exception\n" << message << "\n"; \ - throw std::logic_error(_ss_.str()); \ - } \ - } while (0) - -inline bool eq (const std::string& a, const char* const b1, const char* const b2 = 0) { - return (a == std::string(b1) || (b2 && a == std::string(b2)) || - a == std::string("-") + std::string(b1)); -} - -struct InputParser { - qlt::test::Input in; - - class ArgAdvancer { - const int argc_; - char const* const* argv_; - int i_; - public: - ArgAdvancer (int argc, char** argv) : argc_(argc), argv_(argv), i_(1) {} - const char* advance () { - if (i_+1 >= argc_) throw_if(true, "Command line is missing an argument."); - return argv_[++i_]; - } - const char* token () const { return argv_[i_]; } - void incr () { ++i_; } - bool more () const { return i_ < argc_; } - }; - - InputParser (int argc, char** argv, const qlt::Parallel::Ptr& p) { - in.unittest = false; - in.write = false; - in.ncells = 0; - in.ntracers = 1; - in.tracer_type = 0; - in.nrepeat = 1; - in.pseudorandom = false; - in.verbose = false; - for (ArgAdvancer aa(argc, argv); aa.more(); aa.incr()) { - const char* token = aa.token(); - if (eq(token, "-t", "--unittest")) in.unittest = true; - else if (eq(token, "-w", "--write")) in.write = true; - else if (eq(token, "-nc", "--ncells")) in.ncells = std::atoi(aa.advance()); - else if (eq(token, "-nt", "--ntracers")) in.ntracers = std::atoi(aa.advance()); - else if (eq(token, "-tt", "--tracertype")) in.tracer_type = std::atoi(aa.advance()); - else if (eq(token, "-nr", "--nrepeat")) in.nrepeat = std::atoi(aa.advance()); - else if (eq(token, "--random")) in.pseudorandom = true; - else if (eq(token, "-v", "--verbose")) in.verbose = true; - else throw_if(true, "Invalid token " << token); - } - throw_if(in.tracer_type < 0 || in.tracer_type >= 4, "Tracer type is out of bounds [0, 3]."); - throw_if(in.ntracers < 1, "Number of tracers is < 1."); - } - - void print (std::ostream& os) const { - os << "ncells " << in.ncells - << " nrepeat " << in.nrepeat; - if (in.pseudorandom) os << " random"; - os << "\n"; - } -}; - -int main (int argc, char** argv) { - int ret = 0; - MPI_Init(&argc, &argv); - auto p = qlt::make_parallel(MPI_COMM_WORLD); - srand(p->rank()); - Kokkos::initialize(argc, argv); - try { - InputParser inp(argc, argv, p); - if (p->amroot()) inp.print(std::cout); - ret = qlt::test::run(p, inp.in); - if (p->amroot()) std::cout << (ret != 0 ? "FAIL" : "PASS") << "\n"; - } catch (const std::exception& e) { - if (p->amroot()) - std::cerr << e.what(); - } - Kokkos::finalize_all(); - MPI_Finalize(); - return ret; -} From 242cf8acb0bb6f0520df006c62d6918357b69b7f Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Sun, 31 Dec 2017 18:37:03 -0700 Subject: [PATCH 20/28] CEDR: Add unit tests for the QP solvers, plus do more speedups on these. --- cedr/cedr_local.cpp | 197 ++++++++++++++++++++++++++++++- cedr/cedr_local.hpp | 5 + cedr/cedr_local_inl.hpp | 255 +++++++++++++++++++--------------------- cedr/cedr_qlt.cpp | 17 ++- cedr/cedr_qlt_inl.hpp | 11 +- cedr/cedr_test.cpp | 2 + 6 files changed, 336 insertions(+), 151 deletions(-) diff --git a/cedr/cedr_local.cpp b/cedr/cedr_local.cpp index dec8f00..1b5a017 100644 --- a/cedr/cedr_local.cpp +++ b/cedr/cedr_local.cpp @@ -4,12 +4,201 @@ namespace cedr { namespace local { -Int test_sort4 () {} +// Check the first-order optimality conditions. Return true if OK, false +// otherwise. If quiet, don't print anything. +bool check_1eq_bc_qp_foc ( + const char* label, const Int n, const Real* w, const Real* a, const Real b, + const Real* xlo, const Real* xhi, const Real* y, const Real* x, const bool verbose) +{ + auto& os = std::cout; + bool ok = true; + Real xtmp; + // Check the bound constraints. + for (Int i = 0; i < n; ++i) + if (x[i] < (xtmp = xlo[i])) { + if (verbose) + os << "x[" << i << "] = " << x[i] + << " but x[i] - xlo[i] = " << (x[i] - xtmp) << "\n"; + ok = false; + } + for (Int i = 0; i < n; ++i) + if (x[i] > (xtmp = xhi[i])) { + if (verbose) + os << "x[" << i << "] = " << x[i] + << " but xhi[i] - x[i] = " << (xtmp - x[i]) << "\n"; + ok = false; + } + // Check the equality constraint. + Real r = 0; + for (Int i = 0; i < n; ++i) + r += a[i]*x[i]; + r -= b; + if (std::abs(r) > impl::calc_r_tol(b, a, y, n)) { + if (verbose) + os << "r = " << r << "\n"; + ok = false; + } + // Check the gradient is 0 when projected into the constraints. Compute + // g = W (x - y) + // g_reduced = g - C ((C'C) \ (C'g)) + // where + // IA = I(:,A) + // C = [IA a], + // and A is the active set. + const Real padtol = 1e5*std::numeric_limits::epsilon(); + Real lambda = 0, den = 0; + for (Int i = 0; i < n; ++i) { + const Real pad = padtol*(xhi[i] - xlo[i]); + if (xlo[i] + pad <= x[i] && x[i] <= xhi[i] - pad) { + const Real gi = w[i]*(x[i] - y[i]); + lambda += a[i]*gi; + den += a[i]*a[i]; + } + } + lambda /= den; + Real normg = 0, normy = 0; + for (Int i = 0; i < n; ++i) { + normy += cedr::util::square(y[i]); + const Real pad = padtol*(xhi[i] - xlo[i]); + if (xlo[i] + pad <= x[i] && x[i] <= xhi[i] - pad) + normg += cedr::util::square(w[i]*(x[i] - y[i]) - a[i]*lambda); + } + normy = std::sqrt(normy); + normg = std::sqrt(normg); + const Real gtol = 1e2*std::numeric_limits::epsilon()*normy; + if (normg > gtol) { + if (verbose) + os << "norm(g) = " << normg << " gtol = " << gtol << "\n"; + ok = false; + } + // Check the gradient at the active boundaries. + for (Int i = 0; i < n; ++i) { + const bool onlo = x[i] == xlo[i]; + const bool onhi = onlo ? false : x[i] == xhi[i]; + if (onlo || onhi) { + const Real rg = w[i]*(x[i] - y[i]) - a[i]*lambda; + if (onlo && rg < -gtol) { + if (verbose) + os << "onlo but rg = " << rg << "\n"; + ok = false; + } else if (onhi && rg > gtol) { + if (verbose) + os << "onhi but rg = " << rg << "\n"; + ok = false; + } + } + } + if ( ! ok && verbose) + os << "label: " << label << "\n"; + return ok; +} Int unittest () { - Int ne, nerr = 0; - ne = test_sort4(); - if (ne) std::cerr << ""; + bool verbose = true; + Int nerr = 0; + + Int n; + static const Int N = 16; + Real w[N], a[N], b, xlo[N], xhi[N], y[N], x[N], al, au; + + auto run = [&] () { + const Int info = solve_1eq_bc_qp(n, w, a, b, xlo, xhi, y, x); + if (n == 2) { + // This version never returns 0. + Real x2[2]; + const Int info2 = solve_1eq_bc_qp_2d(w, a, b, xlo, xhi, y, x2); + if (info2 != 1 && (info == 0 || info == 1)) { + if (verbose) pr(puf(info) pu(info2)); + ++nerr; + } + const Real rd = cedr::util::reldif(x, x2, 2); + if (rd > 10*std::numeric_limits::epsilon()) { + if (verbose) + printf("%1.1e | %1.15e %1.15e | %1.15e %1.15e | %1.15e %1.15e\n", + rd, y[0], y[1], x[0], x[1], x2[0], x2[1]); + ++nerr; + } + } + const bool ok = check_1eq_bc_qp_foc("unittest", n, w, a, b, xlo, xhi, y, x, + verbose); + if ( ! ok) ++nerr; + }; + + auto gena = [&] () { + for (Int i = 0; i < n; ++i) + a[i] = 1;//0.1 + cedr::util::urand(); + }; + auto genw = [&] () { + for (Int i = 0; i < n; ++i) + w[i] = 1;//0.1 + cedr::util::urand(); + }; + auto genbnds = [&] () { + al = au = 0; + for (Int i = 0; i < n; ++i) { + xlo[i] = cedr::util::urand() - 0.5; + al += a[i]*xlo[i]; + xhi[i] = xlo[i] + cedr::util::urand(); + au += a[i]*xhi[i]; + } + }; + auto genb = [&] (const bool in) { + if (in) { + const Real alpha = cedr::util::urand(); + b = alpha*al + (1 - alpha)*au; + } else { + if (cedr::util::urand() > 0.5) + b = au + 0.01 + cedr::util::urand(); + else + b = al - 0.01 - cedr::util::urand(); + } + }; + auto geny = [&] (const bool in) { + if (in) { + for (Int i = 0; i < n; ++i) { + const Real alpha = cedr::util::urand(); + y[i] = alpha*xlo[i] + (1 - alpha)*xhi[i]; + } + } else if (cedr::util::urand() > 0.2) { + for (Int i = 1; i < n; i += 2) { + const Real alpha = cedr::util::urand(); + y[i] = alpha*xlo[i] + (1 - alpha)*xhi[i]; + cedr_assert(y[i] >= xlo[i] && y[i] <= xhi[i]); + } + for (Int i = 0; i < n; i += 4) + y[i] = xlo[i] - cedr::util::urand(); + for (Int i = 2; i < n; i += 4) + y[i] = xhi[i] + cedr::util::urand(); + } else { + for (Int i = 0; i < n; i += 2) + y[i] = xlo[i] - cedr::util::urand(); + for (Int i = 1; i < n; i += 2) + y[i] = xhi[i] + cedr::util::urand(); + } + }; + auto b4y = [&] () { + b = 0; + for (Int i = 0; i < n; ++i) + b += a[i]*y[i]; + }; + + for (n = 2; n <= 16; ++n) { + const Int count = n == 2 ? 100 : 10; + for (Int i = 0; i < count; ++i) { + gena(); + genw(); + genbnds(); + genb(true); + geny(true); + run(); + b4y(); + run(); + genb(true); + geny(false); + run(); + } + } + + return nerr; } } diff --git a/cedr/cedr_local.hpp b/cedr/cedr_local.hpp index ef6e924..7378ed8 100644 --- a/cedr/cedr_local.hpp +++ b/cedr/cedr_local.hpp @@ -20,6 +20,11 @@ Int solve_1eq_bc_qp(const Int n, const Real* w, const Real* a, const Real b, const Real* xlo, const Real* xhi, const Real* y, Real* x, const Int max_its = 100); +KOKKOS_INLINE_FUNCTION +Int solve_1eq_bc_qp_2d(const Real* w, const Real* a, const Real b, + const Real* xlo, const Real* xhi, + const Real* y, Real* x); + Int unittest(); } diff --git a/cedr/cedr_local_inl.hpp b/cedr/cedr_local_inl.hpp index 9734644..2ad940f 100644 --- a/cedr/cedr_local_inl.hpp +++ b/cedr/cedr_local_inl.hpp @@ -6,58 +6,88 @@ namespace cedr { namespace local { +namespace impl { KOKKOS_INLINE_FUNCTION -bool is_inside (const Real xi, const Real* xlo, const Real* xhi, const Int i) { - return (xi > xlo[i] && xi < xhi[i]); +Real calc_r_tol (const Real b, const Real* a, const Real* y, const Int n) { + Real ab = std::abs(b); + for (Int i = 0; i < n; ++i) ab = std::max(ab, std::abs(a[i]*y[i])); + return 1e1*std::numeric_limits::epsilon()*std::abs(ab); } +// Eval r at end points to check for feasibility, and also possibly a quick exit +// on a common case. Return -1 if infeasible, 1 if a corner is a solution, 0 if +// feasible and a corner is not. KOKKOS_INLINE_FUNCTION -bool is_outside (const Real xi, const Real* xlo, const Real* xhi, const Int i) { - return (xi < xlo[i] || xi > xhi[i]); +Int check_lu (const Int n, const Real* a, const Real& b, + const Real* xlo, const Real* xhi, const Real* y, const Real& r_tol, + Real* x) { + Real r = -b; + for (Int i = 0; i < n; ++i) { + x[i] = xlo[i]; + r += a[i]*x[i]; + } + if (std::abs(r) <= r_tol) return 1; + if (r > 0) return -1; + r = -b; + for (Int i = 0; i < n; ++i) { + x[i] = xhi[i]; + r += a[i]*x[i]; + } + if (std::abs(r) <= r_tol) return 1; + if (r < 0) return -1; + return 0; } -template KOKKOS_INLINE_FUNCTION -void sort4 (T* x) { - T buf[4]; - for (Int i = 0; i < 2; ++i) { - const Int j = 2*i; - if (x[j] <= x[j+1]) { buf[j] = x[j]; buf[j+1] = x[j+1]; } - else { buf[j] = x[j+1]; buf[j+1] = x[j]; } +void calc_r (const Int n, const Real* w, const Real* a, const Real b, + const Real* xlo, const Real* xhi, const Real* y, const Real& lambda, + Real* x, Real& r, Real& r_lambda) { + r = 0; + r_lambda = 0; + for (Int i = 0; i < n; ++i) { + const Real q = a[i]/w[i]; + const Real x_trial = y[i] + lambda*q; + Real xtmp; + if (x_trial < (xtmp = xlo[i])) + x[i] = xtmp; + else if (x_trial > (xtmp = xhi[i])) + x[i] = xtmp; + else { + x[i] = x_trial; + r_lambda += a[i]*q; + } + r += a[i]*x[i]; } - Int p0 = 0, p1 = 2; - for (Int i = 0; i < 4; ++i) - x[i] = (p1 >= 4 || (p0 < 2 && buf[p0] <= buf[p1]) ? - buf[p0++] : - buf[p1++]); - cedr_assert(p0 == 2 && p1 == 4); + r -= b; } +} // namespace impl // 2D special case for efficiency. KOKKOS_INLINE_FUNCTION -Int solve_1eq_bc_qp_2d (const Int n, const Real* w, const Real* a, const Real b, +Int solve_1eq_bc_qp_2d (const Real* w, const Real* a, const Real b, const Real* xlo, const Real* xhi, const Real* y, Real* x) { - cedr_assert(n == 2); + const Real r_tol = impl::calc_r_tol(b, a, y, 2); + Int info = impl::check_lu(2, a, b, xlo, xhi, y, r_tol, x); + if (info != 0) return info; { // Check if the optimal point ignoring bound constraints is in bounds. - Real q[2], qsum = 0; + Real qsum = 0, dm = b; for (int i = 0; i < 2; ++i) { - q[i] = a[i]/w[i]; - qsum += q[i]; - } - Real dm = b; - for (int i = 0; i < 2; ++i) + qsum += a[i]/w[i]; dm -= a[i]*y[i]; - bool good = true; + } + const Real fac = dm/qsum; + bool ok = true; for (int i = 0; i < 2; ++i) { - x[i] = y[i] + dm*(q[i]/qsum); - if (is_outside(x[i], xlo, xhi, i)) { - good = false; + x[i] = y[i] + fac*(a[i]/w[i]); + if (x[i] < xlo[i] || x[i] > xhi[i]) { + // Could be out due to numerics. + ok = false; break; } } - if (good) return dm == 0 ? 0 : 1; + if (ok) return 1; } // Solve for intersection of a'x = b, given by the parameterized line @@ -71,98 +101,85 @@ Int solve_1eq_bc_qp_2d (const Int n, const Real* w, const Real* a, const Real b, Real x_dir[] = {-a[1], a[0]}; // Get the 4 alpha values. - struct Alpha { - Real alpha; - Int side; - bool operator<= (const Alpha& a) const { return alpha <= a.alpha; } - }; - Alpha alphas[4]; - auto set_alpha = [&] (const Real* xbd, const Int& idx, const Int& side) { - alphas[side].alpha = (xbd[idx] - x_base[idx])/x_dir[idx]; - alphas[side].side = side; - }; - set_alpha(xlo, 1, 0); // bottom - set_alpha(xhi, 0, 1); // right - set_alpha(xhi, 1, 2); // top - set_alpha(xlo, 0, 3); // left + Real alphas[4]; + alphas[0] = (xlo[1] - x_base[1])/x_dir[1]; // bottom + alphas[1] = (xhi[0] - x_base[0])/x_dir[0]; // right + alphas[2] = (xhi[1] - x_base[1])/x_dir[1]; // top + alphas[3] = (xlo[0] - x_base[0])/x_dir[0]; // left - // Sort alphas. The middle two bound the feasible interval. - sort4(alphas); + // Find the middle two in the sorted alphas. + Real min = alphas[0], max = min; + Int imin = 0, imax = 0; + for (Int i = 1; i < 4; ++i) { + const Real alpha = alphas[i]; + if (alpha < min) { min = alpha; imin = i; } + if (alpha > max) { max = alpha; imax = i; } + } + Int ais[2]; + Int cnt = 0; + for (Int i = 0; i < 4; ++i) + if (i != imin && i != imax) { + ais[cnt++] = i; + if (cnt == 2) break; + } - // Eval the middle two and record the better of the them. - auto eval_xi = [&] (const Real& alpha, const Int& idx) { - return x_base[idx] + alpha*x_dir[idx]; - }; - auto eval_obj = [&] (const Real& alpha) { + Real objs[2]; + Real alpha_mid = 0; + for (Int j = 0; j < 2; ++j) { + const Real alpha = alphas[ais[j]]; + alpha_mid += alpha; Real obj = 0; for (Int i = 0; i < 2; ++i) { - x[i] = eval_xi(alpha, i); + x[i] = x_base[i] + alpha*x_dir[i]; obj += w[i]*cedr::util::square(y[i] - x[i]); } - return obj; - }; - const Int ai = eval_obj(alphas[1].alpha) <= eval_obj(alphas[2].alpha) ? 1 : 2; + objs[j] = obj; + } - Int info = 1, clipidx = 0; - const auto& aai = alphas[ai]; - switch (aai.side) { - case 0: x[0] = eval_xi(aai.alpha, 0); x[1] = xlo[1]; clipidx = 0; break; - case 1: x[0] = xhi[0]; x[1] = eval_xi(aai.alpha, 1); clipidx = 1; break; - case 2: x[0] = eval_xi(aai.alpha, 0); x[1] = xhi[1]; clipidx = 0; break; - case 3: x[0] = xlo[0]; x[1] = eval_xi(aai.alpha, 1); clipidx = 1; break; + const Int ai = ais[objs[0] <= objs[1] ? 0 : 1]; + + info = 1; + Int clipidx = 0; + const Real alpha = alphas[ai]; + switch (ai) { + case 0: case 2: + x[0] = x_base[0] + alpha*x_dir[0]; + x[1] = ai == 0 ? xlo[1] : xhi[1]; + clipidx = 0; + break; + case 1: case 3: + x[0] = ai == 1 ? xhi[0] : xlo[0]; + x[1] = x_base[1] + alpha*x_dir[1]; + clipidx = 1; + break; default: cedr_assert(0); info = -2; } x[clipidx] = cedr::impl::min(xhi[clipidx], cedr::impl::max(xlo[clipidx], x[clipidx])); return info; } -KOKKOS_INLINE_FUNCTION -Real calc_r_tol (const Real b, const Real* a, const Real* y, const Int n) { - Real ab = std::abs(b); - for (Int i = 0; i < n; ++i) ab = std::max(ab, std::abs(a[i]*y[i])); - return 1e1*std::numeric_limits::epsilon()*std::abs(ab); -} - -KOKKOS_INLINE_FUNCTION -void calc_r (const Int n, const Real* w, const Real* a, const Real b, - const Real* xlo, const Real* xhi, const Real* y, const Real& lambda, - Real* x, Real& r, Real& r_lambda) { - r = 0; - r_lambda = 0; - for (Int i = 0; i < n; ++i) { - const Real q = a[i]/w[i]; - const Real x_trial = y[i] + lambda*q; - Real xtmp; - if (x_trial < (xtmp = xlo[i])) - x[i] = xtmp; - else if (x_trial > (xtmp = xhi[i])) - x[i] = xtmp; - else { - x[i] = x_trial; - r_lambda += a[i]*q; - } - r += a[i]*x[i]; - } - r -= b; -} - KOKKOS_INLINE_FUNCTION Int solve_1eq_bc_qp (const Int n, const Real* w, const Real* a, const Real b, const Real* xlo, const Real* xhi, const Real* y, Real* x, const Int max_its) { - const Real r_tol = calc_r_tol(b, a, y, n); - Int info; + const Real r_tol = impl::calc_r_tol(b, a, y, n); + Int info = impl::check_lu(n, a, b, xlo, xhi, y, r_tol, x); + if (info != 0) return info; + + for (int i = 0; i < n; ++i) + if (x[i] != y[i]) { + info = 1; + x[i] = y[i]; + } + // In our use case, the caller has already checked (more cheaply) for a quick + // exit. +#if 0 { // Check for a quick exit. bool all_in = true; Real r = 0; - info = 0; for (Int i = 0; i < n; ++i) { - if (x[i] != y[i]) { - x[i] = y[i]; - info = 1; - } - if (is_outside(x[i], xlo, xhi, i)) { + if (x[i] < xlo[i] || x[i] > xhi[i]) { all_in = false; break; } @@ -174,37 +191,7 @@ Int solve_1eq_bc_qp (const Int n, const Real* w, const Real* a, const Real b, return info; } } - - if (n == 2) - return solve_1eq_bc_qp_2d(n, w, a, b, xlo, xhi, y, x); - - { // Eval r at end points to check for feasibility, and also possibly a quick - // exit on a common case. - Real r = -b; - for (Int i = 0; i < n; ++i) { - x[i] = xlo[i]; - r += a[i]*x[i]; - } - if (std::abs(r) <= r_tol) return 1; - if (r > 0) return -1; - r = -b; - for (Int i = 0; i < n; ++i) { - x[i] = xhi[i]; - r += a[i]*x[i]; - } - if (std::abs(r) <= r_tol) return 1; - if (r < 0) return -1; - } - - { // Check for a quick exit: the bounds are so tight that the midpoint of the - // box satisfies r_tol. - Real r = -b; - for (Int i = 0; i < n; ++i) { - x[i] = 0.5*(xlo[i] + xhi[i]); - r += a[i]*x[i]; - } - if (std::abs(r) <= r_tol) return 1; - } +#endif const Real wall_dist = 1e-3; @@ -218,8 +205,8 @@ Int solve_1eq_bc_qp (const Int n, const Real* w, const Real* a, const Real b, lamlo = lamlo_i; lamhi = lamhi_i; } else { - lamlo = impl::min(lamlo, lamlo_i); - lamhi = impl::max(lamhi, lamhi_i); + lamlo = cedr::impl::min(lamlo, lamlo_i); + lamhi = cedr::impl::max(lamhi, lamhi_i); } } const Real lamlo_feas = lamlo, lamhi_feas = lamhi; @@ -232,7 +219,7 @@ Int solve_1eq_bc_qp (const Int n, const Real* w, const Real* a, const Real b, for (Int iteration = 0; iteration < max_its; ++iteration) { // Compute x, r, r_lambda. Real r, r_lambda; - calc_r(n, w, a, b, xlo, xhi, y, lambda, x, r, r_lambda); + impl::calc_r(n, w, a, b, xlo, xhi, y, lambda, x, r, r_lambda); // Is r(lambda) - b sufficiently == 0? if (std::abs(r) <= r_tol) { info = 1; diff --git a/cedr/cedr_qlt.cpp b/cedr/cedr_qlt.cpp index a917e5e..b677692 100644 --- a/cedr/cedr_qlt.cpp +++ b/cedr/cedr_qlt.cpp @@ -708,7 +708,6 @@ void QLT::run () { Timer::stop(Timer::waitall); // Combine kids' data. //todo Kernelize, interacting with waitany todo above. - Timer::start(Timer::snp); for (const auto& n : lvl.nodes) { if ( ! n->nkids) continue; cedr_kernel_assert(n->nkids == 2); @@ -734,7 +733,6 @@ void QLT::run () { } } } - Timer::stop(Timer::snp); // Send to parents. for (size_t i = 0; i < lvl.me.size(); ++i) { const auto& mmd = lvl.me[i]; @@ -997,9 +995,9 @@ class TestQLT { q_max = std::min(1, q_min + (0.9 - q_min)*urand()), q = q_min + (q_max - q_min)*urand(); // Check correctness up to FP. - assert(q_min >= 0 && - q_max <= 1 + 10*std::numeric_limits::epsilon() && - q_min <= q && q <= q_max); + cedr_assert(q_min >= 0 && + q_max <= 1 + 10*std::numeric_limits::epsilon() && + q_min <= q && q <= q_max); Qm_min[i] = q_min*rhom[i]; Qm_max[i] = q_max*rhom[i]; // Protect against FP error. @@ -1214,7 +1212,7 @@ class TestQLT { } static Int check (const Parallel& p, const std::vector& ts, const Values& v) { - static const bool details = false; + static const bool details = true; static const Real ulp2 = 2*std::numeric_limits::epsilon(); Int nerr = 0; std::vector lcl_mass(2*ts.size()), q_min_lcl(ts.size()), q_max_lcl(ts.size()); @@ -1231,7 +1229,10 @@ class TestQLT { q_min_lcl[ti] = 1; q_max_lcl[ti] = 0; for (Int i = 0; i < n; ++i) { - const bool lv = Qm[i] < Qm_min[i] || Qm[i] > Qm_max[i]; + // I believe this should hold exactly, but at least once I saw a single + // bit difference. Relax to 2 ulp and think more. + const bool lv = (Qm[i] < Qm_min[i]*(1 - ulp2) || + Qm[i] > Qm_max[i]*(1 + ulp2)); if (lv) local_violated[ti] = 1; if ( ! safe_only && lv) { if (details) @@ -1577,8 +1578,6 @@ Int run_unit_and_randomized_tests (const Parallel::Ptr& p, const Input& in) { nerr += ne; if (p->amroot()) std::cout << "\n"; } - if (nerr) - return nerr; // Performance test. if (in.perftest && in.ncells > 0) { oned::Mesh m(in.ncells, p, diff --git a/cedr/cedr_qlt_inl.hpp b/cedr/cedr_qlt_inl.hpp index 7e8baf5..645352a 100644 --- a/cedr/cedr_qlt_inl.hpp +++ b/cedr/cedr_qlt_inl.hpp @@ -105,8 +105,11 @@ void solve_node_problem (const Real& rhom, const Real* pd, const Real& Qm, rhom_kids, Qm - (lo ? Qm_min : Qm_max)); } else { - // Quick exit if everything is OK as is. - if (Qm == pd[1] && // Was our total tracer mass wasn't adjusted? + // Quick exit if everything is OK as is. This is a speedup, and it also + // lets the subnode solver make ~1 ulp changes instead of having to keep x + // = y if y satisfies the conditions. Without this block, the + // no_change_should_hold tests can fail. + if (Qm == pd[1] && // Was our total tracer mass adjusted? // Are the kids' problems feasible? Qm_orig_kids[0] >= Qm_min_kids[0] && Qm_orig_kids[0] <= Qm_max_kids[0] && Qm_orig_kids[1] >= Qm_min_kids[1] && Qm_orig_kids[1] <= Qm_max_kids[1]) { @@ -122,8 +125,8 @@ void solve_node_problem (const Real& rhom, const Real* pd, const Real& Qm, static const Real ones[] = {1, 1}; const Real w[] = {1/rhom0, 1/rhom1}; Real Qm_kids[2] = {k0d[1], k1d[1]}; - local::solve_1eq_bc_qp(2, w, ones, Qm, Qm_min_kids, Qm_max_kids, - Qm_orig_kids, Qm_kids); + local::solve_1eq_bc_qp_2d(w, ones, Qm, Qm_min_kids, Qm_max_kids, + Qm_orig_kids, Qm_kids); Qm0 = Qm_kids[0]; Qm1 = Qm_kids[1]; } diff --git a/cedr/cedr_test.cpp b/cedr/cedr_test.cpp index 9e6aa41..1b9e217 100644 --- a/cedr/cedr_test.cpp +++ b/cedr/cedr_test.cpp @@ -81,6 +81,8 @@ int main (int argc, char** argv) { try { cedr::InputParser inp(argc, argv, p); if (p->amroot()) inp.print(std::cout); + if (inp.qin.unittest) + ret += cedr::local::unittest(); if (inp.qin.unittest || inp.qin.perftest) ret += cedr::qlt::test::run_unit_and_randomized_tests(p, inp.qin); if (inp.tin.ncells > 0) From e10c84560e545aa42ce1b0f595c1ca4e88c7a53b Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Mon, 1 Jan 2018 15:59:59 -0700 Subject: [PATCH 21/28] CEDR: Add local ClipAndAssuredSum and unit test. --- cedr/Makefile | 1 + cedr/cedr_local.cpp | 38 ++++++++++++++++++++-------- cedr/cedr_local.hpp | 6 +++++ cedr/cedr_local_inl.hpp | 44 +++++++++++++++++++++++++++++---- cedr/cedr_test.cpp | 13 +++++----- cedr/cedr_test_1d_transport.cpp | 2 ++ 6 files changed, 83 insertions(+), 21 deletions(-) diff --git a/cedr/Makefile b/cedr/Makefile index 64f28d6..f621243 100644 --- a/cedr/Makefile +++ b/cedr/Makefile @@ -23,3 +23,4 @@ clean: cedr_qlt.o: cedr_mpi.hpp cedr_mpi_inl.hpp cedr_local.hpp cedr_local_inl.hpp \ cedr_qlt.hpp cedr_qlt_inl.hpp cedr_kokkos.hpp cedr_util.hpp cedr_test.o: cedr_qlt.hpp cedr_util.hpp +cedr_local.o: cedr_local_inl.hpp diff --git a/cedr/cedr_local.cpp b/cedr/cedr_local.cpp index 1b5a017..d677f8c 100644 --- a/cedr/cedr_local.cpp +++ b/cedr/cedr_local.cpp @@ -3,7 +3,7 @@ namespace cedr { namespace local { - +namespace test { // Check the first-order optimality conditions. Return true if OK, false // otherwise. If quiet, don't print anything. bool check_1eq_bc_qp_foc ( @@ -65,7 +65,7 @@ bool check_1eq_bc_qp_foc ( } normy = std::sqrt(normy); normg = std::sqrt(normg); - const Real gtol = 1e2*std::numeric_limits::epsilon()*normy; + const Real gtol = 1e4*std::numeric_limits::epsilon()*normy; if (normg > gtol) { if (verbose) os << "norm(g) = " << normg << " gtol = " << gtol << "\n"; @@ -92,6 +92,7 @@ bool check_1eq_bc_qp_foc ( os << "label: " << label << "\n"; return ok; } +} // namespace test Int unittest () { bool verbose = true; @@ -103,6 +104,10 @@ Int unittest () { auto run = [&] () { const Int info = solve_1eq_bc_qp(n, w, a, b, xlo, xhi, y, x); + const bool ok = test::check_1eq_bc_qp_foc( + "unittest", n, w, a, b, xlo, xhi, y, x, verbose); + if ( ! ok) ++nerr; + if (n == 2) { // This version never returns 0. Real x2[2]; @@ -112,25 +117,38 @@ Int unittest () { ++nerr; } const Real rd = cedr::util::reldif(x, x2, 2); - if (rd > 10*std::numeric_limits::epsilon()) { + if (rd > 1e2*std::numeric_limits::epsilon()) { if (verbose) - printf("%1.1e | %1.15e %1.15e | %1.15e %1.15e | %1.15e %1.15e\n", - rd, y[0], y[1], x[0], x[1], x2[0], x2[1]); + printf("%1.1e | y %1.15e %1.15e | x %1.15e %1.15e | " + "x2 %1.15e %1.15e | l %1.15e %1.15e | u %1.15e %1.15e\n", + rd, y[0], y[1], x[0], x[1], x2[0], x2[1], + xlo[0], xlo[1], xhi[0], xhi[1]); ++nerr; } } - const bool ok = check_1eq_bc_qp_foc("unittest", n, w, a, b, xlo, xhi, y, x, - verbose); - if ( ! ok) ++nerr; + + caas(n, a, b, xlo, xhi, y, x); + Real m = 0, den = 0; + for (Int i = 0; i < n; ++i) { + m += a[i]*x[i]; + den += std::abs(a[i]*x[i]); + if (x[i] < xlo[i]) ++nerr; + else if (x[i] > xhi[i]) ++nerr; + } + const Real rd = std::abs(b - m)/den; + if (rd > 1e3*std::numeric_limits::epsilon()) { + if (verbose) pr(puf(rd) pu(n) pu(b) pu(m)); + ++nerr; + } }; auto gena = [&] () { for (Int i = 0; i < n; ++i) - a[i] = 1;//0.1 + cedr::util::urand(); + a[i] = 0.1 + cedr::util::urand(); }; auto genw = [&] () { for (Int i = 0; i < n; ++i) - w[i] = 1;//0.1 + cedr::util::urand(); + w[i] = 0.1 + cedr::util::urand(); }; auto genbnds = [&] () { al = au = 0; diff --git a/cedr/cedr_local.hpp b/cedr/cedr_local.hpp index 7378ed8..e6a6ba8 100644 --- a/cedr/cedr_local.hpp +++ b/cedr/cedr_local.hpp @@ -25,6 +25,12 @@ Int solve_1eq_bc_qp_2d(const Real* w, const Real* a, const Real b, const Real* xlo, const Real* xhi, const Real* y, Real* x); +// ClipAndAssuredSum. Does not check for feasibility. +KOKKOS_INLINE_FUNCTION +void caas(const Int n, const Real* a, const Real b, + const Real* xlo, const Real* xhi, + const Real* y, Real* x); + Int unittest(); } diff --git a/cedr/cedr_local_inl.hpp b/cedr/cedr_local_inl.hpp index 2ad940f..5c3c867 100644 --- a/cedr/cedr_local_inl.hpp +++ b/cedr/cedr_local_inl.hpp @@ -72,17 +72,17 @@ Int solve_1eq_bc_qp_2d (const Real* w, const Real* a, const Real b, if (info != 0) return info; { // Check if the optimal point ignoring bound constraints is in bounds. - Real qsum = 0, dm = b; + Real qmass = 0, dm = b; for (int i = 0; i < 2; ++i) { - qsum += a[i]/w[i]; + const Real qi = a[i]/w[i]; + qmass += a[i]*qi; dm -= a[i]*y[i]; } - const Real fac = dm/qsum; + const Real lambda = dm/qmass; bool ok = true; for (int i = 0; i < 2; ++i) { - x[i] = y[i] + fac*(a[i]/w[i]); + x[i] = y[i] + lambda*(a[i]/w[i]); if (x[i] < xlo[i] || x[i] > xhi[i]) { - // Could be out due to numerics. ok = false; break; } @@ -263,6 +263,40 @@ Int solve_1eq_bc_qp (const Int n, const Real* w, const Real* a, const Real b, return info; } +KOKKOS_INLINE_FUNCTION +void caas (const Int n, const Real* a, const Real b, + const Real* xlo, const Real* xhi, + const Real* y, Real* x) { + Real dm = b; + for (Int i = 0; i < n; ++i) { + x[i] = cedr::impl::max(xlo[i], cedr::impl::min(xhi[i], y[i])); + dm -= a[i]*x[i]; + } + if (dm == 0) return; + if (dm > 0) { + Real fac = 0; + for (Int i = 0; i < n; ++i) + fac += a[i]*(xhi[i] - x[i]); + if (fac > 0) { + fac = dm/fac; + for (Int i = 0; i < n; ++i) + x[i] += fac*(xhi[i] - x[i]); + } + } else if (dm < 0) { + Real fac = 0; + for (Int i = 0; i < n; ++i) + fac += a[i]*(x[i] - xlo[i]); + if (fac > 0) { + fac = dm/fac; + for (Int i = 0; i < n; ++i) + x[i] += fac*(x[i] - xlo[i]); + } + } + // Clip again for numerics. + for (Int i = 0; i < n; ++i) + x[i] = cedr::impl::max(xlo[i], cedr::impl::min(xhi[i], x[i])); +} + } // namespace local } // namespace cedr diff --git a/cedr/cedr_test.cpp b/cedr/cedr_test.cpp index 1b9e217..91faecd 100644 --- a/cedr/cedr_test.cpp +++ b/cedr/cedr_test.cpp @@ -73,7 +73,7 @@ struct InputParser { } // namespace cedr int main (int argc, char** argv) { - int ret = 0; + int nerr = 0; MPI_Init(&argc, &argv); auto p = cedr::mpi::make_parallel(MPI_COMM_WORLD); srand(p->rank()); @@ -82,17 +82,18 @@ int main (int argc, char** argv) { cedr::InputParser inp(argc, argv, p); if (p->amroot()) inp.print(std::cout); if (inp.qin.unittest) - ret += cedr::local::unittest(); + nerr += cedr::local::unittest(); if (inp.qin.unittest || inp.qin.perftest) - ret += cedr::qlt::test::run_unit_and_randomized_tests(p, inp.qin); + nerr += cedr::qlt::test::run_unit_and_randomized_tests(p, inp.qin); if (inp.tin.ncells > 0) - ret += cedr::test::transport1d::run(p, inp.tin); - if (p->amroot()) std::cout << (ret != 0 ? "FAIL" : "PASS") << "\n"; + nerr += cedr::test::transport1d::run(p, inp.tin); + if (p->amroot()) std::cout << (nerr != 0 ? "FAIL" : "PASS") << "\n"; } catch (const std::exception& e) { if (p->amroot()) std::cerr << e.what(); } Kokkos::finalize_all(); + if (nerr) prc(nerr); MPI_Finalize(); - return ret; + return 0; } diff --git a/cedr/cedr_test_1d_transport.cpp b/cedr/cedr_test_1d_transport.cpp index e1dc92f..119d5ce 100644 --- a/cedr/cedr_test_1d_transport.cpp +++ b/cedr/cedr_test_1d_transport.cpp @@ -38,7 +38,9 @@ void cubic_interp_periodic ( Int* const dod) { const int nc = nx - 1; +#ifdef _OPENMP # pragma omp parallel for +#endif for (Int j = 0; j < nxi; ++j) { const Real xi_per = to_periodic_core(x[0], x[nc], xi[j]); Int ip1 = std::upper_bound(x, x + nx, xi_per) - x; From ec408c4c2f231cbd714fdd9a3131ab0b0bab6b9e Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Mon, 1 Jan 2018 19:58:43 -0700 Subject: [PATCH 22/28] CEDR: Improve testing. Use reduction for final PASS/FAIL message. Resolve the roundoff-error issue that was puzzling me. There's a quantity I expect to be exact, but it was off by a few ULPs sometimes. Found and fixed that. --- cedr/cedr_local.cpp | 2 +- cedr/cedr_qlt.cpp | 13 ++++++------- cedr/cedr_qlt_inl.hpp | 13 +++++++++---- cedr/cedr_test.cpp | 7 ++++++- 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/cedr/cedr_local.cpp b/cedr/cedr_local.cpp index d677f8c..15167e5 100644 --- a/cedr/cedr_local.cpp +++ b/cedr/cedr_local.cpp @@ -117,7 +117,7 @@ Int unittest () { ++nerr; } const Real rd = cedr::util::reldif(x, x2, 2); - if (rd > 1e2*std::numeric_limits::epsilon()) { + if (rd > 1e4*std::numeric_limits::epsilon()) { if (verbose) printf("%1.1e | y %1.15e %1.15e | x %1.15e %1.15e | " "x2 %1.15e %1.15e | l %1.15e %1.15e | u %1.15e %1.15e\n", diff --git a/cedr/cedr_qlt.cpp b/cedr/cedr_qlt.cpp index b677692..7ad1f2d 100644 --- a/cedr/cedr_qlt.cpp +++ b/cedr/cedr_qlt.cpp @@ -1213,7 +1213,7 @@ class TestQLT { static Int check (const Parallel& p, const std::vector& ts, const Values& v) { static const bool details = true; - static const Real ulp2 = 2*std::numeric_limits::epsilon(); + static const Real ulp3 = 3*std::numeric_limits::epsilon(); Int nerr = 0; std::vector lcl_mass(2*ts.size()), q_min_lcl(ts.size()), q_max_lcl(ts.size()); std::vector t_ok(ts.size(), 1), local_violated(ts.size(), 0); @@ -1229,12 +1229,11 @@ class TestQLT { q_min_lcl[ti] = 1; q_max_lcl[ti] = 0; for (Int i = 0; i < n; ++i) { - // I believe this should hold exactly, but at least once I saw a single - // bit difference. Relax to 2 ulp and think more. - const bool lv = (Qm[i] < Qm_min[i]*(1 - ulp2) || - Qm[i] > Qm_max[i]*(1 + ulp2)); + const bool lv = (Qm[i] < Qm_min[i] || Qm[i] > Qm_max[i]); if (lv) local_violated[ti] = 1; if ( ! safe_only && lv) { + // If this fails at ~ machine eps, check r2l_nl_adjust_bounds code in + // solve_node_problem. if (details) pr("check q " << t.str() << ": " << Qm[i] << " " << (Qm[i] < Qm_min[i] ? Qm[i] - Qm_min[i] : Qm[i] - Qm_max[i])); @@ -1271,8 +1270,8 @@ class TestQLT { * Qm_max = v.Qm_max(t.idx); const Real q_min = q_min_gbl[ti], q_max = q_max_gbl[ti]; for (Int i = 0; i < n; ++i) { - if (Qm[i] < q_min*rhom[i]*(1 - ulp2) || - Qm[i] > q_max*rhom[i]*(1 + ulp2)) { + if (Qm[i] < q_min*rhom[i]*(1 - ulp3) || + Qm[i] > q_max*rhom[i]*(1 + ulp3)) { if (details) pr("check q " << t.str() << ": " << q_min*rhom[i] << " " << Qm_min[i] << " " << Qm[i] << " " << Qm_max[i] << " " << q_max*rhom[i] << " | " << diff --git a/cedr/cedr_qlt_inl.hpp b/cedr/cedr_qlt_inl.hpp index 645352a..0c1ff4a 100644 --- a/cedr/cedr_qlt_inl.hpp +++ b/cedr/cedr_qlt_inl.hpp @@ -100,10 +100,15 @@ void solve_node_problem (const Real& rhom, const Real* pd, const Real& Qm, const Real Qm_min = pd[0], Qm_max = pd[2]; const bool lo = Qm < Qm_min, hi = Qm > Qm_max; if (lo || hi) { - const Real rhom_kids[] = {rhom0, rhom1}; - r2l_nl_adjust_bounds(lo ? Qm_min_kids : Qm_max_kids, - rhom_kids, - Qm - (lo ? Qm_min : Qm_max)); + // If the discrepancy is numerical noise, don't act on it. + const Real tol = 10*std::numeric_limits::epsilon(); + const Real discrepancy = lo ? Qm_min - Qm : Qm - Qm_max; + if (discrepancy > tol*Qm_max) { + const Real rhom_kids[] = {rhom0, rhom1}; + r2l_nl_adjust_bounds(lo ? Qm_min_kids : Qm_max_kids, + rhom_kids, + Qm - (lo ? Qm_min : Qm_max)); + } } else { // Quick exit if everything is OK as is. This is a speedup, and it also // lets the subnode solver make ~1 ulp changes instead of having to keep x diff --git a/cedr/cedr_test.cpp b/cedr/cedr_test.cpp index 91faecd..94010c5 100644 --- a/cedr/cedr_test.cpp +++ b/cedr/cedr_test.cpp @@ -87,7 +87,12 @@ int main (int argc, char** argv) { nerr += cedr::qlt::test::run_unit_and_randomized_tests(p, inp.qin); if (inp.tin.ncells > 0) nerr += cedr::test::transport1d::run(p, inp.tin); - if (p->amroot()) std::cout << (nerr != 0 ? "FAIL" : "PASS") << "\n"; + { + int gnerr; + cedr::mpi::reduce(*p, &nerr, &gnerr, 1, MPI_SUM, p->root()); + if (p->amroot()) + std::cout << (gnerr != 0 ? "FAIL" : "PASS") << "\n"; + } } catch (const std::exception& e) { if (p->amroot()) std::cerr << e.what(); From 87414aa76c8ed61e93620d5299ca52538846c462 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Sun, 14 Jan 2018 14:55:12 -0700 Subject: [PATCH 23/28] Start implementation of global ClipAndAssuredSum. Refactor randomized tester so I can use it for multiple CDRs. Add an abstract CDR interface. Start CAAS impl, but with immediate WIP throw message on construction. --- cedr/Makefile | 11 +- cedr/cedr.hpp | 20 +- cedr/cedr_caas.cpp | 83 ++++++ cedr/cedr_caas.hpp | 62 ++++ cedr/cedr_caas_inl.hpp | 31 ++ cedr/cedr_cdr.hpp | 57 ++++ cedr/cedr_mpi.hpp | 66 ++--- cedr/cedr_mpi_inl.hpp | 55 ++++ cedr/cedr_qlt.cpp | 491 ++------------------------------ cedr/cedr_qlt.hpp | 57 ++-- cedr/cedr_test.hpp | 22 ++ cedr/cedr_test_1d_transport.cpp | 4 +- cedr/cedr_test_randomized.cpp | 413 +++++++++++++++++++++++++++ cedr/cedr_test_randomized.hpp | 119 ++++++++ cedr/make_qltcpp.sh | 2 +- 15 files changed, 942 insertions(+), 551 deletions(-) create mode 100644 cedr/cedr_caas.cpp create mode 100644 cedr/cedr_caas.hpp create mode 100644 cedr/cedr_caas_inl.hpp create mode 100644 cedr/cedr_cdr.hpp create mode 100644 cedr/cedr_test.hpp create mode 100644 cedr/cedr_test_randomized.cpp create mode 100644 cedr/cedr_test_randomized.hpp diff --git a/cedr/Makefile b/cedr/Makefile index f621243..01bca9f 100644 --- a/cedr/Makefile +++ b/cedr/Makefile @@ -4,8 +4,8 @@ CXXFLAGS=$(opt) -Wall -pedantic -fopenmp -std=c++11 -I$(KOKKOS)/include -DQLT_TI LDFLAGS=-fopenmp -L$(KOKKOS)/lib -lkokkos -ldl LINK_LAPACK_BLAS=-llapack -lblas -SOURCES=cedr_qlt.cpp cedr_test.cpp cedr_mpi.cpp cedr_local.cpp cedr_util.cpp \ - cedr_test_1d_transport.cpp +SOURCES=cedr_mpi.cpp cedr_util.cpp cedr_qlt.cpp cedr_caas.cpp cedr_local.cpp \ + cedr_test.cpp cedr_test_randomized.cpp cedr_test_1d_transport.cpp OBJECTS=$(SOURCES:.cpp=.o) @@ -21,6 +21,11 @@ clean: rm -f *.o testcedr cedr_qlt.o: cedr_mpi.hpp cedr_mpi_inl.hpp cedr_local.hpp cedr_local_inl.hpp \ - cedr_qlt.hpp cedr_qlt_inl.hpp cedr_kokkos.hpp cedr_util.hpp + cedr_qlt.hpp cedr_qlt_inl.hpp cedr_kokkos.hpp cedr_util.hpp \ + cedr_test_randomized.hpp +cedr_caas.o: cedr_mpi.hpp cedr_mpi_inl.hpp cedr_local.hpp cedr_local_inl.hpp \ + cedr_caas.hpp cedr_caas_inl.hpp cedr_kokkos.hpp cedr_util.hpp \ + cedr_test_randomized.hpp cedr_test.o: cedr_qlt.hpp cedr_util.hpp +cedr_test_1d_transport.o: cedr_qlt.hpp cedr_util.hpp cedr_local.o: cedr_local_inl.hpp diff --git a/cedr/cedr.hpp b/cedr/cedr.hpp index 5de4a3b..8dced2e 100644 --- a/cedr/cedr.hpp +++ b/cedr/cedr.hpp @@ -1,12 +1,30 @@ #ifndef INCLUDE_CEDR_HPP #define INCLUDE_CEDR_HPP -#include // Need some source for std::size_t. +#include "cedr_kokkos.hpp" +// Communication-Efficient Constrained Density Reconstructors namespace cedr { typedef int Int; typedef std::size_t Size; typedef double Real; + +// CDRs in general implement +// * tracer mass, Qm, conservation; +// * mixing ratio, q, shape preservation, either local bound preservation or +// dynamic range preservation; and +// * tracer consistency, which follows from dynamic range preservation or +// stronger (including local bound preservation) with rhom coming from the +// dynamics. +// +// One can solve a subset of these. +// If !conserve, then the CDR does not alter the tracer mass, but it does not +// correct for any failure in mass conservation in the field given to it. +// If consistent but !shapepreserve, the the CDR solves the dynamic range +// preservation problem rather than the local bound preservation problem. +struct ProblemType { + enum : Int { conserve = 1, shapepreserve = 1 << 1, consistent = 1 << 2 }; +}; } #endif diff --git a/cedr/cedr_caas.cpp b/cedr/cedr_caas.cpp new file mode 100644 index 0000000..0c7ca1d --- /dev/null +++ b/cedr/cedr_caas.cpp @@ -0,0 +1,83 @@ +#include "cedr_caas.hpp" +#include "cedr_util.hpp" + +namespace cedr { +namespace caas { + +struct OpData { int nsum, nmin, nmax; }; +static OpData g_op_data; +static void all_reduce_op (Real* in, Real* inout, int* len, + MPI_Datatype* /*datatype*/) { + const int n = g_op_data.nsum + g_op_data.nmin + g_op_data.nmax; + for (int i = 0; i < *len; ++i) { + int k = 0; + for ( ; k < g_op_data.nsum; ++k) + inout[k] += in[k]; + for ( ; k < g_op_data.nmin; ++k) + inout[k] = std::min(inout[k], in[k]); + for ( ; k < g_op_data.nmax; ++k) + inout[k] = std::max(inout[k], in[k]); + in += n; + inout += n; + } +} + +template +CAAS::CAAS (const mpi::Parallel::Ptr& p, const Int nlclcells) + : p_(p), nlclcells_(nlclcells), ntracers_(0), op_(all_reduce_op, true) +{ + cedr_throw_if(true, "WIP: Can't call yet."); +} + +template +void CAAS::declare_tracer (int problem_type) { + cedr_throw_if( ! (problem_type & ProblemType::shapepreserve) || + (problem_type & ProblemType::conserve), + "CAAS is a WIP; only shapepreserve (=> consistent) is " + "supported right now."); + ++ntracers_; +} + +template +void CAAS::end_tracer_declarations () { + d_ = RealList("CAAS data", nlclcells_ * (3*ntracers_ + 1)); +} + +template +int CAAS::get_problem_type (const Int& tracer_idx) const { + return ProblemType::shapepreserve | ProblemType::consistent; +} + +template +Int CAAS::get_num_tracers () const { + return ntracers_; +} + +template +void CAAS::reduce_locally () { +} + +template +void CAAS::reduce_globally () { + MPI_Type_contiguous(1 + 3*ntracers_, MPI_DOUBLE, &datatype_); + MPI_Type_commit(&datatype_); + g_op_data.nsum = 1 + ntracers_; + g_op_data.nmin = ntracers_; + g_op_data.nmax = ntracers_; + int err = MPI_Allreduce(send_.data(), recv_.data(), nlclcells_, datatype_, + op_.get(), p_->comm()); +} + +template +void CAAS::caas () { +} + +template +void CAAS::run () { + reduce_locally(); + reduce_globally(); + caas(); +} + +} // namespace caas +} // namespace cedr diff --git a/cedr/cedr_caas.hpp b/cedr/cedr_caas.hpp new file mode 100644 index 0000000..09f53ff --- /dev/null +++ b/cedr/cedr_caas.hpp @@ -0,0 +1,62 @@ +#ifndef INCLUDE_CEDR_CAAS_HPP +#define INCLUDE_CEDR_CAAS_HPP + +#include "cedr_cdr.hpp" + +namespace cedr { +// ClipAndAssuredSum. +namespace caas { + +template +class CAAS : public CDR { +public: + typedef typename cedr::impl::DeviceType::type Device; + typedef CAAS Me; + typedef std::shared_ptr Ptr; + +public: + CAAS(const mpi::Parallel::Ptr& p, const Int nlclcells); + + void declare_tracer(int problem_type) override; + + void end_tracer_declarations() override; + + int get_problem_type(const Int& tracer_idx) const override; + + Int get_num_tracers() const override; + + // lclcellidx is trivial, the user's index for the cell. + KOKKOS_INLINE_FUNCTION + void set_rhom(const Int& lclcellidx, const Real& rhom) override; + + KOKKOS_INLINE_FUNCTION + void set_Qm(const Int& lclcellidx, const Int& tracer_idx, + const Real& Qm, const Real& Qm_min, const Real& Qm_max, + const Real Qm_prev = -1) override; + + void run() override; + + KOKKOS_INLINE_FUNCTION + Real get_Qm(const Int& lclcellidx, const Int& tracer_idx) override; + +private: + typedef Kokkos::View RealList; + typedef cedr::impl::Unmanaged UnmanagedRealList; + + mpi::Parallel::Ptr p_; + Int nlclcells_, ntracers_; + MPI_Datatype datatype_; + mpi::Op op_; + RealList d_, send_, recv_; + + void reduce_locally(); + void reduce_globally(); + void caas(); +}; + +} // namespace caas +} // namespace cedr + +#include "cedr_caas_inl.hpp" + +#endif diff --git a/cedr/cedr_caas_inl.hpp b/cedr/cedr_caas_inl.hpp new file mode 100644 index 0000000..e4c4715 --- /dev/null +++ b/cedr/cedr_caas_inl.hpp @@ -0,0 +1,31 @@ +#ifndef INCLUDE_CEDR_CAAS_INL_HPP +#define INCLUDE_CEDR_CAAS_INL_HPP + +namespace cedr { +// ClipAndAssuredSum. +namespace caas { + +template KOKKOS_INLINE_FUNCTION +void CAAS::set_rhom (const Int& lclcellidx, const Real& rhom) { + d_(lclcellidx) = rhom; +} + +template KOKKOS_INLINE_FUNCTION +void CAAS +::set_Qm (const Int& lclcellidx, const Int& tracer_idx, + const Real& Qm, const Real& Qm_min, const Real& Qm_max, + const Real Qm_prev) { + d_((1 + tracer_idx)*nlclcells_ + lclcellidx) = Qm; + d_((1 + ntracers_ + tracer_idx)*nlclcells_ + lclcellidx) = Qm_min; + d_((1 + 2*ntracers_ + tracer_idx)*nlclcells_ + lclcellidx) = Qm_max; +} + +template KOKKOS_INLINE_FUNCTION +Real CAAS::get_Qm (const Int& lclcellidx, const Int& tracer_idx) { + return d_((1 + tracer_idx)*nlclcells_ + lclcellidx); +} + +} // namespace caas +} // namespace cedr + +#endif diff --git a/cedr/cedr_cdr.hpp b/cedr/cedr_cdr.hpp new file mode 100644 index 0000000..e0f4300 --- /dev/null +++ b/cedr/cedr_cdr.hpp @@ -0,0 +1,57 @@ +#ifndef INCLUDE_CEDR_CDR_HPP +#define INCLUDE_CEDR_CDR_HPP + +#include "cedr_mpi.hpp" + +namespace cedr { +// Constrained Density Reconstructor interface. +struct CDR { + // Set up QLT tracer metadata. Once end_tracer_declarations is called, it is + // an error to call declare_tracer again. Call declare_tracer in order of the + // tracer index in the caller's numbering. It is an error to call this + // function from a parallel region. + virtual void declare_tracer(int problem_type) = 0; + + // It is an error to call this function from a parallel region. + virtual void end_tracer_declarations() = 0; + + virtual int get_problem_type(const Int& tracer_idx) const = 0; + + virtual Int get_num_tracers() const = 0; + + // set_{rhom,Qm}: Set cell values prior to running the QLT algorithm. + // set_rhom must be called before set_Qm. + // Notation: + // rho: Total density. + // Q: Tracer density. + // q: Tracer mixing ratio = Q/rho. + // *m: Mass corresponding to the density; results from an integral over a + // region, such as a cell. + // Some CDRs have a nontrivial local <-> global cell index map. For these + // CDRs, lclcellidx may be nontrivial. For others, the caller should provide + // the index into the local cell. + virtual void set_rhom( + const Int& lclcellidx, + // Current total mass in this cell. + const Real& rhom) = 0; + + virtual void set_Qm( + const Int& lclcellidx, const Int& tracer_idx, + // Current tracer mass in this cell. + const Real& Qm, + // Minimum and maximum permitted tracer mass in this cell. + const Real& Qm_min, const Real& Qm_max, + // If mass conservation is requested, provide the previous Qm, which will be + // summed to give the desired global mass. + const Real Qm_prev = -1) = 0; + + // Run the QLT algorithm with the values set by set_{rho,Q}. It is an error to + // call this function from a parallel region. + virtual void run() = 0; + + // Get a cell's tracer mass Qm after the QLT algorithm has run. + virtual Real get_Qm(const Int& lclcellidx, const Int& tracer_idx) = 0; +}; +} // namespace cedr + +#endif diff --git a/cedr/cedr_mpi.hpp b/cedr/cedr_mpi.hpp index 25fd466..c2b8750 100644 --- a/cedr/cedr_mpi.hpp +++ b/cedr/cedr_mpi.hpp @@ -27,62 +27,50 @@ Parallel::Ptr make_parallel(MPI_Comm comm); template MPI_Datatype get_type(); template -int reduce (const Parallel& p, const T* sendbuf, T* rcvbuf, int count, MPI_Op op, - int root) { - MPI_Datatype dt = get_type(); - return MPI_Reduce(const_cast(sendbuf), rcvbuf, count, dt, op, root, p.comm()); -} +int reduce(const Parallel& p, const T* sendbuf, T* rcvbuf, int count, MPI_Op op, + int root); template -int all_reduce (const Parallel& p, const T* sendbuf, T* rcvbuf, int count, MPI_Op op) { - MPI_Datatype dt = get_type(); - return MPI_Allreduce(const_cast(sendbuf), rcvbuf, count, dt, op, p.comm()); -} +int all_reduce(const Parallel& p, const T* sendbuf, T* rcvbuf, int count, MPI_Op op); template -int isend (const Parallel& p, const T* buf, int count, int dest, int tag, - MPI_Request* ireq) { - MPI_Datatype dt = get_type(); - MPI_Request ureq; - MPI_Request* req = ireq ? ireq : &ureq; - int ret = MPI_Isend(const_cast(buf), count, dt, dest, tag, p.comm(), req); - if ( ! ireq) MPI_Request_free(req); - return ret; -} +int isend(const Parallel& p, const T* buf, int count, int dest, int tag, + MPI_Request* ireq); template -int irecv (const Parallel& p, T* buf, int count, int src, int tag, MPI_Request* ireq) { - MPI_Datatype dt = get_type(); - MPI_Request ureq; - MPI_Request* req = ireq ? ireq : &ureq; - int ret = MPI_Irecv(buf, count, dt, src, tag, p.comm(), req); - if ( ! ireq) MPI_Request_free(req); - return ret; -} +int irecv(const Parallel& p, T* buf, int count, int src, int tag, MPI_Request* ireq); int waitany(int count, MPI_Request* reqs, int* index, MPI_Status* stats = nullptr); int waitall(int count, MPI_Request* reqs, MPI_Status* stats = nullptr); template -int gather (const Parallel& p, const T* sendbuf, int sendcount, - T* recvbuf, int recvcount, int root) { - MPI_Datatype dt = get_type(); - return MPI_Gather(sendbuf, sendcount, dt, recvbuf, recvcount, dt, root, p.comm()); -} +int gather(const Parallel& p, const T* sendbuf, int sendcount, + T* recvbuf, int recvcount, int root); template -int gatherv (const Parallel& p, const T* sendbuf, int sendcount, - T* recvbuf, const int* recvcounts, const int* displs, int root) { - MPI_Datatype dt = get_type(); - return MPI_Gatherv(sendbuf, sendcount, dt, recvbuf, recvcounts, displs, dt, root, - p.comm()); -} +int gatherv(const Parallel& p, const T* sendbuf, int sendcount, + T* recvbuf, const int* recvcounts, const int* displs, int root); bool all_ok(const Parallel& p, bool im_ok); -} -} +struct Op { + typedef std::shared_ptr Ptr; + + Op (MPI_User_function* function, bool commute) { + MPI_Op_create(function, static_cast(commute), op_); + } + + ~Op () { MPI_Op_free(op_); } + + MPI_Op* get () const { return op_; } + +private: + MPI_Op* op_; +}; + +} // namespace mpi +} // namespace cedr #include "cedr_mpi_inl.hpp" diff --git a/cedr/cedr_mpi_inl.hpp b/cedr/cedr_mpi_inl.hpp index d63fbf2..e4f28db 100644 --- a/cedr/cedr_mpi_inl.hpp +++ b/cedr/cedr_mpi_inl.hpp @@ -1,4 +1,59 @@ #ifndef INCLUDE_CEDR_MPI_INL_HPP #define INCLUDE_CEDR_MPI_INL_HPP +namespace cedr { +namespace mpi { + +template +int reduce (const Parallel& p, const T* sendbuf, T* rcvbuf, int count, MPI_Op op, + int root) { + MPI_Datatype dt = get_type(); + return MPI_Reduce(const_cast(sendbuf), rcvbuf, count, dt, op, root, p.comm()); +} + +template +int all_reduce (const Parallel& p, const T* sendbuf, T* rcvbuf, int count, MPI_Op op) { + MPI_Datatype dt = get_type(); + return MPI_Allreduce(const_cast(sendbuf), rcvbuf, count, dt, op, p.comm()); +} + +template +int isend (const Parallel& p, const T* buf, int count, int dest, int tag, + MPI_Request* ireq) { + MPI_Datatype dt = get_type(); + MPI_Request ureq; + MPI_Request* req = ireq ? ireq : &ureq; + int ret = MPI_Isend(const_cast(buf), count, dt, dest, tag, p.comm(), req); + if ( ! ireq) MPI_Request_free(req); + return ret; +} + +template +int irecv (const Parallel& p, T* buf, int count, int src, int tag, MPI_Request* ireq) { + MPI_Datatype dt = get_type(); + MPI_Request ureq; + MPI_Request* req = ireq ? ireq : &ureq; + int ret = MPI_Irecv(buf, count, dt, src, tag, p.comm(), req); + if ( ! ireq) MPI_Request_free(req); + return ret; +} + +template +int gather (const Parallel& p, const T* sendbuf, int sendcount, + T* recvbuf, int recvcount, int root) { + MPI_Datatype dt = get_type(); + return MPI_Gather(sendbuf, sendcount, dt, recvbuf, recvcount, dt, root, p.comm()); +} + +template +int gatherv (const Parallel& p, const T* sendbuf, int sendcount, + T* recvbuf, const int* recvcounts, const int* displs, int root) { + MPI_Datatype dt = get_type(); + return MPI_Gatherv(sendbuf, sendcount, dt, recvbuf, recvcounts, displs, dt, root, + p.comm()); +} + +} // namespace mpi +} // namespace cedr + #endif diff --git a/cedr/cedr_qlt.cpp b/cedr/cedr_qlt.cpp index 7ad1f2d..68e5f5b 100644 --- a/cedr/cedr_qlt.cpp +++ b/cedr/cedr_qlt.cpp @@ -1,4 +1,5 @@ #include "cedr_qlt.hpp" +#include "cedr_test_randomized.hpp" #include @@ -15,8 +16,7 @@ namespace qlt { class Timer { public: - enum Op { tree, analyze, trcrinit, trcrgen, trcrcheck, - qltrun, qltrunl2r, qltrunr2l, snp, waitall, + enum Op { tree, analyze, qltrun, qltrunl2r, qltrunr2l, snp, waitall, total, NTIMERS }; static inline void init () { #ifdef QLT_TIME @@ -55,7 +55,6 @@ class Timer { #ifdef QLT_TIME const double tot = et_[total]; tpr(tree); tpr(analyze); - tpr(trcrinit); tpr(trcrgen); tpr(trcrcheck); tpr(qltrun); tpr(qltrunl2r); tpr(qltrunr2l); tpr(snp); tpr(waitall); printf("%-20s %10.3e %10.1f\n", "total", tot, 100.0); #endif @@ -851,85 +850,29 @@ constexpr Int QLT::MetaData::problem_type_[]; namespace test { using namespace impl; -class TestQLT { +class TestQLT : public cedr::test::TestRandomized { +public: typedef QLT QLTT; - typedef Kokkos::View R2D; - - struct Tracer { - typedef QLTT::ProblemType PT; - - Int idx; - Int problem_type; - Int perturbation_type; - bool no_change_should_hold, safe_should_hold, local_should_hold; - bool write; - - std::string str () const { - std::stringstream ss; - ss << "(ti " << idx; - if (problem_type & PT::conserve) ss << " c"; - if (problem_type & PT::shapepreserve) ss << " s"; - if (problem_type & PT::consistent) ss << " t"; - ss << " pt " << perturbation_type << " ssh " << safe_should_hold - << " lsh " << local_should_hold << ")"; - return ss.str(); - } - - Tracer () - : idx(-1), problem_type(-1), perturbation_type(-1), no_change_should_hold(false), - safe_should_hold(true), local_should_hold(true), write(false) - {} - }; - struct Values { - Values (const Int ntracers, const Int ncells) - : ncells_(ncells), v_((4*ntracers + 1)*ncells) - {} - Int ncells () const { return ncells_; } - Real* rhom () { return v_.data(); } - Real* Qm_min (const Int& ti) { return v_.data() + ncells_*(1 + 4*ti ); } - Real* Qm (const Int& ti) { return v_.data() + ncells_*(1 + 4*ti + 1); } - Real* Qm_max (const Int& ti) { return v_.data() + ncells_*(1 + 4*ti + 2); } - Real* Qm_prev (const Int& ti) { return v_.data() + ncells_*(1 + 4*ti + 3); } - const Real* rhom () const { return const_cast(this)->rhom(); } - const Real* Qm_min (const Int& ti) const - { return const_cast(this)->Qm_min (ti); } - const Real* Qm (const Int& ti) const - { return const_cast(this)->Qm (ti); } - const Real* Qm_max (const Int& ti) const - { return const_cast(this)->Qm_max (ti); } - const Real* Qm_prev (const Int& ti) const - { return const_cast(this)->Qm_prev(ti); } - private: - Int ncells_; - std::vector v_; - }; - - // For solution output, if requested. - struct Writer { - std::unique_ptr fh; - std::vector ngcis; // Number of i'th rank's gcis_ array. - std::vector displs; // Cumsum of above. - std::vector gcis; // Global cell indices packed by rank's gcis_ vector. - ~Writer () { - if ( ! fh) return; - fprintf(fh.get(), " return s\n"); - } - }; + TestQLT (const Parallel::Ptr& p, const tree::Node::Ptr& tree, + const Int& ncells, const bool verbose=false) + : TestRandomized(p, ncells, verbose), qlt_(p, ncells, tree), tree_(tree) + { + if (verbose) qlt_.print(std::cout); + init(); + } private: - const Parallel::Ptr p_; - const Int ncells_; QLTT qlt_; - // Caller index (local cell index in the app code) -> QLT lclcellidx. - std::vector gcis_, i2lci_; - std::vector tracers_; - // For optional output. - bool write_inited_; - std::shared_ptr w_; // Only on root. + tree::Node::Ptr tree_; + std::vector i2lci_; + + void init_numbering () override { + init_numbering(tree_); + } -private: void init_numbering (const tree::Node::Ptr& node) { + check(qlt_); // TestQLT doesn't actually care about a particular ordering, as there is no // geometry to the test problem. However, use *some* ordering to model what // a real problem must do. @@ -944,264 +887,6 @@ class TestQLT { init_numbering(node->kids[i]); } - void init_tracers () { - Timer::start(Timer::trcrinit); - typedef Tracer::PT PT; - static const Int pts[] = { - PT::conserve | PT::shapepreserve | PT::consistent, - PT::shapepreserve, // Test a noncanonical problem type. - PT::conserve | PT::consistent, - PT::consistent - }; - Int tracer_idx = 0; - for (Int perturb = 0; perturb < 6; ++perturb) - for (Int ti = 0; ti < 4; ++ti) { - Tracer t; - t.problem_type = pts[ti]; - const bool shapepreserve = t.problem_type & PT::shapepreserve; - t.idx = tracer_idx++; - t.perturbation_type = perturb; - t.safe_should_hold = true; - t.no_change_should_hold = perturb == 0; - t.local_should_hold = perturb < 4 && shapepreserve; - t.write = perturb == 2 && ti == 2; - tracers_.push_back(t); - qlt_.declare_tracer(t.problem_type); - } - qlt_.end_tracer_declarations(); - cedr_assert(qlt_.get_num_tracers() == static_cast(tracers_.size())); - for (size_t i = 0; i < tracers_.size(); ++i) - cedr_assert(qlt_.get_problem_type(i) == (tracers_[i].problem_type | - PT::consistent)); - Timer::stop(Timer::trcrinit); - } - - static Real urand () { return rand() / ((Real) RAND_MAX + 1.0); } - - static void generate_rho (Values& v) { - auto r = v.rhom(); - const Int n = v.ncells(); - for (Int i = 0; i < n; ++i) - r[i] = 0.5 + 1.5*urand(); - } - - static void generate_Q (const Tracer& t, Values& v) { - Real* rhom = v.rhom(), * Qm_min = v.Qm_min(t.idx), * Qm = v.Qm(t.idx), - * Qm_max = v.Qm_max(t.idx), * Qm_prev = v.Qm_prev(t.idx); - const Int n = v.ncells(); - for (Int i = 0; i < n; ++i) { - const Real - q_min = 0.1 + 0.8*urand(), - q_max = std::min(1, q_min + (0.9 - q_min)*urand()), - q = q_min + (q_max - q_min)*urand(); - // Check correctness up to FP. - cedr_assert(q_min >= 0 && - q_max <= 1 + 10*std::numeric_limits::epsilon() && - q_min <= q && q <= q_max); - Qm_min[i] = q_min*rhom[i]; - Qm_max[i] = q_max*rhom[i]; - // Protect against FP error. - Qm[i] = std::max(Qm_min[i], std::min(Qm_max[i], q*rhom[i])); - // Set previous Qm to the current unperturbed value. - Qm_prev[i] = Qm[i]; - } - } - - static void gen_rand_perm (const size_t n, std::vector& p) { - p.resize(n); - for (size_t i = 0; i < n; ++i) - p[i] = i; - for (size_t i = 0; i < n; ++i) { - const int j = urand()*n, k = urand()*n; - std::swap(p[j], p[k]); - } - } - - // Permuting the Qm array, even just on a rank as long as there is > 1 cell, - // produces a problem likely requiring considerable reconstruction, which - // reconstruction assuredly satisfies the properties. But because this is a - // local operation only, it doesn't test the 1 cell/rank case. - static void permute_Q (const Tracer& t, Values& v) { - Real* const Qm = v.Qm(t.idx); - const Int N = v.ncells(); - std::vector p; - gen_rand_perm(N, p); - std::vector Qm_orig(N); - std::copy(Qm, Qm + N, Qm_orig.begin()); - for (Int i = 0; i < N; ++i) - Qm[i] = Qm_orig[p[i]]; - } - - void add_const_to_Q (const Tracer& t, Values& v, - // Move 0 < alpha <= 1 of the way to the QLT or safety - // feasibility bound. - const Real& alpha, - // Whether the modification should be done in a - // mass-conserving way. - const bool conserve_mass, - // Only safety problem is feasible. - const bool safety_problem) { - // Some of these reductions aren't used at present. Might add more test - // options later that use them. - Real rhom, Qm, Qm_max; { - Real Qm_sum_lcl[3] = {0}; - for (Int i = 0; i < v.ncells(); ++i) { - Qm_sum_lcl[0] += v.rhom()[i]; - Qm_sum_lcl[1] += v.Qm(t.idx)[i]; - Qm_sum_lcl[2] += v.Qm_max(t.idx)[i]; - } - Real Qm_sum_gbl[3] = {0}; - mpi::all_reduce(*p_, Qm_sum_lcl, Qm_sum_gbl, 3, MPI_SUM); - rhom = Qm_sum_gbl[0]; Qm = Qm_sum_gbl[1]; Qm_max = Qm_sum_gbl[2]; - } - Real Qm_max_safety = 0; - if (safety_problem) { - Real q_safety_lcl = v.Qm_max(t.idx)[0] / v.rhom()[0]; - for (Int i = 1; i < v.ncells(); ++i) - q_safety_lcl = std::max(q_safety_lcl, v.Qm_max(t.idx)[i] / v.rhom()[i]); - Real q_safety_gbl = 0; - mpi::all_reduce(*p_, &q_safety_lcl, &q_safety_gbl, 1, MPI_MAX); - Qm_max_safety = q_safety_gbl*rhom; - } - const Real dQm = safety_problem ? - ((Qm_max - Qm) + alpha * (Qm_max_safety - Qm_max)) / ncells_ : - alpha * (Qm_max - Qm) / ncells_; - for (Int i = 0; i < v.ncells(); ++i) - v.Qm(t.idx)[i] += dQm; - // Now permute Qm so that it's a little more interesting. - permute_Q(t, v); - // Adjust Qm_prev. Qm_prev is used to test the PT::conserve case, and also - // simply to record the correct total mass. The modification above modified - // Q's total mass. If conserve_mass, then Qm_prev needs to be made to sum to - // the same new mass. If ! conserve_mass, we want Qm_prev to be modified in - // an interesting way, so that PT::conserve doesn't trivially undo the mod - // that was made above when the root fixes the mass discrepancy. - const Real - relax = 0.9, - dQm_prev = (conserve_mass ? dQm : - (safety_problem ? - ((Qm_max - Qm) + relax*alpha * (Qm_max_safety - Qm_max)) / ncells_ : - relax*alpha * (Qm_max - Qm) / ncells_)); - for (Int i = 0; i < v.ncells(); ++i) - v.Qm_prev(t.idx)[i] += dQm_prev; - } - - void perturb_Q (const Tracer& t, Values& v) { - // QLT is naturally mass conserving. But if QLT isn't being asked to impose - // mass conservation, then the caller better have a conservative - // method. Here, we model that by saying that Qm_prev and Qm should sum to - // the same mass. - const bool cm = ! (t.problem_type & Tracer::PT::conserve); - // For the edge cases, we cannot be exactly on the edge and still expect the - // q-limit checks to pass to machine precision. Thus, back away from the - // edge by an amount that bounds the error in the global mass due to FP, - // assuming each cell's mass is O(1). - const Real edg = 1 - ncells_*std::numeric_limits::epsilon(); - switch (t.perturbation_type) { - case 0: - // Do nothing, to test that QLT doesn't make any changes if none is - // needed. - break; - case 1: permute_Q(t, v); break; - case 2: add_const_to_Q(t, v, 0.5, cm, false); break; - case 3: add_const_to_Q(t, v, edg, cm, false); break; - case 4: add_const_to_Q(t, v, 0.5, cm, true ); break; - case 5: add_const_to_Q(t, v, edg, cm, true ); break; - } - } - - static std::string get_tracer_name (const Tracer& t) { - std::stringstream ss; - ss << "t" << t.idx; - return ss.str(); - } - - void init_writer () { - if (p_->amroot()) { - w_ = std::make_shared(); - w_->fh = std::unique_ptr(fopen("out_QLT.py", "w")); - int n = gcis_.size(); - w_->ngcis.resize(p_->size()); - mpi::gather(*p_, &n, 1, w_->ngcis.data(), 1, p_->root()); - w_->displs.resize(p_->size() + 1); - w_->displs[0] = 0; - for (size_t i = 0; i < w_->ngcis.size(); ++i) - w_->displs[i+1] = w_->displs[i] + w_->ngcis[i]; - cedr_assert(w_->displs.back() == ncells_); - w_->gcis.resize(ncells_); - mpi::gatherv(*p_, gcis_.data(), gcis_.size(), w_->gcis.data(), w_->ngcis.data(), - w_->displs.data(), p_->root()); - } else { - int n = gcis_.size(); - mpi::gather(*p_, &n, 1, static_cast(nullptr), 0, p_->root()); - Int* Inull = nullptr; - const int* inull = nullptr; - mpi::gatherv(*p_, gcis_.data(), gcis_.size(), Inull, inull, inull, p_->root()); - } - write_inited_ = true; - } - - void gather_field (const Real* Qm_lcl, std::vector& Qm_gbl, - std::vector& wrk) { - if (p_->amroot()) { - Qm_gbl.resize(ncells_); - wrk.resize(ncells_); - mpi::gatherv(*p_, Qm_lcl, gcis_.size(), wrk.data(), w_->ngcis.data(), - w_->displs.data(), p_->root()); - for (Int i = 0; i < ncells_; ++i) - Qm_gbl[w_->gcis[i]] = wrk[i]; - } else { - Real* rnull = nullptr; - const int* inull = nullptr; - mpi::gatherv(*p_, Qm_lcl, gcis_.size(), rnull, inull, inull, p_->root()); - } - } - - void write_field (const std::string& tracer_name, const std::string& field_name, - const std::vector& Qm) { - if ( ! p_->amroot()) return; - fprintf(w_->fh.get(), " s.%s.%s = [", tracer_name.c_str(), field_name.c_str()); - for (const auto& e : Qm) - fprintf(w_->fh.get(), "%1.15e, ", e); - fprintf(w_->fh.get(), "]\n"); - } - - void write_pre (const Tracer& t, Values& v) { - if ( ! t.write) return; - std::vector f, wrk; - if ( ! write_inited_) { - init_writer(); - if (w_) - fprintf(w_->fh.get(), - "def getsolns():\n" - " class Struct:\n" - " pass\n" - " s = Struct()\n" - " s.all = Struct()\n"); - gather_field(v.rhom(), f, wrk); - write_field("all", "rhom", f); - } - const auto name = get_tracer_name(t); - if (w_) - fprintf(w_->fh.get(), " s.%s = Struct()\n", name.c_str()); - gather_field(v.Qm_min(t.idx), f, wrk); - write_field(name, "Qm_min", f); - gather_field(v.Qm_prev(t.idx), f, wrk); - write_field(name, "Qm_orig", f); - gather_field(v.Qm(t.idx), f, wrk); - write_field(name, "Qm_pre", f); - gather_field(v.Qm_max(t.idx), f, wrk); - write_field(name, "Qm_max", f); - } - - void write_post (const Tracer& t, Values& v) { - if ( ! t.write) return; - const auto name = get_tracer_name(t); - std::vector Qm, wrk; - gather_field(v.Qm(t.idx), Qm, wrk); - write_field(name, "Qm_qlt", Qm); - } - static void check (const QLTT& qlt) { const Int n = qlt.nlclcells(); std::vector gcis; @@ -1211,139 +896,23 @@ class TestQLT { cedr_assert(qlt.gci2lci(gcis[i]) == i); } - static Int check (const Parallel& p, const std::vector& ts, const Values& v) { - static const bool details = true; - static const Real ulp3 = 3*std::numeric_limits::epsilon(); - Int nerr = 0; - std::vector lcl_mass(2*ts.size()), q_min_lcl(ts.size()), q_max_lcl(ts.size()); - std::vector t_ok(ts.size(), 1), local_violated(ts.size(), 0); - for (size_t ti = 0; ti < ts.size(); ++ti) { - const auto& t = ts[ti]; - - cedr_assert(t.safe_should_hold); - const bool safe_only = ! t.local_should_hold; - const Int n = v.ncells(); - const Real* rhom = v.rhom(), * Qm_min = v.Qm_min(t.idx), * Qm = v.Qm(t.idx), - * Qm_max = v.Qm_max(t.idx), * Qm_prev = v.Qm_prev(t.idx); - - q_min_lcl[ti] = 1; - q_max_lcl[ti] = 0; - for (Int i = 0; i < n; ++i) { - const bool lv = (Qm[i] < Qm_min[i] || Qm[i] > Qm_max[i]); - if (lv) local_violated[ti] = 1; - if ( ! safe_only && lv) { - // If this fails at ~ machine eps, check r2l_nl_adjust_bounds code in - // solve_node_problem. - if (details) - pr("check q " << t.str() << ": " << Qm[i] << " " << - (Qm[i] < Qm_min[i] ? Qm[i] - Qm_min[i] : Qm[i] - Qm_max[i])); - t_ok[ti] = false; - ++nerr; - } - if (t.no_change_should_hold && Qm[i] != Qm_prev[i]) { - if (details) - pr("Q should be unchanged but is not: " << Qm_prev[i] << " changed to " << - Qm[i] << " in " << t.str()); - t_ok[ti] = false; - ++nerr; - } - lcl_mass[2*ti ] += Qm_prev[i]; - lcl_mass[2*ti + 1] += Qm[i]; - q_min_lcl[ti] = std::min(q_min_lcl[ti], Qm_min[i]/rhom[i]); - q_max_lcl[ti] = std::max(q_max_lcl[ti], Qm_max[i]/rhom[i]); - } - } - - std::vector q_min_gbl(ts.size(), 0), q_max_gbl(ts.size(), 0); - mpi::all_reduce(p, q_min_lcl.data(), q_min_gbl.data(), q_min_lcl.size(), MPI_MIN); - mpi::all_reduce(p, q_max_lcl.data(), q_max_gbl.data(), q_max_lcl.size(), MPI_MAX); - - for (size_t ti = 0; ti < ts.size(); ++ti) { - // Check safety problem. If local_should_hold and it does, then the safety - // problem is by construction also solved (since it's a relaxation of the - // local problem). - const auto& t = ts[ti]; - const bool safe_only = ! t.local_should_hold; - if (safe_only) { - const Int n = v.ncells(); - const Real* rhom = v.rhom(), * Qm_min = v.Qm_min(t.idx), * Qm = v.Qm(t.idx), - * Qm_max = v.Qm_max(t.idx); - const Real q_min = q_min_gbl[ti], q_max = q_max_gbl[ti]; - for (Int i = 0; i < n; ++i) { - if (Qm[i] < q_min*rhom[i]*(1 - ulp3) || - Qm[i] > q_max*rhom[i]*(1 + ulp3)) { - if (details) - pr("check q " << t.str() << ": " << q_min*rhom[i] << " " << Qm_min[i] << - " " << Qm[i] << " " << Qm_max[i] << " " << q_max*rhom[i] << " | " << - (Qm[i] < q_min*rhom[i] ? - Qm[i] - q_min*rhom[i] : - Qm[i] - q_max*rhom[i])); - t_ok[ti] = false; - ++nerr; - } - } - } - } - - std::vector glbl_mass(2*ts.size(), 0); - mpi::reduce(p, lcl_mass.data(), glbl_mass.data(), lcl_mass.size(), MPI_SUM, - p.root()); - std::vector t_ok_gbl(ts.size(), 0); - mpi::reduce(p, t_ok.data(), t_ok_gbl.data(), t_ok.size(), MPI_MIN, p.root()); - // Right now we're not using these: - std::vector local_violated_gbl(ts.size(), 0); - mpi::reduce(p, local_violated.data(), local_violated_gbl.data(), - local_violated.size(), MPI_MAX, p.root()); - - if (p.amroot()) { - const Real tol = 1e3*std::numeric_limits::epsilon(); - for (size_t ti = 0; ti < ts.size(); ++ti) { - // Check mass conservation. - const Real desired_mass = glbl_mass[2*ti], actual_mass = glbl_mass[2*ti+1], - rd = cedr::util::reldif(desired_mass, actual_mass); - const bool mass_failed = rd > tol; - if (mass_failed) { - ++nerr; - t_ok_gbl[ti] = false; - } - if ( ! t_ok_gbl[ti]) { - std::cout << "FAIL " << ts[ti].str(); - if (mass_failed) std::cout << " mass re " << rd; - std::cout << "\n"; - } - } - } - - return nerr; + void init_tracers () override { + for (const auto& t : tracers_) + qlt_.declare_tracer(t.problem_type); + qlt_.end_tracer_declarations(); + cedr_assert(qlt_.get_num_tracers() == static_cast(tracers_.size())); + for (size_t i = 0; i < tracers_.size(); ++i) + cedr_assert(qlt_.get_problem_type(i) == (tracers_[i].problem_type | + ProblemType::consistent)); } -public: - TestQLT (const Parallel::Ptr& p, const tree::Node::Ptr& tree, - const Int& ncells, const bool verbose = false) - : p_(p), ncells_(ncells), qlt_(p_, ncells, tree), write_inited_(false) - { - check(qlt_); - init_numbering(tree); - init_tracers(); - if (verbose) qlt_.print(std::cout); - } - - Int run (const Int nrepeat = 1, const bool write=false) { - Timer::start(Timer::trcrgen); + void run_impl (Values& v, const Int nrepeat, const bool write) override { const Int nt = qlt_.get_num_tracers(), nlclcells = qlt_.nlclcells(); - Values v(nt, nlclcells); - generate_rho(v); { Real* rhom = v.rhom(); for (Int i = 0; i < nlclcells; ++i) qlt_.set_rhom(i2lci_[i], rhom[i]); } - for (Int ti = 0; ti < nt; ++ti) { - generate_Q(tracers_[ti], v); - perturb_Q(tracers_[ti], v); - if (write) write_pre(tracers_[ti], v); - } - Timer::stop(Timer::trcrgen); for (Int trial = 0; trial <= nrepeat; ++trial) { for (Int ti = 0; ti < nt; ++ti) { Real* Qm_min = v.Qm_min(ti), * Qm = v.Qm(ti), * Qm_max = v.Qm_max(ti), @@ -1364,17 +933,11 @@ class TestQLT { Timer::reset(Timer::snp); } } - Timer::start(Timer::trcrcheck); - Int nerr = 0; for (Int ti = 0; ti < nt; ++ti) { Real* Qm = v.Qm(ti); for (Int i = 0; i < nlclcells; ++i) Qm[i] = qlt_.get_Qm(i2lci_[i], ti); - if (write) write_post(tracers_[ti], v); } - nerr += check(*p_, tracers_, v); - Timer::stop(Timer::trcrcheck); - return nerr; } }; diff --git a/cedr/cedr_qlt.hpp b/cedr/cedr_qlt.hpp index 4ec8128..2e85cf4 100644 --- a/cedr/cedr_qlt.hpp +++ b/cedr/cedr_qlt.hpp @@ -9,9 +9,7 @@ #include #include -#include "cedr.hpp" -#include "cedr_kokkos.hpp" -#include "cedr_mpi.hpp" +#include "cedr_cdr.hpp" namespace cedr { // QLT: Quasi-local tree-based non-iterative tracer density reconstructor for @@ -41,16 +39,12 @@ Node::Ptr make_tree_over_1d_mesh(const Parallel::Ptr& p, const Int& ncells, } // namespace tree template -class QLT { +class QLT : public cedr::CDR { public: typedef typename cedr::impl::DeviceType::type Device; typedef QLT Me; typedef std::shared_ptr Ptr; - struct ProblemType { - enum : Int { conserve = 1, shapepreserve = 1 << 1, consistent = 1 << 2 }; - }; - // Set up QLT topology and communication data structures based on a tree. QLT(const Parallel::Ptr& p, const Int& ncells, const tree::Node::Ptr& tree); @@ -70,47 +64,28 @@ class QLT { // it. This is not an efficient operation. Int gci2lci(const Int& gci) const; - // Set up QLT tracer metadata. Once end_tracer_declarations is called, it is - // an error to call declare_tracer again. Call declare_tracer in order of the - // tracer index in the caller's numbering. - void declare_tracer(int problem_type); + void declare_tracer(int problem_type) override; - void end_tracer_declarations(); + void end_tracer_declarations() override; - int get_problem_type(const Int& tracer_idx) const; + int get_problem_type(const Int& tracer_idx) const override; - Int get_num_tracers() const; + Int get_num_tracers() const override; - // set_{rhom,Qm}: Set cell values prior to running the QLT algorithm. - // set_rhom must be called before set_Qm. - // lclcellidx is gci2lci(cellidx). - // Notation: - // rho: Total density. - // Q: Tracer density. - // q: Tracer mixing ratio = Q/rho. - // *m: Mass corresponding to the density; results from an integral over a - // region, such as a cell. + // lclcellidx is gci2lci(cellidx). KOKKOS_INLINE_FUNCTION - void set_rhom(const Int& lclcellidx, - // Current total mass in this cell. - const Real& rhom); + void set_rhom(const Int& lclcellidx, const Real& rhom) override; + // lclcellidx is gci2lci(cellidx). KOKKOS_INLINE_FUNCTION void set_Qm(const Int& lclcellidx, const Int& tracer_idx, - // Current tracer mass in this cell. - const Real& Qm, - // Minimum and maximum permitted tracer mass in this cell. - const Real& Qm_min, const Real& Qm_max, - // If mass conservation is requested, provide the previous Qm, - // which will be summed to give the desired global mass. - const Real Qm_prev = -1); - - // Run the QLT algorithm with the values set by set_{rho,Q}. - void run(); - - // Get a cell's tracer mass Qm after the QLT algorithm has run. + const Real& Qm, const Real& Qm_min, const Real& Qm_max, + const Real Qm_prev = -1) override; + + void run() override; + KOKKOS_INLINE_FUNCTION - Real get_Qm(const Int& lclcellidx, const Int& tracer_idx); + Real get_Qm(const Int& lclcellidx, const Int& tracer_idx) override; private: typedef Kokkos::View IntList; @@ -167,7 +142,7 @@ class QLT { // The only problem not supported is conservation alone. It makes very // little sense to use QLT for conservation alone. // The remaining 6 fall into 4 categories of details. These 4 categories - // are traceked by QLT; which of the original 6 problems being solved is + // are tracked by QLT; which of the original 6 problems being solved is // not important. enum { // l2r: rhom, (Qm_min, Qm, Qm_max)*; l2r, r2l: Qm* diff --git a/cedr/cedr_test.hpp b/cedr/cedr_test.hpp new file mode 100644 index 0000000..afa9b5c --- /dev/null +++ b/cedr/cedr_test.hpp @@ -0,0 +1,22 @@ +#ifndef INCLUDE_CEDR_TEST_HPP +#define INCLUDE_CEDR_TEST_HPP + +#include "cedr.hpp" +#include "cedr_mpi.hpp" + +namespace cedr { +namespace test { +namespace transport1d { + +struct Input { + Int ncells; + bool verbose; +}; + +Int run(const mpi::Parallel::Ptr& p, const Input& in); + +} // namespace transport1d +} // namespace test +} // namespace cedr + +#endif diff --git a/cedr/cedr_test_1d_transport.cpp b/cedr/cedr_test_1d_transport.cpp index 119d5ce..3b28329 100644 --- a/cedr/cedr_test_1d_transport.cpp +++ b/cedr/cedr_test_1d_transport.cpp @@ -269,8 +269,8 @@ Int run (const mpi::Parallel::Ptr& parallel, const Input& in) { true /* imbalanced */); typedef qlt::QLT QLTT; QLTT qlt(parallel, in.ncells, tree); - qlt.declare_tracer(QLTT::ProblemType::conserve | - QLTT::ProblemType::shapepreserve); + qlt.declare_tracer(cedr::ProblemType::conserve | + cedr::ProblemType::shapepreserve); qlt.end_tracer_declarations(); for (Int i = 0; i < in.ncells; ++i) qlt.set_rhom(i, p.area(i)); diff --git a/cedr/cedr_test_randomized.cpp b/cedr/cedr_test_randomized.cpp new file mode 100644 index 0000000..8aafee1 --- /dev/null +++ b/cedr/cedr_test_randomized.cpp @@ -0,0 +1,413 @@ +#include "cedr_test_randomized.hpp" + +namespace cedr { +namespace test { + +std::string TestRandomized::Tracer::str () const { + std::stringstream ss; + ss << "(ti " << idx; + if (problem_type & PT::conserve) ss << " c"; + if (problem_type & PT::shapepreserve) ss << " s"; + if (problem_type & PT::consistent) ss << " t"; + ss << " pt " << perturbation_type << " ssh " << safe_should_hold + << " lsh " << local_should_hold << ")"; + return ss.str(); +} + +TestRandomized::Writer::~Writer () { + if ( ! fh) return; + fprintf(fh.get(), " return s\n"); +} + +void TestRandomized::init_tracers_vector () { + typedef Tracer::PT PT; + static const Int pts[] = { + PT::conserve | PT::shapepreserve | PT::consistent, + PT::shapepreserve, // Test a noncanonical problem type. + PT::conserve | PT::consistent, + PT::consistent + }; + Int tracer_idx = 0; + for (Int perturb = 0; perturb < 6; ++perturb) + for (Int ti = 0; ti < 4; ++ti) { + Tracer t; + t.problem_type = pts[ti]; + const bool shapepreserve = t.problem_type & PT::shapepreserve; + t.idx = tracer_idx++; + t.perturbation_type = perturb; + t.safe_should_hold = true; + t.no_change_should_hold = perturb == 0; + t.local_should_hold = perturb < 4 && shapepreserve; + t.write = perturb == 2 && ti == 2; + tracers_.push_back(t); + } +} + +static Real urand () { return rand() / ((Real) RAND_MAX + 1.0); } + +void TestRandomized::generate_rho (Values& v) { + auto r = v.rhom(); + const Int n = v.ncells(); + for (Int i = 0; i < n; ++i) + r[i] = 0.5 + 1.5*urand(); +} + +void TestRandomized::generate_Q (const Tracer& t, Values& v) { + Real* rhom = v.rhom(), * Qm_min = v.Qm_min(t.idx), * Qm = v.Qm(t.idx), + * Qm_max = v.Qm_max(t.idx), * Qm_prev = v.Qm_prev(t.idx); + const Int n = v.ncells(); + for (Int i = 0; i < n; ++i) { + const Real + q_min = 0.1 + 0.8*urand(), + q_max = std::min(1, q_min + (0.9 - q_min)*urand()), + q = q_min + (q_max - q_min)*urand(); + // Check correctness up to FP. + cedr_assert(q_min >= 0 && + q_max <= 1 + 10*std::numeric_limits::epsilon() && + q_min <= q && q <= q_max); + Qm_min[i] = q_min*rhom[i]; + Qm_max[i] = q_max*rhom[i]; + // Protect against FP error. + Qm[i] = std::max(Qm_min[i], std::min(Qm_max[i], q*rhom[i])); + // Set previous Qm to the current unperturbed value. + Qm_prev[i] = Qm[i]; + } +} + +static void gen_rand_perm (const size_t n, std::vector& p) { + p.resize(n); + for (size_t i = 0; i < n; ++i) + p[i] = i; + for (size_t i = 0; i < n; ++i) { + const int j = urand()*n, k = urand()*n; + std::swap(p[j], p[k]); + } +} + +// Permuting the Qm array, even just on a rank as long as there is > 1 cell, +// produces a problem likely requiring considerable reconstruction, which +// reconstruction assuredly satisfies the properties. But because this is a +// local operation only, it doesn't test the 1 cell/rank case. +void TestRandomized::permute_Q (const Tracer& t, Values& v) { + Real* const Qm = v.Qm(t.idx); + const Int N = v.ncells(); + std::vector p; + gen_rand_perm(N, p); + std::vector Qm_orig(N); + std::copy(Qm, Qm + N, Qm_orig.begin()); + for (Int i = 0; i < N; ++i) + Qm[i] = Qm_orig[p[i]]; +} + +void TestRandomized +::add_const_to_Q (const Tracer& t, Values& v, + // Move 0 < alpha <= 1 of the way to the QLT or safety + // feasibility bound. + const Real& alpha, + // Whether the modification should be done in a + // mass-conserving way. + const bool conserve_mass, + // Only safety problem is feasible. + const bool safety_problem) { + // Some of these reductions aren't used at present. Might add more test + // options later that use them. + Real rhom, Qm, Qm_max; { + Real Qm_sum_lcl[3] = {0}; + for (Int i = 0; i < v.ncells(); ++i) { + Qm_sum_lcl[0] += v.rhom()[i]; + Qm_sum_lcl[1] += v.Qm(t.idx)[i]; + Qm_sum_lcl[2] += v.Qm_max(t.idx)[i]; + } + Real Qm_sum_gbl[3] = {0}; + mpi::all_reduce(*p_, Qm_sum_lcl, Qm_sum_gbl, 3, MPI_SUM); + rhom = Qm_sum_gbl[0]; Qm = Qm_sum_gbl[1]; Qm_max = Qm_sum_gbl[2]; + } + Real Qm_max_safety = 0; + if (safety_problem) { + Real q_safety_lcl = v.Qm_max(t.idx)[0] / v.rhom()[0]; + for (Int i = 1; i < v.ncells(); ++i) + q_safety_lcl = std::max(q_safety_lcl, v.Qm_max(t.idx)[i] / v.rhom()[i]); + Real q_safety_gbl = 0; + mpi::all_reduce(*p_, &q_safety_lcl, &q_safety_gbl, 1, MPI_MAX); + Qm_max_safety = q_safety_gbl*rhom; + } + const Real dQm = safety_problem ? + ((Qm_max - Qm) + alpha * (Qm_max_safety - Qm_max)) / ncells_ : + alpha * (Qm_max - Qm) / ncells_; + for (Int i = 0; i < v.ncells(); ++i) + v.Qm(t.idx)[i] += dQm; + // Now permute Qm so that it's a little more interesting. + permute_Q(t, v); + // Adjust Qm_prev. Qm_prev is used to test the PT::conserve case, and also + // simply to record the correct total mass. The modification above modified + // Q's total mass. If conserve_mass, then Qm_prev needs to be made to sum to + // the same new mass. If ! conserve_mass, we want Qm_prev to be modified in + // an interesting way, so that PT::conserve doesn't trivially undo the mod + // that was made above when the root fixes the mass discrepancy. + const Real + relax = 0.9, + dQm_prev = (conserve_mass ? dQm : + (safety_problem ? + ((Qm_max - Qm) + relax*alpha * (Qm_max_safety - Qm_max)) / ncells_ : + relax*alpha * (Qm_max - Qm) / ncells_)); + for (Int i = 0; i < v.ncells(); ++i) + v.Qm_prev(t.idx)[i] += dQm_prev; +} + +void TestRandomized::perturb_Q (const Tracer& t, Values& v) { + // QLT is naturally mass conserving. But if QLT isn't being asked to impose + // mass conservation, then the caller better have a conservative + // method. Here, we model that by saying that Qm_prev and Qm should sum to + // the same mass. + const bool cm = ! (t.problem_type & Tracer::PT::conserve); + // For the edge cases, we cannot be exactly on the edge and still expect the + // q-limit checks to pass to machine precision. Thus, back away from the + // edge by an amount that bounds the error in the global mass due to FP, + // assuming each cell's mass is O(1). + const Real edg = 1 - ncells_*std::numeric_limits::epsilon(); + switch (t.perturbation_type) { + case 0: + // Do nothing, to test that QLT doesn't make any changes if none is + // needed. + break; + case 1: permute_Q(t, v); break; + case 2: add_const_to_Q(t, v, 0.5, cm, false); break; + case 3: add_const_to_Q(t, v, edg, cm, false); break; + case 4: add_const_to_Q(t, v, 0.5, cm, true ); break; + case 5: add_const_to_Q(t, v, edg, cm, true ); break; + } +} + +std::string TestRandomized::get_tracer_name (const Tracer& t) { + std::stringstream ss; + ss << "t" << t.idx; + return ss.str(); +} + +void TestRandomized::init_writer () { + if (p_->amroot()) { + w_ = std::make_shared(); + w_->fh = std::unique_ptr(fopen("out_QLT.py", "w")); + int n = gcis_.size(); + w_->ngcis.resize(p_->size()); + mpi::gather(*p_, &n, 1, w_->ngcis.data(), 1, p_->root()); + w_->displs.resize(p_->size() + 1); + w_->displs[0] = 0; + for (size_t i = 0; i < w_->ngcis.size(); ++i) + w_->displs[i+1] = w_->displs[i] + w_->ngcis[i]; + cedr_assert(w_->displs.back() == ncells_); + w_->gcis.resize(ncells_); + mpi::gatherv(*p_, gcis_.data(), gcis_.size(), w_->gcis.data(), w_->ngcis.data(), + w_->displs.data(), p_->root()); + } else { + int n = gcis_.size(); + mpi::gather(*p_, &n, 1, static_cast(nullptr), 0, p_->root()); + Int* Inull = nullptr; + const int* inull = nullptr; + mpi::gatherv(*p_, gcis_.data(), gcis_.size(), Inull, inull, inull, p_->root()); + } + write_inited_ = true; +} + +void TestRandomized +::gather_field (const Real* Qm_lcl, std::vector& Qm_gbl, + std::vector& wrk) { + if (p_->amroot()) { + Qm_gbl.resize(ncells_); + wrk.resize(ncells_); + mpi::gatherv(*p_, Qm_lcl, gcis_.size(), wrk.data(), w_->ngcis.data(), + w_->displs.data(), p_->root()); + for (Int i = 0; i < ncells_; ++i) + Qm_gbl[w_->gcis[i]] = wrk[i]; + } else { + Real* rnull = nullptr; + const int* inull = nullptr; + mpi::gatherv(*p_, Qm_lcl, gcis_.size(), rnull, inull, inull, p_->root()); + } +} + +void TestRandomized +::write_field (const std::string& tracer_name, const std::string& field_name, + const std::vector& Qm) { + if ( ! p_->amroot()) return; + fprintf(w_->fh.get(), " s.%s.%s = [", tracer_name.c_str(), field_name.c_str()); + for (const auto& e : Qm) + fprintf(w_->fh.get(), "%1.15e, ", e); + fprintf(w_->fh.get(), "]\n"); +} + +void TestRandomized::write_pre (const Tracer& t, Values& v) { + if ( ! t.write) return; + std::vector f, wrk; + if ( ! write_inited_) { + init_writer(); + if (w_) + fprintf(w_->fh.get(), + "def getsolns():\n" + " class Struct:\n" + " pass\n" + " s = Struct()\n" + " s.all = Struct()\n"); + gather_field(v.rhom(), f, wrk); + write_field("all", "rhom", f); + } + const auto name = get_tracer_name(t); + if (w_) + fprintf(w_->fh.get(), " s.%s = Struct()\n", name.c_str()); + gather_field(v.Qm_min(t.idx), f, wrk); + write_field(name, "Qm_min", f); + gather_field(v.Qm_prev(t.idx), f, wrk); + write_field(name, "Qm_orig", f); + gather_field(v.Qm(t.idx), f, wrk); + write_field(name, "Qm_pre", f); + gather_field(v.Qm_max(t.idx), f, wrk); + write_field(name, "Qm_max", f); +} + +void TestRandomized::write_post (const Tracer& t, Values& v) { + if ( ! t.write) return; + const auto name = get_tracer_name(t); + std::vector Qm, wrk; + gather_field(v.Qm(t.idx), Qm, wrk); + write_field(name, "Qm_qlt", Qm); +} + +Int TestRandomized +::check (const mpi::Parallel& p, const std::vector& ts, const Values& v) { + static const bool details = true; + static const Real ulp3 = 3*std::numeric_limits::epsilon(); + Int nerr = 0; + std::vector lcl_mass(2*ts.size()), q_min_lcl(ts.size()), q_max_lcl(ts.size()); + std::vector t_ok(ts.size(), 1), local_violated(ts.size(), 0); + for (size_t ti = 0; ti < ts.size(); ++ti) { + const auto& t = ts[ti]; + + cedr_assert(t.safe_should_hold); + const bool safe_only = ! t.local_should_hold; + const Int n = v.ncells(); + const Real* rhom = v.rhom(), * Qm_min = v.Qm_min(t.idx), * Qm = v.Qm(t.idx), + * Qm_max = v.Qm_max(t.idx), * Qm_prev = v.Qm_prev(t.idx); + + q_min_lcl[ti] = 1; + q_max_lcl[ti] = 0; + for (Int i = 0; i < n; ++i) { + const bool lv = (Qm[i] < Qm_min[i] || Qm[i] > Qm_max[i]); + if (lv) local_violated[ti] = 1; + if ( ! safe_only && lv) { + // If this fails at ~ machine eps, check r2l_nl_adjust_bounds code in + // solve_node_problem. + if (details) + pr("check q " << t.str() << ": " << Qm[i] << " " << + (Qm[i] < Qm_min[i] ? Qm[i] - Qm_min[i] : Qm[i] - Qm_max[i])); + t_ok[ti] = false; + ++nerr; + } + if (t.no_change_should_hold && Qm[i] != Qm_prev[i]) { + if (details) + pr("Q should be unchanged but is not: " << Qm_prev[i] << " changed to " << + Qm[i] << " in " << t.str()); + t_ok[ti] = false; + ++nerr; + } + lcl_mass[2*ti ] += Qm_prev[i]; + lcl_mass[2*ti + 1] += Qm[i]; + q_min_lcl[ti] = std::min(q_min_lcl[ti], Qm_min[i]/rhom[i]); + q_max_lcl[ti] = std::max(q_max_lcl[ti], Qm_max[i]/rhom[i]); + } + } + + std::vector q_min_gbl(ts.size(), 0), q_max_gbl(ts.size(), 0); + mpi::all_reduce(p, q_min_lcl.data(), q_min_gbl.data(), q_min_lcl.size(), MPI_MIN); + mpi::all_reduce(p, q_max_lcl.data(), q_max_gbl.data(), q_max_lcl.size(), MPI_MAX); + + for (size_t ti = 0; ti < ts.size(); ++ti) { + // Check safety problem. If local_should_hold and it does, then the safety + // problem is by construction also solved (since it's a relaxation of the + // local problem). + const auto& t = ts[ti]; + const bool safe_only = ! t.local_should_hold; + if (safe_only) { + const Int n = v.ncells(); + const Real* rhom = v.rhom(), * Qm_min = v.Qm_min(t.idx), * Qm = v.Qm(t.idx), + * Qm_max = v.Qm_max(t.idx); + const Real q_min = q_min_gbl[ti], q_max = q_max_gbl[ti]; + for (Int i = 0; i < n; ++i) { + if (Qm[i] < q_min*rhom[i]*(1 - ulp3) || + Qm[i] > q_max*rhom[i]*(1 + ulp3)) { + if (details) + pr("check q " << t.str() << ": " << q_min*rhom[i] << " " << Qm_min[i] << + " " << Qm[i] << " " << Qm_max[i] << " " << q_max*rhom[i] << " | " << + (Qm[i] < q_min*rhom[i] ? + Qm[i] - q_min*rhom[i] : + Qm[i] - q_max*rhom[i])); + t_ok[ti] = false; + ++nerr; + } + } + } + } + + std::vector glbl_mass(2*ts.size(), 0); + mpi::reduce(p, lcl_mass.data(), glbl_mass.data(), lcl_mass.size(), MPI_SUM, + p.root()); + std::vector t_ok_gbl(ts.size(), 0); + mpi::reduce(p, t_ok.data(), t_ok_gbl.data(), t_ok.size(), MPI_MIN, p.root()); + // Right now we're not using these: + std::vector local_violated_gbl(ts.size(), 0); + mpi::reduce(p, local_violated.data(), local_violated_gbl.data(), + local_violated.size(), MPI_MAX, p.root()); + + if (p.amroot()) { + const Real tol = 1e3*std::numeric_limits::epsilon(); + for (size_t ti = 0; ti < ts.size(); ++ti) { + // Check mass conservation. + const Real desired_mass = glbl_mass[2*ti], actual_mass = glbl_mass[2*ti+1], + rd = cedr::util::reldif(desired_mass, actual_mass); + const bool mass_failed = rd > tol; + if (mass_failed) { + ++nerr; + t_ok_gbl[ti] = false; + } + if ( ! t_ok_gbl[ti]) { + std::cout << "FAIL " << ts[ti].str(); + if (mass_failed) std::cout << " mass re " << rd; + std::cout << "\n"; + } + } + } + + return nerr; +} + +TestRandomized +::TestRandomized (const mpi::Parallel::Ptr& p, const Int& ncells, + const bool verbose) + : p_(p), ncells_(ncells), write_inited_(false) +{} + +void TestRandomized::init () { + init_numbering(); + init_tracers_vector(); + init_tracers(); +} + +Int TestRandomized::run (const Int nrepeat, const bool write) { + const Int nt = tracers_.size(), nlclcells = gcis_.size(); + Values v(nt, nlclcells); + generate_rho(v); + for (const auto& t : tracers_) { + generate_Q(t, v); + perturb_Q(t, v); + } + if (write) + for (const auto& t : tracers_) + write_pre(t, v); + run_impl(v, nrepeat, write); + if (write) + for (const auto& t : tracers_) + write_post(t, v); + return check(*p_, tracers_, v); +} + +} // namespace test +} // namespace cedr diff --git a/cedr/cedr_test_randomized.hpp b/cedr/cedr_test_randomized.hpp new file mode 100644 index 0000000..e2e813b --- /dev/null +++ b/cedr/cedr_test_randomized.hpp @@ -0,0 +1,119 @@ +#ifndef INCLUDE_CEDR_TEST_RANDOMIZED_HPP +#define INCLUDE_CEDR_TEST_RANDOMIZED_HPP + +#include "cedr_mpi.hpp" +#include "cedr_util.hpp" + +namespace cedr { +namespace test { + +class TestRandomized { +public: + TestRandomized(const mpi::Parallel::Ptr& p, const Int& ncells, + const bool verbose = false); + + void init(); + + Int run(const Int nrepeat = 1, const bool write=false); + +protected: + struct Tracer { + typedef ProblemType PT; + + Int idx; + Int problem_type; + Int perturbation_type; + bool no_change_should_hold, safe_should_hold, local_should_hold; + bool write; + + std::string str() const; + + Tracer () + : idx(-1), problem_type(-1), perturbation_type(-1), no_change_should_hold(false), + safe_should_hold(true), local_should_hold(true), write(false) + {} + }; + + struct Values { + Values (const Int ntracers, const Int ncells) + : ncells_(ncells), v_((4*ntracers + 1)*ncells) + {} + Int ncells () const { return ncells_; } + Real* rhom () { return v_.data(); } + Real* Qm_min (const Int& ti) { return v_.data() + ncells_*(1 + 4*ti ); } + Real* Qm (const Int& ti) { return v_.data() + ncells_*(1 + 4*ti + 1); } + Real* Qm_max (const Int& ti) { return v_.data() + ncells_*(1 + 4*ti + 2); } + Real* Qm_prev (const Int& ti) { return v_.data() + ncells_*(1 + 4*ti + 3); } + const Real* rhom () const { return const_cast(this)->rhom(); } + const Real* Qm_min (const Int& ti) const + { return const_cast(this)->Qm_min (ti); } + const Real* Qm (const Int& ti) const + { return const_cast(this)->Qm (ti); } + const Real* Qm_max (const Int& ti) const + { return const_cast(this)->Qm_max (ti); } + const Real* Qm_prev (const Int& ti) const + { return const_cast(this)->Qm_prev(ti); } + private: + Int ncells_; + std::vector v_; + }; + + // For solution output, if requested. + struct Writer { + std::unique_ptr fh; + std::vector ngcis; // Number of i'th rank's gcis_ array. + std::vector displs; // Cumsum of above. + std::vector gcis; // Global cell indices packed by rank's gcis_ vector. + ~Writer(); + }; + + const mpi::Parallel::Ptr p_; + const Int ncells_; + // Global mesh entity IDs, 1-1 with reduction array index or QLT leaf node. + std::vector gcis_; + std::vector tracers_; + // For optional output. + bool write_inited_; + std::shared_ptr w_; // Only on root. + + // Fill gcis_. + virtual void init_numbering() = 0; + + // Using tracers_, the vector of Tracers, initialize the CDR's tracers. + virtual void init_tracers() = 0; + + virtual void run_impl(Values& v, const Int nrepeat, const bool write) = 0; + +private: + void init_tracers_vector(); + + void add_const_to_Q( + const Tracer& t, Values& v, + // Move 0 < alpha <= 1 of the way to the QLT or safety feasibility bound. + const Real& alpha, + // Whether the modification should be done in a mass-conserving way. + const bool conserve_mass, + // Only safety problem is feasible. + const bool safety_problem); + + void perturb_Q(const Tracer& t, Values& v); + void init_writer(); + void gather_field(const Real* Qm_lcl, std::vector& Qm_gbl, + std::vector& wrk); + void write_field(const std::string& tracer_name, const std::string& field_name, + const std::vector& Qm); + void write_pre(const Tracer& t, Values& v); + void write_post(const Tracer& t, Values& v); + + static void generate_rho(Values& v); + static void generate_Q(const Tracer& t, Values& v); + static void permute_Q(const Tracer& t, Values& v); + static std::string get_tracer_name(const Tracer& t); + static Int check(const mpi::Parallel& p, const std::vector& ts, + const Values& v); +}; + +} // namespace test +} // namespace cedr + +#endif diff --git a/cedr/make_qltcpp.sh b/cedr/make_qltcpp.sh index 5362ec9..c0267c7 100644 --- a/cedr/make_qltcpp.sh +++ b/cedr/make_qltcpp.sh @@ -2,7 +2,7 @@ # mpicxx -Wall -pedantic -fopenmp -std=c++11 -I/home/ambradl/lib/kokkos/cpu/include qlt.cpp -L/home/ambradl/lib/kokkos/cpu/lib -lkokkos -ldl # OMP_PROC_BIND=false OMP_NUM_THREADS=2 mpirun -np 14 ./a.out -t -(for f in cedr.hpp cedr_kokkos.hpp cedr_mpi.hpp cedr_util.hpp cedr_qlt.hpp cedr_local.hpp cedr_mpi_inl.hpp cedr_local_inl.hpp cedr_qlt_inl.hpp cedr_test.hpp cedr_util.cpp cedr_local.cpp cedr_mpi.cpp cedr_qlt.cpp cedr_test_1d_transport.cpp cedr_test.cpp; do +(for f in cedr_kokkos.hpp cedr.hpp cedr_mpi.hpp cedr_util.hpp cedr_cdr.hpp cedr_qlt.hpp cedr_caas.hpp cedr_caas_inl.hpp cedr_local.hpp cedr_mpi_inl.hpp cedr_local_inl.hpp cedr_qlt_inl.hpp cedr_test_randomized.hpp cedr_test.hpp cedr_util.cpp cedr_local.cpp cedr_mpi.cpp cedr_qlt.cpp cedr_caas.cpp cedr_test_randomized.cpp cedr_test_1d_transport.cpp cedr_test.cpp; do echo "//>> $f" cat $f echo "" From d9a4019e145286cfdf925728afe9cd3c1eb49da5 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Mon, 15 Jan 2018 10:35:31 -0700 Subject: [PATCH 24/28] SIQK: Prep this directory for COMPOSE. --- siqk/siqk_quadrature.hpp | 259 ++++++++++++++++++++++++++++++++++++++- siqk/siqk_runtests.py | 0 siqk/siqk_sqr.hpp | 26 ++-- 3 files changed, 270 insertions(+), 15 deletions(-) mode change 100644 => 100755 siqk/siqk_runtests.py diff --git a/siqk/siqk_quadrature.hpp b/siqk/siqk_quadrature.hpp index 18e5ccb..ce602ca 100644 --- a/siqk/siqk_quadrature.hpp +++ b/siqk/siqk_quadrature.hpp @@ -6,12 +6,31 @@ namespace siqk { /* For the TRISYM entries, see, e.g., + Triangular quadrature to use for integration Dunavant, D.A. "High Degree + Efficient Symmetrical Gaussian Quadrature Rules for the Triangle." + J. Numer. Meth. Eng., 21, pp 1129-1148. + and Zhang, Linbo, Tao Cui, and Hui Liu. "A set of symmetric quadrature rules on triangles and tetrahedra." J. of Computational Mathematics (2009): 89-96. For the TRITAYLOR, see Day, David M. and Mark A. Taylor, "A new 11 point degree 6 cubature formula - for the triangle", PAMM 7 (2007). + for the triangle", PAMM 7 (2007) + and + Taylor, Mark A., Beth A. Wingate, and Len P. Bos. "A cardinal function + algorithm for computing multivariate quadrature points." SIAM Journal on + Numerical Analysis 45.1 (2007): 193-205. */ + +// The symmetric order-12 quadrature rule gives 1 fewer digit of conservation +// than, e.g., the order-14 one, so switch to the Taylor et al rule. Part of the +// problem I think is that the Dunavant table results from double precision +// computations (rather than quad) and is recorded to perhaps one fewer digit +// than might have been available. But I can't find a table with an extra +// digit. The Taylor et al. rule has one fewer coordinate, so that's also an +// advantage. The loss of symmetry in the coordinates I think is not relevant to +// this application. +#define SIQK_USE_TRITAY12 + #define SIQK_QUADRATURE_TRISYM_ORDER4_COORD \ {0.108103018168070, 0.445948490915965, 0.445948490915965, \ 0.445948490915965, 0.108103018168070, 0.445948490915965, \ @@ -300,6 +319,222 @@ namespace siqk { 1.018615244613670e-01, 1.114218316600018e-01, 1.120094502629461e-01, \ 1.247875714375583e-01, 1.884034888373949e-01} +#define SIQK_QUADRATURE_TRITAY_ORDER12_COORD \ + {7.26510255160501828e-02, 9.27348974483949817e-01, 0.00000000000000000e+00, \ + 2.11790731803609689e-02, 2.35517332495786824e-02, 9.55269193570060349e-01, \ + 1.41841115784669236e-01, 5.40914911362088088e-17, 8.58158884215330708e-01, \ + 1.15143666726236216e-02, 9.45475073220970907e-01, 4.30105601064054710e-02, \ + 2.77555756156289135e-17, 1.54064601626856063e-01, 8.45935398373143910e-01, \ + 3.72684680767588483e-01, -1.88694080537681499e-16, 6.27315319232411683e-01, \ + 9.43134911146902510e-01, 2.71109713562557482e-02, 2.97541174968417414e-02, \ + 8.44725347421859452e-01, 1.46044961672175677e-01, 9.22969090596487129e-03, \ + 8.23277107647898521e-01, 2.11522233831219000e-02, 1.55570668968979586e-01, \ + 6.21586880750877868e-01, 1.45665147883470222e-02, 3.63846604460775103e-01, \ + 2.21919501597089841e-02, 7.88601719223131714e-01, 1.89206330617159302e-01, \ + 2.27722111443204644e-01, 7.49189739790679599e-01, 2.30881487661157569e-02, \ + 7.38137544226065284e-02, 7.18714961015890358e-02, 8.54314749475804436e-01, \ + 6.43364629415364875e-01, 3.32129083947645065e-01, 2.45062866369900600e-02, \ + 2.28091126376529507e-02, 3.61181591189672080e-01, 6.16009296172674969e-01, \ + 6.63093778446759319e-01, 2.43458133948799671e-01, 9.34480876044410103e-02, \ + 2.51456820638045198e-02, 5.81689214740147453e-01, 3.93165103196048027e-01, \ + 4.29837040104380730e-01, 5.44446676271925334e-01, 2.57162836236939363e-02, \ + 9.40413011410586863e-02, 8.26003314017559997e-01, 7.99553848413813162e-02, \ + 7.94010795132135239e-01, 1.16386499067277244e-01, 8.96027058005875177e-02, \ + 7.83496599417470019e-02, 2.03768481077729741e-01, 7.17881858980523258e-01, \ + 2.25505520049374242e-01, 6.44132203822605637e-02, 7.10081259568365097e-01, \ + 6.43800731623786371e-01, 9.54285858105846096e-02, 2.60770682565629019e-01, \ + 5.43837635808460451e-01, 2.44982965093490213e-01, 2.11179399098049336e-01, \ + 4.32112641877997194e-01, 7.05667243440369213e-02, 4.97320633777965815e-01, \ + 2.55495747579340349e-01, 6.19381257362555782e-01, 1.25122995058103870e-01, \ + 1.22162380966293838e-01, 6.27682615680314027e-01, 2.50155003353392136e-01, \ + 4.47861373562203791e-01, 4.22605657433460014e-01, 1.29532969004336196e-01, \ + 4.09354529674576528e-01, 2.10785259391403995e-01, 3.79860210934019449e-01, \ + 1.24718320885524481e-01, 4.08963804491244809e-01, 4.66317874623230710e-01, \ + 2.28197277938737758e-01, 2.13777432530059680e-01, 5.58025289531202562e-01, \ + 2.88796329020881648e-01, 4.09786577770025306e-01, 3.01417093209092990e-01} + +#define SIQK_QUADRATURE_TRITAY_ORDER12_WEIGHT \ + {4.888049814660050e-03, 6.675900027367356e-03, 6.845534654343699e-03, \ + 7.119751436080721e-03, 7.714492373624846e-03, 9.654708742436301e-03, \ + 1.050932673560249e-02, 1.068084365762828e-02, 1.848368581123072e-02, \ + 1.854548042160657e-02, 2.062000411968213e-02, 2.168508541701153e-02, \ + 2.249074619915818e-02, 2.490407320150775e-02, 2.509917342768508e-02, \ + 2.794373431987983e-02, 2.814555860521331e-02, 2.816965445973000e-02, \ + 3.052917241207244e-02, 3.057527760403899e-02, 3.957360579297199e-02, \ + 4.128188739546268e-02, 4.593784216579169e-02, 4.749957532530720e-02, \ + 4.814880503690738e-02, 5.096492487678762e-02, 5.335208304882109e-02, \ + 5.414687261316752e-02, 5.943783395113540e-02, 5.998970732710617e-02, \ + 6.316454642265663e-02, 7.522206260332436e-02} + +#define SIQK_QUADRATURE_TRITAY_ORDER16_COORD \ + {2.22044604925031308e-16, 1.00000000000000022e+00, -4.44089209850062616e-16, \ + 1.72652007459386422e-16, -1.72652007459386422e-16, 1.00000000000000000e+00, \ + 9.99999999999999556e-01, 1.67697146066824836e-16, 2.76392063783237780e-16, \ + 5.51287671788707190e-02, 9.39886358357719054e-01, 4.98487446341022711e-03, \ + 6.97876983249687277e-03, 5.43806683058353502e-02, 9.38640561861667777e-01, \ + 9.37963548813877668e-01, 9.39400491638755185e-03, 5.26424462697347786e-02, \ + 3.66619396286766500e-02, 1.64345086362403456e-02, 9.46903551735083004e-01, \ + 1.67139052970596280e-02, 9.46948726986246103e-01, 3.63373677166942688e-02, \ + 9.42217145243293808e-01, 4.26604005767651506e-02, 1.51224541799410417e-02, \ + 1.18395699389696601e-01, 1.22269495438720680e-02, 8.69377351066431325e-01, \ + 1.21386193179034985e-02, 8.67369652104666988e-01, 1.20491728577429513e-01, \ + 1.38549201074093298e-01, 8.45674402138906656e-01, 1.57763967870000466e-02, \ + 1.56119497522677064e-02, 1.39575963210261389e-01, 8.44812087037470905e-01, \ + 8.54716865118515079e-01, 1.31782174323082840e-01, 1.35009605584020809e-02, \ + 8.38676993516376368e-01, 1.57955126300247592e-02, 1.45527493853598866e-01, \ + 2.47883957465546700e-01, 7.36546288443630570e-01, 1.55697540908227294e-02, \ + 2.48047467521941595e-01, 1.39688430330388181e-02, 7.37983689445019575e-01, \ + 1.54489124190416716e-02, 2.54789518603903087e-01, 7.29761568977055242e-01, \ + 1.40536794130045051e-02, 7.31638652255490185e-01, 2.54307668331505310e-01, \ + 7.14650647525855276e-01, 1.57253728950845356e-02, 2.69623979579060202e-01, \ + 7.19291320004516122e-01, 2.66230284364682601e-01, 1.44783956308012773e-02, \ + 7.34816524385439873e-02, 8.67350406521407824e-01, 5.91679410400481887e-02, \ + 6.23723757982518195e-02, 7.41493666956614256e-02, 8.63478257506086755e-01, \ + 5.64947509640178147e-01, 1.59285948360033090e-02, 4.19123895523818568e-01, \ + 4.03471605078646045e-01, 1.56061028067777056e-02, 5.80922292114576355e-01, \ + 3.93065372986517114e-01, 5.91009481748388743e-01, 1.59251452650941427e-02, \ + 1.58528135007360294e-02, 4.03477149688871994e-01, 5.80670036810391865e-01, \ + 1.55759225172019677e-02, 5.69474562852597677e-01, 4.14949514630200356e-01, \ + 8.56028762075832783e-01, 6.78493700650298209e-02, 7.61218678591373960e-02, \ + 5.57652171741686020e-01, 4.26596859027159547e-01, 1.57509692311544325e-02, \ + 1.58711917968908656e-01, 6.70982507889701790e-02, 7.74189831242121151e-01, \ + 1.65257027288124081e-01, 7.52831023147951472e-01, 8.19119495639244466e-02, \ + 6.69143759151381579e-02, 7.75372778355688519e-01, 1.57712845729173323e-01, \ + 8.06983742470389620e-02, 1.68907315778736744e-01, 7.50394309974224294e-01, \ + 7.60435265981276642e-01, 1.68733583291941547e-01, 7.08311507267818108e-02, \ + 7.41575866479260215e-01, 8.21244708436324466e-02, 1.76299662677107338e-01, \ + 2.90354968333863872e-01, 6.28870536334479868e-01, 8.07744953316562597e-02, \ + 6.13421339495847429e-01, 8.11413015265752130e-02, 3.05437358977577345e-01, \ + 8.03401946048588056e-02, 2.96911206508048198e-01, 6.22748598887093108e-01, \ + 2.98521053628375943e-01, 7.67542314170573392e-02, 6.24724714954566718e-01, \ + 7.65491844989589776e-02, 6.22302233384477099e-01, 3.01148582116563923e-01, \ + 6.11711534686959046e-01, 3.10378628805096313e-01, 7.79098365079446409e-02, \ + 4.57714874646253878e-01, 8.19218215186586080e-02, 4.60363303835087556e-01, \ + 4.46142332818981191e-01, 4.71702266501346945e-01, 8.21554006796718639e-02, \ + 8.15831550859882348e-02, 4.54660341525047307e-01, 4.63756503388964458e-01, \ + 1.87663085257486151e-01, 1.70109133923693812e-01, 6.42227780818820149e-01, \ + 1.69570213325764829e-01, 6.40600432948674525e-01, 1.89829353725560646e-01, \ + 6.34777673094082173e-01, 1.91226758371660088e-01, 1.73995568534257739e-01, \ + 3.31577016252400436e-01, 1.88531576707023696e-01, 4.79891407040575868e-01, \ + 1.87871344418995001e-01, 4.77292995769074468e-01, 3.34835659811930531e-01, \ + 1.91505318098148747e-01, 3.12697462175977048e-01, 4.95797219725874205e-01, \ + 3.11122038514993648e-01, 4.96122594594562871e-01, 1.92755366890443480e-01, \ + 4.91017887987217960e-01, 1.92880531286706181e-01, 3.16101580726075804e-01, \ + 4.74506574489367838e-01, 3.36004145381649799e-01, 1.89489280128982363e-01, \ + 3.31914842734057136e-01, 3.33728055084797526e-01, 3.34357102181145338e-01} + +#define SIQK_QUADRATURE_TRITAY_ORDER16_WEIGHT \ + {3.101299925557040e-04, 3.157587355864167e-04, 3.543300779435999e-04, \ + 2.758185808404191e-03, 3.134620382788961e-03, 3.926570441300832e-03, \ + 4.727574193224073e-03, 4.891225563554369e-03, 4.993082174472287e-03, \ + 6.877690940807241e-03, 7.048958902004150e-03, 7.482343216857858e-03, \ + 7.804875180599580e-03, 7.884184667408244e-03, 8.789727319135741e-03, \ + 1.020569201350139e-02, 1.047814393079899e-02, 1.053567064989013e-02, \ + 1.088233801010153e-02, 1.111442043493028e-02, 1.120933468410323e-02, \ + 1.150613084965787e-02, 1.184069512498871e-02, 1.287323216839533e-02, \ + 1.289784008040242e-02, 1.290361638049960e-02, 1.301716160293398e-02, \ + 1.328840708045580e-02, 1.328923809154386e-02, 1.337661646188983e-02, \ + 1.878939033204372e-02, 1.915329470976454e-02, 1.924248475126509e-02, \ + 1.948099129262171e-02, 1.973020557737488e-02, 2.061823890489025e-02, \ + 2.564362192416913e-02, 2.582028209673193e-02, 2.591150211345546e-02, \ + 2.642639940905077e-02, 2.692527865136344e-02, 2.709476646596388e-02, \ + 2.923685732222178e-02, 2.964315841816427e-02, 2.971791383743251e-02, \ + 3.159001279314883e-02, 3.164634225766622e-02, 3.203536808857846e-02, \ + 4.060202979591518e-02, 4.072187567651760e-02, 4.073396006206902e-02, \ + 4.075252740422450e-02, 4.075823324694786e-02, 4.084655298115641e-02, \ + 4.616091672652638e-02} + +#define SIQK_QUADRATURE_TRITAY_ORDER18_COORD \ + {7.07029890425770434e-03, 1.16731059668412299e-02, 9.81256595128901066e-01, \ + 1.18506636748826333e-02, 9.81003085838793698e-01, 7.14625048632366866e-03, \ + 9.77787974953233552e-01, 1.06966317091697870e-02, 1.15153933375966612e-02, \ + 1.21952425108865503e-02, 9.38247698355045179e-01, 4.95570591340682709e-02, \ + 5.03248860967756076e-02, 1.26627518417214337e-02, 9.37012362061502957e-01, \ + 9.28052601109434661e-01, 5.98109409983804755e-02, 1.21364578921848640e-02, \ + 9.24985307647630872e-01, 1.37363297926722354e-02, 6.12783625596968889e-02, \ + 6.29343769992106727e-02, 9.22952795940546356e-01, 1.41128270602429717e-02, \ + 1.46695353279870377e-02, 6.33107354992695215e-02, 9.22019729172743441e-01, \ + 8.38221442443636167e-01, 1.17265100334603151e-02, 1.50052047522903520e-01, \ + 1.20132291087278187e-02, 1.55472058732347040e-01, 8.32514712158925141e-01, \ + 1.53147795225895278e-01, 8.34329388898221724e-01, 1.25228158758829977e-02, \ + 1.26364459307456434e-02, 8.50163803195673196e-01, 1.37199750873581161e-01, \ + 1.39355658599882609e-01, 1.28816350521976618e-02, 8.47762706347919726e-01, \ + 8.35267146700183760e-01, 1.51080160895878751e-01, 1.36526924039374886e-02, \ + 4.12764350243855882e-01, 1.01917879216578220e-02, 5.77043861834486305e-01, \ + 1.19773841073520515e-02, 2.81337239930327110e-01, 7.06685375962320839e-01, \ + 2.75105559050908943e-01, 7.12437462850100567e-01, 1.24569780989904899e-02, \ + 7.11523343775096961e-01, 2.76302525086338957e-01, 1.21741311385640816e-02, \ + 5.69603491897309744e-01, 1.09658368560618374e-02, 4.19430671246628417e-01, \ + 1.11273414647166669e-02, 4.28911051788389452e-01, 5.59961606746893992e-01, \ + 5.66810345010056338e-01, 4.21542055511477942e-01, 1.16475994784657200e-02, \ + 4.17052309556705914e-01, 5.71125859044442907e-01, 1.18218313988511792e-02, \ + 1.15242148311881509e-02, 5.82686827051090317e-01, 4.05788958117721532e-01, \ + 7.14440844241883699e-01, 1.30567806713246960e-02, 2.72502375086791593e-01, \ + 2.64452707580261070e-01, 1.30760400963919332e-02, 7.22471252323346969e-01, \ + 1.33578918342581732e-02, 7.26343706240674458e-01, 2.60298401925067369e-01, \ + 8.68135265415298840e-01, 6.87230068637382230e-02, 6.31417277209629368e-02, \ + 6.27086061132897665e-02, 8.65230210152941437e-01, 7.20611837337687966e-02, \ + 7.60967385052684769e-02, 6.48599071037368607e-02, 8.59043354390994662e-01, \ + 6.27716704398273706e-02, 1.48349494336207116e-01, 7.88878835223965513e-01, \ + 7.88170460224977831e-01, 6.24359898395942040e-02, 1.49393549935427972e-01, \ + 1.47224894550839758e-01, 7.87136901173502213e-01, 6.56382042756580297e-02, \ + 4.22525938278520530e-01, 5.19104921609511785e-02, 5.25563569560528299e-01, \ + 7.74048614563915161e-01, 1.54312992744383953e-01, 7.16383926917008862e-02, \ + 6.76067776910891149e-01, 2.61784274560294683e-01, 6.21479485288141675e-02, \ + 6.74530572355868108e-02, 7.66725787281281046e-01, 1.65821155483132143e-01, \ + 6.17776557233678525e-02, 2.58210367662733586e-01, 6.80011976613898561e-01, \ + 1.74941863707076289e-01, 6.79065925147429861e-02, 7.57151543778180725e-01, \ + 5.84917884088599349e-02, 5.29357827480425258e-01, 4.12150384110714807e-01, \ + 6.72145076162932620e-01, 6.66036150484161232e-02, 2.61251308788651271e-01, \ + 5.51208842356557649e-01, 5.85675461899432051e-02, 3.90223611453499153e-01, \ + 2.98183807982819626e-01, 6.44535360410836422e-02, 6.37362655976096759e-01, \ + 2.61427822878740113e-01, 6.74813842915130246e-01, 6.37583342061296410e-02, \ + 5.82159599068178268e-02, 3.91460231036876105e-01, 5.50323809056306068e-01, \ + 6.75570147429912504e-02, 6.48770149230717630e-01, 2.83672836026291120e-01, \ + 5.44832625703827067e-01, 3.94649822040802345e-01, 6.05175522553705880e-02, \ + 3.99787267113028255e-01, 5.39013715193329634e-01, 6.11990176936421104e-02, \ + 1.51078277618042822e-01, 1.62789508278475825e-01, 6.86132214103481353e-01, \ + 1.61959533146025403e-01, 6.81243632264066146e-01, 1.56796834589908451e-01, \ + 6.78965449795995379e-01, 1.54283287802020219e-01, 1.66751262401984401e-01, \ + 4.97246831616064200e-01, 2.52272775044453668e-01, 2.50480393339482132e-01, \ + 2.45792781854977660e-01, 2.54798153240703207e-01, 4.99409064904319133e-01, \ + 2.75839635471827105e-01, 1.48558054919434857e-01, 5.75602309608738039e-01, \ + 1.41286303940196589e-01, 2.93023960643619241e-01, 5.65689735416184170e-01, \ + 5.75308715344231558e-01, 2.80899127230990808e-01, 1.43792157424777634e-01, \ + 2.66045287116412177e-01, 4.82098959297083796e-01, 2.51855753586504028e-01, \ + 2.89515501140379161e-01, 5.64187824544361005e-01, 1.46296674315259834e-01, \ + 4.20272276953932211e-01, 1.30769964434388403e-01, 4.48957758611679414e-01, \ + 5.51913339122326096e-01, 1.47969222194756778e-01, 3.00117438682917126e-01, \ + 1.54754368775656848e-01, 5.63868422294592553e-01, 2.81377208929750600e-01, \ + 1.38678912478906013e-01, 4.36115742879047474e-01, 4.25205344642046457e-01, \ + 3.79754605982586757e-01, 3.60326393528548949e-01, 2.59919000488864349e-01, \ + 4.32257322202306393e-01, 4.22418833467425037e-01, 1.45323844330268570e-01, \ + 2.50087546338060018e-01, 3.71900183305238496e-01, 3.78012270356701430e-01, \ + 3.73879170813181227e-01, 2.41364500692846234e-01, 3.84756328493972566e-01} +#define SIQK_QUADRATURE_TRITAY_ORDER18_WEIGHT \ + {1.258287849322552e-03, 1.263672600361209e-03, 1.663464766659172e-03, \ + 4.075174606270012e-03, 4.306776287080819e-03, 4.389337308965301e-03, \ + 4.854979278083793e-03, 5.123310595743368e-03, 5.419884417037201e-03, \ + 6.469269508792310e-03, 6.816991179147562e-03, 6.923866407332497e-03, \ + 6.971077005242425e-03, 7.206069998379916e-03, 7.685172776701560e-03, \ + 8.124490112628030e-03, 8.485915214007324e-03, 8.504426621066338e-03, \ + 8.547676033732530e-03, 8.694442727954849e-03, 8.727198121935910e-03, \ + 8.920337864331938e-03, 8.922343193968446e-03, 8.952316877617482e-03, \ + 9.062987810035171e-03, 9.239241944101240e-03, 9.289678218556065e-03, \ + 1.016085758882769e-02, 1.068858309045880e-02, 1.159584270491392e-02, \ + 1.372133554295597e-02, 1.451509611701859e-02, 1.472613692527127e-02, \ + 1.497181258145377e-02, 1.535134740593910e-02, 1.626316829313562e-02, \ + 1.639421042530506e-02, 1.656173375959963e-02, 1.730837634372872e-02, \ + 1.735406869880698e-02, 1.736860247019273e-02, 1.742643812271074e-02, \ + 1.743007805929840e-02, 1.777357849874442e-02, 1.800914981913493e-02, \ + 1.814631429213930e-02, 1.909488510415974e-02, 1.961264000589436e-02, \ + 2.413550629437514e-02, 2.449560607831186e-02, 2.486104169360984e-02, \ + 2.535328684929062e-02, 2.548859970214835e-02, 2.606800318335970e-02, \ + 2.617304374623586e-02, 2.622203417758513e-02, 2.637298224112941e-02, \ + 2.647245318638137e-02, 2.711977972504153e-02, 2.717351017096441e-02, \ + 2.735502743194343e-02, 2.786441729563326e-02, 2.888671321165472e-02, \ + 2.926968908113495e-02, 3.045196253398069e-02, 3.186369822247498e-02} + class TriangleQuadrature { const Real trisym_order4_coord_ [ 18] = SIQK_QUADRATURE_TRISYM_ORDER4_COORD; const Real trisym_order4_weight_ [ 6] = SIQK_QUADRATURE_TRISYM_ORDER4_WEIGHT; @@ -307,10 +542,19 @@ class TriangleQuadrature { const Real tritay_order6_weight_ [ 11] = SIQK_QUADRATURE_TRITAY_ORDER6_WEIGHT; const Real trisym_order8_coord_ [ 48] = SIQK_QUADRATURE_TRISYM_ORDER8_COORD; const Real trisym_order8_weight_ [ 16] = SIQK_QUADRATURE_TRISYM_ORDER8_WEIGHT; +#ifdef SIQK_USE_TRITAY12 + const Real tritay_order12_coord_ [ 96] = SIQK_QUADRATURE_TRITAY_ORDER12_COORD; + const Real tritay_order12_weight_[ 32] = SIQK_QUADRATURE_TRITAY_ORDER12_WEIGHT; +#else const Real trisym_order12_coord_ [ 99] = SIQK_QUADRATURE_TRISYM_ORDER12_COORD; const Real trisym_order12_weight_[ 33] = SIQK_QUADRATURE_TRISYM_ORDER12_WEIGHT; +#endif const Real trisym_order14_coord_ [138] = SIQK_QUADRATURE_TRISYM_ORDER14_COORD; const Real trisym_order14_weight_[ 46] = SIQK_QUADRATURE_TRISYM_ORDER14_WEIGHT; + const Real tritay_order16_coord_ [165] = SIQK_QUADRATURE_TRITAY_ORDER16_COORD; + const Real tritay_order16_weight_[ 55] = SIQK_QUADRATURE_TRITAY_ORDER16_WEIGHT; + const Real tritay_order18_coord_ [198] = SIQK_QUADRATURE_TRITAY_ORDER18_COORD; + const Real tritay_order18_weight_[ 66] = SIQK_QUADRATURE_TRITAY_ORDER18_WEIGHT; const Real trisym_order20_coord_ [264] = SIQK_QUADRATURE_TRISYM_ORDER20_COORD; const Real trisym_order20_weight_[ 88] = SIQK_QUADRATURE_TRISYM_ORDER20_WEIGHT; @@ -334,13 +578,26 @@ class TriangleQuadrature { weight = RawConstArray(trisym_order8_weight_, 16); break; case 12: +#ifdef SIQK_USE_TRITAY12 + coord = RawConstVec3s(tritay_order12_coord_, 32, 3); + weight = RawConstArray(tritay_order12_weight_, 32); +#else coord = RawConstVec3s(trisym_order12_coord_, 33, 3); weight = RawConstArray(trisym_order12_weight_, 33); +#endif break; case 14: coord = RawConstVec3s(trisym_order14_coord_, 46, 3); weight = RawConstArray(trisym_order14_weight_, 46); break; + case 16: + coord = RawConstVec3s(tritay_order16_coord_, 55, 3); + weight = RawConstArray(tritay_order16_weight_, 55); + break; + case 18: + coord = RawConstVec3s(tritay_order18_coord_, 66, 3); + weight = RawConstArray(tritay_order18_weight_, 66); + break; case 20: coord = RawConstVec3s(trisym_order20_coord_, 88, 3); weight = RawConstArray(trisym_order20_weight_, 88); diff --git a/siqk/siqk_runtests.py b/siqk/siqk_runtests.py old mode 100644 new mode 100755 diff --git a/siqk/siqk_sqr.hpp b/siqk/siqk_sqr.hpp index abace0a..c50f710 100644 --- a/siqk/siqk_sqr.hpp +++ b/siqk/siqk_sqr.hpp @@ -26,10 +26,6 @@ namespace sqr { // spherical quadrilateral <-> reference square */ namespace impl { -// In the implementation, (a,b) in [0,1] because convex combinations are used -// throughout; but in the user interface, (a,b) in [-1,1] to agree with the -// definition of the reference square. - // Compute T(i,:). template KOKKOS_INLINE_FUNCTION @@ -45,7 +41,9 @@ void calc_T_row (const ConstVec3sT& p, const Quad& e, const Int i, template KOKKOS_INLINE_FUNCTION void calc_ref_to_bilinear (const ConstVec3sT& p, const Quad& e, - const Real a, const Real b, Real q[3]) { + Real a, Real b, Real q[3]) { + a = 0.5*(a + 1); + b = 0.5*(b + 1); for (Int i = 0; i < 3; ++i) { Real t1, t2, t3, t4; impl::calc_T_row(p, e, i, t1, t2, t3, t4); @@ -71,8 +69,10 @@ void calc_residual (const ConstVec3sT& p, const Quad& e, const Real a, // calc_isoparametric_jacobian in slmmir.cpp. template KOKKOS_INLINE_FUNCTION -void calc_Jacobian (const ConstVec3sT& p, const Quad& e, const Real a, - const Real b, Real J[6]) { +void calc_Jacobian (const ConstVec3sT& p, const Quad& e, Real a, Real b, + Real J[6]) { + a = 0.5*(a + 1); + b = 0.5*(b + 1); Real r[3]; for (Int i = 0; i < 3; ++i) { Real t1, t2, t3, t4; @@ -113,8 +113,8 @@ void solve_Jxr (Real J[6], const Real r[3], Real dx[2]) { Qtr[j] += Jj[i]*r[i]; } // dx = R \ (Q' r). - dx[1] = Qtr[1] / n2; - dx[0] = (Qtr[0] - a*dx[1]) / n1; + dx[1] = 2*(Qtr[1] / n2); + dx[0] = 2*((Qtr[0] - a*dx[1]) / n1); } } // namespace impl @@ -133,7 +133,7 @@ void calc_ref_to_sphere ( // The point on the sphere. Real q[3]) { - impl::calc_ref_to_bilinear(p, e, 0.5*(a+1), 0.5*(b+1), q); + impl::calc_ref_to_bilinear(p, e, a, b, q); SphereGeometry::normalize(q); } @@ -155,7 +155,7 @@ void calc_sphere_to_ref ( { const Real tol2 = square(tol); Real rnorm2 = 1; - a = b = 0.5; + a = b = 0; Int it = 0; for (it = 1; it <= max_its; ++it) { // Newton's method. Real r[3], J[6]; @@ -168,8 +168,6 @@ void calc_sphere_to_ref ( a -= dx[0]; b -= dx[1]; } - a = 2*a - 1; - b = 2*b - 1; if (info) { info->success = rnorm2 <= tol2; info->n_iterations = it; @@ -218,7 +216,7 @@ class TestSphereToRefKernel { ij = k % square(n_a_test), i = ij / n_a_test, j = ij % n_a_test; - const Real a_t = 2*a_test[i]-1, b_t = 2*a_test[j]-1; + const Real a_t = a_test[i], b_t = a_test[j]; Real q[3]; sqr::calc_ref_to_sphere(p_, slice(e_, ei), a_t, b_t, q); Real a, b; From d85f752f67f2a1afccd2035f1c6166f29a4e736a Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Mon, 15 Jan 2018 11:48:24 -0700 Subject: [PATCH 25/28] CEDR: Update readme. --- cedr/readme.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cedr/readme.txt b/cedr/readme.txt index 339e6b2..0417972 100644 --- a/cedr/readme.txt +++ b/cedr/readme.txt @@ -10,4 +10,4 @@ $ ./kokkos/generate_makefile.bash --with-openmp --ldflags=-fPIC --prefix=/path/t make.inc.mymachine. Edit it with machine-specific information. Then $ ln -s make.inc.machine make.inc $ make -j2 - $ mpirun -np 4 ./testqlt -t # Look for PASS + $ mpirun -np 4 ./testcedr -t # Look for PASS From cc54b2a57134d24c9bee7671ddf69cf29166ed92 Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Mon, 15 Jan 2018 19:10:48 -0700 Subject: [PATCH 26/28] cedr: Finish initial impl of global ClipAndAssuredSum. --- cedr/Makefile | 1 + cedr/{readme.txt => README.md} | 0 cedr/cedr.hpp | 1 + cedr/cedr_caas.cpp | 198 ++++++++++++++++++++++++++------ cedr/cedr_caas.hpp | 16 ++- cedr/cedr_caas_inl.hpp | 14 ++- cedr/cedr_cdr.hpp | 2 + cedr/cedr_mpi.cpp | 1 + cedr/cedr_mpi.hpp | 8 +- cedr/cedr_qlt.cpp | 70 +++++------ cedr/cedr_qlt.hpp | 6 +- cedr/cedr_test.cpp | 5 +- cedr/cedr_test_1d_transport.cpp | 52 +++++---- cedr/cedr_test_randomized.cpp | 58 +++++++--- cedr/cedr_test_randomized.hpp | 29 +++-- 15 files changed, 318 insertions(+), 143 deletions(-) rename cedr/{readme.txt => README.md} (100%) diff --git a/cedr/Makefile b/cedr/Makefile index 01bca9f..99fab15 100644 --- a/cedr/Makefile +++ b/cedr/Makefile @@ -29,3 +29,4 @@ cedr_caas.o: cedr_mpi.hpp cedr_mpi_inl.hpp cedr_local.hpp cedr_local_inl.hpp \ cedr_test.o: cedr_qlt.hpp cedr_util.hpp cedr_test_1d_transport.o: cedr_qlt.hpp cedr_util.hpp cedr_local.o: cedr_local_inl.hpp +cedr_test_randomized.o: cedr_qlt.hpp cedr_util.hpp cedr_test_randomized.hpp diff --git a/cedr/readme.txt b/cedr/README.md similarity index 100% rename from cedr/readme.txt rename to cedr/README.md diff --git a/cedr/cedr.hpp b/cedr/cedr.hpp index 8dced2e..dea74e7 100644 --- a/cedr/cedr.hpp +++ b/cedr/cedr.hpp @@ -6,6 +6,7 @@ // Communication-Efficient Constrained Density Reconstructors namespace cedr { typedef int Int; +typedef long int Long; typedef std::size_t Size; typedef double Real; diff --git a/cedr/cedr_caas.cpp b/cedr/cedr_caas.cpp index 0c7ca1d..ee8f176 100644 --- a/cedr/cedr_caas.cpp +++ b/cedr/cedr_caas.cpp @@ -1,83 +1,207 @@ #include "cedr_caas.hpp" #include "cedr_util.hpp" +#include "cedr_test_randomized.hpp" namespace cedr { namespace caas { -struct OpData { int nsum, nmin, nmax; }; -static OpData g_op_data; -static void all_reduce_op (Real* in, Real* inout, int* len, - MPI_Datatype* /*datatype*/) { - const int n = g_op_data.nsum + g_op_data.nmin + g_op_data.nmax; - for (int i = 0; i < *len; ++i) { - int k = 0; - for ( ; k < g_op_data.nsum; ++k) - inout[k] += in[k]; - for ( ; k < g_op_data.nmin; ++k) - inout[k] = std::min(inout[k], in[k]); - for ( ; k < g_op_data.nmax; ++k) - inout[k] = std::max(inout[k], in[k]); - in += n; - inout += n; - } -} - template CAAS::CAAS (const mpi::Parallel::Ptr& p, const Int nlclcells) - : p_(p), nlclcells_(nlclcells), ntracers_(0), op_(all_reduce_op, true) + : p_(p), nlclcells_(nlclcells), need_conserve_(false) { - cedr_throw_if(true, "WIP: Can't call yet."); + cedr_throw_if(nlclcells == 0, "CAAS does not support 0 cells on a rank."); + tracer_decls_ = std::make_shared >(); } template void CAAS::declare_tracer (int problem_type) { - cedr_throw_if( ! (problem_type & ProblemType::shapepreserve) || - (problem_type & ProblemType::conserve), - "CAAS is a WIP; only shapepreserve (=> consistent) is " - "supported right now."); - ++ntracers_; + cedr_throw_if( ! (problem_type & ProblemType::shapepreserve), + "CAAS is a WIP; ! shapepreserve is not supported yet."); + tracer_decls_->push_back(problem_type); + if (problem_type & ProblemType::conserve) + need_conserve_ = true; } template void CAAS::end_tracer_declarations () { - d_ = RealList("CAAS data", nlclcells_ * (3*ntracers_ + 1)); + tracers_ = IntList("CAAS tracers", static_cast(tracer_decls_->size())); + for (Int i = 0; i < tracers_.extent_int(0); ++i) + tracers_(i) = (*tracer_decls_)[i]; + tracer_decls_ = nullptr; + // (rho, Qm, Qm_min, Qm_max, [Qm_prev]) + const Int e = need_conserve_ ? 1 : 0; + d_ = RealList("CAAS data", nlclcells_ * ((3+e)*tracers_.size() + 1)); + const auto nslots = 4*tracers_.size(); + // (e'Qm_clip, e'Qm, e'Qm_min, e'Qm_max, [e'Qm_prev]) + send_ = RealList("CAAS send", nslots); + recv_ = RealList("CAAS recv", nslots); } template int CAAS::get_problem_type (const Int& tracer_idx) const { - return ProblemType::shapepreserve | ProblemType::consistent; + cedr_assert(tracer_idx >= 0 && tracer_idx < tracers_.extent_int(0)); + return tracers_[tracer_idx]; } template Int CAAS::get_num_tracers () const { - return ntracers_; + return tracers_.extent_int(0); } template void CAAS::reduce_locally () { + const Int nt = tracers_.size(); + Int k = 0; + Int os = nlclcells_; + // Qm_clip + for ( ; k < nt; ++k) { + Real Qm_sum = 0, Qm_clip_sum = 0; + for (Int i = 0; i < nlclcells_; ++i) { + const Real Qm = d_(os+i); + Qm_sum += (tracers_(k) & ProblemType::conserve ? + d_(os + nlclcells_*3*nt + i) /* Qm_prev */ : + Qm); + const Real Qm_min = d_(os + nlclcells_* nt + i); + const Real Qm_max = d_(os + nlclcells_*2*nt + i); + const Real Qm_clip = cedr::impl::min(Qm_max, cedr::impl::max(Qm_min, Qm)); + Qm_clip_sum += Qm_clip; + d_(os+i) = Qm_clip; + } + send_( k) = Qm_clip_sum; + send_(nt + k) = Qm_sum; + os += nlclcells_; + } + k += nt; + // Qm_min, Qm_max + for ( ; k < 4*nt; ++k) { + Real accum = 0; + for (Int i = 0; i < nlclcells_; ++i) + accum += d_(os+i); + send_(k) = accum; + os += nlclcells_; + } } template void CAAS::reduce_globally () { - MPI_Type_contiguous(1 + 3*ntracers_, MPI_DOUBLE, &datatype_); - MPI_Type_commit(&datatype_); - g_op_data.nsum = 1 + ntracers_; - g_op_data.nmin = ntracers_; - g_op_data.nmax = ntracers_; - int err = MPI_Allreduce(send_.data(), recv_.data(), nlclcells_, datatype_, - op_.get(), p_->comm()); + int err = mpi::all_reduce(*p_, send_.data(), recv_.data(), send_.size(), MPI_SUM); + cedr_throw_if(err != MPI_SUCCESS, + "CAAS::reduce_globally MPI_Allreduce returned " << err); } template -void CAAS::caas () { +void CAAS::finish_locally () { + const Int nt = tracers_.size(); + Int os = nlclcells_; + for (Int k = 0; k < nt; ++k) { + const Real Qm_clip_sum = recv_( k); + const Real Qm_sum = recv_(nt + k); + const Real m = Qm_sum - Qm_clip_sum; + if (m < 0) { + const Real Qm_min_sum = recv_(2*nt + k); + Real fac = Qm_clip_sum - Qm_min_sum; + if (fac > 0) { + fac = m/fac; + for (Int i = 0; i < nlclcells_; ++i) { + const Real Qm_min = d_(os + nlclcells_* nt + i); + Real& Qm = d_(os+i); + Qm += fac*(Qm - Qm_min); + } + } + } else if (m > 0) { + const Real Qm_max_sum = recv_(3*nt + k); + Real fac = Qm_max_sum - Qm_clip_sum; + if (fac > 0) { + fac = m/fac; + for (Int i = 0; i < nlclcells_; ++i) { + const Real Qm_max = d_(os + nlclcells_*2*nt + i); + Real& Qm = d_(os+i); + Qm += fac*(Qm_max - Qm); + } + } + } + os += nlclcells_; + } } template void CAAS::run () { reduce_locally(); reduce_globally(); - caas(); + finish_locally(); } +namespace test { +struct TestCAAS : public cedr::test::TestRandomized { + typedef CAAS CAAST; + + TestCAAS (const mpi::Parallel::Ptr& p, const Int& ncells, const bool verbose) + : TestRandomized("CAAS", p, ncells, verbose), + p_(p) + { + const auto np = p->size(), rank = p->rank(); + nlclcells_ = ncells / np; + const Int todo = ncells - nlclcells_ * np; + if (rank < todo) ++nlclcells_; + caas_ = std::make_shared(p, nlclcells_); + init(); + } + + CDR& get_cdr () override { return *caas_; } + + void init_numbering () override { + const auto np = p_->size(), rank = p_->rank(); + Int start = 0; + for (Int lrank = 0; lrank < rank; ++lrank) + start += get_nllclcells(ncells_, np, lrank); + gcis_.resize(nlclcells_); + for (Int i = 0; i < nlclcells_; ++i) + gcis_[i] = start + i; + } + + void init_tracers () override { + // CAAS doesn't yet support everything, so remove a bunch of the tracers. + std::vector tracers; + Int idx = 0; + for (auto& t : tracers_) { + if ( ! (t.problem_type & ProblemType::shapepreserve) || + ! t.local_should_hold) + continue; + t.idx = idx++; + tracers.push_back(t); + caas_->declare_tracer(t.problem_type); + } + tracers_ = tracers; + caas_->end_tracer_declarations(); + } + + void run_impl (const Int trial) override { + caas_->run(); + } + +private: + mpi::Parallel::Ptr p_; + Int nlclcells_; + CAAST::Ptr caas_; + + static Int get_nllclcells (const Int& ncells, const Int& np, const Int& rank) { + Int nlclcells = ncells / np; + const Int todo = ncells - nlclcells * np; + if (rank < todo) ++nlclcells; + return nlclcells; + } +}; + +Int unittest (const mpi::Parallel::Ptr& p) { + const auto np = p->size(); + Int nerr = 0; + for (Int nlclcells : {1, 2, 4, 11}) { + Long ncells = np*nlclcells; + if (ncells > np) ncells -= np/2; + nerr += TestCAAS(p, ncells, false).run(1, false); + } + return nerr; +} +} // namespace test } // namespace caas } // namespace cedr diff --git a/cedr/cedr_caas.hpp b/cedr/cedr_caas.hpp index 09f53ff..bd8356f 100644 --- a/cedr/cedr_caas.hpp +++ b/cedr/cedr_caas.hpp @@ -25,7 +25,7 @@ class CAAS : public CDR { Int get_num_tracers() const override; - // lclcellidx is trivial, the user's index for the cell. + // lclcellidx is trivial; it is the user's index for the cell. KOKKOS_INLINE_FUNCTION void set_rhom(const Int& lclcellidx, const Real& rhom) override; @@ -42,18 +42,24 @@ class CAAS : public CDR { private: typedef Kokkos::View RealList; typedef cedr::impl::Unmanaged UnmanagedRealList; + typedef Kokkos::View IntList; mpi::Parallel::Ptr p_; - Int nlclcells_, ntracers_; - MPI_Datatype datatype_; - mpi::Op op_; + + Int nlclcells_; + std::shared_ptr > tracer_decls_; + bool need_conserve_; + IntList tracers_; RealList d_, send_, recv_; void reduce_locally(); void reduce_globally(); - void caas(); + void finish_locally(); }; +namespace test { +Int unittest(const mpi::Parallel::Ptr& p); +} // namespace test } // namespace caas } // namespace cedr diff --git a/cedr/cedr_caas_inl.hpp b/cedr/cedr_caas_inl.hpp index e4c4715..96dc754 100644 --- a/cedr/cedr_caas_inl.hpp +++ b/cedr/cedr_caas_inl.hpp @@ -1,12 +1,15 @@ #ifndef INCLUDE_CEDR_CAAS_INL_HPP #define INCLUDE_CEDR_CAAS_INL_HPP +#include "cedr_util.hpp" + namespace cedr { // ClipAndAssuredSum. namespace caas { template KOKKOS_INLINE_FUNCTION void CAAS::set_rhom (const Int& lclcellidx, const Real& rhom) { + cedr_kernel_assert(lclcellidx >= 0 && lclcellidx < nlclcells_); d_(lclcellidx) = rhom; } @@ -15,13 +18,20 @@ void CAAS ::set_Qm (const Int& lclcellidx, const Int& tracer_idx, const Real& Qm, const Real& Qm_min, const Real& Qm_max, const Real Qm_prev) { + cedr_kernel_assert(lclcellidx >= 0 && lclcellidx < nlclcells_); + cedr_kernel_assert(tracer_idx >= 0 && tracer_idx < tracers_.extent_int(0)); + const Int nt = tracers_.size(); d_((1 + tracer_idx)*nlclcells_ + lclcellidx) = Qm; - d_((1 + ntracers_ + tracer_idx)*nlclcells_ + lclcellidx) = Qm_min; - d_((1 + 2*ntracers_ + tracer_idx)*nlclcells_ + lclcellidx) = Qm_max; + d_((1 + nt + tracer_idx)*nlclcells_ + lclcellidx) = Qm_min; + d_((1 + 2*nt + tracer_idx)*nlclcells_ + lclcellidx) = Qm_max; + if (need_conserve_) + d_((1 + 3*nt + tracer_idx)*nlclcells_ + lclcellidx) = Qm_prev; } template KOKKOS_INLINE_FUNCTION Real CAAS::get_Qm (const Int& lclcellidx, const Int& tracer_idx) { + cedr_kernel_assert(lclcellidx >= 0 && lclcellidx < nlclcells_); + cedr_kernel_assert(tracer_idx >= 0 && tracer_idx < tracers_.extent_int(0)); return d_((1 + tracer_idx)*nlclcells_ + lclcellidx); } diff --git a/cedr/cedr_cdr.hpp b/cedr/cedr_cdr.hpp index e0f4300..c49fd74 100644 --- a/cedr/cedr_cdr.hpp +++ b/cedr/cedr_cdr.hpp @@ -6,6 +6,8 @@ namespace cedr { // Constrained Density Reconstructor interface. struct CDR { + virtual void print(std::ostream& os) const {} + // Set up QLT tracer metadata. Once end_tracer_declarations is called, it is // an error to call declare_tracer again. Call declare_tracer in order of the // tracer index in the caller's numbering. It is an error to call this diff --git a/cedr/cedr_mpi.cpp b/cedr/cedr_mpi.cpp index e561f30..1569a66 100644 --- a/cedr/cedr_mpi.cpp +++ b/cedr/cedr_mpi.cpp @@ -21,6 +21,7 @@ Int Parallel::rank () const { template <> MPI_Datatype get_type() { return MPI_INT; } template <> MPI_Datatype get_type() { return MPI_DOUBLE; } +template <> MPI_Datatype get_type() { return MPI_LONG_INT; } int waitany (int count, MPI_Request* reqs, int* index, MPI_Status* stats) { return MPI_Waitany(count, reqs, index, stats ? stats : MPI_STATUS_IGNORE); diff --git a/cedr/cedr_mpi.hpp b/cedr/cedr_mpi.hpp index c2b8750..1f28594 100644 --- a/cedr/cedr_mpi.hpp +++ b/cedr/cedr_mpi.hpp @@ -58,15 +58,15 @@ struct Op { typedef std::shared_ptr Ptr; Op (MPI_User_function* function, bool commute) { - MPI_Op_create(function, static_cast(commute), op_); + MPI_Op_create(function, static_cast(commute), &op_); } - ~Op () { MPI_Op_free(op_); } + ~Op () { MPI_Op_free(&op_); } - MPI_Op* get () const { return op_; } + const MPI_Op& get () const { return op_; } private: - MPI_Op* op_; + MPI_Op op_; }; } // namespace mpi diff --git a/cedr/cedr_qlt.cpp b/cedr/cedr_qlt.cpp index 68e5f5b..e5d93da 100644 --- a/cedr/cedr_qlt.cpp +++ b/cedr/cedr_qlt.cpp @@ -499,7 +499,7 @@ Int unittest (const Parallel::Ptr& p, const NodeSets::ConstPtr& ns, template void QLT::init (const std::string& name, IntList& d, typename IntList::HostMirror& h, size_t n) { - d = IntList(name, n); + d = IntList("QLT " + name, n); h = Kokkos::create_mirror_view(d); } @@ -589,8 +589,8 @@ void QLT::MetaData::init (const MetaDataBuilder& mdb) { template void QLT::BulkData::init (const MetaData& md, const Int& nslots) { - l2r_data_ = RealList("l2r_data", md.a_h.prob2bl2r[md.nprobtypes]*nslots); - r2l_data_ = RealList("r2l_data", md.a_h.prob2br2l[md.nprobtypes]*nslots); + l2r_data_ = RealList("QLT l2r_data", md.a_h.prob2bl2r[md.nprobtypes]*nslots); + r2l_data_ = RealList("QLT r2l_data", md.a_h.prob2br2l[md.nprobtypes]*nslots); l2r_data = l2r_data_; r2l_data = r2l_data_; } @@ -615,6 +615,7 @@ void QLT::init_ordinals () { template QLT::QLT (const Parallel::Ptr& p, const Int& ncells, const tree::Node::Ptr& tree) { init(p, ncells, tree); + cedr_throw_if(nlclcells() == 0, "QLT does not support 0 cells on a rank."); } template @@ -631,7 +632,7 @@ Int QLT::nlclcells () const { return ns_->levels[0].nodes.size(); } // and instead uses the information from get_owned_glblcells to determine // local cell indices. template -void QLT::get_owned_glblcells (std::vector& gcis) const { +void QLT::get_owned_glblcells (std::vector& gcis) const { gcis.resize(ns_->levels[0].nodes.size()); for (const auto& n : ns_->levels[0].nodes) gcis[n->offset] = n->id; @@ -645,7 +646,7 @@ Int QLT::gci2lci (const Int& gci) const { const auto it = gci2lci_.find(gci); if (it == gci2lci_.end()) { pr(puf(gci)); - std::vector gcis; + std::vector gcis; get_owned_glblcells(gcis); mprarr(gcis); } @@ -856,7 +857,8 @@ class TestQLT : public cedr::test::TestRandomized { TestQLT (const Parallel::Ptr& p, const tree::Node::Ptr& tree, const Int& ncells, const bool verbose=false) - : TestRandomized(p, ncells, verbose), qlt_(p, ncells, tree), tree_(tree) + : TestRandomized("QLT", p, ncells, verbose), + qlt_(p, ncells, tree), tree_(tree) { if (verbose) qlt_.print(std::cout); init(); @@ -865,7 +867,8 @@ class TestQLT : public cedr::test::TestRandomized { private: QLTT qlt_; tree::Node::Ptr tree_; - std::vector i2lci_; + + CDR& get_cdr () override { return qlt_; } void init_numbering () override { init_numbering(tree_); @@ -877,10 +880,8 @@ class TestQLT : public cedr::test::TestRandomized { // geometry to the test problem. However, use *some* ordering to model what // a real problem must do. if ( ! node->nkids) { - if (node->rank == p_->rank()) { + if (node->rank == p_->rank()) gcis_.push_back(node->cellidx); - i2lci_.push_back(qlt_.gci2lci(gcis_.back())); - } return; } for (Int i = 0; i < node->nkids; ++i) @@ -889,7 +890,7 @@ class TestQLT : public cedr::test::TestRandomized { static void check (const QLTT& qlt) { const Int n = qlt.nlclcells(); - std::vector gcis; + std::vector gcis; qlt.get_owned_glblcells(gcis); cedr_assert(static_cast(gcis.size()) == n); for (Int i = 0; i < n; ++i) @@ -906,44 +907,25 @@ class TestQLT : public cedr::test::TestRandomized { ProblemType::consistent)); } - void run_impl (Values& v, const Int nrepeat, const bool write) override { - const Int nt = qlt_.get_num_tracers(), nlclcells = qlt_.nlclcells(); - { - Real* rhom = v.rhom(); - for (Int i = 0; i < nlclcells; ++i) - qlt_.set_rhom(i2lci_[i], rhom[i]); - } - for (Int trial = 0; trial <= nrepeat; ++trial) { - for (Int ti = 0; ti < nt; ++ti) { - Real* Qm_min = v.Qm_min(ti), * Qm = v.Qm(ti), * Qm_max = v.Qm_max(ti), - * Qm_prev = v.Qm_prev(ti); - for (Int i = 0; i < nlclcells; ++i) - qlt_.set_Qm(i2lci_[i], ti, Qm[i], Qm_min[i], Qm_max[i], Qm_prev[i]); - } - MPI_Barrier(p_->comm()); - Timer::start(Timer::qltrun); - qlt_.run(); - MPI_Barrier(p_->comm()); - Timer::stop(Timer::qltrun); - if (trial == 0) { - Timer::reset(Timer::qltrun); - Timer::reset(Timer::qltrunl2r); - Timer::reset(Timer::qltrunr2l); - Timer::reset(Timer::waitall); - Timer::reset(Timer::snp); - } - } - for (Int ti = 0; ti < nt; ++ti) { - Real* Qm = v.Qm(ti); - for (Int i = 0; i < nlclcells; ++i) - Qm[i] = qlt_.get_Qm(i2lci_[i], ti); + void run_impl (const Int trial) override { + MPI_Barrier(p_->comm()); + Timer::start(Timer::qltrun); + qlt_.run(); + MPI_Barrier(p_->comm()); + Timer::stop(Timer::qltrun); + if (trial == 0) { + Timer::reset(Timer::qltrun); + Timer::reset(Timer::qltrunl2r); + Timer::reset(Timer::qltrunr2l); + Timer::reset(Timer::waitall); + Timer::reset(Timer::snp); } } }; // Test all QLT variations and situations. Int test_qlt (const Parallel::Ptr& p, const tree::Node::Ptr& tree, const Int& ncells, - const int nrepeat = 1, + const Int nrepeat = 1, // Diagnostic output for dev and illustration purposes. To be // clear, no QLT unit test requires output to be checked; each // checks in-memory data and returns a failure count. @@ -979,7 +961,7 @@ struct Mesh { nranks_ = p->size(); p_ = p; pd_ = parallel_decomp; - cedr_assert(nranks_ <= nc_); + cedr_throw_if(nranks_ > nc_, "#GIDs < #ranks is not supported."); } Int ncell () const { return nc_; } diff --git a/cedr/cedr_qlt.hpp b/cedr/cedr_qlt.hpp index 2e85cf4..f36bc69 100644 --- a/cedr/cedr_qlt.hpp +++ b/cedr/cedr_qlt.hpp @@ -25,7 +25,7 @@ struct Node { typedef std::shared_ptr Ptr; const Node* parent; // (Can't be a shared_ptr: would be a circular dependency.) Int rank; // Owning rank. - Int cellidx; // If a leaf, the cell to which this node corresponds. + Long cellidx; // If a leaf, the cell to which this node corresponds. Int nkids; // 0 at leaf, 1 or 2 otherwise. Node::Ptr kids[2]; void* reserved; // For internal use. @@ -48,7 +48,7 @@ class QLT : public cedr::CDR { // Set up QLT topology and communication data structures based on a tree. QLT(const Parallel::Ptr& p, const Int& ncells, const tree::Node::Ptr& tree); - void print(std::ostream& os) const; + void print(std::ostream& os) const override; // Number of cells owned by this rank. Int nlclcells() const; @@ -57,7 +57,7 @@ class QLT : public cedr::CDR { // gci2lci(gcis[i]) == i. Ideally, the caller never actually calls gci2lci(), // and instead uses the information from get_owned_glblcells to determine // local cell indices. - void get_owned_glblcells(std::vector& gcis) const; + void get_owned_glblcells(std::vector& gcis) const; // For global cell index cellidx, i.e., the globally unique ordinal associated // with a cell in the caller's tree, return this rank's local index for diff --git a/cedr/cedr_test.cpp b/cedr/cedr_test.cpp index 94010c5..ece27e7 100644 --- a/cedr/cedr_test.cpp +++ b/cedr/cedr_test.cpp @@ -1,4 +1,5 @@ #include "cedr_qlt.hpp" +#include "cedr_caas.hpp" #include "cedr_mpi.hpp" #include "cedr_util.hpp" #include "cedr_test.hpp" @@ -81,8 +82,10 @@ int main (int argc, char** argv) { try { cedr::InputParser inp(argc, argv, p); if (p->amroot()) inp.print(std::cout); - if (inp.qin.unittest) + if (inp.qin.unittest) { nerr += cedr::local::unittest(); + nerr += cedr::caas::test::unittest(p); + } if (inp.qin.unittest || inp.qin.perftest) nerr += cedr::qlt::test::run_unit_and_randomized_tests(p, inp.qin); if (inp.tin.ncells > 0) diff --git a/cedr/cedr_test_1d_transport.cpp b/cedr/cedr_test_1d_transport.cpp index 3b28329..d0a4711 100644 --- a/cedr/cedr_test_1d_transport.cpp +++ b/cedr/cedr_test_1d_transport.cpp @@ -1,5 +1,6 @@ #include "cedr_test.hpp" #include "cedr_qlt.hpp" +#include "cedr_caas.hpp" #include @@ -163,8 +164,7 @@ class Problem1D { xcp_.back() = 1 + xcp_[0]; } - static void run_qlt (const Problem1D& p, - qlt::QLT& qlt, + static void run_cdr (const Problem1D& p, CDR& cdr, const Real* yp, Real* y, const Int* dods) { const Int n = p.ncells(); for (Int i = 0; i < n; ++i) { @@ -176,11 +176,11 @@ class Problem1D { max = std::max(max, v); } const Real area_i = p.area(i); - qlt.set_Qm(i, 0, y[i]*area_i, min*area_i, max*area_i, yp[i]*area_i); + cdr.set_Qm(i, 0, y[i]*area_i, min*area_i, max*area_i, yp[i]*area_i); } - qlt.run(); + cdr.run(); for (Int i = 0; i < n; ++i) - y[i] = qlt.get_Qm(i, 0) / p.area(i); + y[i] = cdr.get_Qm(i, 0) / p.area(i); y[n] = y[0]; } @@ -224,8 +224,7 @@ class Problem1D { const std::vector get_xb () const { return xb_; } const std::vector get_xcp () const { return xcp_; } - void cycle (const Int& nsteps, const Real* y0, Real* yf, - qlt::QLT* qlt = nullptr) { + void cycle (const Int& nsteps, const Real* y0, Real* yf, CDR* cdr = nullptr) { const Int n = xcp_.size(); rwrk_.resize(2*n); iwrk_.resize(4*n); @@ -241,8 +240,8 @@ class Problem1D { for (Int ti = 0; ti < nsteps; ++ti) { interp::cubic_interp_periodic(xcp_.data(), n, ys[0], xcpi, n, ys[1], dod); - if (qlt) - run_qlt(*this, *qlt, ys[0], ys[1], dod); + if (cdr) + run_cdr(*this, *cdr, ys[0], ys[1], dod); else run_caas(*this, ys[0], ys[1], dod); std::swap(ys[0], ys[1]); @@ -257,7 +256,6 @@ class Problem1D { // - better, more canonical IC // - optional tree imbalance // - optional mesh nonuniformity -// - choice of preservation methods // - parallel? Int run (const mpi::Parallel::Ptr& parallel, const Input& in) { cedr_throw_if(parallel->size() > 1, "run_1d_transport_test runs in serial only."); @@ -269,12 +267,21 @@ Int run (const mpi::Parallel::Ptr& parallel, const Input& in) { true /* imbalanced */); typedef qlt::QLT QLTT; QLTT qlt(parallel, in.ncells, tree); - qlt.declare_tracer(cedr::ProblemType::conserve | - cedr::ProblemType::shapepreserve); - qlt.end_tracer_declarations(); - for (Int i = 0; i < in.ncells; ++i) - qlt.set_rhom(i, p.area(i)); - qlt.print(std::cout); + + typedef caas::CAAS CAAST; + CAAST caas(parallel, in.ncells); + + CDR* cdrs[] = {&qlt, &caas}; + const int ncdrs = sizeof(cdrs)/sizeof(*cdrs); + + for (CDR* cdr : cdrs) { + cdr->declare_tracer(cedr::ProblemType::conserve | + cedr::ProblemType::shapepreserve); + cdr->end_tracer_declarations(); + for (Int i = 0; i < in.ncells; ++i) + cdr->set_rhom(i, p.area(i)); + cdr->print(std::cout); + } std::vector y0(in.ncells+1); for (Int i = 0, nc = p.ncells(); i < nc; ++i) @@ -292,15 +299,18 @@ Int run (const mpi::Parallel::Ptr& parallel, const Input& in) { const Int nsteps = Int(3.17*in.ncells); const Int ncycles = 1; - std::copy(y0.begin(), y0.end(), yf.begin()); - for (Int i = 0; i < ncycles; ++i) - p.cycle(nsteps, yf.data(), yf.data(), &qlt); - w.write("yqlt", yf); + const char* names[] = {"yqlt", "ycaas"}; + for (int ic = 0; ic < ncdrs; ++ic) { + std::copy(y0.begin(), y0.end(), yf.begin()); + for (Int i = 0; i < ncycles; ++i) + p.cycle(nsteps, yf.data(), yf.data(), cdrs[ic]); + w.write(names[ic], yf); + } std::copy(y0.begin(), y0.end(), yf.begin()); for (Int i = 0; i < ncycles; ++i) p.cycle(nsteps, yf.data(), yf.data()); - w.write("ycaas", yf); + w.write("ylcaas", yf); return nerr; } diff --git a/cedr/cedr_test_randomized.cpp b/cedr/cedr_test_randomized.cpp index 8aafee1..7b48f0e 100644 --- a/cedr/cedr_test_randomized.cpp +++ b/cedr/cedr_test_randomized.cpp @@ -123,7 +123,7 @@ ::add_const_to_Q (const Tracer& t, Values& v, rhom = Qm_sum_gbl[0]; Qm = Qm_sum_gbl[1]; Qm_max = Qm_sum_gbl[2]; } Real Qm_max_safety = 0; - if (safety_problem) { + if (safety_problem && v.ncells()) { Real q_safety_lcl = v.Qm_max(t.idx)[0] / v.rhom()[0]; for (Int i = 1; i < v.ncells(); ++i) q_safety_lcl = std::max(q_safety_lcl, v.Qm_max(t.idx)[i] / v.rhom()[i]); @@ -202,9 +202,9 @@ void TestRandomized::init_writer () { } else { int n = gcis_.size(); mpi::gather(*p_, &n, 1, static_cast(nullptr), 0, p_->root()); - Int* Inull = nullptr; + Long* Lnull = nullptr; const int* inull = nullptr; - mpi::gatherv(*p_, gcis_.data(), gcis_.size(), Inull, inull, inull, p_->root()); + mpi::gatherv(*p_, gcis_.data(), gcis_.size(), Lnull, inull, inull, p_->root()); } write_inited_ = true; } @@ -273,8 +273,9 @@ void TestRandomized::write_post (const Tracer& t, Values& v) { } Int TestRandomized -::check (const mpi::Parallel& p, const std::vector& ts, const Values& v) { - static const bool details = true; +::check (const std::string& cdr_name, const mpi::Parallel& p, + const std::vector& ts, const Values& v) { + static const bool details = false; static const Real ulp3 = 3*std::numeric_limits::epsilon(); Int nerr = 0; std::vector lcl_mass(2*ts.size()), q_min_lcl(ts.size()), q_max_lcl(ts.size()); @@ -335,11 +336,11 @@ ::check (const mpi::Parallel& p, const std::vector& ts, const Values& v) if (Qm[i] < q_min*rhom[i]*(1 - ulp3) || Qm[i] > q_max*rhom[i]*(1 + ulp3)) { if (details) - pr("check q " << t.str() << ": " << q_min*rhom[i] << " " << Qm_min[i] << - " " << Qm[i] << " " << Qm_max[i] << " " << q_max*rhom[i] << " | " << - (Qm[i] < q_min*rhom[i] ? - Qm[i] - q_min*rhom[i] : - Qm[i] - q_max*rhom[i])); + pr("check q (safety) " << t.str() << ": " << q_min*rhom[i] << " " + << Qm_min[i] << " " << Qm[i] << " " << Qm_max[i] << " " + << q_max*rhom[i] << " | " << (Qm[i] < q_min*rhom[i] ? + Qm[i] - q_min*rhom[i] : + Qm[i] - q_max*rhom[i])); t_ok[ti] = false; ++nerr; } @@ -369,7 +370,7 @@ ::check (const mpi::Parallel& p, const std::vector& ts, const Values& v) t_ok_gbl[ti] = false; } if ( ! t_ok_gbl[ti]) { - std::cout << "FAIL " << ts[ti].str(); + std::cout << "FAIL " << cdr_name << ": " << ts[ti].str(); if (mass_failed) std::cout << " mass re " << rd; std::cout << "\n"; } @@ -380,9 +381,9 @@ ::check (const mpi::Parallel& p, const std::vector& ts, const Values& v) } TestRandomized -::TestRandomized (const mpi::Parallel::Ptr& p, const Int& ncells, - const bool verbose) - : p_(p), ncells_(ncells), write_inited_(false) +::TestRandomized (const std::string& name, const mpi::Parallel::Ptr& p, + const Int& ncells, const bool verbose) + : cdr_name_(name), p_(p), ncells_(ncells), write_inited_(false) {} void TestRandomized::init () { @@ -393,20 +394,45 @@ void TestRandomized::init () { Int TestRandomized::run (const Int nrepeat, const bool write) { const Int nt = tracers_.size(), nlclcells = gcis_.size(); + Values v(nt, nlclcells); generate_rho(v); for (const auto& t : tracers_) { generate_Q(t, v); perturb_Q(t, v); } + if (write) for (const auto& t : tracers_) write_pre(t, v); - run_impl(v, nrepeat, write); + + CDR& cdr = get_cdr(); + { + Real* rhom = v.rhom(); + for (Int i = 0; i < nlclcells; ++i) + cdr.set_rhom(i, rhom[i]); + } + for (Int trial = 0; trial <= nrepeat; ++trial) { + for (Int ti = 0; ti < nt; ++ti) { + Real* Qm_min = v.Qm_min(ti), * Qm = v.Qm(ti), * Qm_max = v.Qm_max(ti), + * Qm_prev = v.Qm_prev(ti); + for (Int i = 0; i < nlclcells; ++i) + cdr.set_Qm(i, ti, Qm[i], Qm_min[i], Qm_max[i], Qm_prev[i]); + } + + run_impl(trial); + } + + for (Int ti = 0; ti < nt; ++ti) { + Real* Qm = v.Qm(ti); + for (Int i = 0; i < nlclcells; ++i) + Qm[i] = cdr.get_Qm(i, ti); + } + if (write) for (const auto& t : tracers_) write_post(t, v); - return check(*p_, tracers_, v); + return check(cdr_name_, *p_, tracers_, v); } } // namespace test diff --git a/cedr/cedr_test_randomized.hpp b/cedr/cedr_test_randomized.hpp index e2e813b..dd4f54d 100644 --- a/cedr/cedr_test_randomized.hpp +++ b/cedr/cedr_test_randomized.hpp @@ -1,6 +1,7 @@ #ifndef INCLUDE_CEDR_TEST_RANDOMIZED_HPP #define INCLUDE_CEDR_TEST_RANDOMIZED_HPP +#include "cedr_cdr.hpp" #include "cedr_mpi.hpp" #include "cedr_util.hpp" @@ -9,13 +10,17 @@ namespace test { class TestRandomized { public: - TestRandomized(const mpi::Parallel::Ptr& p, const Int& ncells, - const bool verbose = false); + TestRandomized(const std::string& cdr_name, const mpi::Parallel::Ptr& p, + const Int& ncells, const bool verbose = false); + // The subclass should call this, probably in its constructor. void init(); Int run(const Int nrepeat = 1, const bool write=false); +private: + const std::string cdr_name_; + protected: struct Tracer { typedef ProblemType PT; @@ -62,19 +67,19 @@ class TestRandomized { struct Writer { std::unique_ptr fh; std::vector ngcis; // Number of i'th rank's gcis_ array. + std::vector gcis; // Global cell indices packed by rank's gcis_ vector. std::vector displs; // Cumsum of above. - std::vector gcis; // Global cell indices packed by rank's gcis_ vector. ~Writer(); }; const mpi::Parallel::Ptr p_; const Int ncells_; // Global mesh entity IDs, 1-1 with reduction array index or QLT leaf node. - std::vector gcis_; + std::vector gcis_; std::vector tracers_; - // For optional output. - bool write_inited_; - std::shared_ptr w_; // Only on root. + + // Tell this class the CDR. + virtual CDR& get_cdr() = 0; // Fill gcis_. virtual void init_numbering() = 0; @@ -82,9 +87,13 @@ class TestRandomized { // Using tracers_, the vector of Tracers, initialize the CDR's tracers. virtual void init_tracers() = 0; - virtual void run_impl(Values& v, const Int nrepeat, const bool write) = 0; + virtual void run_impl(const Int trial) = 0; private: + // For optional output. + bool write_inited_; + std::shared_ptr w_; // Only on root. + void init_tracers_vector(); void add_const_to_Q( @@ -109,8 +118,8 @@ class TestRandomized { static void generate_Q(const Tracer& t, Values& v); static void permute_Q(const Tracer& t, Values& v); static std::string get_tracer_name(const Tracer& t); - static Int check(const mpi::Parallel& p, const std::vector& ts, - const Values& v); + static Int check(const std::string& cdr_name, const mpi::Parallel& p, + const std::vector& ts, const Values& v); }; } // namespace test From 1e7bc198ca698d18fb724d5f82966d229e2f2a1b Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Thu, 18 Jan 2018 18:40:32 -0700 Subject: [PATCH 27/28] CEDR: Generalize CDR interface for multiple rhos. In general, there may be n tracer mass fields Qm and m total mass fields rhom. So far, m has been assumed to be 1. If a problem is feasible that's fine. But eventually we want to take on infeasible problems. Generalize the CDR interface for this capability, although we're not yet implementing it. Since it's not being impl'ed yet, throw if the caller tries to use it. If-guard some MPI calls. --- cedr/cedr_caas.cpp | 39 ++++++++++------- cedr/cedr_caas.hpp | 17 +++++--- cedr/cedr_caas_inl.hpp | 9 ++-- cedr/cedr_cdr.hpp | 20 ++++++--- cedr/cedr_qlt.cpp | 75 ++++++++++++++++++--------------- cedr/cedr_qlt.hpp | 4 +- cedr/cedr_qlt_inl.hpp | 2 +- cedr/cedr_test_1d_transport.cpp | 8 ++-- cedr/cedr_test_randomized.cpp | 2 +- 9 files changed, 101 insertions(+), 75 deletions(-) diff --git a/cedr/cedr_caas.cpp b/cedr/cedr_caas.cpp index ee8f176..2694024 100644 --- a/cedr/cedr_caas.cpp +++ b/cedr/cedr_caas.cpp @@ -7,31 +7,38 @@ namespace caas { template CAAS::CAAS (const mpi::Parallel::Ptr& p, const Int nlclcells) - : p_(p), nlclcells_(nlclcells), need_conserve_(false) + : p_(p), nlclcells_(nlclcells), nrhomidxs_(0), need_conserve_(false) { cedr_throw_if(nlclcells == 0, "CAAS does not support 0 cells on a rank."); - tracer_decls_ = std::make_shared >(); + tracer_decls_ = std::make_shared >(); } template -void CAAS::declare_tracer (int problem_type) { +void CAAS::declare_tracer(int problem_type, const Int& rhomidx) { cedr_throw_if( ! (problem_type & ProblemType::shapepreserve), "CAAS is a WIP; ! shapepreserve is not supported yet."); - tracer_decls_->push_back(problem_type); + cedr_throw_if(rhomidx > 0, "rhomidx > 0 is not supported yet."); + tracer_decls_->push_back(Decl(problem_type, rhomidx)); if (problem_type & ProblemType::conserve) need_conserve_ = true; + nrhomidxs_ = std::max(nrhomidxs_, rhomidx+1); } template void CAAS::end_tracer_declarations () { - tracers_ = IntList("CAAS tracers", static_cast(tracer_decls_->size())); - for (Int i = 0; i < tracers_.extent_int(0); ++i) - tracers_(i) = (*tracer_decls_)[i]; + cedr_throw_if(tracer_decls_->size() == 0, "#tracers is 0."); + cedr_throw_if(nrhomidxs_ == 0, "#rhomidxs is 0."); + probs_ = IntList("CAAS probs", static_cast(tracer_decls_->size())); + t2r_ = IntList("CAAS t2r", static_cast(tracer_decls_->size())); + for (Int i = 0; i < probs_.extent_int(0); ++i) { + probs_(i) = (*tracer_decls_)[i].probtype; + t2r_(i) = (*tracer_decls_)[i].rhomidx; + } tracer_decls_ = nullptr; // (rho, Qm, Qm_min, Qm_max, [Qm_prev]) const Int e = need_conserve_ ? 1 : 0; - d_ = RealList("CAAS data", nlclcells_ * ((3+e)*tracers_.size() + 1)); - const auto nslots = 4*tracers_.size(); + d_ = RealList("CAAS data", nlclcells_ * ((3+e)*probs_.size() + 1)); + const auto nslots = 4*probs_.size(); // (e'Qm_clip, e'Qm, e'Qm_min, e'Qm_max, [e'Qm_prev]) send_ = RealList("CAAS send", nslots); recv_ = RealList("CAAS recv", nslots); @@ -39,18 +46,18 @@ void CAAS::end_tracer_declarations () { template int CAAS::get_problem_type (const Int& tracer_idx) const { - cedr_assert(tracer_idx >= 0 && tracer_idx < tracers_.extent_int(0)); - return tracers_[tracer_idx]; + cedr_assert(tracer_idx >= 0 && tracer_idx < probs_.extent_int(0)); + return probs_[tracer_idx]; } template Int CAAS::get_num_tracers () const { - return tracers_.extent_int(0); + return probs_.extent_int(0); } template void CAAS::reduce_locally () { - const Int nt = tracers_.size(); + const Int nt = probs_.size(); Int k = 0; Int os = nlclcells_; // Qm_clip @@ -58,7 +65,7 @@ void CAAS::reduce_locally () { Real Qm_sum = 0, Qm_clip_sum = 0; for (Int i = 0; i < nlclcells_; ++i) { const Real Qm = d_(os+i); - Qm_sum += (tracers_(k) & ProblemType::conserve ? + Qm_sum += (probs_(k) & ProblemType::conserve ? d_(os + nlclcells_*3*nt + i) /* Qm_prev */ : Qm); const Real Qm_min = d_(os + nlclcells_* nt + i); @@ -91,7 +98,7 @@ void CAAS::reduce_globally () { template void CAAS::finish_locally () { - const Int nt = tracers_.size(); + const Int nt = probs_.size(); Int os = nlclcells_; for (Int k = 0; k < nt; ++k) { const Real Qm_clip_sum = recv_( k); @@ -169,7 +176,7 @@ struct TestCAAS : public cedr::test::TestRandomized { continue; t.idx = idx++; tracers.push_back(t); - caas_->declare_tracer(t.problem_type); + caas_->declare_tracer(t.problem_type, 0); } tracers_ = tracers; caas_->end_tracer_declarations(); diff --git a/cedr/cedr_caas.hpp b/cedr/cedr_caas.hpp index bd8356f..849636f 100644 --- a/cedr/cedr_caas.hpp +++ b/cedr/cedr_caas.hpp @@ -17,7 +17,7 @@ class CAAS : public CDR { public: CAAS(const mpi::Parallel::Ptr& p, const Int nlclcells); - void declare_tracer(int problem_type) override; + void declare_tracer(int problem_type, const Int& rhomidx) override; void end_tracer_declarations() override; @@ -27,7 +27,7 @@ class CAAS : public CDR { // lclcellidx is trivial; it is the user's index for the cell. KOKKOS_INLINE_FUNCTION - void set_rhom(const Int& lclcellidx, const Real& rhom) override; + void set_rhom(const Int& lclcellidx, const Int& rhomidx, const Real& rhom) override; KOKKOS_INLINE_FUNCTION void set_Qm(const Int& lclcellidx, const Int& tracer_idx, @@ -44,12 +44,19 @@ class CAAS : public CDR { typedef cedr::impl::Unmanaged UnmanagedRealList; typedef Kokkos::View IntList; + struct Decl { + int probtype; + Int rhomidx; + Decl (const int probtype_, const Int rhomidx_) + : probtype(probtype_), rhomidx(rhomidx_) {} + }; + mpi::Parallel::Ptr p_; - Int nlclcells_; - std::shared_ptr > tracer_decls_; + Int nlclcells_, nrhomidxs_; + std::shared_ptr > tracer_decls_; bool need_conserve_; - IntList tracers_; + IntList probs_, t2r_; RealList d_, send_, recv_; void reduce_locally(); diff --git a/cedr/cedr_caas_inl.hpp b/cedr/cedr_caas_inl.hpp index 96dc754..f1a64fd 100644 --- a/cedr/cedr_caas_inl.hpp +++ b/cedr/cedr_caas_inl.hpp @@ -8,8 +8,9 @@ namespace cedr { namespace caas { template KOKKOS_INLINE_FUNCTION -void CAAS::set_rhom (const Int& lclcellidx, const Real& rhom) { +void CAAS::set_rhom (const Int& lclcellidx, const Int& rhomidx, const Real& rhom) { cedr_kernel_assert(lclcellidx >= 0 && lclcellidx < nlclcells_); + cedr_kernel_assert(rhomidx >= 0 && rhomidx < nrhomidxs_); d_(lclcellidx) = rhom; } @@ -19,8 +20,8 @@ ::set_Qm (const Int& lclcellidx, const Int& tracer_idx, const Real& Qm, const Real& Qm_min, const Real& Qm_max, const Real Qm_prev) { cedr_kernel_assert(lclcellidx >= 0 && lclcellidx < nlclcells_); - cedr_kernel_assert(tracer_idx >= 0 && tracer_idx < tracers_.extent_int(0)); - const Int nt = tracers_.size(); + cedr_kernel_assert(tracer_idx >= 0 && tracer_idx < probs_.extent_int(0)); + const Int nt = probs_.size(); d_((1 + tracer_idx)*nlclcells_ + lclcellidx) = Qm; d_((1 + nt + tracer_idx)*nlclcells_ + lclcellidx) = Qm_min; d_((1 + 2*nt + tracer_idx)*nlclcells_ + lclcellidx) = Qm_max; @@ -31,7 +32,7 @@ ::set_Qm (const Int& lclcellidx, const Int& tracer_idx, template KOKKOS_INLINE_FUNCTION Real CAAS::get_Qm (const Int& lclcellidx, const Int& tracer_idx) { cedr_kernel_assert(lclcellidx >= 0 && lclcellidx < nlclcells_); - cedr_kernel_assert(tracer_idx >= 0 && tracer_idx < tracers_.extent_int(0)); + cedr_kernel_assert(tracer_idx >= 0 && tracer_idx < probs_.extent_int(0)); return d_((1 + tracer_idx)*nlclcells_ + lclcellidx); } diff --git a/cedr/cedr_cdr.hpp b/cedr/cedr_cdr.hpp index c49fd74..6dad452 100644 --- a/cedr/cedr_cdr.hpp +++ b/cedr/cedr_cdr.hpp @@ -6,13 +6,17 @@ namespace cedr { // Constrained Density Reconstructor interface. struct CDR { + typedef std::shared_ptr Ptr; + virtual void print(std::ostream& os) const {} - // Set up QLT tracer metadata. Once end_tracer_declarations is called, it is - // an error to call declare_tracer again. Call declare_tracer in order of the - // tracer index in the caller's numbering. It is an error to call this - // function from a parallel region. - virtual void declare_tracer(int problem_type) = 0; + // Set up QLT tracer metadata. Call declare_tracer in order of the tracer + // index in the caller's numbering. Once end_tracer_declarations is called, it + // is an error to call declare_tracer again. + // Associate the tracer with a rhom index. In many problems, there will be + // only one rhom, so rhomidx is always 0. + // It is an error to call this function from a parallel region. + virtual void declare_tracer(int problem_type, const Int& rhomidx) = 0; // It is an error to call this function from a parallel region. virtual void end_tracer_declarations() = 0; @@ -22,7 +26,7 @@ struct CDR { virtual Int get_num_tracers() const = 0; // set_{rhom,Qm}: Set cell values prior to running the QLT algorithm. - // set_rhom must be called before set_Qm. + // // Notation: // rho: Total density. // Q: Tracer density. @@ -32,8 +36,10 @@ struct CDR { // Some CDRs have a nontrivial local <-> global cell index map. For these // CDRs, lclcellidx may be nontrivial. For others, the caller should provide // the index into the local cell. + // + // set_rhom must be called before set_Qm. virtual void set_rhom( - const Int& lclcellidx, + const Int& lclcellidx, const Int& rhomidx, // Current total mass in this cell. const Real& rhom) = 0; diff --git a/cedr/cedr_qlt.cpp b/cedr/cedr_qlt.cpp index e5d93da..6a04f04 100644 --- a/cedr/cedr_qlt.cpp +++ b/cedr/cedr_qlt.cpp @@ -654,13 +654,11 @@ Int QLT::gci2lci (const Int& gci) const { return it->second; } -// Set up QLT tracer metadata. Once end_tracer_declarations is called, it is -// an error to call declare_tracer again. Call declare_tracer in order of the -// tracer index in the caller's numbering. template -void QLT::declare_tracer (int problem_type) { +void QLT::declare_tracer (int problem_type, const Int& rhomidx) { cedr_throw_if( ! mdb_, "end_tracer_declarations was already called; " "it is an error to call declare_tracer now."); + cedr_throw_if(rhomidx > 0, "rhomidx > 0 is not supported yet."); // For its exception side effect, and to get canonical problem type, since // some possible problem types map to the same canonical one: problem_type = md_.get_problem_type(md_.get_problem_type_idx(problem_type)); @@ -697,15 +695,17 @@ void QLT::run () { for (size_t il = 0; il < ns_->levels.size(); ++il) { auto& lvl = ns_->levels[il]; // Set up receives. - for (size_t i = 0; i < lvl.kids.size(); ++i) { - const auto& mmd = lvl.kids[i]; - mpi::irecv(*p_, &bd_.l2r_data(mmd.offset*l2rndps), mmd.size*l2rndps, mmd.rank, - NodeSets::mpitag, &lvl.kids_req[i]); + if (lvl.kids.size()) { + for (size_t i = 0; i < lvl.kids.size(); ++i) { + const auto& mmd = lvl.kids[i]; + mpi::irecv(*p_, &bd_.l2r_data(mmd.offset*l2rndps), mmd.size*l2rndps, mmd.rank, + NodeSets::mpitag, &lvl.kids_req[i]); + } + //todo Replace with simultaneous waitany and isend. + Timer::start(Timer::waitall); + mpi::waitall(lvl.kids_req.size(), lvl.kids_req.data()); + Timer::stop(Timer::waitall); } - //todo Replace with simultaneous waitany and isend. - Timer::start(Timer::waitall); - mpi::waitall(lvl.kids_req.size(), lvl.kids_req.data()); - Timer::stop(Timer::waitall); // Combine kids' data. //todo Kernelize, interacting with waitany todo above. for (const auto& n : lvl.nodes) { @@ -734,15 +734,17 @@ void QLT::run () { } } // Send to parents. - for (size_t i = 0; i < lvl.me.size(); ++i) { - const auto& mmd = lvl.me[i]; - mpi::isend(*p_, &bd_.l2r_data(mmd.offset*l2rndps), mmd.size*l2rndps, mmd.rank, - NodeSets::mpitag, &lvl.me_req[i]); - } - if (il+1 == ns_->levels.size()) { - Timer::start(Timer::waitall); - mpi::waitall(lvl.me_req.size(), lvl.me_req.data()); - Timer::stop(Timer::waitall); + if (lvl.me.size()) { + for (size_t i = 0; i < lvl.me.size(); ++i) { + const auto& mmd = lvl.me[i]; + mpi::isend(*p_, &bd_.l2r_data(mmd.offset*l2rndps), mmd.size*l2rndps, mmd.rank, + NodeSets::mpitag, &lvl.me_req[i]); + } + if (il+1 == ns_->levels.size()) { + Timer::start(Timer::waitall); + mpi::waitall(lvl.me_req.size(), lvl.me_req.data()); + Timer::stop(Timer::waitall); + } } } Timer::stop(Timer::qltrunl2r); Timer::start(Timer::qltrunr2l); @@ -776,15 +778,17 @@ void QLT::run () { // Root to leaves. for (size_t il = ns_->levels.size(); il > 0; --il) { auto& lvl = ns_->levels[il-1]; - for (size_t i = 0; i < lvl.me.size(); ++i) { - const auto& mmd = lvl.me[i]; - mpi::irecv(*p_, &bd_.r2l_data(mmd.offset*r2lndps), mmd.size*r2lndps, mmd.rank, - NodeSets::mpitag, &lvl.me_req[i]); + if (lvl.me.size()) { + for (size_t i = 0; i < lvl.me.size(); ++i) { + const auto& mmd = lvl.me[i]; + mpi::irecv(*p_, &bd_.r2l_data(mmd.offset*r2lndps), mmd.size*r2lndps, mmd.rank, + NodeSets::mpitag, &lvl.me_req[i]); + } + //todo Replace with simultaneous waitany and isend. + Timer::start(Timer::waitall); + mpi::waitall(lvl.me_req.size(), lvl.me_req.data()); + Timer::stop(Timer::waitall); } - //todo Replace with simultaneous waitany and isend. - Timer::start(Timer::waitall); - mpi::waitall(lvl.me_req.size(), lvl.me_req.data()); - Timer::stop(Timer::waitall); // Solve QP for kids' values. //todo Kernelize, interacting with waitany todo above. Timer::start(Timer::snp); @@ -829,11 +833,12 @@ void QLT::run () { } Timer::stop(Timer::snp); // Send. - for (size_t i = 0; i < lvl.kids.size(); ++i) { - const auto& mmd = lvl.kids[i]; - mpi::isend(*p_, &bd_.r2l_data(mmd.offset*r2lndps), mmd.size*r2lndps, mmd.rank, - NodeSets::mpitag, &lvl.kids_req[i]); - } + if (lvl.kids.size()) + for (size_t i = 0; i < lvl.kids.size(); ++i) { + const auto& mmd = lvl.kids[i]; + mpi::isend(*p_, &bd_.r2l_data(mmd.offset*r2lndps), mmd.size*r2lndps, mmd.rank, + NodeSets::mpitag, &lvl.kids_req[i]); + } } // Wait on sends to clean up. for (size_t il = 0; il < ns_->levels.size(); ++il) { @@ -899,7 +904,7 @@ class TestQLT : public cedr::test::TestRandomized { void init_tracers () override { for (const auto& t : tracers_) - qlt_.declare_tracer(t.problem_type); + qlt_.declare_tracer(t.problem_type, 0); qlt_.end_tracer_declarations(); cedr_assert(qlt_.get_num_tracers() == static_cast(tracers_.size())); for (size_t i = 0; i < tracers_.size(); ++i) diff --git a/cedr/cedr_qlt.hpp b/cedr/cedr_qlt.hpp index f36bc69..e923600 100644 --- a/cedr/cedr_qlt.hpp +++ b/cedr/cedr_qlt.hpp @@ -64,7 +64,7 @@ class QLT : public cedr::CDR { // it. This is not an efficient operation. Int gci2lci(const Int& gci) const; - void declare_tracer(int problem_type) override; + void declare_tracer(int problem_type, const Int& rhomidx) override; void end_tracer_declarations() override; @@ -74,7 +74,7 @@ class QLT : public cedr::CDR { // lclcellidx is gci2lci(cellidx). KOKKOS_INLINE_FUNCTION - void set_rhom(const Int& lclcellidx, const Real& rhom) override; + void set_rhom(const Int& lclcellidx, const Int& rhomidx, const Real& rhom) override; // lclcellidx is gci2lci(cellidx). KOKKOS_INLINE_FUNCTION diff --git a/cedr/cedr_qlt_inl.hpp b/cedr/cedr_qlt_inl.hpp index 0c1ff4a..f07f6db 100644 --- a/cedr/cedr_qlt_inl.hpp +++ b/cedr/cedr_qlt_inl.hpp @@ -9,7 +9,7 @@ namespace cedr { namespace qlt { template KOKKOS_INLINE_FUNCTION -void QLT::set_rhom (const Int& lclcellidx, const Real& rhom) { +void QLT::set_rhom (const Int& lclcellidx, const Int& rhomidx, const Real& rhom) { const Int ndps = md_.a_d.prob2bl2r[md_.nprobtypes]; bd_.l2r_data(ndps*lclcellidx) = rhom; } diff --git a/cedr/cedr_test_1d_transport.cpp b/cedr/cedr_test_1d_transport.cpp index d0a4711..dbf2d9b 100644 --- a/cedr/cedr_test_1d_transport.cpp +++ b/cedr/cedr_test_1d_transport.cpp @@ -261,10 +261,10 @@ Int run (const mpi::Parallel::Ptr& parallel, const Input& in) { cedr_throw_if(parallel->size() > 1, "run_1d_transport_test runs in serial only."); Int nerr = 0; - Problem1D p(in.ncells, true /* nonuniform_mesh */ ); + Problem1D p(in.ncells, false /* nonuniform_mesh */ ); auto tree = qlt::tree::make_tree_over_1d_mesh(parallel, in.ncells, - true /* imbalanced */); + false /* imbalanced */); typedef qlt::QLT QLTT; QLTT qlt(parallel, in.ncells, tree); @@ -276,10 +276,10 @@ Int run (const mpi::Parallel::Ptr& parallel, const Input& in) { for (CDR* cdr : cdrs) { cdr->declare_tracer(cedr::ProblemType::conserve | - cedr::ProblemType::shapepreserve); + cedr::ProblemType::shapepreserve, 0); cdr->end_tracer_declarations(); for (Int i = 0; i < in.ncells; ++i) - cdr->set_rhom(i, p.area(i)); + cdr->set_rhom(i, 0, p.area(i)); cdr->print(std::cout); } diff --git a/cedr/cedr_test_randomized.cpp b/cedr/cedr_test_randomized.cpp index 7b48f0e..32ede9b 100644 --- a/cedr/cedr_test_randomized.cpp +++ b/cedr/cedr_test_randomized.cpp @@ -410,7 +410,7 @@ Int TestRandomized::run (const Int nrepeat, const bool write) { { Real* rhom = v.rhom(); for (Int i = 0; i < nlclcells; ++i) - cdr.set_rhom(i, rhom[i]); + cdr.set_rhom(i, 0, rhom[i]); } for (Int trial = 0; trial <= nrepeat; ++trial) { for (Int ti = 0; ti < nt; ++ti) { From 71beabec9cd340e8851e4b5493bbdf2004e7f07b Mon Sep 17 00:00:00 2001 From: "Andrew M. Bradley" Date: Sun, 4 Feb 2018 17:36:54 -0700 Subject: [PATCH 28/28] Configure with CMake. Add CMake configuration and tests. Remove Makefiles. Fix deprecated Kokkos usage. --- CMakeLists.txt | 81 ++++++++++++++++++++++++++++++++++++++++ README.md | 22 ++++++++++- cedr/CMakeLists.txt | 15 ++++++++ cedr/Makefile | 32 ---------------- cedr/README.md | 13 ------- cedr/cedr_qlt_inl.hpp | 3 +- cedr/cedr_test.cpp | 6 ++- cedr/make.inc.ws | 4 -- siqk/CMakeLists.txt | 13 +++++++ siqk/Makefile | 17 --------- siqk/README.md | 1 - siqk/make.inc.amb | 3 -- siqk/siqk_defs.hpp | 23 +++++++++--- siqk/siqk_intersect.hpp | 8 ++-- siqk/siqk_quadrature.hpp | 18 ++++----- siqk/siqk_runtests.py | 62 ++++++++++++++++-------------- siqk/siqk_search.hpp | 4 +- siqk/siqk_test.cpp | 18 ++++----- 18 files changed, 210 insertions(+), 133 deletions(-) create mode 100644 CMakeLists.txt create mode 100644 cedr/CMakeLists.txt delete mode 100644 cedr/Makefile delete mode 100644 cedr/README.md delete mode 100644 cedr/make.inc.ws create mode 100644 siqk/CMakeLists.txt delete mode 100644 siqk/Makefile delete mode 100644 siqk/README.md delete mode 100644 siqk/make.inc.amb diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..175ace3 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,81 @@ +cmake_minimum_required (VERSION 3.5) + +project (compose CXX) +set (CMAKE_CXX_STANDARD 11) + +function (prc var) + message ("${var}: ${${var}}") +endfunction () + +find_package (MPI REQUIRED) + +if (Kokkos_DIR) + include (${Kokkos_DIR}/kokkos.cmake) + set (Kokkos_INCLUDE ${Kokkos_DIR}/include) +else () + message (FATAL_ERROR "COMPOSE requires Kokkos_DIR") +endif () + +set (SOURCES + cedr/cedr_caas.cpp + cedr/cedr_local.cpp + cedr/cedr_mpi.cpp + cedr/cedr_qlt.cpp + cedr/cedr_test.cpp + cedr/cedr_test_1d_transport.cpp + cedr/cedr_test_randomized.cpp + cedr/cedr_util.cpp) + +set (HEADERS + cedr/cedr.hpp + cedr/cedr_caas.hpp + cedr/cedr_caas_inl.hpp + cedr/cedr_cdr.hpp + cedr/cedr_kokkos.hpp + cedr/cedr_local.hpp + cedr/cedr_local_inl.hpp + cedr/cedr_mpi.hpp + cedr/cedr_mpi_inl.hpp + cedr/cedr_qlt.hpp + cedr/cedr_qlt_inl.hpp + cedr/cedr_test.hpp + cedr/cedr_test_randomized.hpp + cedr/cedr_util.hpp + siqk/siqk.hpp + siqk/siqk_defs.hpp + siqk/siqk_geometry.hpp + siqk/siqk_intersect.hpp + siqk/siqk_quadrature.hpp + siqk/siqk_search.hpp + siqk/siqk_sqr.hpp) + +if (NOT COMPOSE_TEST_MPIRUN) + set (COMPOSE_TEST_MPIRUN mpirun) +endif () +if (NOT COMPOSE_TEST_NRANK) + set (COMPOSE_TEST_NRANK 8) +endif () + +set (COMPOSE_COMPILE_FLAGS "${MPI_COMPILE_FLAGS} ${KOKKOS_CXXFLAGS} ${CMAKE_CXX_FLAGS}") +set (COMPOSE_LINK_FLAGS "${MPI_LINK_FLAGS} ${KOKKOS_LDFLAGS}") +set (COMPOSE_INCLUDES "${Kokkos_INCLUDE}") +set (COMPOSE_LIBRARIES ${MPI_LIBRARIES} ${KOKKOS_LIBS}) + +prc(MPI_COMPILE_FLAGS) +prc(MPI_LINK_FLAGS) +prc(MPI_LIBRARIES) +add_library (${PROJECT_NAME} ${SOURCES}) +set_target_properties (${PROJECT_NAME} PROPERTIES + COMPILE_FLAGS ${COMPOSE_COMPILE_FLAGS} + LINK_FLAGS ${COMPOSE_LINK_FLAGS}) +target_include_directories (${PROJECT_NAME} PUBLIC cedr siqk) +target_include_directories (${PROJECT_NAME} PRIVATE siqk cedr) +target_include_directories (${PROJECT_NAME} PUBLIC ${COMPOSE_INCLUDES}) +target_link_libraries (${PROJECT_NAME} ${COMPOSE_LIBRARIES}) + +install (TARGETS ${PROJECT_NAME} ARCHIVE DESTINATION lib) +install (FILES ${HEADERS} DESTINATION include/compose) + +enable_testing () +add_subdirectory(siqk) +add_subdirectory(cedr) diff --git a/README.md b/README.md index 8dec62e..165fdd3 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,22 @@ # COMPOSE -Compact Multi-moment Performance-Portable Semi-Lagrangian methods for non-hydrostatic dynamics +Compact Multi-moment Performance-Portable Semi-Lagrangian methods + +COMPOSE provides libraries for semi-Lagrangian transport and, together or +separately, property preservation. + +CEDR: Communication-Efficient Constrained Density Reconstructors. +SIQK: Sphereical Polygon Intersection and Quadrature. + +First, install Kokkos: + https://github.com/kokkos/kokkos +For example, in a typical environment using OpenMP, a simple build line is: + ./kokkos/generate_makefile.bash --with-serial --with-openmp --prefix=/path/to/my/libs --compiler=g++ + make -j8 install + +Second, configure, build, and test COMPOSE: + cmake \ + -D Kokkos_DIR=/path/to/my/kokkos/install \ + -D CMAKE_INSTALL_PREFIX=/path/to/my/compose/install \ + /path/to/compose/repo + make -j8 + ctest diff --git a/cedr/CMakeLists.txt b/cedr/CMakeLists.txt new file mode 100644 index 0000000..f0f5c88 --- /dev/null +++ b/cedr/CMakeLists.txt @@ -0,0 +1,15 @@ +add_executable (cedr_test cedr_test.cpp) +set_target_properties (cedr_test PROPERTIES + COMPILE_FLAGS ${COMPOSE_COMPILE_FLAGS} + LINK_FLAGS ${COMPOSE_LINK_FLAGS}) + +target_include_directories (cedr_test PRIVATE ${COMPOSE_INCLUDES}) +target_link_libraries (cedr_test ${PROJECT_NAME} ${COMPOSE_LIBRARIES}) + +add_test (NAME cedr-test-unit + COMMAND $ -t) +add_test (NAME cedr-test-unit-mpi + COMMAND ${COMPOSE_TEST_MPIRUN} -np ${COMPOSE_TEST_NRANK} + $ -t --proc-random -nc 111 -nt 11) +add_test (NAME cedr-test-t1d + COMMAND $ -t -t1d -nc 111) diff --git a/cedr/Makefile b/cedr/Makefile deleted file mode 100644 index 99fab15..0000000 --- a/cedr/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -include make.inc - -CXXFLAGS=$(opt) -Wall -pedantic -fopenmp -std=c++11 -I$(KOKKOS)/include -DQLT_TIME -LDFLAGS=-fopenmp -L$(KOKKOS)/lib -lkokkos -ldl -LINK_LAPACK_BLAS=-llapack -lblas - -SOURCES=cedr_mpi.cpp cedr_util.cpp cedr_qlt.cpp cedr_caas.cpp cedr_local.cpp \ - cedr_test.cpp cedr_test_randomized.cpp cedr_test_1d_transport.cpp - -OBJECTS=$(SOURCES:.cpp=.o) - -.cpp.o: - $(MPICXX) $(CFLAGS) $(CXXFLAGS) -c $< -o $@ - -all: testcedr - -testcedr: cedr_test.o $(OBJECTS) - $(MPICXX) $(OBJECTS) $(LDFLAGS) -o testcedr - -clean: - rm -f *.o testcedr - -cedr_qlt.o: cedr_mpi.hpp cedr_mpi_inl.hpp cedr_local.hpp cedr_local_inl.hpp \ - cedr_qlt.hpp cedr_qlt_inl.hpp cedr_kokkos.hpp cedr_util.hpp \ - cedr_test_randomized.hpp -cedr_caas.o: cedr_mpi.hpp cedr_mpi_inl.hpp cedr_local.hpp cedr_local_inl.hpp \ - cedr_caas.hpp cedr_caas_inl.hpp cedr_kokkos.hpp cedr_util.hpp \ - cedr_test_randomized.hpp -cedr_test.o: cedr_qlt.hpp cedr_util.hpp -cedr_test_1d_transport.o: cedr_qlt.hpp cedr_util.hpp -cedr_local.o: cedr_local_inl.hpp -cedr_test_randomized.o: cedr_qlt.hpp cedr_util.hpp cedr_test_randomized.hpp diff --git a/cedr/README.md b/cedr/README.md deleted file mode 100644 index 0417972..0000000 --- a/cedr/README.md +++ /dev/null @@ -1,13 +0,0 @@ -For clarity, suppose your your C++ compiler is g++-4.8 in what follows. But it -can be something else. - -1. Get and install the standalone Kokkos TPL: - -$ git clone https://github.com/kokkos/kokkos.git -$ ./kokkos/generate_makefile.bash --with-openmp --ldflags=-fPIC --prefix=/path/to/desired/installation --compiler=g++-4.8 - -2. cp an existing make.inc.* file to one for your machine, say, -make.inc.mymachine. Edit it with machine-specific information. Then - $ ln -s make.inc.machine make.inc - $ make -j2 - $ mpirun -np 4 ./testcedr -t # Look for PASS diff --git a/cedr/cedr_qlt_inl.hpp b/cedr/cedr_qlt_inl.hpp index f07f6db..fb9290f 100644 --- a/cedr/cedr_qlt_inl.hpp +++ b/cedr/cedr_qlt_inl.hpp @@ -38,7 +38,8 @@ void QLT::set_Qm (const Int& lclcellidx, const Int& tracer_idx, cedr_kernel_throw_if(true, "set_Q: invalid problem_type."); } if (problem_type & ProblemType::conserve) { - cedr_kernel_throw_if(Qm_prev < 0, "Qm_prev was not provided to set_Q."); + cedr_kernel_throw_if(Qm_prev < -0.5, + "Qm_prev was not provided to set_Q."); bd[3] = Qm_prev; } } diff --git a/cedr/cedr_test.cpp b/cedr/cedr_test.cpp index ece27e7..ebb76f1 100644 --- a/cedr/cedr_test.cpp +++ b/cedr/cedr_test.cpp @@ -74,7 +74,7 @@ struct InputParser { } // namespace cedr int main (int argc, char** argv) { - int nerr = 0; + int nerr = 0, retval = 0; MPI_Init(&argc, &argv); auto p = cedr::mpi::make_parallel(MPI_COMM_WORLD); srand(p->rank()); @@ -93,15 +93,17 @@ int main (int argc, char** argv) { { int gnerr; cedr::mpi::reduce(*p, &nerr, &gnerr, 1, MPI_SUM, p->root()); + retval = gnerr != 0 ? -1 : 0; if (p->amroot()) std::cout << (gnerr != 0 ? "FAIL" : "PASS") << "\n"; } } catch (const std::exception& e) { if (p->amroot()) std::cerr << e.what(); + retval = -1; } Kokkos::finalize_all(); if (nerr) prc(nerr); MPI_Finalize(); - return 0; + return retval; } diff --git a/cedr/make.inc.ws b/cedr/make.inc.ws deleted file mode 100644 index daeeeff..0000000 --- a/cedr/make.inc.ws +++ /dev/null @@ -1,4 +0,0 @@ -opt= -MPICXX=mpicxx -KOKKOS=/home/ambradl/lib/kokkos/cpu -#KOKKOS=/home/ambradl/lib/kokkos/cpu-serial diff --git a/siqk/CMakeLists.txt b/siqk/CMakeLists.txt new file mode 100644 index 0000000..9ff299c --- /dev/null +++ b/siqk/CMakeLists.txt @@ -0,0 +1,13 @@ +add_executable (siqk_test siqk_test.cpp) +set_target_properties (siqk_test PROPERTIES + COMPILE_FLAGS ${COMPOSE_COMPILE_FLAGS} + LINK_FLAGS ${COMPOSE_LINK_FLAGS}) +target_include_directories (siqk_test PRIVATE ${COMPOSE_INCLUDES}) +target_link_libraries (siqk_test ${COMPOSE_LIBRARIES}) + +configure_file (siqk_runtests.py siqk_runtests.py) + +add_test (NAME siqk-test-area + COMMAND python siqk_runtests.py $ 0) +add_test (NAME siqk-test-cube + COMMAND python siqk_runtests.py $ 1) diff --git a/siqk/Makefile b/siqk/Makefile deleted file mode 100644 index 525107a..0000000 --- a/siqk/Makefile +++ /dev/null @@ -1,17 +0,0 @@ -include make.inc - -CXXFLAGS=$(opt) -Wall -pedantic -fopenmp -std=c++11 -I$(KOKKOS)/include -DSIQK_TIME -Wno-unused-function -LDFLAGS=-fopenmp -L$(KOKKOS)/lib -lkokkos -ldl - -OBJECTS=$(SOURCES:.cpp=.o) - -.cpp.o: - $(CXX) $(CFLAGS) $(CXXFLAGS) -c $< -o $@ - -all: siqk_test - -siqk_test: $(OBJECTS) siqk_test.o - $(CXX) $(OBJECTS) siqk_test.o $(LDFLAGS) -o siqk_test - -clean: - rm -f *.o siqk_test diff --git a/siqk/README.md b/siqk/README.md deleted file mode 100644 index da86fd7..0000000 --- a/siqk/README.md +++ /dev/null @@ -1 +0,0 @@ -Sphere Intersection and Quadrature with Kokkos. diff --git a/siqk/make.inc.amb b/siqk/make.inc.amb deleted file mode 100644 index 71ed8b5..0000000 --- a/siqk/make.inc.amb +++ /dev/null @@ -1,3 +0,0 @@ -opt=-O3 -CXX=g++ -KOKKOS=/home/ambradl/lib/kokkos/cpu diff --git a/siqk/siqk_defs.hpp b/siqk/siqk_defs.hpp index ad3c896..9c3cbd0 100644 --- a/siqk/siqk_defs.hpp +++ b/siqk/siqk_defs.hpp @@ -155,20 +155,20 @@ const_slice (const VT& v, Int i) { return ko::subview(v, i, ko::ALL()); } #else template KOKKOS_FORCEINLINE_FUNCTION typename VT::value_type* -slice (const VT& v, Int i) { return v.ptr_on_device() + v.dimension_1()*i; } +slice (const VT& v, Int i) { return v.data() + v.extent(1)*i; } template KOKKOS_FORCEINLINE_FUNCTION typename VT::const_value_type* -const_slice (const VT& v, Int i) { return v.ptr_on_device() + v.dimension_1()*i; } +const_slice (const VT& v, Int i) { return v.data() + v.extent(1)*i; } #endif // Number of slices in a 2D array, where each row is a slice. template KOKKOS_FORCEINLINE_FUNCTION -Int nslices (const A2D& a) { return static_cast(a.dimension_0()); } +Int nslices (const A2D& a) { return static_cast(a.extent(0)); } // Number of entries in a 2D array's row. template KOKKOS_FORCEINLINE_FUNCTION -Int szslice (const A2D& a) { return static_cast(a.dimension_1()); } +Int szslice (const A2D& a) { return static_cast(a.extent(1)); } template KOKKOS_INLINE_FUNCTION @@ -184,8 +184,19 @@ void resize_and_copy (DV& d, const SV& s, } template -void resize_and_copy (DV& d, const SV& s, - typename std::enable_if::type* = 0) { +void resize_and_copy ( + DV& d, const SV& s, + typename std::enable_if::type* = 0) +{ + ko::resize(d, nslices(s)); + ko::deep_copy(d, s); +} + +template +void resize_and_copy ( + DV& d, const SV& s, + typename std::enable_if::type* = 0) +{ ko::resize(d, nslices(s), szslice(s)); ko::deep_copy(d, s); } diff --git a/siqk/siqk_intersect.hpp b/siqk/siqk_intersect.hpp index d380002..6fcde7b 100644 --- a/siqk/siqk_intersect.hpp +++ b/siqk/siqk_intersect.hpp @@ -195,7 +195,7 @@ void fill_normals (sh::Mesh& m) { // Fill. Idxs::HostMirror en("en", nslices(m.e), szslice(m.e)); ko::deep_copy(en, -1); - Vec3s::HostMirror nml("nml", ne, 3); + Vec3s::HostMirror nml("nml", ne); Int ie = 0; for (Int ip = 0; ip < nslices(m.e); ++ip) for (Int iv = 0; iv < szslice(m.e); ++iv) @@ -248,9 +248,9 @@ class AreaOTFunctor { // In and out vertex lists. Real buf[9*max_nvert]; RawVec3s - vi(buf, max_nvert, 3), - vo(buf + 3*max_nvert, max_nvert, 3), - wrk(buf + 6*max_nvert, max_nvert, 3); + vi(buf, max_nvert), + vo(buf + 3*max_nvert, max_nvert), + wrk(buf + 6*max_nvert, max_nvert); Int ni; ni = 0; for (Int i = 0; i < szslice(e_); ++i) { diff --git a/siqk/siqk_quadrature.hpp b/siqk/siqk_quadrature.hpp index ce602ca..42164ad 100644 --- a/siqk/siqk_quadrature.hpp +++ b/siqk/siqk_quadrature.hpp @@ -566,40 +566,40 @@ class TriangleQuadrature { RawConstArray& weight) const { switch (order) { case 4: - coord = RawConstVec3s(trisym_order4_coord_, 6, 3); + coord = RawConstVec3s(trisym_order4_coord_, 6); weight = RawConstArray(trisym_order4_weight_, 6); break; case 6: - coord = RawConstVec3s(tritay_order6_coord_, 11, 3); + coord = RawConstVec3s(tritay_order6_coord_, 11); weight = RawConstArray(tritay_order6_weight_, 11); break; case 8: - coord = RawConstVec3s(trisym_order8_coord_, 16, 3); + coord = RawConstVec3s(trisym_order8_coord_, 16); weight = RawConstArray(trisym_order8_weight_, 16); break; case 12: #ifdef SIQK_USE_TRITAY12 - coord = RawConstVec3s(tritay_order12_coord_, 32, 3); + coord = RawConstVec3s(tritay_order12_coord_, 32); weight = RawConstArray(tritay_order12_weight_, 32); #else - coord = RawConstVec3s(trisym_order12_coord_, 33, 3); + coord = RawConstVec3s(trisym_order12_coord_, 33); weight = RawConstArray(trisym_order12_weight_, 33); #endif break; case 14: - coord = RawConstVec3s(trisym_order14_coord_, 46, 3); + coord = RawConstVec3s(trisym_order14_coord_, 46); weight = RawConstArray(trisym_order14_weight_, 46); break; case 16: - coord = RawConstVec3s(tritay_order16_coord_, 55, 3); + coord = RawConstVec3s(tritay_order16_coord_, 55); weight = RawConstArray(tritay_order16_weight_, 55); break; case 18: - coord = RawConstVec3s(tritay_order18_coord_, 66, 3); + coord = RawConstVec3s(tritay_order18_coord_, 66); weight = RawConstArray(tritay_order18_weight_, 66); break; case 20: - coord = RawConstVec3s(trisym_order20_coord_, 88, 3); + coord = RawConstVec3s(trisym_order20_coord_, 88); weight = RawConstArray(trisym_order20_weight_, 88); break; default: diff --git a/siqk/siqk_runtests.py b/siqk/siqk_runtests.py index eea9e21..8a05d9a 100755 --- a/siqk/siqk_runtests.py +++ b/siqk/siqk_runtests.py @@ -1,10 +1,12 @@ #!/usr/bin/python -import os +import os, sys quick = True +exe = sys.argv[1] +testno = int(sys.argv[2]) -stride = 1 +stride = 2 biggest = 1111 xlates = [4.2*10**f for f in range(-17, 0, stride)] @@ -17,37 +19,39 @@ fails = [] cnt = 0 -# Test 1 -for n in [4, 20, 40, 79]: - if quick and n > 20: break - for angle in angles: - cmd = ('OMP_NUM_THREADS=8 ./siqk_test --testno 1 --angle {angle:1.15e} -n {n:d}'. - format(angle=angle, n=n)) - stat = os.system(cmd + ' |& grep PASSED &> /dev/null') - if stat: - fails.append(cmd) - else: - cnt += 1 - print len(fails) - -# Test 0 -for n in [4, 50, 511, biggest]: - if quick and n > 50: break - for angle in angles: - for xlate in xlates: - for ylate in ylates: - cmd = ('OMP_NUM_THREADS=8 ./siqk_test --testno 0 --xlate {xlate:1.15e} --ylate {ylate:1.14e} --angle {angle:1.15e} -n {n:d}'. - format(xlate=xlate, ylate=ylate, angle=angle, n=n)) - stat = os.system(cmd + ' |& grep PASSED &> /dev/null') - if stat: - fails.append(cmd) - else: - cnt += 1 - print len(fails) +if testno == 0: + for n in [4, 50, 511, biggest]: + if quick and n > 50: break + for angle in angles: + for xlate in xlates: + for ylate in ylates: + cmd = ('OMP_NUM_THREADS=8 {exe:s} --testno 0 --xlate {xlate:1.15e} --ylate {ylate:1.14e} --angle {angle:1.15e} -n {n:d}'. + format(exe=exe, xlate=xlate, ylate=ylate, angle=angle, n=n)) + stat = os.system(cmd + ' |& grep PASSED &> /dev/null') + if stat: + fails.append(cmd) + else: + cnt += 1 + print len(fails) + +elif testno == 1: + for n in [4, 20, 40, 79]: + if quick and n > 20: break + for angle in angles: + cmd = ('OMP_NUM_THREADS=8 {exe:s} --testno 1 --angle {angle:1.15e} -n {n:d}'. + format(exe=exe, angle=angle, n=n)) + stat = os.system(cmd + ' |& grep PASSED &> /dev/null') + if stat: + fails.append(cmd) + else: + cnt += 1 + print len(fails) if len(fails) > 0: print 'FAILED' for f in fails: print f + sys.exit(-1) else: print 'PASSED ({0:d})'.format(cnt) + sys.exit(0) diff --git a/siqk/siqk_search.hpp b/siqk/siqk_search.hpp index 381ac99..1e517e3 100644 --- a/siqk/siqk_search.hpp +++ b/siqk/siqk_search.hpp @@ -236,7 +236,7 @@ class Octree { // Get OT's bounding box. calc_bb(p, bb_); // Get elements' bounding boxes. - Vec6s::HostMirror ebbs("ebbs", nslices(e), 6); + Vec6s::HostMirror ebbs("ebbs", nslices(e)); calc_bb(p, e, ebbs); // Static element lists for work. Each level has active work space. std::vector buf(max_depth_*nslices(e)); @@ -305,7 +305,7 @@ class Octree { void init_static_ds (const DynNodes nodes, const DynIntList& offsets, const DynIntList& elems) { { - ko::resize(nodes_, nodes.n(), 8); + ko::resize(nodes_, nodes.n()); auto nodes_hm = ko::create_mirror_view(nodes_); for (Int i = 0; i < nodes.n(); ++i) for (Int j = 0; j < 8; ++j) diff --git a/siqk/siqk_test.cpp b/siqk/siqk_test.cpp index 7d73cdf..e08ab2a 100644 --- a/siqk/siqk_test.cpp +++ b/siqk/siqk_test.cpp @@ -35,7 +35,7 @@ static void make_planar_mesh (Vec3s::HostMirror& p, Idxs::HostMirror& e, const Int n) { const Real d = std::sqrt(0.5); ko::resize(e, n*n, 4); - ko::resize(p, (n+1)*(n+1), 3); + ko::resize(p, (n+1)*(n+1)); for (Int iy = 0; iy < n+1; ++iy) for (Int ix = 0; ix < n+1; ++ix) { const auto idx = (n+1)*iy + ix; @@ -111,9 +111,9 @@ static void remove_unused_vertices (Vec3s::HostMirror& p, Idxs::HostMirror& e, for (Int k = 0; k < szslice(e); ++k) e(ei,k) -= adjust[e(ei,k)]; // Remove unused from p. - Vec3s::HostMirror pc("copy", nslices(p), szslice(p)); + Vec3s::HostMirror pc("copy", nslices(p)); ko::deep_copy(pc, p); - ko::resize(p, nslices(p) - rmcnt, szslice(p)); + ko::resize(p, nslices(p) - rmcnt); for (Int i = 0, j = 0; i < nslices(pc); ++i) { if (pc(i,0) == unused) continue; for (Int k = 0; k < szslice(pc); ++k) p(j,k) = pc(i,k); @@ -143,11 +143,11 @@ void make_cubesphere_mesh (Vec3s::HostMirror& p, Idxs::HostMirror& e, Idxs::HostMirror& e_ref = es[0]; make_planar_mesh(p_ref, e_ref, n); ko::resize(e, 6*nslices(e_ref), 4); - ko::resize(p, 6*nslices(p_ref), 3); + ko::resize(p, 6*nslices(p_ref)); for (Int i = 1; i < 6; ++i) { ko::resize(es[i], nslices(e_ref), 4); ko::deep_copy(es[i], e_ref); - ko::resize(ps[i], nslices(p_ref), 3); + ko::resize(ps[i], nslices(p_ref)); ko::deep_copy(ps[i], p_ref); transform_planar_mesh(R[i], xlate[i], ps[i]); } @@ -366,17 +366,17 @@ static Real calc_true_area ( const ConstVec3s::HostMirror& p, const ConstIdxs::HostMirror& e, const bool wm) { - Vec3s::HostMirror clip_poly("clip_poly", 4, 3), poly("poly", 4, 3), - nml("nml", 4, 3); + Vec3s::HostMirror clip_poly("clip_poly", 4), poly("poly", 4), + nml("nml", 4); fill_quad(cp, clip_poly); fill_quad(p, poly); for (Int i = 0; i < 4; ++i) Geo::edge_normal(slice(clip_poly, i), slice(clip_poly, (i+1) % 4), slice(nml, i)); - Vec3s::HostMirror vo("vo", test::max_nvert, 3); + Vec3s::HostMirror vo("vo", test::max_nvert); Int no; { - Vec3s::HostMirror wrk("wrk", test::max_nvert, 3); + Vec3s::HostMirror wrk("wrk", test::max_nvert); sh::clip_against_poly(clip_poly, nml, poly, 4, vo, no, wrk); } if (wm) {