From febd84d20becf6250dc9a51ce80ec8246790b32a Mon Sep 17 00:00:00 2001 From: Patrick Brosi Date: Tue, 3 Sep 2024 15:16:54 +0200 Subject: [PATCH] automatic referencing of duplicate geometries (most) --- src/spatialjoin/SpatialJoinMain.cpp | 5 ++ src/spatialjoin/Sweeper.cpp | 123 ++++++++++++++++++++++++++-- src/spatialjoin/Sweeper.h | 3 + src/spatialjoin/tests/TestMain.cpp | 1 + 4 files changed, 124 insertions(+), 8 deletions(-) diff --git a/src/spatialjoin/SpatialJoinMain.cpp b/src/spatialjoin/SpatialJoinMain.cpp index 3f33d16..fb50ba8 100755 --- a/src/spatialjoin/SpatialJoinMain.cpp +++ b/src/spatialjoin/SpatialJoinMain.cpp @@ -295,6 +295,11 @@ int main(int argc, char** argv) { LOGTO(INFO, std::cerr) << "done (" << TOOK(ts) / 1000000000.0 << "s)."; + ts = TIME(); + LOGTO(INFO, std::cerr) << "Ref'ing duplicates..."; + sweeper.refDuplicates(); + LOGTO(INFO, std::cerr) << "done (" << TOOK(ts) / 1000000000.0 << "s)."; + if (preSortCache) { ts = TIME(); LOGTO(INFO, std::cerr) << "Pre-sorting cache..."; diff --git a/src/spatialjoin/Sweeper.cpp b/src/spatialjoin/Sweeper.cpp index b344617..0c168a6 100644 --- a/src/spatialjoin/Sweeper.cpp +++ b/src/spatialjoin/Sweeper.cpp @@ -69,7 +69,7 @@ void Sweeper::add(const I32MultiLine& a, const std::string& gid, bool side, // _____________________________________________________________________________ void Sweeper::add(const I32MultiPoint& a, const std::string& gid, bool side, - WriteBatch& batch) const { + WriteBatch& batch) const { uint16_t subid = 0; // a subid of 0 means "single point" if (a.size() > 1) subid = 1; @@ -97,8 +97,8 @@ void Sweeper::add(const I32MultiLine& a, const std::string& gid, size_t subId, } // _____________________________________________________________________________ -void Sweeper::add(const I32MultiPoint& a, const std::string& gid, - size_t subid, bool side, WriteBatch& batch) const { +void Sweeper::add(const I32MultiPoint& a, const std::string& gid, size_t subid, + bool side, WriteBatch& batch) const { size_t newId = subid; for (const auto& point : a) { add(point, gid, newId, side, batch); @@ -875,6 +875,114 @@ void Sweeper::diskAdd(const BoxVal& bv) { _curSweepId++; } +// _____________________________________________________________________________ +void Sweeper::refDuplicates() { + // start at beginning of _file + lseek(_file, 0, SEEK_SET); + + const size_t RBUF_SIZE = 100000; + unsigned char* buf = new unsigned char[sizeof(BoxVal) * RBUF_SIZE]; + + ssize_t len; + size_t jj = 0; + + size_t numEvents = _curSweepId; + + BoxVal prev{0, 0, 0, 0, 0, POLYGON, 0, {}, 0}; + bool first = false; + + while ((len = util::readAll(_file, buf, sizeof(BoxVal) * RBUF_SIZE)) > 0) { + for (ssize_t i = 0; i < len; i += sizeof(BoxVal)) { + auto cur = reinterpret_cast(buf + i); + jj++; + + if (jj % 500000 == 0) { + log(std::to_string(jj / 2) + " / " + std::to_string(numEvents / 2) + + " (" + std::to_string((((1.0 * jj) / (1.0 * numEvents)) * 100)) + + "%)"); + } + + if (!cur->out) { + if (!first) { + // filter special event + if (cur->type != POINT || !(cur->loY == 1 && cur->upY == 0)) { + prev = *cur; + first = true; + } + continue; + } + + if (cur->type == POINT && !(cur->loY == 1 && cur->upY == 0) && + cur->type == prev.type && cur->val == prev.val && + cur->loY == prev.loY && cur->upY == prev.upY && + cur->b45 == prev.b45 && cur->side == prev.side) { + const auto prevC = _pointCache.get(prev.id, 0); + if (prevC->subId == 0) { + const auto curC = _pointCache.get(cur->id, 0); + _refs[prevC->id][curC->id] = curC->subId; + _lateRefGeoms.insert(cur->id); + } + } + + if (cur->type == SIMPLE_LINE && cur->type == prev.type && + cur->val == prev.val && cur->loY == prev.loY && + cur->upY == prev.upY && cur->areaOrLen == prev.areaOrLen && + cur->b45 == prev.b45 && cur->side == prev.side) { + const auto prevC = _simpleLineCache.get(prev.id, 0); + const auto curC = _simpleLineCache.get(cur->id, 0); + if (prevC->a == curC->a && prevC->b == curC->b) { + _refs[prevC->id][curC->id] = 0; + _lateRefGeoms.insert(cur->id); + } + } + + if (cur->type == LINE && cur->type == prev.type && + cur->val == prev.val && cur->loY == prev.loY && + cur->upY == prev.upY && cur->areaOrLen == prev.areaOrLen && + cur->b45 == prev.b45 && cur->side == prev.side) { + const auto prevC = _lineCache.get(prev.id, 0); + const auto curC = _lineCache.get(cur->id, 0); + if (prevC->subId == 0 && prevC->box == curC->box && prevC->geom == curC->geom) { + _refs[prevC->id][curC->id] = curC->subId; + _lateRefGeoms.insert(cur->id); + } + } + + if (cur->type == POLYGON && cur->type == prev.type && + cur->val == prev.val && cur->loY == prev.loY && + cur->upY == prev.upY && cur->areaOrLen == prev.areaOrLen && + cur->b45 == prev.b45 && cur->side == prev.side) { + const auto prevC = _areaCache.get(prev.id, 0); + const auto curC = _areaCache.get(cur->id, 0); + if (prevC->subId == 0 && prevC->box == curC->box && prevC->area == curC->area && + prevC->geom == curC->geom) { + _refs[prevC->id][curC->id] = curC->subId; + _lateRefGeoms.insert(cur->id); + } + } + + if (cur->type == SIMPLE_POLYGON && cur->type == prev.type && + cur->val == prev.val && cur->loY == prev.loY && + cur->upY == prev.upY && cur->areaOrLen == prev.areaOrLen && + cur->b45 == prev.b45 && cur->side == prev.side) { + const auto prevC = _simpleAreaCache.get(prev.id, 0); + const auto curC = _simpleAreaCache.get(cur->id, 0); + if (prevC->geom == curC->geom) { + _refs[prevC->id][curC->id] = 0; + _lateRefGeoms.insert(cur->id); + } + } + + if (cur->type != POINT || !(cur->loY == 1 && cur->upY == 0)) { + prev = *cur; + } + } + } + } + + delete[] buf; +} + // _____________________________________________________________________________ void Sweeper::sortCache() { // start at beginning of _file @@ -1069,7 +1177,8 @@ RelStats Sweeper::sweep() { if (!cur->out && cur->loY == 1 && cur->upY == 0 && cur->type == POINT) { // special multi-IN _activeMultis[cur->side].insert(cur->id); - } else if (!cur->out) { + } else if (!cur->out && + _lateRefGeoms.find(cur->id) == _lateRefGeoms.end()) { // IN event actives[cur->side].insert({cur->loY, cur->upY}, {cur->id, cur->type, cur->b45, cur->side}); @@ -1100,7 +1209,7 @@ RelStats Sweeper::sweep() { _cfg.sweepProgressCb(jj / 2); if (jj % 200000 == 0) clearMultis(false); - } else { + } else if (_lateRefGeoms.find(cur->id) == _lateRefGeoms.end()) { // OUT event actives[cur->side].erase({cur->loY, cur->upY}, {cur->id, cur->type}); @@ -1190,7 +1299,6 @@ sj::Area Sweeper::areaFromSimpleArea(const SimpleArea* sa) const { std::tuple Sweeper::check(const Area* a, const Area* b, size_t t) const { - // cheap equivalence check if (a->box == b->box && a->area == b->area && a->geom == b->geom) { // equivalent! @@ -2238,8 +2346,7 @@ void Sweeper::doCheck(const BoxVal cur, const SweepVal sv, size_t t) { auto b = _lineCache.get(sv.id, t); _stats[t].timeGeoCacheRetrievalLine += TOOK(ts); - if (a->id == b->id) - return; // no self-checks in multigeometries + if (a->id == b->id) return; // no self-checks in multigeometries _stats[t].lineCmps++; _stats[t].lineLenSum += std::max(a->length, b->length); diff --git a/src/spatialjoin/Sweeper.h b/src/spatialjoin/Sweeper.h index b1f6556..07ef9cf 100644 --- a/src/spatialjoin/Sweeper.h +++ b/src/spatialjoin/Sweeper.h @@ -256,6 +256,7 @@ class Sweeper { RelStats sweep(); void sortCache(); + void refDuplicates(); size_t numElements() const { return _curSweepId / 2; } @@ -460,6 +461,8 @@ class Sweeper { std::unordered_map> _refs; + + std::unordered_set _lateRefGeoms; }; } // namespace sj diff --git a/src/spatialjoin/tests/TestMain.cpp b/src/spatialjoin/tests/TestMain.cpp index 9942ae3..85222a7 100644 --- a/src/spatialjoin/tests/TestMain.cpp +++ b/src/spatialjoin/tests/TestMain.cpp @@ -49,6 +49,7 @@ std::string fullRun(const std::string& file, const sj::SweeperCfg& cfg) { sweeper.flush(); + sweeper.refDuplicates(); sweeper.sweep();