Skip to content

Commit

Permalink
automatic referencing of duplicate geometries (most)
Browse files Browse the repository at this point in the history
  • Loading branch information
patrickbr committed Sep 3, 2024
1 parent a37821e commit febd84d
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 8 deletions.
5 changes: 5 additions & 0 deletions src/spatialjoin/SpatialJoinMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,11 @@ int main(int argc, char** argv) {

LOGTO(INFO, std::cerr) << "done (" << TOOK(ts) / 1000000000.0 << "s).";

ts = TIME();
LOGTO(INFO, std::cerr) << "Ref'ing duplicates...";
sweeper.refDuplicates();
LOGTO(INFO, std::cerr) << "done (" << TOOK(ts) / 1000000000.0 << "s).";

if (preSortCache) {
ts = TIME();
LOGTO(INFO, std::cerr) << "Pre-sorting cache...";
Expand Down
123 changes: 115 additions & 8 deletions src/spatialjoin/Sweeper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ void Sweeper::add(const I32MultiLine& a, const std::string& gid, bool side,

// _____________________________________________________________________________
void Sweeper::add(const I32MultiPoint& a, const std::string& gid, bool side,
WriteBatch& batch) const {
WriteBatch& batch) const {
uint16_t subid = 0; // a subid of 0 means "single point"
if (a.size() > 1) subid = 1;

Expand Down Expand Up @@ -97,8 +97,8 @@ void Sweeper::add(const I32MultiLine& a, const std::string& gid, size_t subId,
}

// _____________________________________________________________________________
void Sweeper::add(const I32MultiPoint& a, const std::string& gid,
size_t subid, bool side, WriteBatch& batch) const {
void Sweeper::add(const I32MultiPoint& a, const std::string& gid, size_t subid,
bool side, WriteBatch& batch) const {
size_t newId = subid;
for (const auto& point : a) {
add(point, gid, newId, side, batch);
Expand Down Expand Up @@ -875,6 +875,114 @@ void Sweeper::diskAdd(const BoxVal& bv) {
_curSweepId++;
}

// _____________________________________________________________________________
void Sweeper::refDuplicates() {
// start at beginning of _file
lseek(_file, 0, SEEK_SET);

const size_t RBUF_SIZE = 100000;
unsigned char* buf = new unsigned char[sizeof(BoxVal) * RBUF_SIZE];

ssize_t len;
size_t jj = 0;

size_t numEvents = _curSweepId;

BoxVal prev{0, 0, 0, 0, 0, POLYGON, 0, {}, 0};
bool first = false;

while ((len = util::readAll(_file, buf, sizeof(BoxVal) * RBUF_SIZE)) > 0) {
for (ssize_t i = 0; i < len; i += sizeof(BoxVal)) {
auto cur = reinterpret_cast<const BoxVal*>(buf + i);
jj++;

if (jj % 500000 == 0) {
log(std::to_string(jj / 2) + " / " + std::to_string(numEvents / 2) +
" (" + std::to_string((((1.0 * jj) / (1.0 * numEvents)) * 100)) +
"%)");
}

if (!cur->out) {
if (!first) {
// filter special event
if (cur->type != POINT || !(cur->loY == 1 && cur->upY == 0)) {
prev = *cur;
first = true;
}
continue;
}

if (cur->type == POINT && !(cur->loY == 1 && cur->upY == 0) &&
cur->type == prev.type && cur->val == prev.val &&
cur->loY == prev.loY && cur->upY == prev.upY &&
cur->b45 == prev.b45 && cur->side == prev.side) {
const auto prevC = _pointCache.get(prev.id, 0);
if (prevC->subId == 0) {
const auto curC = _pointCache.get(cur->id, 0);
_refs[prevC->id][curC->id] = curC->subId;
_lateRefGeoms.insert(cur->id);
}
}

if (cur->type == SIMPLE_LINE && cur->type == prev.type &&
cur->val == prev.val && cur->loY == prev.loY &&
cur->upY == prev.upY && cur->areaOrLen == prev.areaOrLen &&
cur->b45 == prev.b45 && cur->side == prev.side) {
const auto prevC = _simpleLineCache.get(prev.id, 0);
const auto curC = _simpleLineCache.get(cur->id, 0);
if (prevC->a == curC->a && prevC->b == curC->b) {
_refs[prevC->id][curC->id] = 0;
_lateRefGeoms.insert(cur->id);
}
}

if (cur->type == LINE && cur->type == prev.type &&
cur->val == prev.val && cur->loY == prev.loY &&
cur->upY == prev.upY && cur->areaOrLen == prev.areaOrLen &&
cur->b45 == prev.b45 && cur->side == prev.side) {
const auto prevC = _lineCache.get(prev.id, 0);
const auto curC = _lineCache.get(cur->id, 0);
if (prevC->subId == 0 && prevC->box == curC->box && prevC->geom == curC->geom) {
_refs[prevC->id][curC->id] = curC->subId;
_lateRefGeoms.insert(cur->id);
}
}

if (cur->type == POLYGON && cur->type == prev.type &&
cur->val == prev.val && cur->loY == prev.loY &&
cur->upY == prev.upY && cur->areaOrLen == prev.areaOrLen &&
cur->b45 == prev.b45 && cur->side == prev.side) {
const auto prevC = _areaCache.get(prev.id, 0);
const auto curC = _areaCache.get(cur->id, 0);
if (prevC->subId == 0 && prevC->box == curC->box && prevC->area == curC->area &&
prevC->geom == curC->geom) {
_refs[prevC->id][curC->id] = curC->subId;
_lateRefGeoms.insert(cur->id);
}
}

if (cur->type == SIMPLE_POLYGON && cur->type == prev.type &&
cur->val == prev.val && cur->loY == prev.loY &&
cur->upY == prev.upY && cur->areaOrLen == prev.areaOrLen &&
cur->b45 == prev.b45 && cur->side == prev.side) {
const auto prevC = _simpleAreaCache.get(prev.id, 0);
const auto curC = _simpleAreaCache.get(cur->id, 0);
if (prevC->geom == curC->geom) {
_refs[prevC->id][curC->id] = 0;
_lateRefGeoms.insert(cur->id);
}
}

if (cur->type != POINT || !(cur->loY == 1 && cur->upY == 0)) {
prev = *cur;
}
}
}
}

delete[] buf;
}

// _____________________________________________________________________________
void Sweeper::sortCache() {
// start at beginning of _file
Expand Down Expand Up @@ -1069,7 +1177,8 @@ RelStats Sweeper::sweep() {
if (!cur->out && cur->loY == 1 && cur->upY == 0 && cur->type == POINT) {
// special multi-IN
_activeMultis[cur->side].insert(cur->id);
} else if (!cur->out) {
} else if (!cur->out &&
_lateRefGeoms.find(cur->id) == _lateRefGeoms.end()) {
// IN event
actives[cur->side].insert({cur->loY, cur->upY},
{cur->id, cur->type, cur->b45, cur->side});
Expand Down Expand Up @@ -1100,7 +1209,7 @@ RelStats Sweeper::sweep() {
_cfg.sweepProgressCb(jj / 2);

if (jj % 200000 == 0) clearMultis(false);
} else {
} else if (_lateRefGeoms.find(cur->id) == _lateRefGeoms.end()) {
// OUT event
actives[cur->side].erase({cur->loY, cur->upY}, {cur->id, cur->type});

Expand Down Expand Up @@ -1190,7 +1299,6 @@ sj::Area Sweeper::areaFromSimpleArea(const SimpleArea* sa) const {
std::tuple<bool, bool, bool, bool, bool> Sweeper::check(const Area* a,
const Area* b,
size_t t) const {

// cheap equivalence check
if (a->box == b->box && a->area == b->area && a->geom == b->geom) {
// equivalent!
Expand Down Expand Up @@ -2238,8 +2346,7 @@ void Sweeper::doCheck(const BoxVal cur, const SweepVal sv, size_t t) {
auto b = _lineCache.get(sv.id, t);
_stats[t].timeGeoCacheRetrievalLine += TOOK(ts);

if (a->id == b->id)
return; // no self-checks in multigeometries
if (a->id == b->id) return; // no self-checks in multigeometries

_stats[t].lineCmps++;
_stats[t].lineLenSum += std::max(a->length, b->length);
Expand Down
3 changes: 3 additions & 0 deletions src/spatialjoin/Sweeper.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ class Sweeper {

RelStats sweep();
void sortCache();
void refDuplicates();

size_t numElements() const { return _curSweepId / 2; }

Expand Down Expand Up @@ -460,6 +461,8 @@ class Sweeper {

std::unordered_map<std::string, std::unordered_map<std::string, size_t>>
_refs;

std::unordered_set<size_t> _lateRefGeoms;
};
} // namespace sj

Expand Down
1 change: 1 addition & 0 deletions src/spatialjoin/tests/TestMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ std::string fullRun(const std::string& file, const sj::SweeperCfg& cfg) {


sweeper.flush();
sweeper.refDuplicates();

sweeper.sweep();

Expand Down

0 comments on commit febd84d

Please sign in to comment.