Skip to content

Commit

Permalink
implement batch delete for paths
Browse files Browse the repository at this point in the history
  • Loading branch information
jeizenga committed Jul 19, 2024
1 parent 97e42e0 commit 39cb4fd
Show file tree
Hide file tree
Showing 6 changed files with 180 additions and 60 deletions.
5 changes: 5 additions & 0 deletions bdsg/include/bdsg/hash_graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,11 @@ class HashGraph : public MutablePathDeletableHandleGraph, public SerializableHan
*/
void destroy_path(const path_handle_t& path);

/**
* Destroy the given set of paths. Invalidates handles to all the paths and their steps.
*/
void destroy_paths(const std::vector<path_handle_t>& paths);

/**
* Create a path with the given name. The caller must ensure that no path
* with the given name exists already, or the behavior is undefined.
Expand Down
103 changes: 66 additions & 37 deletions bdsg/include/bdsg/internal/base_packed_graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,11 @@ class BasePackedGraph {
* Destroy the given path. Invalidates handles to the path and its node steps.
*/
void destroy_path(const path_handle_t& path);

/**
* Destroy the given set of paths. Invalidates handles to all the paths and their steps.
*/
void destroy_paths(const std::vector<path_handle_t>& paths);

/**
* Create a path with the given name. The caller must ensure that no path
Expand Down Expand Up @@ -2607,52 +2612,76 @@ string BasePackedGraph<Backend>::decode_path_name(const int64_t& path_idx) const

template<typename Backend>
void BasePackedGraph<Backend>::destroy_path(const path_handle_t& path) {
destroy_paths({path});
}

template<typename Backend>
void BasePackedGraph<Backend>::destroy_paths(const std::vector<path_handle_t>& paths) {

PackedPath& packed_path = paths.at(as_integer(path));
std::unordered_set<path_handle_t> paths_set(paths.begin(), paths.end());

// remove node membership records corresponding to this path
bool first_iter = true;
for (uint64_t step_offset = path_head_iv.get(as_integer(path));
step_offset != 0 && (step_offset != path_head_iv.get(as_integer(path)) || first_iter);
step_offset = get_step_next(packed_path, step_offset)) {

uint64_t trav = get_step_trav(packed_path, step_offset);
size_t node_member_idx = graph_index_to_node_member_index(graph_iv_index(decode_traversal(trav)));
PackedSet<Backend> nodes_visited;

for (const auto& path : paths) {

// find a membership record for this path
size_t prev = 0;
size_t here = path_membership_node_iv.get(node_member_idx);
while (as_path_handle(get_membership_path(here)) != path) {
prev = here;
here = get_next_membership(here);
// note: we don't need to be careful about getting the exact corresponding step since this node
// should try to delete a membership record exactly as many times as it occurs on this path -- all of
// the records will get deleted
}
PackedPath& packed_path = this->paths.at(as_integer(path));

if (prev == 0) {
// this was the first record, set following one to be the head
path_membership_node_iv.set(node_member_idx, get_next_membership(here));
}
else {
// make the link from the previous record skip over the current one
set_next_membership(prev, get_next_membership(here));
// remove node membership records corresponding to this path
bool first_iter = true;
for (uint64_t step_offset = path_head_iv.get(as_integer(path));
step_offset != 0 && (step_offset != path_head_iv.get(as_integer(path)) || first_iter);
step_offset = get_step_next(packed_path, step_offset)) {

uint64_t trav = get_step_trav(packed_path, step_offset);
// if there are multiple paths, we check for whether we've gone over the same
// node multiple times (which would be wasteful)
if (paths.size() > 1) {
nid_t node_id = get_id(decode_traversal(trav));
if (nodes_visited.find(node_id)) {
continue;
}
nodes_visited.insert(node_id);
}

size_t node_member_idx = graph_index_to_node_member_index(graph_iv_index(decode_traversal(trav)));

// find a membership record for this path
size_t prev = 0;
size_t here = path_membership_node_iv.get(node_member_idx);
while (here) {
if (paths_set.count(as_path_handle(get_membership_path(here)))) {
// this is a membership record for a path that we're deleting
if (prev == 0) {
// this was the first record, set following one to be the head
path_membership_node_iv.set(node_member_idx, get_next_membership(here));
}
else {
// make the link from the previous record skip over the current one
set_next_membership(prev, get_next_membership(here));
}

++deleted_membership_records;
}
else {
prev = here;
}

here = get_next_membership(here);
}

first_iter = false;
}

++deleted_membership_records;
path_id.erase(extract_encoded_path_name(as_integer(path)));

first_iter = false;
path_is_deleted_iv.set(as_integer(path), true);
packed_path.steps_iv.clear();
packed_path.links_iv.clear();
path_head_iv.set(as_integer(path), 0);
path_tail_iv.set(as_integer(path), 0);
path_deleted_steps_iv.set(as_integer(path), 0);
}

path_id.erase(extract_encoded_path_name(as_integer(path)));

path_is_deleted_iv.set(as_integer(path), true);
packed_path.steps_iv.clear();
packed_path.links_iv.clear();
path_head_iv.set(as_integer(path), 0);
path_tail_iv.set(as_integer(path), 0);
path_deleted_steps_iv.set(as_integer(path), 0);

defragment();
}

Expand Down
10 changes: 4 additions & 6 deletions bdsg/include/bdsg/internal/packed_structs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1370,9 +1370,6 @@ inline uint64_t PackedSet<Backend>::optimal_anchor() const {

template<typename Backend>
inline void PackedSet<Backend>::rehash(bool shrink) {
// Keeping the RNG state in the class isn't bvery portable, so we make one
// per thread in thread storage.
static thread_local default_random_engine gen(random_device{}());

// move to the next size in the schedule
if (shrink) {
Expand All @@ -1390,6 +1387,10 @@ inline void PackedSet<Backend>::rehash(bool shrink) {
PackedVec new_table;
new_table.resize(bdsg_packed_set_size_schedule[schedule_val]);

// we use random-at-coding-time linear congruential generator to get a pseudo-random seed
// that makes the coefficients in a deterministic, system-independent way
uint64_t seed = 17340519333326003581ull * (schedule_val + 14057138822558802453ull) + 6918838906415272680ull;
std::mt19937_64 gen(seed);
std::uniform_int_distribution<uint64_t> distr(0, new_table.size() - 1);
for (size_t i = 0; i < 5; ++i) {
coefs[i] = distr(gen);
Expand Down Expand Up @@ -1511,9 +1512,6 @@ inline void PackedSet<Backend>::set_load_factors(double min_load_factor, double
assert(max_load_factor < 1.0);
min_load = min_load_factor;
max_load = max_load_factor;
assert(max_load > min_load);
assert(min_load >= 0.0);
assert(max_load < 1.0);
if (table.size() > 0) {
while (num_items <= min_load * table.size()) {
rehash(true);
Expand Down
53 changes: 37 additions & 16 deletions bdsg/src/hash_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -592,25 +592,46 @@ namespace bdsg {
}
return true;
}

void HashGraph::destroy_path(const path_handle_t& path) {
destroy_paths({path});
}

void HashGraph::destroy_paths(const vector<path_handle_t>& paths) {

// remove the records of nodes occurring on this path
for_each_step_in_path(path, [&](const step_handle_t& step) {
path_mapping_t* mapping = (path_mapping_t*) intptr_t(as_integers(step)[1]);
vector<path_mapping_t*>& node_occs = graph[get_id(mapping->handle)].occurrences;
for (size_t i = 0; i < node_occs.size(); i++) {
if (node_occs[i] == mapping) {
node_occs[i] = node_occs.back();
node_occs.pop_back();
break;
}
}
});
unordered_set<int64_t> path_ids;
for (auto path : paths) {
path_ids.emplace(as_integer(path));
}
unordered_set<nid_t> nodes_visited;

// erase the path itself
path_id.erase(paths[as_integer(path)].name);
paths.erase(as_integer(path));
for (auto path : paths) {
// remove the records of nodes occurring on this path
for_each_step_in_path(path, [&](const step_handle_t& step) {
path_mapping_t* mapping = (path_mapping_t*) intptr_t(as_integers(step)[1]);
if (paths.size() > 1) {
bool did_insert = nodes_visited.insert(get_id(mapping->handle)).second;
if (!did_insert) {
// we've already deleted on this node
return;
}
}
vector<path_mapping_t*>& node_occs = graph[get_id(mapping->handle)].occurrences;
for (size_t i = 0; i < node_occs.size(); ) {
if (path_ids.count(node_occs[i]->path_id)) {
node_occs[i] = node_occs.back();
node_occs.pop_back();
}
else {
++i;
}
}
});

// erase the path itself
path_id.erase(this->paths[as_integer(path)].name);
this->paths.erase(as_integer(path));
}
}

path_handle_t HashGraph::create_path_handle(const string& name, bool is_circular) {
Expand Down
67 changes: 67 additions & 0 deletions bdsg/src/test_libbdsg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2285,6 +2285,73 @@ void test_deletable_handle_graphs() {
}
}

// batch deletion of paths works as expected
{
vector<MutablePathDeletableHandleGraph*> implementations;

// Add implementations

PackedGraph pg;
implementations.push_back(&pg);

HashGraph hg;
implementations.push_back(&hg);

MappedPackedGraph mpg;
implementations.push_back(&mpg);

for (int imp = 0; imp < implementations.size(); ++imp) {

MutablePathDeletableHandleGraph& graph = *implementations[imp];

auto h1 = graph.create_handle("A");
auto h2 = graph.create_handle("A");
auto h3 = graph.create_handle("A");

graph.create_edge(h1, h2);
graph.create_edge(h2, h3);

auto p1 = graph.create_path_handle("1");
auto p2 = graph.create_path_handle("2");
auto p3 = graph.create_path_handle("3");
auto p4 = graph.create_path_handle("4");
auto p5 = graph.create_path_handle("5");

for (const auto& p : {p1, p2, p3, p4, p5}) {
for (auto h : {h1, h2, h3}) {
graph.append_step(p, h);
}
}

graph.destroy_paths({p1, p3, p4});

set<path_handle_t> paths_seen;
set<path_handle_t> paths_expected{p2, p5};
graph.for_each_path_handle([&](const path_handle_t& path) {
assert(!paths_seen.count(path));
paths_seen.insert(path);
std::vector<handle_t> handles;
std::vector<handle_t> handles_expected{h1, h2, h3};
for (auto h : graph.scan_path(path)) {
handles.push_back(h);
}
assert(handles == handles_expected);
});

assert(paths_seen == paths_expected);

graph.for_each_handle([&](const handle_t& h) {
set<path_handle_t> paths;
graph.for_each_step_on_handle(h, [&](const step_handle_t& step) {
auto p = graph.get_path_handle_of_step(step);
assert(!paths.count(p));
paths.insert(p);
});
assert(paths_seen == paths_expected);
});
}
}

cerr << "DeletableHandleGraph tests successful!" << endl;
}

Expand Down

0 comments on commit 39cb4fd

Please sign in to comment.