From 78b639c64a2ca8925009d3c712c8ae1ab9c2da49 Mon Sep 17 00:00:00 2001 From: jermp Date: Sun, 20 Mar 2022 22:28:26 +0100 Subject: [PATCH 01/40] compute runs --- include/builder/build.cpp | 166 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) diff --git a/include/builder/build.cpp b/include/builder/build.cpp index 9dab699..aa61015 100644 --- a/include/builder/build.cpp +++ b/include/builder/build.cpp @@ -113,6 +113,14 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b } }; + struct seq_abundance_endpoints { + seq_abundance_endpoints(uint32_t seq_id_, uint32_t first_ab_, uint32_t last_ab_) + : seq_id(seq_id_), first_ab(first_ab_), last_ab(last_ab_) {} + uint32_t seq_id, first_ab, last_ab; + }; + + std::vector seq_abundance_endpoints_vec; + auto parse_header = [&]() { if (sequence.empty()) return; @@ -156,11 +164,17 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b bool kmers_have_all_mfa = true; bool kmers_have_different_abundances = false; + uint64_t first_ab = constants::invalid; + uint64_t last_ab = constants::invalid; + for (uint64_t j = 0, num_kmers = data.num_kmers, prev_ab = constants::invalid; j != seq_len - k + 1; ++j, ++num_kmers) { uint64_t ab = std::strtoull(sequence.data() + i, &end, 10); i = sequence.find_first_of(' ', i) + 1; + if (j == 0) { first_ab = ab; } + if (j == seq_len - k) { last_ab = ab; } + if (ab != constants::most_frequent_abundance) kmers_have_all_mfa = false; if (j > 0) { if (ab != prev_ab) { kmers_have_different_abundances = true; } @@ -197,6 +211,9 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b num_sequences_diff_abs += kmers_have_different_abundances; num_sequences_all_mfa += kmers_have_all_mfa; + + seq_abundance_endpoints_vec.emplace_back(num_sequences, first_ab, last_ab); + // std::cerr << "seq:" << num_sequences << "[" << first_ab << "," << last_ab << "]\n"; }; while (!is.eof()) { @@ -278,6 +295,155 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b << "%)" << std::endl; data.abundances_builder.push_abundance_interval(ab_value, ab_length); data.abundances_builder.finalize(data.num_kmers); + + /* sort on first component */ + std::sort(seq_abundance_endpoints_vec.begin(), seq_abundance_endpoints_vec.end(), + [](auto const& x, auto const& y) { + if (x.first_ab != y.first_ab) return x.first_ab < y.first_ab; + if (x.last_ab != y.last_ab) return x.last_ab < y.last_ab; + return x.seq_id < y.seq_id; + }); + + /* (abundance, num_seqs_with_front_abundance=abundance) */ + std::unordered_map abundance_map; + for (auto const& x : seq_abundance_endpoints_vec) { + // std::cerr << "seq:" << x.seq_id << "[" << x.first_ab << "," << x.last_ab << "]\n"; + auto it = abundance_map.find(x.first_ab); + if (it != abundance_map.cend()) { // found + (*it).second += 1; + } else { + abundance_map[x.first_ab] = 1; + } + } + + std::cout << "calculating lower bound..." << std::endl; + uint64_t R = data.abundances_builder.num_abundance_intervals(); + + for (auto const& x : seq_abundance_endpoints_vec) { + uint64_t back = x.last_ab; + auto it = abundance_map.find(back); + if (it != abundance_map.cend()) { // found + if ((*it).second > 0) { // if it is 0, we cannot find a match + (*it).second -= 1; + // std::cerr << "seq:" << x.seq_id << "[" << x.first_ab << "," << x.last_ab + // << "] got paired"; + // std::cerr << " (remaining " << (*it).second << ")\n"; + R -= 1; + } + // else { + // std::cerr << "seq:" << x.seq_id << "[" << x.first_ab << "," << x.last_ab + // << "] NOT paired!\n"; + // } + } + } + std::cout << "R = " << R << std::endl; + + assert(num_sequences == seq_abundance_endpoints_vec.size()); + + // visited array + std::vector visited(num_sequences, false); + + // edges: a map from abundance "x" to offset "t", where offset "t" is the position into + // seq_abundance_endpoints_vec of the next egde having first abundance equal to "x" + for (auto& p : abundance_map) { p.second = 0; } + for (auto const& x : seq_abundance_endpoints_vec) { + uint64_t front = x.first_ab; + assert(abundance_map.find(front) != abundance_map.cend()); // found + abundance_map[front] += 1; + } + { + std::vector> offsets; // (abundance,offset) + offsets.reserve(abundance_map.size()); + for (auto const& p : abundance_map) { offsets.emplace_back(p.first, p.second); } + assert(offsets.size() > 0); + std::sort(offsets.begin(), offsets.end(), + [](auto const& x, auto const& y) { return x.first < y.first; }); + uint64_t offset = 0; + for (auto const& p : offsets) { + uint64_t ab = p.first; + abundance_map[ab] = offset; + offset += p.second; + } + } + + typedef std::vector walk_t; + walk_t walk; + std::vector walks; + + while (true) { + walk.clear(); + + // take an unvisited vertex + uint64_t i = 0; + for (; i != num_sequences; ++i) { + uint64_t seq_id = seq_abundance_endpoints_vec[i].seq_id; + if (visited[seq_id] == false) break; + } + + if (i == num_sequences) break; // all visited + + // std::cout << " i = " << i << std::endl; + + while (true) { + uint64_t seq_id = seq_abundance_endpoints_vec[i].seq_id; + if (visited[seq_id] == true) break; + + visited[seq_id] = true; + uint64_t front = seq_abundance_endpoints_vec[i].first_ab; + uint64_t back = seq_abundance_endpoints_vec[i].last_ab; + walk.emplace_back(seq_id, front, back); + // std::cout << "added " << seq_id << " to walk-" << walks.size() << std::endl; + + uint64_t next_offset = abundance_map[back]; + + while (visited[seq_id] == true) { + next_offset += 1; + if (next_offset == num_sequences) break; + auto ab_endpoint = seq_abundance_endpoints_vec[next_offset]; + if (ab_endpoint.first_ab != back) break; + seq_id = ab_endpoint.seq_id; + } + + if (next_offset == num_sequences) { + abundance_map[back] = next_offset - 1; + break; + } + + if (seq_abundance_endpoints_vec[next_offset].first_ab != back) { + abundance_map[back] = next_offset - 1; + i = next_offset - 1; + } else { + abundance_map[back] = next_offset; + i = next_offset; + } + } + + if (!walk.empty()) { + // add walk to walks + walks.push_back(walk); + std::cout << "walks.size() = " << walks.size() << std::endl; + } + } + + uint64_t num_runs = data.abundances_builder.num_abundance_intervals(); + std::cout << "computed " << walks.size() << " walks" << std::endl; + std::fill(visited.begin(), visited.end(), false); + for (auto const& walk : walks) { + num_runs -= walk.size() - 1; + uint64_t prev_back = walk.front().first_ab; + for (auto const& w : walk) { + if (visited[w.seq_id] == true) { + std::cout << "ERROR: duplicate vertex." << std::endl; + } + if (w.first_ab != prev_back) { std::cout << "ERROR: path is broken." << std::endl; } + prev_back = w.last_ab; + visited[w.seq_id] = true; + std::cout << w.seq_id << ":[" << w.first_ab << "," << w.last_ab << "] "; + } + std::cout << std::endl; + } + + std::cout << "num_runs " << num_runs << std::endl; } } From 8d1dc2addf2a082117def7d8be92e9cc6fae8dfe Mon Sep 17 00:00:00 2001 From: jermp Date: Sun, 20 Mar 2022 22:59:46 +0100 Subject: [PATCH 02/40] compute runs with cover --- include/builder/build.cpp | 138 ++++--------------------------------- include/builder/cover.hpp | 140 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+), 124 deletions(-) create mode 100644 include/builder/cover.hpp diff --git a/include/builder/build.cpp b/include/builder/build.cpp index aa61015..4ee8592 100644 --- a/include/builder/build.cpp +++ b/include/builder/build.cpp @@ -8,6 +8,7 @@ #include "../../external/pthash/external/essentials/include/essentials.hpp" #include "build_util_types.hpp" +#include "cover.hpp" #include // for std::partial_sum #include @@ -113,13 +114,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b } }; - struct seq_abundance_endpoints { - seq_abundance_endpoints(uint32_t seq_id_, uint32_t first_ab_, uint32_t last_ab_) - : seq_id(seq_id_), first_ab(first_ab_), last_ab(last_ab_) {} - uint32_t seq_id, first_ab, last_ab; - }; - - std::vector seq_abundance_endpoints_vec; + std::vector vertices; auto parse_header = [&]() { if (sequence.empty()) return; @@ -212,7 +207,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b num_sequences_diff_abs += kmers_have_different_abundances; num_sequences_all_mfa += kmers_have_all_mfa; - seq_abundance_endpoints_vec.emplace_back(num_sequences, first_ab, last_ab); + vertices.emplace_back(num_sequences, first_ab, last_ab); // std::cerr << "seq:" << num_sequences << "[" << first_ab << "," << last_ab << "]\n"; }; @@ -296,17 +291,15 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b data.abundances_builder.push_abundance_interval(ab_value, ab_length); data.abundances_builder.finalize(data.num_kmers); - /* sort on first component */ - std::sort(seq_abundance_endpoints_vec.begin(), seq_abundance_endpoints_vec.end(), - [](auto const& x, auto const& y) { - if (x.first_ab != y.first_ab) return x.first_ab < y.first_ab; - if (x.last_ab != y.last_ab) return x.last_ab < y.last_ab; - return x.seq_id < y.seq_id; - }); + std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) { + if (x.first_ab != y.first_ab) return x.first_ab < y.first_ab; + if (x.last_ab != y.last_ab) return x.last_ab < y.last_ab; + return x.seq_id < y.seq_id; + }); /* (abundance, num_seqs_with_front_abundance=abundance) */ std::unordered_map abundance_map; - for (auto const& x : seq_abundance_endpoints_vec) { + for (auto const& x : vertices) { // std::cerr << "seq:" << x.seq_id << "[" << x.first_ab << "," << x.last_ab << "]\n"; auto it = abundance_map.find(x.first_ab); if (it != abundance_map.cend()) { // found @@ -317,9 +310,10 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b } std::cout << "calculating lower bound..." << std::endl; - uint64_t R = data.abundances_builder.num_abundance_intervals(); + uint64_t num_runs_abundances = data.abundances_builder.num_abundance_intervals(); - for (auto const& x : seq_abundance_endpoints_vec) { + uint64_t R = num_runs_abundances; + for (auto const& x : vertices) { uint64_t back = x.last_ab; auto it = abundance_map.find(back); if (it != abundance_map.cend()) { // found @@ -338,112 +332,8 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b } std::cout << "R = " << R << std::endl; - assert(num_sequences == seq_abundance_endpoints_vec.size()); - - // visited array - std::vector visited(num_sequences, false); - - // edges: a map from abundance "x" to offset "t", where offset "t" is the position into - // seq_abundance_endpoints_vec of the next egde having first abundance equal to "x" - for (auto& p : abundance_map) { p.second = 0; } - for (auto const& x : seq_abundance_endpoints_vec) { - uint64_t front = x.first_ab; - assert(abundance_map.find(front) != abundance_map.cend()); // found - abundance_map[front] += 1; - } - { - std::vector> offsets; // (abundance,offset) - offsets.reserve(abundance_map.size()); - for (auto const& p : abundance_map) { offsets.emplace_back(p.first, p.second); } - assert(offsets.size() > 0); - std::sort(offsets.begin(), offsets.end(), - [](auto const& x, auto const& y) { return x.first < y.first; }); - uint64_t offset = 0; - for (auto const& p : offsets) { - uint64_t ab = p.first; - abundance_map[ab] = offset; - offset += p.second; - } - } - - typedef std::vector walk_t; - walk_t walk; - std::vector walks; - - while (true) { - walk.clear(); - - // take an unvisited vertex - uint64_t i = 0; - for (; i != num_sequences; ++i) { - uint64_t seq_id = seq_abundance_endpoints_vec[i].seq_id; - if (visited[seq_id] == false) break; - } - - if (i == num_sequences) break; // all visited - - // std::cout << " i = " << i << std::endl; - - while (true) { - uint64_t seq_id = seq_abundance_endpoints_vec[i].seq_id; - if (visited[seq_id] == true) break; - - visited[seq_id] = true; - uint64_t front = seq_abundance_endpoints_vec[i].first_ab; - uint64_t back = seq_abundance_endpoints_vec[i].last_ab; - walk.emplace_back(seq_id, front, back); - // std::cout << "added " << seq_id << " to walk-" << walks.size() << std::endl; - - uint64_t next_offset = abundance_map[back]; - - while (visited[seq_id] == true) { - next_offset += 1; - if (next_offset == num_sequences) break; - auto ab_endpoint = seq_abundance_endpoints_vec[next_offset]; - if (ab_endpoint.first_ab != back) break; - seq_id = ab_endpoint.seq_id; - } - - if (next_offset == num_sequences) { - abundance_map[back] = next_offset - 1; - break; - } - - if (seq_abundance_endpoints_vec[next_offset].first_ab != back) { - abundance_map[back] = next_offset - 1; - i = next_offset - 1; - } else { - abundance_map[back] = next_offset; - i = next_offset; - } - } - - if (!walk.empty()) { - // add walk to walks - walks.push_back(walk); - std::cout << "walks.size() = " << walks.size() << std::endl; - } - } - - uint64_t num_runs = data.abundances_builder.num_abundance_intervals(); - std::cout << "computed " << walks.size() << " walks" << std::endl; - std::fill(visited.begin(), visited.end(), false); - for (auto const& walk : walks) { - num_runs -= walk.size() - 1; - uint64_t prev_back = walk.front().first_ab; - for (auto const& w : walk) { - if (visited[w.seq_id] == true) { - std::cout << "ERROR: duplicate vertex." << std::endl; - } - if (w.first_ab != prev_back) { std::cout << "ERROR: path is broken." << std::endl; } - prev_back = w.last_ab; - visited[w.seq_id] = true; - std::cout << w.seq_id << ":[" << w.first_ab << "," << w.last_ab << "] "; - } - std::cout << std::endl; - } - - std::cout << "num_runs " << num_runs << std::endl; + cover c; + c.compute(vertices, num_runs_abundances); } } diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp new file mode 100644 index 0000000..ee41929 --- /dev/null +++ b/include/builder/cover.hpp @@ -0,0 +1,140 @@ +#pragma once + +namespace sshash { + +struct vertex { + vertex(uint32_t s, uint32_t f, uint32_t b) : seq_id(s), first_ab(f), last_ab(b) {} + uint32_t seq_id, first_ab, last_ab; +}; + +struct cover { + typedef std::vector walk_t; + + void compute(std::vector& vertices, + uint64_t num_runs_abundances // TODO: remove from here + ) { + /* (abundance, num_seqs_with_front_abundance=abundance) */ + std::unordered_map abundance_map; + std::vector visited; + + while (true) { + uint64_t num_vertices = vertices.size(); + + std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) { + if (x.first_ab != y.first_ab) return x.first_ab < y.first_ab; + if (x.last_ab != y.last_ab) return x.last_ab < y.last_ab; + return x.seq_id < y.seq_id; + }); + + visited.resize(num_vertices); + std::fill(visited.begin(), visited.end(), false); + + for (auto const& x : vertices) { + auto it = abundance_map.find(x.first_ab); + if (it != abundance_map.cend()) { // found + (*it).second += 1; + } else { + abundance_map[x.first_ab] = 1; + } + } + + { + std::vector> offsets; // (abundance,offset) + offsets.reserve(abundance_map.size()); + for (auto const& p : abundance_map) { offsets.emplace_back(p.first, p.second); } + assert(offsets.size() > 0); + std::sort(offsets.begin(), offsets.end(), + [](auto const& x, auto const& y) { return x.first < y.first; }); + uint64_t offset = 0; + for (auto const& p : offsets) { + uint64_t ab = p.first; + abundance_map[ab] = offset; + offset += p.second; + } + } + + walk_t walk; + + while (true) { + walk.clear(); + + // take an unvisited vertex + uint64_t i = 0; + for (; i != num_vertices; ++i) { + uint64_t seq_id = vertices[i].seq_id; + if (visited[seq_id] == false) break; + } + + if (i == num_vertices) break; // all visited + + // std::cout << " i = " << i << std::endl; + + while (true) { + uint64_t seq_id = vertices[i].seq_id; + if (visited[seq_id] == true) break; + + visited[seq_id] = true; + uint64_t front = vertices[i].first_ab; + uint64_t back = vertices[i].last_ab; + walk.emplace_back(seq_id, front, back); + // std::cout << "added " << seq_id << " to walk-" << walks.size() << std::endl; + + uint64_t next_offset = abundance_map[back]; + + while (visited[seq_id] == true) { + next_offset += 1; + if (next_offset == num_vertices) break; + auto ab_endpoint = vertices[next_offset]; + if (ab_endpoint.first_ab != back) break; + seq_id = ab_endpoint.seq_id; + } + + if (next_offset == num_vertices) { + abundance_map[back] = next_offset - 1; + break; + } + + if (vertices[next_offset].first_ab != back) { + abundance_map[back] = next_offset - 1; + i = next_offset - 1; + } else { + abundance_map[back] = next_offset; + i = next_offset; + } + } + + if (!walk.empty()) { + // add walk to walks + walks.push_back(walk); + std::cout << "walks.size() = " << walks.size() << std::endl; + } + } + + break; + } + + uint64_t num_runs = num_runs_abundances; + std::cout << "computed " << walks.size() << " walks" << std::endl; + std::fill(visited.begin(), visited.end(), false); + for (auto const& walk : walks) { + num_runs -= walk.size() - 1; + uint64_t prev_back = walk.front().first_ab; + for (auto const& w : walk) { + if (visited[w.seq_id] == true) { + std::cout << "ERROR: duplicate vertex." << std::endl; + } + if (w.first_ab != prev_back) { std::cout << "ERROR: path is broken." << std::endl; } + prev_back = w.last_ab; + visited[w.seq_id] = true; + std::cout << w.seq_id << ":[" << w.first_ab << "," << w.last_ab << "] "; + } + std::cout << std::endl; + } + std::cout << "num_runs " << num_runs << std::endl; + } + +private: + std::vector walks; +}; + +} // namespace sshash \ No newline at end of file From 3d1db5b9768fd0934e38b06270d2d1c5c98aee28 Mon Sep 17 00:00:00 2001 From: jermp Date: Mon, 21 Mar 2022 10:28:49 +0100 Subject: [PATCH 03/40] compute runs with cover --- include/builder/build.cpp | 32 ++++---- include/builder/cover.hpp | 158 ++++++++++++++++++++++++++------------ 2 files changed, 125 insertions(+), 65 deletions(-) diff --git a/include/builder/build.cpp b/include/builder/build.cpp index 4ee8592..f2b7866 100644 --- a/include/builder/build.cpp +++ b/include/builder/build.cpp @@ -121,7 +121,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b /* Heder format: - >[seq_id] LN:i:[seq_len] ab:Z:[ab_seq] + >[id] LN:i:[seq_len] ab:Z:[ab_seq] where [ab_seq] is a space-separated sequence of integer counters (the abundances), whose length is equal to [seq_len]-k+1 */ @@ -159,16 +159,16 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b bool kmers_have_all_mfa = true; bool kmers_have_different_abundances = false; - uint64_t first_ab = constants::invalid; - uint64_t last_ab = constants::invalid; + uint64_t front = constants::invalid; + uint64_t back = constants::invalid; for (uint64_t j = 0, num_kmers = data.num_kmers, prev_ab = constants::invalid; j != seq_len - k + 1; ++j, ++num_kmers) { uint64_t ab = std::strtoull(sequence.data() + i, &end, 10); i = sequence.find_first_of(' ', i) + 1; - if (j == 0) { first_ab = ab; } - if (j == seq_len - k) { last_ab = ab; } + if (j == 0) { front = ab; } + if (j == seq_len - k) { back = ab; } if (ab != constants::most_frequent_abundance) kmers_have_all_mfa = false; if (j > 0) { @@ -207,8 +207,8 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b num_sequences_diff_abs += kmers_have_different_abundances; num_sequences_all_mfa += kmers_have_all_mfa; - vertices.emplace_back(num_sequences, first_ab, last_ab); - // std::cerr << "seq:" << num_sequences << "[" << first_ab << "," << last_ab << "]\n"; + vertices.emplace_back(num_sequences, front, back); + // std::cerr << "seq:" << num_sequences << "[" << front << "," << back << "]\n"; }; while (!is.eof()) { @@ -292,20 +292,20 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b data.abundances_builder.finalize(data.num_kmers); std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) { - if (x.first_ab != y.first_ab) return x.first_ab < y.first_ab; - if (x.last_ab != y.last_ab) return x.last_ab < y.last_ab; - return x.seq_id < y.seq_id; + if (x.front != y.front) return x.front < y.front; + if (x.back != y.back) return x.back < y.back; + return x.id < y.id; }); /* (abundance, num_seqs_with_front_abundance=abundance) */ std::unordered_map abundance_map; for (auto const& x : vertices) { - // std::cerr << "seq:" << x.seq_id << "[" << x.first_ab << "," << x.last_ab << "]\n"; - auto it = abundance_map.find(x.first_ab); + // std::cerr << "seq:" << x.id << "[" << x.front << "," << x.back << "]\n"; + auto it = abundance_map.find(x.front); if (it != abundance_map.cend()) { // found (*it).second += 1; } else { - abundance_map[x.first_ab] = 1; + abundance_map[x.front] = 1; } } @@ -314,18 +314,18 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b uint64_t R = num_runs_abundances; for (auto const& x : vertices) { - uint64_t back = x.last_ab; + uint64_t back = x.back; auto it = abundance_map.find(back); if (it != abundance_map.cend()) { // found if ((*it).second > 0) { // if it is 0, we cannot find a match (*it).second -= 1; - // std::cerr << "seq:" << x.seq_id << "[" << x.first_ab << "," << x.last_ab + // std::cerr << "seq:" << x.id << "[" << x.front << "," << x.back // << "] got paired"; // std::cerr << " (remaining " << (*it).second << ")\n"; R -= 1; } // else { - // std::cerr << "seq:" << x.seq_id << "[" << x.first_ab << "," << x.last_ab + // std::cerr << "seq:" << x.id << "[" << x.front << "," << x.back // << "] NOT paired!\n"; // } } diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index ee41929..2aed435 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -3,38 +3,54 @@ namespace sshash { struct vertex { - vertex(uint32_t s, uint32_t f, uint32_t b) : seq_id(s), first_ab(f), last_ab(b) {} - uint32_t seq_id, first_ab, last_ab; + vertex(uint32_t s, uint32_t f, uint32_t b) : id(s), front(f), back(b) {} + uint32_t id, front, back; }; struct cover { typedef std::vector walk_t; + typedef std::vector walks_t; + + enum color_t { + white = 0, // unvisited + gray = 1, // visited by not merged + black = 2 // visited and merged + }; void compute(std::vector& vertices, uint64_t num_runs_abundances // TODO: remove from here ) { /* (abundance, num_seqs_with_front_abundance=abundance) */ std::unordered_map abundance_map; - std::vector visited; + + std::vector colors; + std::vector tmp_vertices; + uint64_t num_runs = num_runs_abundances; while (true) { + std::cout << "round " << rounds.size() << std::endl; + uint64_t num_vertices = vertices.size(); + std::cout << "num_vertices " << num_vertices << std::endl; + tmp_vertices.clear(); + abundance_map.clear(); + + /* all unvisited */ + colors.resize(num_vertices); + std::fill(colors.begin(), colors.end(), color_t::white); std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) { - if (x.first_ab != y.first_ab) return x.first_ab < y.first_ab; - if (x.last_ab != y.last_ab) return x.last_ab < y.last_ab; - return x.seq_id < y.seq_id; + if (x.front != y.front) return x.front < y.front; + if (x.back != y.back) return x.back < y.back; + return x.id < y.id; }); - visited.resize(num_vertices); - std::fill(visited.begin(), visited.end(), false); - for (auto const& x : vertices) { - auto it = abundance_map.find(x.first_ab); + auto it = abundance_map.find(x.front); if (it != abundance_map.cend()) { // found (*it).second += 1; } else { - abundance_map[x.first_ab] = 1; + abundance_map[x.front] = 1; } } @@ -54,6 +70,7 @@ struct cover { } walk_t walk; + walks_t walks_in_round; while (true) { walk.clear(); @@ -61,32 +78,34 @@ struct cover { // take an unvisited vertex uint64_t i = 0; for (; i != num_vertices; ++i) { - uint64_t seq_id = vertices[i].seq_id; - if (visited[seq_id] == false) break; + uint64_t id = vertices[i].id; + if (colors[id] == color_t::white) { break; } } if (i == num_vertices) break; // all visited - // std::cout << " i = " << i << std::endl; - + // create new walk while (true) { - uint64_t seq_id = vertices[i].seq_id; - if (visited[seq_id] == true) break; + uint64_t id = vertices[i].id; + if (colors[id] == color_t::black or colors[id] == color_t::gray) break; - visited[seq_id] = true; - uint64_t front = vertices[i].first_ab; - uint64_t back = vertices[i].last_ab; - walk.emplace_back(seq_id, front, back); - // std::cout << "added " << seq_id << " to walk-" << walks.size() << std::endl; + // colors[id] = color_t::gray; + colors[id] = color_t::black; + uint64_t front = vertices[i].front; + uint64_t back = vertices[i].back; + // if (walk.size() > 0) { colors[walk.back().id] = color_t::black; } + walk.emplace_back(id, front, back); + // std::cout << "added " << id << " to walk-" << walks.size() << std::endl; uint64_t next_offset = abundance_map[back]; - while (visited[seq_id] == true) { + while (colors[id] == color_t::black) { next_offset += 1; if (next_offset == num_vertices) break; - auto ab_endpoint = vertices[next_offset]; - if (ab_endpoint.first_ab != back) break; - seq_id = ab_endpoint.seq_id; + auto vertex = vertices[next_offset]; + if (vertex.front != back) break; + id = vertex.id; + // if (colors[vertex.id] != color_t::white) break; // visited } if (next_offset == num_vertices) { @@ -94,7 +113,7 @@ struct cover { break; } - if (vertices[next_offset].first_ab != back) { + if (vertices[next_offset].front != back) { abundance_map[back] = next_offset - 1; i = next_offset - 1; } else { @@ -103,38 +122,79 @@ struct cover { } } - if (!walk.empty()) { - // add walk to walks - walks.push_back(walk); - std::cout << "walks.size() = " << walks.size() << std::endl; + if (walk.empty()) { continue; } + + if (walk.size() == 1) { + colors[walk.front().id] = color_t::gray; // visited but not merged } + + /* create a new vertex for next round */ + tmp_vertices.emplace_back(walks_in_round.size(), walk.front().front, + walk.back().back); + walks_in_round.push_back(walk); } - break; - } + std::cout << "num_walks_in_round " << rounds.size() << ": " << walks_in_round.size() + << std::endl; - uint64_t num_runs = num_runs_abundances; - std::cout << "computed " << walks.size() << " walks" << std::endl; - std::fill(visited.begin(), visited.end(), false); - for (auto const& walk : walks) { - num_runs -= walk.size() - 1; - uint64_t prev_back = walk.front().first_ab; - for (auto const& w : walk) { - if (visited[w.seq_id] == true) { - std::cout << "ERROR: duplicate vertex." << std::endl; + // if all walks are singletons, then new merging were found + + bool all_singletons = true; + for (auto const& walk : walks_in_round) { + if (walk.size() > 1) { + all_singletons = false; + break; } - if (w.first_ab != prev_back) { std::cout << "ERROR: path is broken." << std::endl; } - prev_back = w.last_ab; - visited[w.seq_id] = true; - std::cout << w.seq_id << ":[" << w.first_ab << "," << w.last_ab << "] "; } - std::cout << std::endl; + if (all_singletons) { + std::cout << "all walks are singletons: no new mergings were found" << std::endl; + break; + } + + { // print walks + std::fill(colors.begin(), colors.end(), color_t::white); + for (auto const& walk : walks_in_round) { + num_runs -= walk.size() - 1; + uint64_t prev_back = walk.front().front; + for (auto const& w : walk) { + if (colors[w.id] == color_t::black) { + std::cout << "ERROR: duplicate vertex." << std::endl; + } + if (w.front != prev_back) { + std::cout << "ERROR: path is broken." << std::endl; + } + prev_back = w.back; + colors[w.id] = color_t::black; + std::cout << w.id << ":[" << w.front << "," << w.back << "] "; + } + std::cout << std::endl; + } + std::cout << "num_runs " << num_runs << std::endl; + } + + rounds.push_back(walks_in_round); + vertices.swap(tmp_vertices); } - std::cout << "num_runs " << num_runs << std::endl; + + // uint64_t num_runs = num_runs_abundances; + // std::cout << "computed " << walks.size() << " walks" << std::endl; + // std::fill(colors.begin(), colors.end(), false); + // for (auto const& walk : walks) { + // num_runs -= walk.size() - 1; + // uint64_t prev_back = walk.front().front; + // for (auto const& w : walk) { + // if (colors[w.id] == true) { std::cout << "ERROR: duplicate vertex." << + // std::endl; } if (w.front != prev_back) { std::cout << "ERROR: path is broken." << + // std::endl; } prev_back = w.back; colors[w.id] = true; std::cout << w.id << ":[" + // << w.front << "," << w.back << "] "; + // } + // std::cout << std::endl; + // } + // std::cout << "num_runs " << num_runs << std::endl; } private: - std::vector walks; + std::vector rounds; }; } // namespace sshash \ No newline at end of file From 2c33fa99f9050c85cc7559c70ec407199c27fceb Mon Sep 17 00:00:00 2001 From: jermp Date: Mon, 21 Mar 2022 12:58:47 +0100 Subject: [PATCH 04/40] compute runs with cover --- include/builder/cover.hpp | 123 +++++++++++++++++++++++++------------- 1 file changed, 83 insertions(+), 40 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index 2aed435..279ad97 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -75,87 +75,118 @@ struct cover { while (true) { walk.clear(); - // take an unvisited vertex + /* take an unvisited vertex */ uint64_t i = 0; for (; i != num_vertices; ++i) { uint64_t id = vertices[i].id; - if (colors[id] == color_t::white) { break; } + if (colors[id] == color_t::white) break; } if (i == num_vertices) break; // all visited - // create new walk + /* create a new walk */ while (true) { uint64_t id = vertices[i].id; - if (colors[id] == color_t::black or colors[id] == color_t::gray) break; - // colors[id] = color_t::gray; - colors[id] = color_t::black; + // std::cout << "visiting vertex " << id << ":[" << vertices[i].front << "," + // << vertices[i].back << "]" << std::endl; + // std::cout << "at offset " << i << std::endl; + + if (colors[id] == color_t::black) break; + + colors[id] = color_t::gray; uint64_t front = vertices[i].front; uint64_t back = vertices[i].back; - // if (walk.size() > 0) { colors[walk.back().id] = color_t::black; } + if (walk.size() > 0) { + assert(walk.back().id != id); + colors[walk.back().id] = color_t::black; + colors[id] = color_t::black; + } walk.emplace_back(id, front, back); // std::cout << "added " << id << " to walk-" << walks.size() << std::endl; - uint64_t next_offset = abundance_map[back]; + uint64_t offset = abundance_map[back]; - while (colors[id] == color_t::black) { - next_offset += 1; - if (next_offset == num_vertices) break; - auto vertex = vertices[next_offset]; - if (vertex.front != back) break; - id = vertex.id; - // if (colors[vertex.id] != color_t::white) break; // visited - } + /* search for a match */ + bool no_match_found = false; + while (true) { + if (offset == num_vertices) break; + auto vertex = vertices[offset]; + // std::cout << " vertex " << vertex.id << ":[" << vertex.front << "," + // << vertex.back << "]" << std::endl; - if (next_offset == num_vertices) { - abundance_map[back] = next_offset - 1; - break; - } + /* skip */ + if (vertex.id == id) { + offset += 1; + continue; + } + + /* checked all candidate matches */ + if (vertex.front != back) { + // std::cout << " MISfound match " << vertex.id << ":[" << vertex.front + // << "," << vertex.back << "]" << std::endl; + // std::cout << " at offset " << offset << std::endl; + // std::cout << "NO MATCHES FOUND" << std::endl; + no_match_found = true; + break; + } - if (vertices[next_offset].front != back) { - abundance_map[back] = next_offset - 1; - i = next_offset - 1; - } else { - abundance_map[back] = next_offset; - i = next_offset; + /* match found */ + if (colors[vertex.id] != color_t::black) { + // std::cout << " found match " << vertex.id << ":[" << vertex.front + // << "," << vertex.back << "]" << std::endl; + // std::cout << " at offset " << offset << std::endl; + break; + } + offset += 1; } + + assert(offset <= num_vertices); + if (no_match_found or offset == num_vertices) break; + + /* valid match was found, then visit it next */ + i = offset; } if (walk.empty()) { continue; } if (walk.size() == 1) { colors[walk.front().id] = color_t::gray; // visited but not merged + continue; } + /* invariant: all vertices belonging to a walk are all black */ + assert(std::all_of(walk.begin(), walk.end(), [&](vertex const& v) { + return colors[v.id] == color_t::black; + })); + /* create a new vertex for next round */ tmp_vertices.emplace_back(walks_in_round.size(), walk.front().front, walk.back().back); walks_in_round.push_back(walk); } + /* add all gray vertices (singleton walks) */ + for (auto const& v : vertices) { + if (colors[v.id] == color_t::gray) { + tmp_vertices.emplace_back(walks_in_round.size(), v.front, v.back); + walk_t walk; + walk.push_back(v); + walks_in_round.push_back(walk); + } + } + std::cout << "num_walks_in_round " << rounds.size() << ": " << walks_in_round.size() << std::endl; - // if all walks are singletons, then new merging were found - bool all_singletons = true; - for (auto const& walk : walks_in_round) { - if (walk.size() > 1) { - all_singletons = false; - break; - } - } - if (all_singletons) { - std::cout << "all walks are singletons: no new mergings were found" << std::endl; - break; - } - - { // print walks + { std::fill(colors.begin(), colors.end(), color_t::white); for (auto const& walk : walks_in_round) { + if (walk.size() > 1) all_singletons = false; num_runs -= walk.size() - 1; uint64_t prev_back = walk.front().front; + std::cout << "=>"; for (auto const& w : walk) { if (colors[w.id] == color_t::black) { std::cout << "ERROR: duplicate vertex." << std::endl; @@ -170,12 +201,24 @@ struct cover { std::cout << std::endl; } std::cout << "num_runs " << num_runs << std::endl; + + std::cout << "created vertices in round " << rounds.size() << ":" << std::endl; + for (auto const& v : tmp_vertices) { + std::cout << v.id << ":[" << v.front << "," << v.back << "]\n"; + } + } + + if (all_singletons) { + std::cout << "all walks are singletons: no new mergings were found" << std::endl; + break; } rounds.push_back(walks_in_round); vertices.swap(tmp_vertices); } + // TODO: form final walks and check that all vertex id are present + // uint64_t num_runs = num_runs_abundances; // std::cout << "computed " << walks.size() << " walks" << std::endl; // std::fill(colors.begin(), colors.end(), false); From ad95e9faaef02a061711359f9b5cd9af3bfddacd Mon Sep 17 00:00:00 2001 From: jermp Date: Mon, 21 Mar 2022 13:14:12 +0100 Subject: [PATCH 05/40] compute runs with cover --- include/builder/cover.hpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index 279ad97..98e26a1 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -17,15 +17,17 @@ struct cover { black = 2 // visited and merged }; + cover() : num_sequences(0) {} + void compute(std::vector& vertices, uint64_t num_runs_abundances // TODO: remove from here ) { /* (abundance, num_seqs_with_front_abundance=abundance) */ std::unordered_map abundance_map; - std::vector colors; std::vector tmp_vertices; uint64_t num_runs = num_runs_abundances; + num_sequences = vertices.size(); while (true) { std::cout << "round " << rounds.size() << std::endl; @@ -148,10 +150,10 @@ struct cover { i = offset; } - if (walk.empty()) { continue; } + assert(!walk.empty()); if (walk.size() == 1) { - colors[walk.front().id] = color_t::gray; // visited but not merged + assert(colors[walk.front().id] == color_t::gray); // visited but not merged continue; } @@ -237,6 +239,7 @@ struct cover { } private: + uint64_t num_sequences; std::vector rounds; }; From e94747c8011e75b59dbb6cd105224098f9143411 Mon Sep 17 00:00:00 2001 From: jermp Date: Mon, 21 Mar 2022 13:18:17 +0100 Subject: [PATCH 06/40] compute runs with cover --- include/builder/cover.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index 98e26a1..cb28ec1 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -89,13 +89,12 @@ struct cover { /* create a new walk */ while (true) { uint64_t id = vertices[i].id; + assert(colors[id] != color_t::black); // std::cout << "visiting vertex " << id << ":[" << vertices[i].front << "," // << vertices[i].back << "]" << std::endl; // std::cout << "at offset " << i << std::endl; - if (colors[id] == color_t::black) break; - colors[id] = color_t::gray; uint64_t front = vertices[i].front; uint64_t back = vertices[i].back; From 64a236e678ad9867eccea6b66cb7f950156fb785 Mon Sep 17 00:00:00 2001 From: jermp Date: Mon, 21 Mar 2022 17:19:12 +0100 Subject: [PATCH 07/40] compute runs with cover --- include/builder/cover.hpp | 88 ++++++++++++++++++++++----------------- 1 file changed, 50 insertions(+), 38 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index cb28ec1..b36c2b1 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -11,20 +11,28 @@ struct cover { typedef std::vector walk_t; typedef std::vector walks_t; - enum color_t { + enum color { white = 0, // unvisited gray = 1, // visited by not merged black = 2 // visited and merged }; + struct range { + range() {} + range(uint64_t b, uint64_t e, uint64_t p) : begin(b), end(e), position(p) {} + uint64_t begin; + uint64_t end; + uint64_t position; + }; + cover() : num_sequences(0) {} void compute(std::vector& vertices, uint64_t num_runs_abundances // TODO: remove from here ) { /* (abundance, num_seqs_with_front_abundance=abundance) */ - std::unordered_map abundance_map; - std::vector colors; + std::unordered_map abundance_map; + std::vector colors; std::vector tmp_vertices; uint64_t num_runs = num_runs_abundances; num_sequences = vertices.size(); @@ -39,7 +47,7 @@ struct cover { /* all unvisited */ colors.resize(num_vertices); - std::fill(colors.begin(), colors.end(), color_t::white); + std::fill(colors.begin(), colors.end(), color::white); std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) { if (x.front != y.front) return x.front < y.front; @@ -50,23 +58,28 @@ struct cover { for (auto const& x : vertices) { auto it = abundance_map.find(x.front); if (it != abundance_map.cend()) { // found - (*it).second += 1; + (*it).second.begin += 1; } else { - abundance_map[x.front] = 1; + abundance_map[x.front] = {1, 0, 0}; } } { std::vector> offsets; // (abundance,offset) offsets.reserve(abundance_map.size()); - for (auto const& p : abundance_map) { offsets.emplace_back(p.first, p.second); } + for (auto const& p : abundance_map) { + offsets.emplace_back(p.first, p.second.begin); + } assert(offsets.size() > 0); std::sort(offsets.begin(), offsets.end(), [](auto const& x, auto const& y) { return x.first < y.first; }); uint64_t offset = 0; for (auto const& p : offsets) { uint64_t ab = p.first; - abundance_map[ab] = offset; + auto& r = abundance_map[ab]; + r.end = r.begin; + r.begin = offset; + r.position = 0; offset += p.second; } } @@ -81,7 +94,7 @@ struct cover { uint64_t i = 0; for (; i != num_vertices; ++i) { uint64_t id = vertices[i].id; - if (colors[id] == color_t::white) break; + if (colors[id] == color::white) break; } if (i == num_vertices) break; // all visited @@ -89,32 +102,35 @@ struct cover { /* create a new walk */ while (true) { uint64_t id = vertices[i].id; - assert(colors[id] != color_t::black); + assert(colors[id] != color::black); - // std::cout << "visiting vertex " << id << ":[" << vertices[i].front << "," - // << vertices[i].back << "]" << std::endl; - // std::cout << "at offset " << i << std::endl; - - colors[id] = color_t::gray; + colors[id] = color::gray; uint64_t front = vertices[i].front; uint64_t back = vertices[i].back; if (walk.size() > 0) { assert(walk.back().id != id); - colors[walk.back().id] = color_t::black; - colors[id] = color_t::black; + colors[walk.back().id] = color::black; + colors[id] = color::black; } walk.emplace_back(id, front, back); - // std::cout << "added " << id << " to walk-" << walks.size() << std::endl; - uint64_t offset = abundance_map[back]; + uint64_t offset = 0; + bool no_match_found = false; + + auto it = abundance_map.find(back); + + /* abundance back is not found, so no match is possible */ + if (it == abundance_map.cend()) { + no_match_found = true; + break; + } + offset = (*it).second.begin; /* search for a match */ - bool no_match_found = false; while (true) { + // std::cout << "offset " << offset << "/" << num_vertices << std::endl; if (offset == num_vertices) break; auto vertex = vertices[offset]; - // std::cout << " vertex " << vertex.id << ":[" << vertex.front << "," - // << vertex.back << "]" << std::endl; /* skip */ if (vertex.id == id) { @@ -124,21 +140,18 @@ struct cover { /* checked all candidate matches */ if (vertex.front != back) { - // std::cout << " MISfound match " << vertex.id << ":[" << vertex.front - // << "," << vertex.back << "]" << std::endl; - // std::cout << " at offset " << offset << std::endl; - // std::cout << "NO MATCHES FOUND" << std::endl; no_match_found = true; break; } /* match found */ - if (colors[vertex.id] != color_t::black) { - // std::cout << " found match " << vertex.id << ":[" << vertex.front - // << "," << vertex.back << "]" << std::endl; - // std::cout << " at offset " << offset << std::endl; + if (colors[vertex.id] != color::black) { + uint64_t& position = (*it).second.position; + assert((*it).second.begin + position <= offset); + position += 1; break; } + offset += 1; } @@ -152,14 +165,13 @@ struct cover { assert(!walk.empty()); if (walk.size() == 1) { - assert(colors[walk.front().id] == color_t::gray); // visited but not merged + assert(colors[walk.front().id] == color::gray); // visited but not merged continue; } /* invariant: all vertices belonging to a walk are all black */ - assert(std::all_of(walk.begin(), walk.end(), [&](vertex const& v) { - return colors[v.id] == color_t::black; - })); + assert(std::all_of(walk.begin(), walk.end(), + [&](vertex const& v) { return colors[v.id] == color::black; })); /* create a new vertex for next round */ tmp_vertices.emplace_back(walks_in_round.size(), walk.front().front, @@ -169,7 +181,7 @@ struct cover { /* add all gray vertices (singleton walks) */ for (auto const& v : vertices) { - if (colors[v.id] == color_t::gray) { + if (colors[v.id] == color::gray) { tmp_vertices.emplace_back(walks_in_round.size(), v.front, v.back); walk_t walk; walk.push_back(v); @@ -182,21 +194,21 @@ struct cover { bool all_singletons = true; { - std::fill(colors.begin(), colors.end(), color_t::white); + std::fill(colors.begin(), colors.end(), color::white); for (auto const& walk : walks_in_round) { if (walk.size() > 1) all_singletons = false; num_runs -= walk.size() - 1; uint64_t prev_back = walk.front().front; std::cout << "=>"; for (auto const& w : walk) { - if (colors[w.id] == color_t::black) { + if (colors[w.id] == color::black) { std::cout << "ERROR: duplicate vertex." << std::endl; } if (w.front != prev_back) { std::cout << "ERROR: path is broken." << std::endl; } prev_back = w.back; - colors[w.id] = color_t::black; + colors[w.id] = color::black; std::cout << w.id << ":[" << w.front << "," << w.back << "] "; } std::cout << std::endl; From fc863c67d7ddd9f02fe0187e24221d1560b318aa Mon Sep 17 00:00:00 2001 From: jermp Date: Mon, 21 Mar 2022 17:49:15 +0100 Subject: [PATCH 08/40] compute runs with cover --- include/builder/cover.hpp | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index b36c2b1..09ff91a 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -30,11 +30,16 @@ struct cover { void compute(std::vector& vertices, uint64_t num_runs_abundances // TODO: remove from here ) { + essentials::timer_type timer; + + timer.start(); + /* (abundance, num_seqs_with_front_abundance=abundance) */ std::unordered_map abundance_map; std::vector colors; std::vector tmp_vertices; uint64_t num_runs = num_runs_abundances; + std::cout << "initial number of runs = " << num_runs << std::endl; num_sequences = vertices.size(); while (true) { @@ -194,12 +199,13 @@ struct cover { bool all_singletons = true; { + uint64_t num_mergings_in_round = 0; std::fill(colors.begin(), colors.end(), color::white); for (auto const& walk : walks_in_round) { if (walk.size() > 1) all_singletons = false; - num_runs -= walk.size() - 1; + num_mergings_in_round += walk.size() - 1; uint64_t prev_back = walk.front().front; - std::cout << "=>"; + // std::cout << "=>"; for (auto const& w : walk) { if (colors[w.id] == color::black) { std::cout << "ERROR: duplicate vertex." << std::endl; @@ -209,16 +215,18 @@ struct cover { } prev_back = w.back; colors[w.id] = color::black; - std::cout << w.id << ":[" << w.front << "," << w.back << "] "; + // std::cout << w.id << ":[" << w.front << "," << w.back << "] "; } - std::cout << std::endl; + // std::cout << std::endl; } + num_runs -= num_mergings_in_round; + std::cout << "num_mergings_in_round = " << num_mergings_in_round << std::endl; std::cout << "num_runs " << num_runs << std::endl; - std::cout << "created vertices in round " << rounds.size() << ":" << std::endl; - for (auto const& v : tmp_vertices) { - std::cout << v.id << ":[" << v.front << "," << v.back << "]\n"; - } + // std::cout << "created vertices in round " << rounds.size() << ":" << std::endl; + // for (auto const& v : tmp_vertices) { + // std::cout << v.id << ":[" << v.front << "," << v.back << "]\n"; + // } } if (all_singletons) { @@ -230,6 +238,10 @@ struct cover { vertices.swap(tmp_vertices); } + timer.stop(); + + std::cout << "cover computed in: " << timer.elapsed() / 1000000 << " [sec]" << std::endl; + // TODO: form final walks and check that all vertex id are present // uint64_t num_runs = num_runs_abundances; From 89234867c9d92016903884f7dbd61f7cbd72f871 Mon Sep 17 00:00:00 2001 From: jermp Date: Mon, 21 Mar 2022 18:08:35 +0100 Subject: [PATCH 09/40] compute runs with cover --- include/builder/build.cpp | 3 +-- include/builder/cover.hpp | 20 ++++++++++++-------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/include/builder/build.cpp b/include/builder/build.cpp index f2b7866..19aa713 100644 --- a/include/builder/build.cpp +++ b/include/builder/build.cpp @@ -309,7 +309,6 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b } } - std::cout << "calculating lower bound..." << std::endl; uint64_t num_runs_abundances = data.abundances_builder.num_abundance_intervals(); uint64_t R = num_runs_abundances; @@ -330,7 +329,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b // } } } - std::cout << "R = " << R << std::endl; + std::cout << "computed lower bound: R = " << R << std::endl; cover c; c.compute(vertices, num_runs_abundances); diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index 09ff91a..93923c5 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -46,7 +46,7 @@ struct cover { std::cout << "round " << rounds.size() << std::endl; uint64_t num_vertices = vertices.size(); - std::cout << "num_vertices " << num_vertices << std::endl; + std::cout << " num_vertices " << num_vertices << std::endl; tmp_vertices.clear(); abundance_map.clear(); @@ -129,7 +129,7 @@ struct cover { no_match_found = true; break; } - offset = (*it).second.begin; + offset = (*it).second.begin + (*it).second.position; // skip to position /* search for a match */ while (true) { @@ -194,16 +194,18 @@ struct cover { } } - std::cout << "num_walks_in_round " << rounds.size() << ": " << walks_in_round.size() - << std::endl; + std::cout << " num_walks = " << walks_in_round.size() << std::endl; bool all_singletons = true; { - uint64_t num_mergings_in_round = 0; +#ifndef NDEBUG std::fill(colors.begin(), colors.end(), color::white); +#endif + uint64_t num_mergings_in_round = 0; for (auto const& walk : walks_in_round) { if (walk.size() > 1) all_singletons = false; num_mergings_in_round += walk.size() - 1; +#ifndef NDEBUG uint64_t prev_back = walk.front().front; // std::cout << "=>"; for (auto const& w : walk) { @@ -218,10 +220,11 @@ struct cover { // std::cout << w.id << ":[" << w.front << "," << w.back << "] "; } // std::cout << std::endl; +#endif } num_runs -= num_mergings_in_round; - std::cout << "num_mergings_in_round = " << num_mergings_in_round << std::endl; - std::cout << "num_runs " << num_runs << std::endl; + std::cout << " num_mergings = " << num_mergings_in_round << std::endl; + std::cout << " num_runs " << num_runs << std::endl; // std::cout << "created vertices in round " << rounds.size() << ":" << std::endl; // for (auto const& v : tmp_vertices) { @@ -230,7 +233,8 @@ struct cover { } if (all_singletons) { - std::cout << "all walks are singletons: no new mergings were found" << std::endl; + std::cout << "STOP: all walks are singletons --> no new mergings were found" + << std::endl; break; } From a05d53b735aa8c88b8a9facbea6131c05a155c56 Mon Sep 17 00:00:00 2001 From: jermp Date: Mon, 21 Mar 2022 21:59:44 +0100 Subject: [PATCH 10/40] optimize for the most frequent case --- include/builder/cover.hpp | 81 +++++++++++++++++++++++++++++++-------- 1 file changed, 65 insertions(+), 16 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index 93923c5..2ec3d25 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -14,7 +14,8 @@ struct cover { enum color { white = 0, // unvisited gray = 1, // visited by not merged - black = 2 // visited and merged + black = 2, // visited and merged + invalid = -1 }; struct range { @@ -42,17 +43,64 @@ struct cover { std::cout << "initial number of runs = " << num_runs << std::endl; num_sequences = vertices.size(); + walk_t walk; + walks_t walks_in_round; + + { + /* + optimize for the most frequent case: + push vertices of the form v:[mfa,mfa] to the bottom, and remove them + forming a single (usually, long) walk. + Warning: here we are assuming the mfa is also the smallest abundance. + */ + + std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) { + if (x.front != y.front) return x.front > y.front; + if (x.back != y.back) return x.back > y.back; + return x.id > y.id; + }); + + uint64_t count = 0; + uint64_t total = vertices.size(); + while (!vertices.empty()) { + auto v = vertices.back(); + if (v.front == constants::most_frequent_abundance and + v.back == constants::most_frequent_abundance) { + walk.push_back(v); + vertices.pop_back(); + count += 1; + } else { + break; + } + } + std::cout << "num vertices of the form v:[" << constants::most_frequent_abundance << "," + << constants::most_frequent_abundance << "] = " << count << "/" << total + << "(" << (count * 100.0) / total << "%)" << std::endl; + + /* create a new vertex for next round */ + tmp_vertices.emplace_back(walks_in_round.size(), walk.front().front, walk.back().back); + walks_in_round.push_back(walk); + + walk.clear(); + } + while (true) { std::cout << "round " << rounds.size() << std::endl; uint64_t num_vertices = vertices.size(); std::cout << " num_vertices " << num_vertices << std::endl; - tmp_vertices.clear(); - abundance_map.clear(); /* all unvisited */ - colors.resize(num_vertices); - std::fill(colors.begin(), colors.end(), color::white); + if (rounds.size() == 0) { + /* remember: we removed some vertices but the id-space still spans + [0..num_sequences-1] */ + colors.resize(num_sequences); + std::fill(colors.begin(), colors.end(), color::invalid); + for (auto const& v : vertices) colors[v.id] = color::white; + } else { + colors.resize(num_vertices); + std::fill(colors.begin(), colors.end(), color::white); + } std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) { if (x.front != y.front) return x.front < y.front; @@ -89,9 +137,6 @@ struct cover { } } - walk_t walk; - walks_t walks_in_round; - while (true) { walk.clear(); @@ -106,18 +151,18 @@ struct cover { /* create a new walk */ while (true) { - uint64_t id = vertices[i].id; + auto vertex = vertices[i]; + uint64_t id = vertex.id; assert(colors[id] != color::black); colors[id] = color::gray; - uint64_t front = vertices[i].front; - uint64_t back = vertices[i].back; + uint64_t back = vertex.back; if (walk.size() > 0) { assert(walk.back().id != id); colors[walk.back().id] = color::black; colors[id] = color::black; } - walk.emplace_back(id, front, back); + walk.push_back(vertex); uint64_t offset = 0; bool no_match_found = false; @@ -135,22 +180,22 @@ struct cover { while (true) { // std::cout << "offset " << offset << "/" << num_vertices << std::endl; if (offset == num_vertices) break; - auto vertex = vertices[offset]; + auto candidate = vertices[offset]; /* skip */ - if (vertex.id == id) { + if (candidate.id == id) { offset += 1; continue; } /* checked all candidate matches */ - if (vertex.front != back) { + if (candidate.front != back) { no_match_found = true; break; } /* match found */ - if (colors[vertex.id] != color::black) { + if (colors[candidate.id] != color::black) { uint64_t& position = (*it).second.position; assert((*it).second.begin + position <= offset); position += 1; @@ -239,7 +284,11 @@ struct cover { } rounds.push_back(walks_in_round); + vertices.swap(tmp_vertices); + tmp_vertices.clear(); + walks_in_round.clear(); + abundance_map.clear(); } timer.stop(); From db92ebefa9c07c2bca5ea09c88b67c2e4734c2e2 Mon Sep 17 00:00:00 2001 From: jermp Date: Tue, 22 Mar 2022 10:42:47 +0100 Subject: [PATCH 11/40] simplification --- include/builder/cover.hpp | 36 ++++++++++-------------------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index 2ec3d25..ba010c9 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -108,34 +108,19 @@ struct cover { return x.id < y.id; }); + uint64_t prev_front = vertices.front().front; + uint64_t prev_offset = 0; + uint64_t offset = 0; for (auto const& x : vertices) { - auto it = abundance_map.find(x.front); - if (it != abundance_map.cend()) { // found - (*it).second.begin += 1; - } else { - abundance_map[x.front] = {1, 0, 0}; - } - } - - { - std::vector> offsets; // (abundance,offset) - offsets.reserve(abundance_map.size()); - for (auto const& p : abundance_map) { - offsets.emplace_back(p.first, p.second.begin); - } - assert(offsets.size() > 0); - std::sort(offsets.begin(), offsets.end(), - [](auto const& x, auto const& y) { return x.first < y.first; }); - uint64_t offset = 0; - for (auto const& p : offsets) { - uint64_t ab = p.first; - auto& r = abundance_map[ab]; - r.end = r.begin; - r.begin = offset; - r.position = 0; - offset += p.second; + if (x.front != prev_front) { + abundance_map[prev_front] = {prev_offset, offset, 0}; + prev_offset = offset; } + offset += 1; + prev_front = x.front; } + abundance_map[prev_front] = {prev_offset, offset, 0}; + assert(offset == vertices.size()); while (true) { walk.clear(); @@ -178,7 +163,6 @@ struct cover { /* search for a match */ while (true) { - // std::cout << "offset " << offset << "/" << num_vertices << std::endl; if (offset == num_vertices) break; auto candidate = vertices[offset]; From c21dc1c4f9b0084dca43f0a7878adb038ee23f49 Mon Sep 17 00:00:00 2001 From: jermp Date: Tue, 22 Mar 2022 11:48:36 +0100 Subject: [PATCH 12/40] re-construction of cover and saving to file --- include/builder/build.cpp | 2 ++ include/builder/cover.hpp | 63 +++++++++++++++++++++++---------------- 2 files changed, 39 insertions(+), 26 deletions(-) diff --git a/include/builder/build.cpp b/include/builder/build.cpp index 19aa713..9bf8ea4 100644 --- a/include/builder/build.cpp +++ b/include/builder/build.cpp @@ -333,6 +333,8 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b cover c; c.compute(vertices, num_runs_abundances); + std::string output_cover_filename("test_cover.txt"); + c.save(output_cover_filename); } } diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index ba010c9..c353ff0 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -28,6 +28,8 @@ struct cover { cover() : num_sequences(0) {} + uint64_t num_vertices() const { return num_sequences; } + void compute(std::vector& vertices, uint64_t num_runs_abundances // TODO: remove from here ) { @@ -236,7 +238,7 @@ struct cover { num_mergings_in_round += walk.size() - 1; #ifndef NDEBUG uint64_t prev_back = walk.front().front; - // std::cout << "=>"; + std::cout << "=>"; for (auto const& w : walk) { if (colors[w.id] == color::black) { std::cout << "ERROR: duplicate vertex." << std::endl; @@ -246,9 +248,9 @@ struct cover { } prev_back = w.back; colors[w.id] = color::black; - // std::cout << w.id << ":[" << w.front << "," << w.back << "] "; + std::cout << w.id << ":[" << w.front << "," << w.back << "] "; } - // std::cout << std::endl; + std::cout << std::endl; #endif } num_runs -= num_mergings_in_round; @@ -261,46 +263,55 @@ struct cover { // } } - if (all_singletons) { - std::cout << "STOP: all walks are singletons --> no new mergings were found" - << std::endl; - break; - } - rounds.push_back(walks_in_round); vertices.swap(tmp_vertices); tmp_vertices.clear(); walks_in_round.clear(); abundance_map.clear(); + + if (all_singletons) { + std::cout << "STOP: all walks are singletons --> no new mergings were found" + << std::endl; + break; + } } timer.stop(); std::cout << "cover computed in: " << timer.elapsed() / 1000000 << " [sec]" << std::endl; + } - // TODO: form final walks and check that all vertex id are present - - // uint64_t num_runs = num_runs_abundances; - // std::cout << "computed " << walks.size() << " walks" << std::endl; - // std::fill(colors.begin(), colors.end(), false); - // for (auto const& walk : walks) { - // num_runs -= walk.size() - 1; - // uint64_t prev_back = walk.front().front; - // for (auto const& w : walk) { - // if (colors[w.id] == true) { std::cout << "ERROR: duplicate vertex." << - // std::endl; } if (w.front != prev_back) { std::cout << "ERROR: path is broken." << - // std::endl; } prev_back = w.back; colors[w.id] = true; std::cout << w.id << ":[" - // << w.front << "," << w.back << "] "; - // } - // std::cout << std::endl; - // } - // std::cout << "num_runs " << num_runs << std::endl; + void save(std::string const& filename) { + std::ofstream out(filename.c_str()); + int r = rounds.size() - 1; + const auto& walks = rounds[r]; + for (auto const& walk : walks) { + for (auto const& vertex : walk) visit(vertex.id, r, out); + } + out.close(); } private: uint64_t num_sequences; std::vector rounds; + + /* visit walk of index w in round of index r */ + void visit(int w, int r, std::ofstream& out) const { + if (r > 0) { + assert(size_t(w) < rounds[r].size()); + auto const& walk = rounds[r][w]; + for (auto const& vertex : walk) { visit(vertex.id, r - 1, out); } + } else { // print + assert(size_t(w) < rounds[0].size()); + auto const& walk = rounds[0][w]; + for (auto const& vertex : walk) { + // out << vertex.id << ":[" << vertex.front << "," << vertex.back << "] "; + out << vertex.id << '\n'; + } + // out << '\n'; + } + } }; } // namespace sshash \ No newline at end of file From 9773ce4a9a96c4b55275f57bb3a48186ff646a3e Mon Sep 17 00:00:00 2001 From: jermp Date: Tue, 22 Mar 2022 14:57:58 +0100 Subject: [PATCH 13/40] now all included into a new executable 'permute' --- CMakeLists.txt | 3 + include/builder/build.cpp | 67 +---- include/builder/cover.hpp | 21 +- .../{build_util_types.hpp => util.hpp} | 7 + src/permute.cpp | 255 ++++++++++++++++++ 5 files changed, 281 insertions(+), 72 deletions(-) rename include/builder/{build_util_types.hpp => util.hpp} (96%) create mode 100644 src/permute.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index c54ca24..820c2ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,3 +42,6 @@ target_link_libraries(check z) add_executable(query src/query.cpp) target_link_libraries(query z) + +add_executable(permute src/permute.cpp) +target_link_libraries(permute z) \ No newline at end of file diff --git a/include/builder/build.cpp b/include/builder/build.cpp index 9bf8ea4..af4a359 100644 --- a/include/builder/build.cpp +++ b/include/builder/build.cpp @@ -6,22 +6,13 @@ #include "../info.cpp" #include "../../external/pthash/include/pthash.hpp" #include "../../external/pthash/external/essentials/include/essentials.hpp" - -#include "build_util_types.hpp" -#include "cover.hpp" +#include "util.hpp" #include // for std::partial_sum #include namespace sshash { -void expect(char got, char expected) { - if (got != expected) { - std::cout << "got '" << got << "' but expected '" << expected << "'" << std::endl; - throw parse_runtime_error(); - } -} - struct parse_data { parse_data() : num_kmers(0) {} uint64_t num_kmers; @@ -114,8 +105,6 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b } }; - std::vector vertices; - auto parse_header = [&]() { if (sequence.empty()) return; @@ -159,17 +148,11 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b bool kmers_have_all_mfa = true; bool kmers_have_different_abundances = false; - uint64_t front = constants::invalid; - uint64_t back = constants::invalid; - for (uint64_t j = 0, num_kmers = data.num_kmers, prev_ab = constants::invalid; j != seq_len - k + 1; ++j, ++num_kmers) { uint64_t ab = std::strtoull(sequence.data() + i, &end, 10); i = sequence.find_first_of(' ', i) + 1; - if (j == 0) { front = ab; } - if (j == seq_len - k) { back = ab; } - if (ab != constants::most_frequent_abundance) kmers_have_all_mfa = false; if (j > 0) { if (ab != prev_ab) { kmers_have_different_abundances = true; } @@ -206,9 +189,6 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b num_sequences_diff_abs += kmers_have_different_abundances; num_sequences_all_mfa += kmers_have_all_mfa; - - vertices.emplace_back(num_sequences, front, back); - // std::cerr << "seq:" << num_sequences << "[" << front << "," << back << "]\n"; }; while (!is.eof()) { @@ -290,51 +270,6 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b << "%)" << std::endl; data.abundances_builder.push_abundance_interval(ab_value, ab_length); data.abundances_builder.finalize(data.num_kmers); - - std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) { - if (x.front != y.front) return x.front < y.front; - if (x.back != y.back) return x.back < y.back; - return x.id < y.id; - }); - - /* (abundance, num_seqs_with_front_abundance=abundance) */ - std::unordered_map abundance_map; - for (auto const& x : vertices) { - // std::cerr << "seq:" << x.id << "[" << x.front << "," << x.back << "]\n"; - auto it = abundance_map.find(x.front); - if (it != abundance_map.cend()) { // found - (*it).second += 1; - } else { - abundance_map[x.front] = 1; - } - } - - uint64_t num_runs_abundances = data.abundances_builder.num_abundance_intervals(); - - uint64_t R = num_runs_abundances; - for (auto const& x : vertices) { - uint64_t back = x.back; - auto it = abundance_map.find(back); - if (it != abundance_map.cend()) { // found - if ((*it).second > 0) { // if it is 0, we cannot find a match - (*it).second -= 1; - // std::cerr << "seq:" << x.id << "[" << x.front << "," << x.back - // << "] got paired"; - // std::cerr << " (remaining " << (*it).second << ")\n"; - R -= 1; - } - // else { - // std::cerr << "seq:" << x.id << "[" << x.front << "," << x.back - // << "] NOT paired!\n"; - // } - } - } - std::cout << "computed lower bound: R = " << R << std::endl; - - cover c; - c.compute(vertices, num_runs_abundances); - std::string output_cover_filename("test_cover.txt"); - c.save(output_cover_filename); } } diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index c353ff0..f4618db 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -1,8 +1,15 @@ #pragma once +#include +#include + +#include "../util.hpp" + namespace sshash { struct vertex { + // We assume there are less than 2^32 sequences and that + // the largest abundance fits into a 32-bit uint. vertex(uint32_t s, uint32_t f, uint32_t b) : id(s), front(f), back(b) {} uint32_t id, front, back; }; @@ -47,6 +54,8 @@ struct cover { walk_t walk; walks_t walks_in_round; + walk.reserve(vertices.size()); // at most + walks_in_round.reserve(vertices.size()); // at most { /* @@ -113,13 +122,13 @@ struct cover { uint64_t prev_front = vertices.front().front; uint64_t prev_offset = 0; uint64_t offset = 0; - for (auto const& x : vertices) { - if (x.front != prev_front) { + for (auto const& vertex : vertices) { + if (vertex.front != prev_front) { abundance_map[prev_front] = {prev_offset, offset, 0}; prev_offset = offset; } offset += 1; - prev_front = x.front; + prev_front = vertex.front; } abundance_map[prev_front] = {prev_offset, offset, 0}; assert(offset == vertices.size()); @@ -238,7 +247,7 @@ struct cover { num_mergings_in_round += walk.size() - 1; #ifndef NDEBUG uint64_t prev_back = walk.front().front; - std::cout << "=>"; + // std::cout << "=>"; for (auto const& w : walk) { if (colors[w.id] == color::black) { std::cout << "ERROR: duplicate vertex." << std::endl; @@ -248,9 +257,9 @@ struct cover { } prev_back = w.back; colors[w.id] = color::black; - std::cout << w.id << ":[" << w.front << "," << w.back << "] "; + // std::cout << w.id << ":[" << w.front << "," << w.back << "] "; } - std::cout << std::endl; + // std::cout << std::endl; #endif } num_runs -= num_mergings_in_round; diff --git a/include/builder/build_util_types.hpp b/include/builder/util.hpp similarity index 96% rename from include/builder/build_util_types.hpp rename to include/builder/util.hpp index 41151c9..725c56e 100644 --- a/include/builder/build_util_types.hpp +++ b/include/builder/util.hpp @@ -12,6 +12,13 @@ struct parse_runtime_error : public std::runtime_error { : std::runtime_error("did you provide an input file with abundance counts?") {} }; +void expect(char got, char expected) { + if (got != expected) { + std::cout << "got '" << got << "' but expected '" << expected << "'" << std::endl; + throw parse_runtime_error(); + } +} + struct compact_string_pool { compact_string_pool() {} diff --git a/src/permute.cpp b/src/permute.cpp new file mode 100644 index 0000000..ac843d0 --- /dev/null +++ b/src/permute.cpp @@ -0,0 +1,255 @@ +#include + +#include "../include/gz/zip_stream.cpp" +#include "../external/pthash/external/essentials/include/essentials.hpp" +#include "../external/pthash/external/cmd_line_parser/include/parser.hpp" +#include "../include/builder/cover.hpp" +#include "../include/builder/util.hpp" +#include "../include/abundances.hpp" + +using namespace sshash; + +struct permute_data { + permute_data() : num_runs_abundances(0) {} + uint64_t num_runs_abundances; + std::vector vertices; +}; + +void parse_file(std::istream& is, permute_data& data, build_configuration const& build_config) { + std::string sequence; + uint64_t k = build_config.k; + uint64_t num_sequences = 0; + uint64_t num_bases = 0; + uint64_t num_kmers = 0; + + uint64_t seq_len = 0; + uint64_t sum_of_abundances = 0; + abundances::builder abundances_builder; + abundances_builder.init(constants::most_frequent_abundance); + + /* intervals of abundances */ + uint64_t ab_value = constants::invalid; + uint64_t ab_length = 1; + + uint64_t num_sequences_diff_abs = 0; // num sequences whose kmers have different abundances + uint64_t num_sequences_all_mfa = 0; // num sequences whose kmers have same abundance == mfa + + auto process_abundance = [&](uint64_t ab) { + if (ab == ab_value) { + ab_length += 1; + } else { + if (ab_value != constants::invalid) { + abundances_builder.push_abundance_interval(ab_value, ab_length); + } + ab_value = ab; + ab_length = 1; + } + }; + + auto parse_header = [&]() { + if (sequence.empty()) return; + + /* + Heder format: + >[id] LN:i:[seq_len] ab:Z:[ab_seq] + where [ab_seq] is a space-separated sequence of integer counters (the abundances), + whose length is equal to [seq_len]-k+1 + */ + + // example header: '>12 LN:i:41 ab:Z:2 2 2 2 2 2 2 2 2 2 2' + + expect(sequence[0], '>'); + uint64_t i = 0; + i = sequence.find_first_of(' ', i); + if (i == std::string::npos) throw parse_runtime_error(); + + i += 1; + expect(sequence[i + 0], 'L'); + expect(sequence[i + 1], 'N'); + expect(sequence[i + 2], ':'); + expect(sequence[i + 3], 'i'); + expect(sequence[i + 4], ':'); + i += 5; + uint64_t j = sequence.find_first_of(' ', i); + if (j == std::string::npos) throw parse_runtime_error(); + + char* end; + seq_len = std::strtoull(sequence.data() + i, &end, 10); + i = j + 1; + expect(sequence[i + 0], 'a'); + expect(sequence[i + 1], 'b'); + expect(sequence[i + 2], ':'); + expect(sequence[i + 3], 'Z'); + expect(sequence[i + 4], ':'); + i += 5; + + bool kmers_have_all_mfa = true; + bool kmers_have_different_abundances = false; + uint64_t front = constants::invalid; + uint64_t back = constants::invalid; + + for (uint64_t j = 0, kmer_id = num_kmers, prev_ab = constants::invalid; + j != seq_len - k + 1; ++j, ++kmer_id) { + uint64_t ab = std::strtoull(sequence.data() + i, &end, 10); + i = sequence.find_first_of(' ', i) + 1; + + if (j == 0) front = ab; + if (j == seq_len - k) back = ab; + + if (ab != constants::most_frequent_abundance) kmers_have_all_mfa = false; + if (j > 0) { + if (ab != prev_ab) kmers_have_different_abundances = true; + } + prev_ab = ab; + + abundances_builder.eat(ab); + sum_of_abundances += ab; + + process_abundance(ab); + } + + num_sequences_diff_abs += kmers_have_different_abundances; + num_sequences_all_mfa += kmers_have_all_mfa; + + data.vertices.emplace_back(num_sequences, front, back); + }; + + while (!is.eof()) { + std::getline(is, sequence); // header sequence + parse_header(); + + std::getline(is, sequence); // DNA sequence + if (sequence.size() < k) continue; + + if (++num_sequences % 100000 == 0) { + std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases, " + << num_kmers << " kmers" << std::endl; + } + + num_bases += sequence.size(); + num_kmers += sequence.size() - k + 1; + + if (seq_len != sequence.size()) { + std::cout << "ERROR: expected a sequence of length " << seq_len + << " but got one of length " << sequence.size() << std::endl; + throw std::runtime_error("file is malformed"); + } + } + + std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases, " << num_kmers + << " kmers" << std::endl; + std::cout << "sum_of_abundances " << sum_of_abundances << std::endl; + + std::cout << "num_sequences whose kmers have different abundances: " << num_sequences_diff_abs + << "/" << num_sequences << " (" << (num_sequences_diff_abs * 100.0) / num_sequences + << "%)" << std::endl; + std::cout << "num_sequences whose kmers all have the same abundance != mfa: " + << (num_sequences - num_sequences_diff_abs) << "/" << num_sequences << " (" + << ((num_sequences - num_sequences_diff_abs) * 100.0) / num_sequences << "%)" + << std::endl; + std::cout << "num_sequences whose kmers all have the same abundance == mfa: " + << num_sequences_all_mfa << "/" << (num_sequences - num_sequences_diff_abs) << " (" + << (num_sequences_all_mfa * 100.0) / (num_sequences - num_sequences_diff_abs) << "%)" + << std::endl; + + abundances_builder.push_abundance_interval(ab_value, ab_length); + abundances_builder.finalize(num_kmers); + + data.num_runs_abundances = abundances_builder.num_abundance_intervals(); +} + +permute_data parse_file(std::string const& filename, build_configuration const& build_config) { + std::ifstream is(filename.c_str()); + if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); + std::cout << "reading file '" << filename << "'..." << std::endl; + permute_data data; + if (util::ends_with(filename, ".gz")) { + zip_istream zis(is); + parse_file(zis, data, build_config); + } else { + parse_file(is, data, build_config); + } + is.close(); + return data; +} + +int main(int argc, char** argv) { + cmd_line_parser::parser parser(argc, argv); + + /* mandatory arguments */ + parser.add("input_filename", + "Must be a FASTA file (.fa/fasta extension) compressed with gzip (.gz) or not:\n" + "\t- without duplicate nor invalid kmers\n" + "\t- one DNA sequence per line\n" + "\t- with also kmers' abundances.\n" + "\tFor example, it could be the de Bruijn graph topology output by BCALM."); + + parser.add("k", "K-mer length (must be <= " + std::to_string(constants::max_k) + ")."); + + /* optional arguments */ + parser.add("output_filename", "Output file where the permuted collection will be written.", + "-o", false); + parser.add("verbose", "Verbose output during construction.", "--verbose", true); + + if (!parser.parse()) return 1; + + auto input_filename = parser.get("input_filename"); + auto k = parser.get("k"); + + build_configuration build_config; + build_config.k = k; + build_config.verbose = parser.get("verbose"); + + std::string output_filename = input_filename + ".permuted"; + if (parser.parsed("output_filename")) { + output_filename = parser.get("output_filename"); + } + + auto data = parse_file(input_filename, build_config); + uint64_t num_runs_abundances = data.num_runs_abundances; + + { + std::sort(data.vertices.begin(), data.vertices.end(), [](auto const& x, auto const& y) { + /* sort on front */ + if (x.front != y.front) return x.front < y.front; + if (x.back != y.back) return x.back < y.back; + return x.id < y.id; + }); + + /* (abundance, num_seqs_with_front=abundance) */ + // We assume there are less than 2^32 distinct abundances and that + // the largest abundance fits into a 32-bit uint. + std::unordered_map abundance_map; + + uint64_t prev_front = data.vertices.front().front; + uint64_t count = 0; + for (auto const& vertex : data.vertices) { + if (vertex.front != prev_front) { + abundance_map[prev_front] = count; + count = 0; + } + count += 1; + prev_front = vertex.front; + } + abundance_map[prev_front] = count; + + uint64_t R = num_runs_abundances; + for (auto const& vertex : data.vertices) { + uint64_t back = vertex.back; + auto it = abundance_map.find(back); + if (it != abundance_map.cend()) { // found + if ((*it).second > 0) { // if it is 0, we cannot find a match + (*it).second -= 1; + R -= 1; + } + } + } + std::cout << "computed lower bound: R = " << R << std::endl; + } + + cover c; + c.compute(data.vertices, num_runs_abundances); + c.save(output_filename); + + return 0; +} From bf6bb5d80869b6a312c02a5a66392c7fe8125f2d Mon Sep 17 00:00:00 2001 From: jermp Date: Tue, 22 Mar 2022 15:23:08 +0100 Subject: [PATCH 14/40] do not use abundances::builder in src/permute --- src/permute.cpp | 59 ++++++++++++++++--------------------------------- 1 file changed, 19 insertions(+), 40 deletions(-) diff --git a/src/permute.cpp b/src/permute.cpp index ac843d0..b4a1d69 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -5,7 +5,6 @@ #include "../external/pthash/external/cmd_line_parser/include/parser.hpp" #include "../include/builder/cover.hpp" #include "../include/builder/util.hpp" -#include "../include/abundances.hpp" using namespace sshash; @@ -21,30 +20,13 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& uint64_t num_sequences = 0; uint64_t num_bases = 0; uint64_t num_kmers = 0; - uint64_t seq_len = 0; - uint64_t sum_of_abundances = 0; - abundances::builder abundances_builder; - abundances_builder.init(constants::most_frequent_abundance); - - /* intervals of abundances */ - uint64_t ab_value = constants::invalid; - uint64_t ab_length = 1; + uint64_t prev_abundance = constants::invalid; + uint64_t sum_of_abundances = 0; uint64_t num_sequences_diff_abs = 0; // num sequences whose kmers have different abundances uint64_t num_sequences_all_mfa = 0; // num sequences whose kmers have same abundance == mfa - - auto process_abundance = [&](uint64_t ab) { - if (ab == ab_value) { - ab_length += 1; - } else { - if (ab_value != constants::invalid) { - abundances_builder.push_abundance_interval(ab_value, ab_length); - } - ab_value = ab; - ab_length = 1; - } - }; + data.num_runs_abundances = 0; auto parse_header = [&]() { if (sequence.empty()) return; @@ -88,24 +70,25 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& uint64_t front = constants::invalid; uint64_t back = constants::invalid; - for (uint64_t j = 0, kmer_id = num_kmers, prev_ab = constants::invalid; - j != seq_len - k + 1; ++j, ++kmer_id) { - uint64_t ab = std::strtoull(sequence.data() + i, &end, 10); + for (uint64_t j = 0, kmer_id = num_kmers; j != seq_len - k + 1; ++j, ++kmer_id) { + uint64_t abundance = std::strtoull(sequence.data() + i, &end, 10); + sum_of_abundances += abundance; i = sequence.find_first_of(' ', i) + 1; - if (j == 0) front = ab; - if (j == seq_len - k) back = ab; + /* set front and back */ + if (j == 0) front = abundance; + if (j == seq_len - k) back = abundance; - if (ab != constants::most_frequent_abundance) kmers_have_all_mfa = false; - if (j > 0) { - if (ab != prev_ab) kmers_have_different_abundances = true; - } - prev_ab = ab; + /* accumulate statistics */ + if (abundance != constants::most_frequent_abundance) kmers_have_all_mfa = false; + if (j > 0 and abundance != prev_abundance) kmers_have_different_abundances = true; - abundances_builder.eat(ab); - sum_of_abundances += ab; + /* count the number of runs */ + if (prev_abundance != constants::invalid and abundance != prev_abundance) { + data.num_runs_abundances += 1; + } - process_abundance(ab); + prev_abundance = abundance; } num_sequences_diff_abs += kmers_have_different_abundances; @@ -136,10 +119,11 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& } } + data.num_runs_abundances += 1; + std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases, " << num_kmers << " kmers" << std::endl; std::cout << "sum_of_abundances " << sum_of_abundances << std::endl; - std::cout << "num_sequences whose kmers have different abundances: " << num_sequences_diff_abs << "/" << num_sequences << " (" << (num_sequences_diff_abs * 100.0) / num_sequences << "%)" << std::endl; @@ -151,11 +135,6 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& << num_sequences_all_mfa << "/" << (num_sequences - num_sequences_diff_abs) << " (" << (num_sequences_all_mfa * 100.0) / (num_sequences - num_sequences_diff_abs) << "%)" << std::endl; - - abundances_builder.push_abundance_interval(ab_value, ab_length); - abundances_builder.finalize(num_kmers); - - data.num_runs_abundances = abundances_builder.num_abundance_intervals(); } permute_data parse_file(std::string const& filename, build_configuration const& build_config) { From ae23a269fb2546b9503b28a292b0fe09fd3abaf1 Mon Sep 17 00:00:00 2001 From: jermp Date: Wed, 23 Mar 2022 18:34:30 +0100 Subject: [PATCH 15/40] minor fixes --- include/builder/cover.hpp | 41 +++++++++++++++++++++------------------ src/permute.cpp | 20 ++++++++++++------- 2 files changed, 35 insertions(+), 26 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index f4618db..48e55ca 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -119,19 +119,21 @@ struct cover { return x.id < y.id; }); - uint64_t prev_front = vertices.front().front; - uint64_t prev_offset = 0; - uint64_t offset = 0; - for (auto const& vertex : vertices) { - if (vertex.front != prev_front) { - abundance_map[prev_front] = {prev_offset, offset, 0}; - prev_offset = offset; + if (num_vertices > 0) { + uint64_t prev_front = vertices.front().front; + uint64_t prev_offset = 0; + uint64_t offset = 0; + for (auto const& vertex : vertices) { + if (vertex.front != prev_front) { + abundance_map[prev_front] = {prev_offset, offset, 0}; + prev_offset = offset; + } + offset += 1; + prev_front = vertex.front; } - offset += 1; - prev_front = vertex.front; + abundance_map[prev_front] = {prev_offset, offset, 0}; + assert(offset == vertices.size()); } - abundance_map[prev_front] = {prev_offset, offset, 0}; - assert(offset == vertices.size()); while (true) { walk.clear(); @@ -247,7 +249,7 @@ struct cover { num_mergings_in_round += walk.size() - 1; #ifndef NDEBUG uint64_t prev_back = walk.front().front; - // std::cout << "=>"; + std::cout << "=>"; for (auto const& w : walk) { if (colors[w.id] == color::black) { std::cout << "ERROR: duplicate vertex." << std::endl; @@ -257,19 +259,19 @@ struct cover { } prev_back = w.back; colors[w.id] = color::black; - // std::cout << w.id << ":[" << w.front << "," << w.back << "] "; + std::cout << w.id << ":[" << w.front << "," << w.back << "] "; } - // std::cout << std::endl; + std::cout << std::endl; #endif } num_runs -= num_mergings_in_round; std::cout << " num_mergings = " << num_mergings_in_round << std::endl; std::cout << " num_runs " << num_runs << std::endl; - // std::cout << "created vertices in round " << rounds.size() << ":" << std::endl; - // for (auto const& v : tmp_vertices) { - // std::cout << v.id << ":[" << v.front << "," << v.back << "]\n"; - // } + std::cout << "created vertices in round " << rounds.size() << ":" << std::endl; + for (auto const& v : tmp_vertices) { + std::cout << v.id << ":[" << v.front << "," << v.back << "]\n"; + } } rounds.push_back(walks_in_round); @@ -279,7 +281,7 @@ struct cover { walks_in_round.clear(); abundance_map.clear(); - if (all_singletons) { + if (all_singletons and rounds.size() > 1) { std::cout << "STOP: all walks are singletons --> no new mergings were found" << std::endl; break; @@ -293,6 +295,7 @@ struct cover { void save(std::string const& filename) { std::ofstream out(filename.c_str()); + assert(rounds.size() > 0); int r = rounds.size() - 1; const auto& walks = rounds[r]; for (auto const& walk : walks) { diff --git a/src/permute.cpp b/src/permute.cpp index b4a1d69..3898ccc 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -22,7 +22,6 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& uint64_t num_kmers = 0; uint64_t seq_len = 0; - uint64_t prev_abundance = constants::invalid; uint64_t sum_of_abundances = 0; uint64_t num_sequences_diff_abs = 0; // num sequences whose kmers have different abundances uint64_t num_sequences_all_mfa = 0; // num sequences whose kmers have same abundance == mfa @@ -70,7 +69,7 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& uint64_t front = constants::invalid; uint64_t back = constants::invalid; - for (uint64_t j = 0, kmer_id = num_kmers; j != seq_len - k + 1; ++j, ++kmer_id) { + for (uint64_t j = 0, prev_abundance = constants::invalid; j != seq_len - k + 1; ++j) { uint64_t abundance = std::strtoull(sequence.data() + i, &end, 10); sum_of_abundances += abundance; i = sequence.find_first_of(' ', i) + 1; @@ -84,9 +83,7 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& if (j > 0 and abundance != prev_abundance) kmers_have_different_abundances = true; /* count the number of runs */ - if (prev_abundance != constants::invalid and abundance != prev_abundance) { - data.num_runs_abundances += 1; - } + if (abundance != prev_abundance) data.num_runs_abundances += 1; prev_abundance = abundance; } @@ -94,6 +91,10 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& num_sequences_diff_abs += kmers_have_different_abundances; num_sequences_all_mfa += kmers_have_all_mfa; + std::cout << "num_runs_abundances in seq " << num_sequences << " = " + << data.num_runs_abundances << std::endl; + std::cout << "vertex " << num_sequences << ":[" << front << "," << back << "]" << std::endl; + data.vertices.emplace_back(num_sequences, front, back); }; @@ -119,7 +120,8 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& } } - data.num_runs_abundances += 1; + assert(data.vertices.size() == num_sequences); + assert(data.num_runs_abundances >= num_sequences); std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases, " << num_kmers << " kmers" << std::endl; @@ -188,6 +190,10 @@ int main(int argc, char** argv) { uint64_t num_runs_abundances = data.num_runs_abundances; { + std::cout + << "The trivial lower bound assumes we are able to concatenate all sequences : R' = " + << num_runs_abundances - data.vertices.size() + 1 << std::endl; + std::sort(data.vertices.begin(), data.vertices.end(), [](auto const& x, auto const& y) { /* sort on front */ if (x.front != y.front) return x.front < y.front; @@ -223,7 +229,7 @@ int main(int argc, char** argv) { } } } - std::cout << "computed lower bound: R = " << R << std::endl; + std::cout << "Computed lower bound: R = " << R << std::endl; } cover c; From a47a74a5eb66b4395ec87eb16db85bbe36cac7ca Mon Sep 17 00:00:00 2001 From: jermp Date: Thu, 24 Mar 2022 10:51:01 +0100 Subject: [PATCH 16/40] minor fixes --- include/builder/cover.hpp | 58 ++++++++++++++++++++----------------- src/permute.cpp | 60 +++++++++++++++++++++------------------ 2 files changed, 64 insertions(+), 54 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index 48e55ca..f35dd08 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -33,13 +33,12 @@ struct cover { uint64_t position; }; - cover() : num_sequences(0) {} + cover(uint64_t num_sequences, uint64_t num_runs_abundances) + : m_num_sequences(num_sequences), m_num_runs_abundances(num_runs_abundances) {} - uint64_t num_vertices() const { return num_sequences; } + void compute(std::vector& vertices) { + assert(vertices.size() == m_num_sequences); - void compute(std::vector& vertices, - uint64_t num_runs_abundances // TODO: remove from here - ) { essentials::timer_type timer; timer.start(); @@ -48,9 +47,7 @@ struct cover { std::unordered_map abundance_map; std::vector colors; std::vector tmp_vertices; - uint64_t num_runs = num_runs_abundances; - std::cout << "initial number of runs = " << num_runs << std::endl; - num_sequences = vertices.size(); + std::cout << "initial number of runs = " << m_num_runs_abundances << std::endl; walk_t walk; walks_t walks_in_round; @@ -71,28 +68,33 @@ struct cover { return x.id > y.id; }); - uint64_t count = 0; - uint64_t total = vertices.size(); + uint64_t num_special_vertices = 0; + uint64_t num_vertices = vertices.size(); while (!vertices.empty()) { auto v = vertices.back(); if (v.front == constants::most_frequent_abundance and v.back == constants::most_frequent_abundance) { walk.push_back(v); vertices.pop_back(); - count += 1; + num_special_vertices += 1; } else { break; } } std::cout << "num vertices of the form v:[" << constants::most_frequent_abundance << "," - << constants::most_frequent_abundance << "] = " << count << "/" << total - << "(" << (count * 100.0) / total << "%)" << std::endl; + << constants::most_frequent_abundance << "] = " << num_special_vertices << "/" + << num_vertices << "(" << (num_special_vertices * 100.0) / num_vertices + << "%)" << std::endl; - /* create a new vertex for next round */ - tmp_vertices.emplace_back(walks_in_round.size(), walk.front().front, walk.back().back); - walks_in_round.push_back(walk); + if (num_special_vertices != 0) { + /* create a new vertex for next round */ + assert(!walk.empty()); + tmp_vertices.emplace_back(walks_in_round.size(), walk.front().front, + walk.back().back); + walks_in_round.push_back(walk); - walk.clear(); + walk.clear(); + } } while (true) { @@ -104,8 +106,8 @@ struct cover { /* all unvisited */ if (rounds.size() == 0) { /* remember: we removed some vertices but the id-space still spans - [0..num_sequences-1] */ - colors.resize(num_sequences); + [0..m_num_sequences-1] */ + colors.resize(m_num_sequences); std::fill(colors.begin(), colors.end(), color::invalid); for (auto const& v : vertices) colors[v.id] = color::white; } else { @@ -264,14 +266,15 @@ struct cover { std::cout << std::endl; #endif } - num_runs -= num_mergings_in_round; + assert(m_num_runs_abundances > num_mergings_in_round); + m_num_runs_abundances -= num_mergings_in_round; std::cout << " num_mergings = " << num_mergings_in_round << std::endl; - std::cout << " num_runs " << num_runs << std::endl; + std::cout << " num_runs " << m_num_runs_abundances << std::endl; - std::cout << "created vertices in round " << rounds.size() << ":" << std::endl; - for (auto const& v : tmp_vertices) { - std::cout << v.id << ":[" << v.front << "," << v.back << "]\n"; - } + // std::cout << "created vertices in round " << rounds.size() << ":" << std::endl; + // for (auto const& v : tmp_vertices) { + // std::cout << v.id << ":[" << v.front << "," << v.back << "]\n"; + // } } rounds.push_back(walks_in_round); @@ -291,6 +294,8 @@ struct cover { timer.stop(); std::cout << "cover computed in: " << timer.elapsed() / 1000000 << " [sec]" << std::endl; + + assert(m_num_runs_abundances >= 1); } void save(std::string const& filename) { @@ -305,7 +310,8 @@ struct cover { } private: - uint64_t num_sequences; + uint64_t m_num_sequences; + uint64_t m_num_runs_abundances; std::vector rounds; /* visit walk of index w in round of index r */ diff --git a/src/permute.cpp b/src/permute.cpp index 3898ccc..3f0f3ce 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -9,15 +9,15 @@ using namespace sshash; struct permute_data { - permute_data() : num_runs_abundances(0) {} + permute_data() : num_runs_abundances(0), num_sequences(0) {} uint64_t num_runs_abundances; + uint64_t num_sequences; std::vector vertices; }; void parse_file(std::istream& is, permute_data& data, build_configuration const& build_config) { std::string sequence; uint64_t k = build_config.k; - uint64_t num_sequences = 0; uint64_t num_bases = 0; uint64_t num_kmers = 0; uint64_t seq_len = 0; @@ -26,6 +26,7 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& uint64_t num_sequences_diff_abs = 0; // num sequences whose kmers have different abundances uint64_t num_sequences_all_mfa = 0; // num sequences whose kmers have same abundance == mfa data.num_runs_abundances = 0; + data.num_sequences = 0; auto parse_header = [&]() { if (sequence.empty()) return; @@ -91,11 +92,7 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& num_sequences_diff_abs += kmers_have_different_abundances; num_sequences_all_mfa += kmers_have_all_mfa; - std::cout << "num_runs_abundances in seq " << num_sequences << " = " - << data.num_runs_abundances << std::endl; - std::cout << "vertex " << num_sequences << ":[" << front << "," << back << "]" << std::endl; - - data.vertices.emplace_back(num_sequences, front, back); + data.vertices.emplace_back(data.num_sequences, front, back); }; while (!is.eof()) { @@ -105,8 +102,8 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& std::getline(is, sequence); // DNA sequence if (sequence.size() < k) continue; - if (++num_sequences % 100000 == 0) { - std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases, " + if (++data.num_sequences % 100000 == 0) { + std::cout << "read " << data.num_sequences << " sequences, " << num_bases << " bases, " << num_kmers << " kmers" << std::endl; } @@ -120,23 +117,24 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& } } - assert(data.vertices.size() == num_sequences); - assert(data.num_runs_abundances >= num_sequences); + assert(data.vertices.size() == data.num_sequences); + assert(data.num_runs_abundances >= data.num_sequences); - std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases, " << num_kmers - << " kmers" << std::endl; + std::cout << "read " << data.num_sequences << " sequences, " << num_bases << " bases, " + << num_kmers << " kmers" << std::endl; std::cout << "sum_of_abundances " << sum_of_abundances << std::endl; std::cout << "num_sequences whose kmers have different abundances: " << num_sequences_diff_abs - << "/" << num_sequences << " (" << (num_sequences_diff_abs * 100.0) / num_sequences - << "%)" << std::endl; + << "/" << data.num_sequences << " (" + << (num_sequences_diff_abs * 100.0) / data.num_sequences << "%)" << std::endl; std::cout << "num_sequences whose kmers all have the same abundance != mfa: " - << (num_sequences - num_sequences_diff_abs) << "/" << num_sequences << " (" - << ((num_sequences - num_sequences_diff_abs) * 100.0) / num_sequences << "%)" - << std::endl; + << (data.num_sequences - num_sequences_diff_abs) << "/" << data.num_sequences << " (" + << ((data.num_sequences - num_sequences_diff_abs) * 100.0) / data.num_sequences + << "%)" << std::endl; std::cout << "num_sequences whose kmers all have the same abundance == mfa: " - << num_sequences_all_mfa << "/" << (num_sequences - num_sequences_diff_abs) << " (" - << (num_sequences_all_mfa * 100.0) / (num_sequences - num_sequences_diff_abs) << "%)" - << std::endl; + << num_sequences_all_mfa << "/" << (data.num_sequences - num_sequences_diff_abs) + << " (" + << (num_sequences_all_mfa * 100.0) / (data.num_sequences - num_sequences_diff_abs) + << "%)" << std::endl; } permute_data parse_file(std::string const& filename, build_configuration const& build_config) { @@ -187,12 +185,12 @@ int main(int argc, char** argv) { } auto data = parse_file(input_filename, build_config); - uint64_t num_runs_abundances = data.num_runs_abundances; { - std::cout - << "The trivial lower bound assumes we are able to concatenate all sequences : R' = " - << num_runs_abundances - data.vertices.size() + 1 << std::endl; + uint64_t R_lower = data.num_runs_abundances - data.vertices.size() + 1; + std::cout << "The trivial lower bound (too optimistic) assumes we are able to concatenate " + "all sequences : R_lower = " + << R_lower << std::endl; std::sort(data.vertices.begin(), data.vertices.end(), [](auto const& x, auto const& y) { /* sort on front */ @@ -218,12 +216,17 @@ int main(int argc, char** argv) { } abundance_map[prev_front] = count; - uint64_t R = num_runs_abundances; + uint64_t R = data.num_runs_abundances; for (auto const& vertex : data.vertices) { uint64_t back = vertex.back; auto it = abundance_map.find(back); if (it != abundance_map.cend()) { // found if ((*it).second > 0) { // if it is 0, we cannot find a match + + /* We clearly cannot create more mergings than num_sequences - 1, + thus R cannot be lower than R_lower. */ + if (R == R_lower) break; + (*it).second -= 1; R -= 1; } @@ -232,8 +235,9 @@ int main(int argc, char** argv) { std::cout << "Computed lower bound: R = " << R << std::endl; } - cover c; - c.compute(data.vertices, num_runs_abundances); + cover c(data.num_sequences, data.num_runs_abundances); + assert(data.vertices.size() == data.num_sequences); + c.compute(data.vertices); c.save(output_filename); return 0; From d337a4dd6f4cd0aafcd7c317f20fbba592b61468 Mon Sep 17 00:00:00 2001 From: jermp Date: Thu, 24 Mar 2022 12:06:36 +0100 Subject: [PATCH 17/40] faster --- include/builder/cover.hpp | 55 ++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index f35dd08..c0a16d9 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -29,7 +29,7 @@ struct cover { range() {} range(uint64_t b, uint64_t e, uint64_t p) : begin(b), end(e), position(p) {} uint64_t begin; - uint64_t end; + uint64_t end; // TODO: currently unused uint64_t position; }; @@ -62,6 +62,9 @@ struct cover { Warning: here we are assuming the mfa is also the smallest abundance. */ + essentials::timer_type timer; + timer.start(); + std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) { if (x.front != y.front) return x.front > y.front; if (x.back != y.back) return x.back > y.back; @@ -95,6 +98,9 @@ struct cover { walk.clear(); } + + timer.stop(); + std::cout << " time: " << timer.elapsed() / 1000000 << " [sec]" << std::endl; } while (true) { @@ -103,6 +109,9 @@ struct cover { uint64_t num_vertices = vertices.size(); std::cout << " num_vertices " << num_vertices << std::endl; + essentials::timer_type round_timer; + round_timer.start(); + /* all unvisited */ if (rounds.size() == 0) { /* remember: we removed some vertices but the id-space still spans @@ -137,26 +146,37 @@ struct cover { assert(offset == vertices.size()); } + uint64_t i = 0; // offset of unvisited vertex + while (true) { walk.clear(); - /* take an unvisited vertex */ - uint64_t i = 0; + /* 1. take an unvisited vertex */ + + // try to visit forward for (; i != num_vertices; ++i) { uint64_t id = vertices[i].id; if (colors[id] == color::white) break; } + // if a vertex is not found, try to visit backward + if (i == num_vertices) { + i = 0; + for (; i != num_vertices; ++i) { + uint64_t id = vertices[i].id; + if (colors[id] == color::white) break; + } + } + if (i == num_vertices) break; // all visited - /* create a new walk */ + /* 2. create a new walk */ while (true) { auto vertex = vertices[i]; uint64_t id = vertex.id; assert(colors[id] != color::black); colors[id] = color::gray; - uint64_t back = vertex.back; if (walk.size() > 0) { assert(walk.back().id != id); colors[walk.back().id] = color::black; @@ -164,19 +184,17 @@ struct cover { } walk.push_back(vertex); - uint64_t offset = 0; - bool no_match_found = false; - + uint64_t back = vertex.back; auto it = abundance_map.find(back); /* abundance back is not found, so no match is possible */ - if (it == abundance_map.cend()) { - no_match_found = true; - break; - } - offset = (*it).second.begin + (*it).second.position; // skip to position + if (it == abundance_map.cend()) break; + + bool no_match_found = false; + uint64_t offset = + (*it).second.begin + (*it).second.position; // skip to position - /* search for a match */ + /* 3. search for a match */ while (true) { if (offset == num_vertices) break; auto candidate = vertices[offset]; @@ -251,7 +269,7 @@ struct cover { num_mergings_in_round += walk.size() - 1; #ifndef NDEBUG uint64_t prev_back = walk.front().front; - std::cout << "=>"; + // std::cout << "=>"; for (auto const& w : walk) { if (colors[w.id] == color::black) { std::cout << "ERROR: duplicate vertex." << std::endl; @@ -261,9 +279,9 @@ struct cover { } prev_back = w.back; colors[w.id] = color::black; - std::cout << w.id << ":[" << w.front << "," << w.back << "] "; + // std::cout << w.id << ":[" << w.front << "," << w.back << "] "; } - std::cout << std::endl; + // std::cout << std::endl; #endif } assert(m_num_runs_abundances > num_mergings_in_round); @@ -271,6 +289,8 @@ struct cover { std::cout << " num_mergings = " << num_mergings_in_round << std::endl; std::cout << " num_runs " << m_num_runs_abundances << std::endl; + round_timer.stop(); + std::cout << " time: " << round_timer.elapsed() / 1000000 << " [sec]" << std::endl; // std::cout << "created vertices in round " << rounds.size() << ":" << std::endl; // for (auto const& v : tmp_vertices) { // std::cout << v.id << ":[" << v.front << "," << v.back << "]\n"; @@ -292,7 +312,6 @@ struct cover { } timer.stop(); - std::cout << "cover computed in: " << timer.elapsed() / 1000000 << " [sec]" << std::endl; assert(m_num_runs_abundances >= 1); From 38c10e6674d283fa0916e9e66ae9e161f698743e Mon Sep 17 00:00:00 2001 From: jermp Date: Thu, 24 Mar 2022 13:57:04 +0100 Subject: [PATCH 18/40] less code --- include/builder/cover.hpp | 45 +++++++++++++-------------------------- 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index c0a16d9..91cd25e 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -25,14 +25,6 @@ struct cover { invalid = -1 }; - struct range { - range() {} - range(uint64_t b, uint64_t e, uint64_t p) : begin(b), end(e), position(p) {} - uint64_t begin; - uint64_t end; // TODO: currently unused - uint64_t position; - }; - cover(uint64_t num_sequences, uint64_t num_runs_abundances) : m_num_sequences(num_sequences), m_num_runs_abundances(num_runs_abundances) {} @@ -43,8 +35,8 @@ struct cover { timer.start(); - /* (abundance, num_seqs_with_front_abundance=abundance) */ - std::unordered_map abundance_map; + /* (abundance, position of a candidate abundance with front = abundance) */ + std::unordered_map abundance_map; std::vector colors; std::vector tmp_vertices; std::cout << "initial number of runs = " << m_num_runs_abundances << std::endl; @@ -131,22 +123,17 @@ struct cover { }); if (num_vertices > 0) { - uint64_t prev_front = vertices.front().front; - uint64_t prev_offset = 0; + uint64_t prev_front = constants::invalid; uint64_t offset = 0; for (auto const& vertex : vertices) { - if (vertex.front != prev_front) { - abundance_map[prev_front] = {prev_offset, offset, 0}; - prev_offset = offset; - } + if (vertex.front != prev_front) abundance_map[vertex.front] = offset; offset += 1; prev_front = vertex.front; } - abundance_map[prev_front] = {prev_offset, offset, 0}; assert(offset == vertices.size()); } - uint64_t i = 0; // offset of unvisited vertex + uint64_t i = 0; // position of unvisited vertex in vertices while (true) { walk.clear(); @@ -191,17 +178,16 @@ struct cover { if (it == abundance_map.cend()) break; bool no_match_found = false; - uint64_t offset = - (*it).second.begin + (*it).second.position; // skip to position + uint64_t candidate_i = (*it).second; // candidate position /* 3. search for a match */ while (true) { - if (offset == num_vertices) break; - auto candidate = vertices[offset]; + if (candidate_i == num_vertices) break; + auto candidate = vertices[candidate_i]; /* skip */ if (candidate.id == id) { - offset += 1; + candidate_i += 1; continue; } @@ -213,20 +199,19 @@ struct cover { /* match found */ if (colors[candidate.id] != color::black) { - uint64_t& position = (*it).second.position; - assert((*it).second.begin + position <= offset); - position += 1; + assert((*it).second <= candidate_i); + (*it).second += 1; break; } - offset += 1; + candidate_i += 1; } - assert(offset <= num_vertices); - if (no_match_found or offset == num_vertices) break; + assert(candidate_i <= num_vertices); + if (no_match_found or candidate_i == num_vertices) break; /* valid match was found, then visit it next */ - i = offset; + i = candidate_i; } assert(!walk.empty()); From 8ea1d329663e43efa6bddae736f652455f9fe156 Mon Sep 17 00:00:00 2001 From: jermp Date: Thu, 24 Mar 2022 19:10:14 +0100 Subject: [PATCH 19/40] minor --- include/builder/cover.hpp | 3 ++- src/permute.cpp | 10 +++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index 91cd25e..c2487c8 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -36,7 +36,7 @@ struct cover { timer.start(); /* (abundance, position of a candidate abundance with front = abundance) */ - std::unordered_map abundance_map; + std::unordered_map abundance_map; std::vector colors; std::vector tmp_vertices; std::cout << "initial number of runs = " << m_num_runs_abundances << std::endl; @@ -298,6 +298,7 @@ struct cover { timer.stop(); std::cout << "cover computed in: " << timer.elapsed() / 1000000 << " [sec]" << std::endl; + std::cout << "final number of runs R = " << m_num_runs_abundances << std::endl; assert(m_num_runs_abundances >= 1); } diff --git a/src/permute.cpp b/src/permute.cpp index 3f0f3ce..3aa554f 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -168,7 +168,7 @@ int main(int argc, char** argv) { /* optional arguments */ parser.add("output_filename", "Output file where the permuted collection will be written.", "-o", false); - parser.add("verbose", "Verbose output during construction.", "--verbose", true); + // parser.add("verbose", "Verbose output during construction.", "--verbose", true); if (!parser.parse()) return 1; @@ -177,7 +177,7 @@ int main(int argc, char** argv) { build_configuration build_config; build_config.k = k; - build_config.verbose = parser.get("verbose"); + // build_config.verbose = parser.get("verbose"); std::string output_filename = input_filename + ".permuted"; if (parser.parsed("output_filename")) { @@ -189,7 +189,7 @@ int main(int argc, char** argv) { { uint64_t R_lower = data.num_runs_abundances - data.vertices.size() + 1; std::cout << "The trivial lower bound (too optimistic) assumes we are able to concatenate " - "all sequences : R_lower = " + "all sequences : R_lo = " << R_lower << std::endl; std::sort(data.vertices.begin(), data.vertices.end(), [](auto const& x, auto const& y) { @@ -200,7 +200,7 @@ int main(int argc, char** argv) { }); /* (abundance, num_seqs_with_front=abundance) */ - // We assume there are less than 2^32 distinct abundances and that + // We assume there are less than 2^32 sequences and that // the largest abundance fits into a 32-bit uint. std::unordered_map abundance_map; @@ -232,7 +232,7 @@ int main(int argc, char** argv) { } } } - std::cout << "Computed lower bound: R = " << R << std::endl; + std::cout << "Computed lower bound: R_hi = " << R << std::endl; } cover c(data.num_sequences, data.num_runs_abundances); From 6fa2c7fc7d877bd017c946924d377e079d020ce5 Mon Sep 17 00:00:00 2001 From: jermp Date: Thu, 24 Mar 2022 22:11:04 +0100 Subject: [PATCH 20/40] more --- ..._sakai.BA000007.3.k31_ust.abundances.fa.gz | Bin 1683039 -> 1683050 bytes include/builder/cover.hpp | 31 ++++-- src/permute.cpp | 96 ++++++++++++++++-- 3 files changed, 109 insertions(+), 18 deletions(-) diff --git a/data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz b/data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz index d5c697e272b4a2befcf49083ee7d86f985620251..7607fd9af8191997568ea53cfb7e6f2a02033bc5 100644 GIT binary patch delta 98 zcmV~$yA6T>06@VgDvAiG_}kz)0rLP#Nn;#<;`L5K$2KJNOyS)$HuKp0o>5tS+gI7v q?@xy;WGR8HWGx#BWh*<0WG}HCB#~4yIm$`SlFLP|a+CYOqv-!by&Vkz delta 87 zcmV~$xe?!TkOG0pkc7CIA2c diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index c2487c8..f0ca73f 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -308,8 +308,14 @@ struct cover { assert(rounds.size() > 0); int r = rounds.size() - 1; const auto& walks = rounds[r]; + uint64_t num_sequences = 0; for (auto const& walk : walks) { - for (auto const& vertex : walk) visit(vertex.id, r, out); + for (auto const& vertex : walk) num_sequences += visit(vertex.id, r, out); + } + if (num_sequences != m_num_sequences) { + std::cerr << "Error: expected to write " << m_num_sequences << " but written " + << num_sequences << std::endl; + throw std::runtime_error("wrong number of sequences written"); } out.close(); } @@ -320,20 +326,23 @@ struct cover { std::vector rounds; /* visit walk of index w in round of index r */ - void visit(int w, int r, std::ofstream& out) const { + uint64_t visit(int w, int r, std::ofstream& out) const { if (r > 0) { assert(size_t(w) < rounds[r].size()); auto const& walk = rounds[r][w]; - for (auto const& vertex : walk) { visit(vertex.id, r - 1, out); } - } else { // print - assert(size_t(w) < rounds[0].size()); - auto const& walk = rounds[0][w]; - for (auto const& vertex : walk) { - // out << vertex.id << ":[" << vertex.front << "," << vertex.back << "] "; - out << vertex.id << '\n'; - } - // out << '\n'; + uint64_t num_sequences = 0; + for (auto const& vertex : walk) num_sequences += visit(vertex.id, r - 1, out); + return num_sequences; + } + /* print */ + assert(size_t(w) < rounds[0].size()); + auto const& walk = rounds[0][w]; + for (auto const& vertex : walk) { + // out << vertex.id << ":[" << vertex.front << "," << vertex.back << "] "; + out << vertex.id << '\n'; } + // out << '\n'; + return walk.size(); } }; diff --git a/src/permute.cpp b/src/permute.cpp index 3aa554f..6fd3e7d 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -55,8 +55,7 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& uint64_t j = sequence.find_first_of(' ', i); if (j == std::string::npos) throw parse_runtime_error(); - char* end; - seq_len = std::strtoull(sequence.data() + i, &end, 10); + seq_len = std::strtoull(sequence.data() + i, nullptr, 10); i = j + 1; expect(sequence[i + 0], 'a'); expect(sequence[i + 1], 'b'); @@ -71,7 +70,7 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& uint64_t back = constants::invalid; for (uint64_t j = 0, prev_abundance = constants::invalid; j != seq_len - k + 1; ++j) { - uint64_t abundance = std::strtoull(sequence.data() + i, &end, 10); + uint64_t abundance = std::strtoull(sequence.data() + i, nullptr, 10); sum_of_abundances += abundance; i = sequence.find_first_of(' ', i) + 1; @@ -152,6 +151,62 @@ permute_data parse_file(std::string const& filename, build_configuration const& return data; } +void permute_and_write(std::istream& is, std::string const& output_filename, + pthash::compact_vector const& permutation) { + // std::vector filenames; + // constexpr uint64_t bytes = 2 * essentials::GB; + std::vector> buffer; // (header, dna) + + std::string header_sequence; + std::string dna_sequence; + uint64_t num_sequences = permutation.size(); + uint64_t num_bases = 0; + + for (uint64_t i = 0; i != num_sequences; ++i) { + std::getline(is, header_sequence); + std::getline(is, dna_sequence); + buffer.emplace_back(header_sequence, dna_sequence); + + num_bases += dna_sequence.size(); + + // check for bytes used and flush + + if (i != 0 and i % 100000 == 0) { + std::cout << "read " << i << " sequences, " << num_bases << " bases" << std::endl; + } + } + + std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases" << std::endl; + + std::cout << "sorting..." << std::endl; + std::sort(buffer.begin(), buffer.end(), [&](auto const& p_x, auto const& p_y) { + assert(p_x.first.front() == '>'); + assert(p_y.first.front() == '>'); + uint64_t seq_id_x = std::strtoull(p_x.first.data() + 1, nullptr, 10); + uint64_t seq_id_y = std::strtoull(p_y.first.data() + 1, nullptr, 10); + return permutation[seq_id_x] < permutation[seq_id_y]; + }); + + std::cout << "saving to file '" << output_filename << "'..." << std::endl; + std::ofstream out(output_filename.c_str()); + for (auto const& seq : buffer) out << seq.first << '\n' << seq.second << '\n'; + out.close(); +} + +void permute_and_write(std::string const& input_filename, std::string const& output_filename, + pthash::compact_vector const& permutation) { + std::ifstream is(input_filename.c_str()); + if (!is.good()) throw std::runtime_error("error in opening the file '" + input_filename + "'"); + std::cout << "reading file '" << input_filename << "'..." << std::endl; + if (util::ends_with(input_filename, ".gz")) { + zip_istream zis(is); + permute_and_write(zis, output_filename, permutation); + } else { + permute_and_write(is, output_filename, permutation); + } + is.close(); +} + int main(int argc, char** argv) { cmd_line_parser::parser parser(argc, argv); @@ -184,6 +239,8 @@ int main(int argc, char** argv) { output_filename = parser.get("output_filename"); } + std::string permutation_filename = input_filename + ".permutation"; + auto data = parse_file(input_filename, build_config); { @@ -235,10 +292,35 @@ int main(int argc, char** argv) { std::cout << "Computed lower bound: R_hi = " << R << std::endl; } - cover c(data.num_sequences, data.num_runs_abundances); - assert(data.vertices.size() == data.num_sequences); - c.compute(data.vertices); - c.save(output_filename); + { + /* compute cover */ + cover c(data.num_sequences, data.num_runs_abundances); + assert(data.vertices.size() == data.num_sequences); + c.compute(data.vertices); + c.save(permutation_filename); + std::vector().swap(data.vertices); + } + + /* permute */ + pthash::compact_vector permutation; + { + std::ifstream is(permutation_filename.c_str()); + if (!is.good()) { + throw std::runtime_error("error in opening the file '" + permutation_filename + "'"); + } + pthash::compact_vector::builder cv_builder(data.num_sequences, + std::ceil(std::log2(data.num_sequences))); + for (uint64_t i = 0; i != data.num_sequences; ++i) { + uint64_t position = 0; + is >> position; + cv_builder.set(position, i); + } + is.close(); + cv_builder.build(permutation); + } + + permute_and_write(input_filename, output_filename, permutation); + std::remove(permutation_filename.c_str()); return 0; } From 61e60f4d399e12fb30aebe8c0997415720ec2f1d Mon Sep 17 00:00:00 2001 From: jermp Date: Fri, 25 Mar 2022 10:40:12 +0100 Subject: [PATCH 21/40] faster identification of candidate vertex for matching --- include/builder/cover.hpp | 82 +++++++++++++++++++++++++++++++++++---- 1 file changed, 75 insertions(+), 7 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index f0ca73f..62836c1 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -101,9 +101,16 @@ struct cover { uint64_t num_vertices = vertices.size(); std::cout << " num_vertices " << num_vertices << std::endl; - essentials::timer_type round_timer; + essentials::timer_type init_timer; // colors init + sorting + filling abundance_map + + // add all gray vertices + essentials::timer_type unvisited_timer; // taking unvisited vertex + essentials::timer_type walk_timer; // creating walks + + essentials::timer_type round_timer; // total time of round round_timer.start(); + init_timer.start(); + /* all unvisited */ if (rounds.size() == 0) { /* remember: we removed some vertices but the id-space still spans @@ -133,11 +140,17 @@ struct cover { assert(offset == vertices.size()); } + init_timer.stop(); + uint64_t i = 0; // position of unvisited vertex in vertices - while (true) { - walk.clear(); + uint64_t total_vertices_visited_to_find_matches = 0; + uint64_t total_vertices_visited_to_failure = 0; + uint64_t total_vertices_visited_to_success = 0; + uint64_t total_vertices = 0; + while (true) { + unvisited_timer.start(); /* 1. take an unvisited vertex */ // try to visit forward @@ -155,10 +168,16 @@ struct cover { } } + unvisited_timer.stop(); + if (i == num_vertices) break; // all visited /* 2. create a new walk */ + walk_timer.start(); + walk.clear(); while (true) { + total_vertices += 1; + auto vertex = vertices[i]; uint64_t id = vertex.id; assert(colors[id] != color::black); @@ -181,8 +200,18 @@ struct cover { uint64_t candidate_i = (*it).second; // candidate position /* 3. search for a match */ + uint64_t tmp_total_vertices_visited_to_failure = 0; + uint64_t tmp_total_vertices_visited_to_success = 0; while (true) { - if (candidate_i == num_vertices) break; + total_vertices_visited_to_find_matches += 1; + tmp_total_vertices_visited_to_failure += 1; + tmp_total_vertices_visited_to_success += 1; + + if (candidate_i == num_vertices) { + total_vertices_visited_to_failure += + tmp_total_vertices_visited_to_failure; + break; + } auto candidate = vertices[candidate_i]; /* skip */ @@ -194,13 +223,15 @@ struct cover { /* checked all candidate matches */ if (candidate.front != back) { no_match_found = true; + total_vertices_visited_to_failure += + tmp_total_vertices_visited_to_failure; break; } /* match found */ if (colors[candidate.id] != color::black) { - assert((*it).second <= candidate_i); - (*it).second += 1; + total_vertices_visited_to_success += + tmp_total_vertices_visited_to_success; break; } @@ -208,13 +239,20 @@ struct cover { } assert(candidate_i <= num_vertices); + + constexpr uint64_t num_optimized_rounds_for_speed = 2; + if (rounds.size() <= num_optimized_rounds_for_speed) { + /* update candidate position in abundance_map */ + (*it).second = candidate_i; + } + if (no_match_found or candidate_i == num_vertices) break; /* valid match was found, then visit it next */ i = candidate_i; } - assert(!walk.empty()); + walk_timer.stop(); if (walk.size() == 1) { assert(colors[walk.front().id] == color::gray); // visited but not merged @@ -231,6 +269,28 @@ struct cover { walks_in_round.push_back(walk); } + if (total_vertices_visited_to_find_matches > total_vertices) { + // std::cout << " total_vertices " << total_vertices << std::endl; + // std::cout << " total_vertices_visited_to_find_matches " + // << total_vertices_visited_to_find_matches << std::endl; + // std::cout << " total_vertices_visited_to_success " + // << total_vertices_visited_to_success << std::endl; + // std::cout << " total_vertices_visited_to_failure " + // << total_vertices_visited_to_failure << std::endl; + std::cout << " avg. num. vertices visited to find a match = " + << static_cast(total_vertices_visited_to_find_matches) / + total_vertices + << std::endl; + std::cout << " avg. num. vertices visited to success = " + << static_cast(total_vertices_visited_to_success) / total_vertices + << std::endl; + std::cout << " avg. num. vertices visited to failure = " + << static_cast(total_vertices_visited_to_failure) / total_vertices + << std::endl; + } + + init_timer.start(); + /* add all gray vertices (singleton walks) */ for (auto const& v : vertices) { if (colors[v.id] == color::gray) { @@ -274,8 +334,16 @@ struct cover { std::cout << " num_mergings = " << num_mergings_in_round << std::endl; std::cout << " num_runs " << m_num_runs_abundances << std::endl; + init_timer.stop(); + round_timer.stop(); std::cout << " time: " << round_timer.elapsed() / 1000000 << " [sec]" << std::endl; + std::cout << " init: " << init_timer.elapsed() / 1000000 << " [sec]" + << std::endl; + std::cout << " walk: " << walk_timer.elapsed() / 1000000 << " [sec]" + << std::endl; + std::cout << " unvisited: " << unvisited_timer.elapsed() / 1000000 << " [sec]" + << std::endl; // std::cout << "created vertices in round " << rounds.size() << ":" << std::endl; // for (auto const& v : tmp_vertices) { // std::cout << v.id << ":[" << v.front << "," << v.back << "]\n"; From 1f7615f5908029ebcb368bd23a3f27606237987c Mon Sep 17 00:00:00 2001 From: jermp Date: Fri, 25 Mar 2022 12:06:40 +0100 Subject: [PATCH 22/40] faster identification of unvisited vertex --- include/builder/cover.hpp | 78 ++++++++++++--------------------------- 1 file changed, 23 insertions(+), 55 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index 62836c1..5038653 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include "../util.hpp" @@ -39,12 +40,15 @@ struct cover { std::unordered_map abundance_map; std::vector colors; std::vector tmp_vertices; - std::cout << "initial number of runs = " << m_num_runs_abundances << std::endl; - + std::unordered_set unvisited_vertices; walk_t walk; walks_t walks_in_round; - walk.reserve(vertices.size()); // at most - walks_in_round.reserve(vertices.size()); // at most + + unvisited_vertices.reserve(vertices.size()); // at most + walk.reserve(vertices.size()); // at most + walks_in_round.reserve(vertices.size()); // at most + + std::cout << "initial number of runs = " << m_num_runs_abundances << std::endl; { /* @@ -101,16 +105,9 @@ struct cover { uint64_t num_vertices = vertices.size(); std::cout << " num_vertices " << num_vertices << std::endl; - essentials::timer_type init_timer; // colors init + sorting + filling abundance_map + - // add all gray vertices - essentials::timer_type unvisited_timer; // taking unvisited vertex - essentials::timer_type walk_timer; // creating walks - essentials::timer_type round_timer; // total time of round round_timer.start(); - init_timer.start(); - /* all unvisited */ if (rounds.size() == 0) { /* remember: we removed some vertices but the id-space still spans @@ -123,6 +120,8 @@ struct cover { std::fill(colors.begin(), colors.end(), color::white); } + for (uint64_t i = 0; i != num_vertices; ++i) unvisited_vertices.insert(i); + std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) { if (x.front != y.front) return x.front < y.front; if (x.back != y.back) return x.back < y.back; @@ -140,54 +139,39 @@ struct cover { assert(offset == vertices.size()); } - init_timer.stop(); - - uint64_t i = 0; // position of unvisited vertex in vertices + uint64_t i = 0; // position of an unvisited vertex in vertices uint64_t total_vertices_visited_to_find_matches = 0; uint64_t total_vertices_visited_to_failure = 0; uint64_t total_vertices_visited_to_success = 0; uint64_t total_vertices = 0; - while (true) { - unvisited_timer.start(); + while (!unvisited_vertices.empty()) { /* 1. take an unvisited vertex */ - - // try to visit forward - for (; i != num_vertices; ++i) { - uint64_t id = vertices[i].id; - if (colors[id] == color::white) break; - } - - // if a vertex is not found, try to visit backward - if (i == num_vertices) { - i = 0; - for (; i != num_vertices; ++i) { - uint64_t id = vertices[i].id; - if (colors[id] == color::white) break; - } - } - - unvisited_timer.stop(); - - if (i == num_vertices) break; // all visited + i = *(unvisited_vertices.begin()); /* 2. create a new walk */ - walk_timer.start(); walk.clear(); while (true) { total_vertices += 1; auto vertex = vertices[i]; uint64_t id = vertex.id; - assert(colors[id] != color::black); + assert(colors[id] != color::black); colors[id] = color::gray; + + /* vertex has been visited, so erase it from unvisited_vertices */ + if (unvisited_vertices.find(i) != unvisited_vertices.cend()) { + unvisited_vertices.erase(i); + } + if (walk.size() > 0) { assert(walk.back().id != id); colors[walk.back().id] = color::black; colors[id] = color::black; } + walk.push_back(vertex); uint64_t back = vertex.back; @@ -252,7 +236,6 @@ struct cover { i = candidate_i; } assert(!walk.empty()); - walk_timer.stop(); if (walk.size() == 1) { assert(colors[walk.front().id] == color::gray); // visited but not merged @@ -268,15 +251,9 @@ struct cover { walk.back().back); walks_in_round.push_back(walk); } + assert(unvisited_vertices.empty()); if (total_vertices_visited_to_find_matches > total_vertices) { - // std::cout << " total_vertices " << total_vertices << std::endl; - // std::cout << " total_vertices_visited_to_find_matches " - // << total_vertices_visited_to_find_matches << std::endl; - // std::cout << " total_vertices_visited_to_success " - // << total_vertices_visited_to_success << std::endl; - // std::cout << " total_vertices_visited_to_failure " - // << total_vertices_visited_to_failure << std::endl; std::cout << " avg. num. vertices visited to find a match = " << static_cast(total_vertices_visited_to_find_matches) / total_vertices @@ -289,8 +266,6 @@ struct cover { << std::endl; } - init_timer.start(); - /* add all gray vertices (singleton walks) */ for (auto const& v : vertices) { if (colors[v.id] == color::gray) { @@ -334,16 +309,9 @@ struct cover { std::cout << " num_mergings = " << num_mergings_in_round << std::endl; std::cout << " num_runs " << m_num_runs_abundances << std::endl; - init_timer.stop(); - round_timer.stop(); std::cout << " time: " << round_timer.elapsed() / 1000000 << " [sec]" << std::endl; - std::cout << " init: " << init_timer.elapsed() / 1000000 << " [sec]" - << std::endl; - std::cout << " walk: " << walk_timer.elapsed() / 1000000 << " [sec]" - << std::endl; - std::cout << " unvisited: " << unvisited_timer.elapsed() / 1000000 << " [sec]" - << std::endl; + // std::cout << "created vertices in round " << rounds.size() << ":" << std::endl; // for (auto const& v : tmp_vertices) { // std::cout << v.id << ":[" << v.front << "," << v.back << "]\n"; From 5ac0615e196708f76f0e702e71b3d2162e905199 Mon Sep 17 00:00:00 2001 From: jermp Date: Fri, 25 Mar 2022 15:11:05 +0100 Subject: [PATCH 23/40] permuting in external memory --- include/util.hpp | 1 + src/permute.cpp | 185 ++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 160 insertions(+), 26 deletions(-) diff --git a/include/util.hpp b/include/util.hpp index ff01b2f..0727dac 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -18,6 +18,7 @@ constexpr double c = 3.0; // for PThash constexpr uint64_t min_l = 6; constexpr uint64_t max_l = 12; constexpr uint64_t most_frequent_abundance = 1; // usual value +static const std::string default_tmp_dirname("."); } // namespace constants typedef pthash::murmurhash2_64 base_hasher_type; diff --git a/src/permute.cpp b/src/permute.cpp index 6fd3e7d..ac202e9 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -151,58 +151,186 @@ permute_data parse_file(std::string const& filename, build_configuration const& return data; } +struct lines_iterator { + lines_iterator(uint8_t const* begin, uint8_t const* end) : m_begin(begin), m_end(end) { + advance_to_next(); + } + + void advance_to_next() { + m_header = read_line(); + m_sequence = read_line(); + } + + std::string const& header() const { return m_header; } + std::string const& sequence() const { return m_sequence; } + bool has_next() const { return !m_header.empty() and !m_sequence.empty(); } + +private: + uint8_t const* m_begin; + uint8_t const* m_end; + + std::string m_header; + std::string m_sequence; + + std::string read_line() { + uint8_t const* begin = m_begin; + while (m_begin != m_end and *m_begin++ != '\n') + ; + if (begin == m_begin) return ""; + return std::string(reinterpret_cast(begin), m_begin - begin - 1); + } +}; + void permute_and_write(std::istream& is, std::string const& output_filename, - pthash::compact_vector const& permutation) { - // std::vector filenames; - // constexpr uint64_t bytes = 2 * essentials::GB; + std::string const& tmp_dirname, pthash::compact_vector const& permutation) { + std::vector filenames; + constexpr uint64_t limit = 1 * essentials::GB; std::vector> buffer; // (header, dna) + std::string run_identifier = + std::to_string(pthash::clock_type::now().time_since_epoch().count()); + std::string header_sequence; std::string dna_sequence; uint64_t num_sequences = permutation.size(); uint64_t num_bases = 0; + uint64_t bytes = 0; + + auto sort_and_flush = [&]() { + if (buffer.empty()) return; + + std::cout << "sorting buffer..." << std::endl; + std::sort(buffer.begin(), buffer.end(), [&](auto const& p_x, auto const& p_y) { + assert(p_x.first.front() == '>'); + assert(p_y.first.front() == '>'); + uint64_t seq_id_x = std::strtoull(p_x.first.data() + 1, nullptr, 10); + uint64_t seq_id_y = std::strtoull(p_y.first.data() + 1, nullptr, 10); + return permutation[seq_id_x] < permutation[seq_id_y]; + }); + + std::string next_tmp_output_filename = + tmp_dirname + "/tmp.run" + run_identifier + "." + std::to_string(filenames.size()); + std::cout << "saving to file '" << next_tmp_output_filename << "'..." << std::endl; + std::ofstream out(next_tmp_output_filename.c_str()); + if (!out.is_open()) throw std::runtime_error("cannot open file"); + for (auto const& seq : buffer) out << seq.first << '\n' << seq.second << '\n'; + out.close(); + + filenames.push_back(next_tmp_output_filename); + + buffer.clear(); + bytes = 0; + }; for (uint64_t i = 0; i != num_sequences; ++i) { std::getline(is, header_sequence); std::getline(is, dna_sequence); - buffer.emplace_back(header_sequence, dna_sequence); - num_bases += dna_sequence.size(); + uint64_t seq_bytes = header_sequence.size() + dna_sequence.size() + 16; + if (bytes + seq_bytes > limit) sort_and_flush(); - // check for bytes used and flush + bytes += seq_bytes; + buffer.emplace_back(header_sequence, dna_sequence); + num_bases += dna_sequence.size(); if (i != 0 and i % 100000 == 0) { std::cout << "read " << i << " sequences, " << num_bases << " bases" << std::endl; } } + sort_and_flush(); std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases" << std::endl; - std::cout << "sorting..." << std::endl; - std::sort(buffer.begin(), buffer.end(), [&](auto const& p_x, auto const& p_y) { - assert(p_x.first.front() == '>'); - assert(p_y.first.front() == '>'); - uint64_t seq_id_x = std::strtoull(p_x.first.data() + 1, nullptr, 10); - uint64_t seq_id_y = std::strtoull(p_y.first.data() + 1, nullptr, 10); - return permutation[seq_id_x] < permutation[seq_id_y]; - }); - - std::cout << "saving to file '" << output_filename << "'..." << std::endl; - std::ofstream out(output_filename.c_str()); - for (auto const& seq : buffer) out << seq.first << '\n' << seq.second << '\n'; - out.close(); + /* merge */ + { + uint64_t num_files_to_merge = filenames.size(); + assert(num_files_to_merge > 0); + std::cout << "files to merge = " << num_files_to_merge << std::endl; + + std::ofstream out(output_filename.c_str()); + if (!out.is_open()) throw std::runtime_error("cannot open file"); + + // iterators and heap + std::vector iterators; + std::vector idx_heap; + iterators.reserve(num_files_to_merge); + idx_heap.reserve(num_files_to_merge); + + // heap functions + auto heap_idx_comparator = [&](uint32_t i, uint32_t j) { + assert(iterators[i].header().front() == '>'); + assert(iterators[j].header().front() == '>'); + uint64_t seq_id_x = std::strtoull(iterators[i].header().data() + 1, nullptr, 10); + uint64_t seq_id_y = std::strtoull(iterators[j].header().data() + 1, nullptr, 10); + return permutation[seq_id_x] < permutation[seq_id_y]; + }; + auto advance_heap_head = [&]() { + auto idx = idx_heap.front(); + iterators[idx].advance_to_next(); + if (iterators[idx].has_next()) { + // percolate down the head + uint64_t pos = 0; + uint64_t size = idx_heap.size(); + while (2 * pos + 1 < size) { + uint64_t i = 2 * pos + 1; + if (i + 1 < size and heap_idx_comparator(idx_heap[i], idx_heap[i + 1])) ++i; + if (heap_idx_comparator(idx_heap[i], idx_heap[pos])) break; + std::swap(idx_heap[pos], idx_heap[i]); + pos = i; + } + } else { + std::pop_heap(idx_heap.begin(), idx_heap.end(), heap_idx_comparator); + idx_heap.pop_back(); + } + }; + + std::vector> mm_files(num_files_to_merge); + + // create the input iterators and the heap + for (uint64_t i = 0; i != num_files_to_merge; ++i) { + auto const& filename = filenames[i]; + mm_files[i].open(filename, mm::advice::sequential); + iterators.emplace_back(mm_files[i].data(), mm_files[i].data() + mm_files[i].size()); + idx_heap.push_back(i); + } + std::make_heap(idx_heap.begin(), idx_heap.end(), heap_idx_comparator); + + uint64_t num_written_sequences = 0; + while (!idx_heap.empty()) { + auto const& it = iterators[idx_heap.front()]; + out << it.header() << '\n'; + out << it.sequence() << '\n'; + num_written_sequences += 1; + if (num_written_sequences % 100000 == 0) { + std::cout << "written sequences = " << num_written_sequences << "/" << num_sequences + << std::endl; + } + advance_heap_head(); + } + std::cout << "written sequences = " << num_written_sequences << "/" << num_sequences + << std::endl; + out.close(); + assert(num_written_sequences == num_sequences); + + /* remove tmp files */ + for (uint64_t i = 0; i != filenames.size(); ++i) { + mm_files[i].close(); + auto const& filename = filenames[i]; + std::remove(filename.c_str()); + } + } } void permute_and_write(std::string const& input_filename, std::string const& output_filename, - pthash::compact_vector const& permutation) { + std::string const& tmp_dirname, pthash::compact_vector const& permutation) { std::ifstream is(input_filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + input_filename + "'"); std::cout << "reading file '" << input_filename << "'..." << std::endl; if (util::ends_with(input_filename, ".gz")) { zip_istream zis(is); - permute_and_write(zis, output_filename, permutation); + permute_and_write(zis, output_filename, tmp_dirname, permutation); } else { - permute_and_write(is, output_filename, permutation); + permute_and_write(is, output_filename, tmp_dirname, permutation); } is.close(); } @@ -223,7 +351,10 @@ int main(int argc, char** argv) { /* optional arguments */ parser.add("output_filename", "Output file where the permuted collection will be written.", "-o", false); - // parser.add("verbose", "Verbose output during construction.", "--verbose", true); + parser.add("tmp_dirname", + "Temporary directory used for merging in external memory. Default is directory '" + + constants::default_tmp_dirname + "'.", + "-d", false); if (!parser.parse()) return 1; @@ -232,14 +363,16 @@ int main(int argc, char** argv) { build_configuration build_config; build_config.k = k; - // build_config.verbose = parser.get("verbose"); std::string output_filename = input_filename + ".permuted"; if (parser.parsed("output_filename")) { output_filename = parser.get("output_filename"); } - std::string permutation_filename = input_filename + ".permutation"; + std::string tmp_dirname = constants::default_tmp_dirname; + if (parser.parsed("tmp_dirname")) tmp_dirname = parser.get("tmp_dirname"); + + std::string permutation_filename = tmp_dirname + "/tmp.permutation"; auto data = parse_file(input_filename, build_config); @@ -319,7 +452,7 @@ int main(int argc, char** argv) { cv_builder.build(permutation); } - permute_and_write(input_filename, output_filename, permutation); + permute_and_write(input_filename, output_filename, tmp_dirname, permutation); std::remove(permutation_filename.c_str()); return 0; From 906ed741c0cefd3be896725664d044758bc4920f Mon Sep 17 00:00:00 2001 From: jermp Date: Fri, 25 Mar 2022 17:11:22 +0100 Subject: [PATCH 24/40] fix in heap comparator --- src/permute.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/permute.cpp b/src/permute.cpp index ac202e9..451420c 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -250,7 +250,7 @@ void permute_and_write(std::istream& is, std::string const& output_filename, std::ofstream out(output_filename.c_str()); if (!out.is_open()) throw std::runtime_error("cannot open file"); - // iterators and heap + // iterators and heap std::vector iterators; std::vector idx_heap; iterators.reserve(num_files_to_merge); @@ -262,7 +262,7 @@ void permute_and_write(std::istream& is, std::string const& output_filename, assert(iterators[j].header().front() == '>'); uint64_t seq_id_x = std::strtoull(iterators[i].header().data() + 1, nullptr, 10); uint64_t seq_id_y = std::strtoull(iterators[j].header().data() + 1, nullptr, 10); - return permutation[seq_id_x] < permutation[seq_id_y]; + return permutation[seq_id_x] > permutation[seq_id_y]; }; auto advance_heap_head = [&]() { auto idx = idx_heap.front(); From e078799431af4ac69278aa9dd7d62cd9eac373a9 Mon Sep 17 00:00:00 2001 From: jermp Date: Fri, 25 Mar 2022 18:52:05 +0100 Subject: [PATCH 25/40] minor --- src/permute.cpp | 50 +++++++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/src/permute.cpp b/src/permute.cpp index 451420c..ca247e8 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -168,7 +168,6 @@ struct lines_iterator { private: uint8_t const* m_begin; uint8_t const* m_end; - std::string m_header; std::string m_sequence; @@ -176,14 +175,13 @@ struct lines_iterator { uint8_t const* begin = m_begin; while (m_begin != m_end and *m_begin++ != '\n') ; - if (begin == m_begin) return ""; + if (begin == m_begin) return std::string(""); return std::string(reinterpret_cast(begin), m_begin - begin - 1); } }; void permute_and_write(std::istream& is, std::string const& output_filename, std::string const& tmp_dirname, pthash::compact_vector const& permutation) { - std::vector filenames; constexpr uint64_t limit = 1 * essentials::GB; std::vector> buffer; // (header, dna) @@ -195,6 +193,11 @@ void permute_and_write(std::istream& is, std::string const& output_filename, uint64_t num_sequences = permutation.size(); uint64_t num_bases = 0; uint64_t bytes = 0; + uint64_t num_files_to_merge = 0; + + auto get_tmp_output_filename = [&](uint64_t id) { + return tmp_dirname + "/tmp.run" + run_identifier + "." + std::to_string(id); + }; auto sort_and_flush = [&]() { if (buffer.empty()) return; @@ -208,18 +211,16 @@ void permute_and_write(std::istream& is, std::string const& output_filename, return permutation[seq_id_x] < permutation[seq_id_y]; }); - std::string next_tmp_output_filename = - tmp_dirname + "/tmp.run" + run_identifier + "." + std::to_string(filenames.size()); - std::cout << "saving to file '" << next_tmp_output_filename << "'..." << std::endl; - std::ofstream out(next_tmp_output_filename.c_str()); + auto tmp_output_filename = get_tmp_output_filename(num_files_to_merge); + std::cout << "saving to file '" << tmp_output_filename << "'..." << std::endl; + std::ofstream out(tmp_output_filename.c_str()); if (!out.is_open()) throw std::runtime_error("cannot open file"); for (auto const& seq : buffer) out << seq.first << '\n' << seq.second << '\n'; out.close(); - filenames.push_back(next_tmp_output_filename); - buffer.clear(); bytes = 0; + num_files_to_merge += 1; }; for (uint64_t i = 0; i != num_sequences; ++i) { @@ -243,20 +244,15 @@ void permute_and_write(std::istream& is, std::string const& output_filename, /* merge */ { - uint64_t num_files_to_merge = filenames.size(); assert(num_files_to_merge > 0); std::cout << "files to merge = " << num_files_to_merge << std::endl; - std::ofstream out(output_filename.c_str()); - if (!out.is_open()) throw std::runtime_error("cannot open file"); - - // iterators and heap std::vector iterators; std::vector idx_heap; iterators.reserve(num_files_to_merge); idx_heap.reserve(num_files_to_merge); + std::vector> mm_files(num_files_to_merge); - // heap functions auto heap_idx_comparator = [&](uint32_t i, uint32_t j) { assert(iterators[i].header().front() == '>'); assert(iterators[j].header().front() == '>'); @@ -264,11 +260,11 @@ void permute_and_write(std::istream& is, std::string const& output_filename, uint64_t seq_id_y = std::strtoull(iterators[j].header().data() + 1, nullptr, 10); return permutation[seq_id_x] > permutation[seq_id_y]; }; + auto advance_heap_head = [&]() { auto idx = idx_heap.front(); iterators[idx].advance_to_next(); - if (iterators[idx].has_next()) { - // percolate down the head + if (iterators[idx].has_next()) { // percolate down the head uint64_t pos = 0; uint64_t size = idx_heap.size(); while (2 * pos + 1 < size) { @@ -284,22 +280,22 @@ void permute_and_write(std::istream& is, std::string const& output_filename, } }; - std::vector> mm_files(num_files_to_merge); - - // create the input iterators and the heap + /* create the input iterators and make the heap */ for (uint64_t i = 0; i != num_files_to_merge; ++i) { - auto const& filename = filenames[i]; - mm_files[i].open(filename, mm::advice::sequential); + auto tmp_output_filename = get_tmp_output_filename(i); + mm_files[i].open(tmp_output_filename, mm::advice::sequential); iterators.emplace_back(mm_files[i].data(), mm_files[i].data() + mm_files[i].size()); idx_heap.push_back(i); } std::make_heap(idx_heap.begin(), idx_heap.end(), heap_idx_comparator); + std::ofstream out(output_filename.c_str()); + if (!out.is_open()) throw std::runtime_error("cannot open file"); + uint64_t num_written_sequences = 0; while (!idx_heap.empty()) { auto const& it = iterators[idx_heap.front()]; - out << it.header() << '\n'; - out << it.sequence() << '\n'; + out << it.header() << '\n' << it.sequence() << '\n'; num_written_sequences += 1; if (num_written_sequences % 100000 == 0) { std::cout << "written sequences = " << num_written_sequences << "/" << num_sequences @@ -313,10 +309,10 @@ void permute_and_write(std::istream& is, std::string const& output_filename, assert(num_written_sequences == num_sequences); /* remove tmp files */ - for (uint64_t i = 0; i != filenames.size(); ++i) { + for (uint64_t i = 0; i != num_files_to_merge; ++i) { mm_files[i].close(); - auto const& filename = filenames[i]; - std::remove(filename.c_str()); + auto tmp_output_filename = get_tmp_output_filename(i); + std::remove(tmp_output_filename.c_str()); } } } From d498c470206e2e9ffb1b6ca2c80382137ff93adc Mon Sep 17 00:00:00 2001 From: jermp Date: Sat, 26 Mar 2022 13:34:19 +0100 Subject: [PATCH 26/40] take into account the orientation of the vertices --- include/builder/cover.hpp | 223 +++++++++++++++++++++++++------------- src/permute.cpp | 8 +- 2 files changed, 150 insertions(+), 81 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index 5038653..313955e 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -11,8 +11,10 @@ namespace sshash { struct vertex { // We assume there are less than 2^32 sequences and that // the largest abundance fits into a 32-bit uint. - vertex(uint32_t s, uint32_t f, uint32_t b) : id(s), front(f), back(b) {} + vertex(uint32_t i, uint32_t f, uint32_t b, bool s) : id(i), front(f), back(b), sign(s) {} uint32_t id, front, back; + bool sign; // '+' --> forward + // '-' --> backward }; struct cover { @@ -41,12 +43,14 @@ struct cover { std::vector colors; std::vector tmp_vertices; std::unordered_set unvisited_vertices; + /* map from vertex id to (offset+,offset-)*/ + std::vector> id_to_offsets; walk_t walk; walks_t walks_in_round; unvisited_vertices.reserve(vertices.size()); // at most walk.reserve(vertices.size()); // at most - walks_in_round.reserve(vertices.size()); // at most + walks_in_round.reserve(2 * vertices.size()); // at most std::cout << "initial number of runs = " << m_num_runs_abundances << std::endl; @@ -86,15 +90,25 @@ struct cover { << "%)" << std::endl; if (num_special_vertices != 0) { - /* create a new vertex for next round */ + /* create new vertices for next round */ assert(!walk.empty()); - tmp_vertices.emplace_back(walks_in_round.size(), walk.front().front, - walk.back().back); + uint32_t id = walks_in_round.size(); + uint32_t front = walk.front().front; + uint32_t back = walk.back().back; + tmp_vertices.emplace_back(id, front, back, true); + tmp_vertices.emplace_back(id, back, front, false); walks_in_round.push_back(walk); - walk.clear(); } + /* now add all the vertices with backward orientation */ + num_vertices = vertices.size(); + for (uint64_t i = 0; i != num_vertices; ++i) { + auto vertex = vertices[i]; + assert(vertex.sign == true); + vertices.emplace_back(vertex.id, vertex.back, vertex.front, false); + } + timer.stop(); std::cout << " time: " << timer.elapsed() / 1000000 << " [sec]" << std::endl; } @@ -104,6 +118,7 @@ struct cover { uint64_t num_vertices = vertices.size(); std::cout << " num_vertices " << num_vertices << std::endl; + assert(num_vertices % 2 == 0); essentials::timer_type round_timer; // total time of round round_timer.start(); @@ -112,15 +127,19 @@ struct cover { if (rounds.size() == 0) { /* remember: we removed some vertices but the id-space still spans [0..m_num_sequences-1] */ + id_to_offsets.resize(m_num_sequences); colors.resize(m_num_sequences); std::fill(colors.begin(), colors.end(), color::invalid); - for (auto const& v : vertices) colors[v.id] = color::white; + for (auto const& vertex : vertices) colors[vertex.id] = color::white; } else { - colors.resize(num_vertices); + id_to_offsets.resize(num_vertices / 2); + colors.resize(num_vertices / 2); std::fill(colors.begin(), colors.end(), color::white); } - for (uint64_t i = 0; i != num_vertices; ++i) unvisited_vertices.insert(i); + for (auto const& vertex : vertices) unvisited_vertices.insert(vertex.id); + std::cout << " num_unvisited_vertices " << unvisited_vertices.size() << std::endl; + assert(unvisited_vertices.size() == num_vertices / 2); std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) { if (x.front != y.front) return x.front < y.front; @@ -128,7 +147,8 @@ struct cover { return x.id < y.id; }); - if (num_vertices > 0) { + // if (num_vertices > 0) + { uint64_t prev_front = constants::invalid; uint64_t offset = 0; for (auto const& vertex : vertices) { @@ -139,16 +159,33 @@ struct cover { assert(offset == vertices.size()); } + /* fill id_to_offsets map */ + { + uint64_t offset = 0; + for (auto const& vertex : vertices) { + if (vertex.sign) { + id_to_offsets[vertex.id].first = offset; + } else { + id_to_offsets[vertex.id].second = offset; + } + offset += 1; + } + } + uint64_t i = 0; // position of an unvisited vertex in vertices uint64_t total_vertices_visited_to_find_matches = 0; uint64_t total_vertices_visited_to_failure = 0; uint64_t total_vertices_visited_to_success = 0; uint64_t total_vertices = 0; + bool no_more_matches_possible = true; while (!unvisited_vertices.empty()) { /* 1. take an unvisited vertex */ - i = *(unvisited_vertices.begin()); + { + uint32_t unvisited_vertex_id = *(unvisited_vertices.begin()); + i = id_to_offsets[unvisited_vertex_id].first; + } /* 2. create a new walk */ walk.clear(); @@ -158,12 +195,14 @@ struct cover { auto vertex = vertices[i]; uint64_t id = vertex.id; + // std::cout << " visiting vertex " << id << " at offset " << i << std::endl; + assert(colors[id] != color::black); colors[id] = color::gray; /* vertex has been visited, so erase it from unvisited_vertices */ - if (unvisited_vertices.find(i) != unvisited_vertices.cend()) { - unvisited_vertices.erase(i); + if (unvisited_vertices.find(id) != unvisited_vertices.cend()) { + unvisited_vertices.erase(id); } if (walk.size() > 0) { @@ -174,66 +213,80 @@ struct cover { walk.push_back(vertex); - uint64_t back = vertex.back; - auto it = abundance_map.find(back); - - /* abundance back is not found, so no match is possible */ - if (it == abundance_map.cend()) break; - - bool no_match_found = false; - uint64_t candidate_i = (*it).second; // candidate position - - /* 3. search for a match */ - uint64_t tmp_total_vertices_visited_to_failure = 0; - uint64_t tmp_total_vertices_visited_to_success = 0; - while (true) { - total_vertices_visited_to_find_matches += 1; - tmp_total_vertices_visited_to_failure += 1; - tmp_total_vertices_visited_to_success += 1; - - if (candidate_i == num_vertices) { - total_vertices_visited_to_failure += - tmp_total_vertices_visited_to_failure; - break; - } - auto candidate = vertices[candidate_i]; + auto search_match = [&](uint64_t back, uint64_t candidate_i) { + bool no_match_found = false; + uint64_t tmp_total_vertices_visited_to_failure = 0; + uint64_t tmp_total_vertices_visited_to_success = 0; + + while (true) { + total_vertices_visited_to_find_matches += 1; + tmp_total_vertices_visited_to_failure += 1; + tmp_total_vertices_visited_to_success += 1; + + if (candidate_i == num_vertices) { + total_vertices_visited_to_failure += + tmp_total_vertices_visited_to_failure; + break; + } + auto candidate = vertices[candidate_i]; + + /* skip */ + if (candidate.id == id) { + candidate_i += 1; + continue; + } + + /* checked all candidate matches */ + if (candidate.front != back) { + no_match_found = true; + total_vertices_visited_to_failure += + tmp_total_vertices_visited_to_failure; + break; + } + + /* match found */ + if (colors[candidate.id] != color::black) { + total_vertices_visited_to_success += + tmp_total_vertices_visited_to_success; + break; + } - /* skip */ - if (candidate.id == id) { candidate_i += 1; - continue; } - /* checked all candidate matches */ - if (candidate.front != back) { - no_match_found = true; - total_vertices_visited_to_failure += - tmp_total_vertices_visited_to_failure; - break; - } + assert(candidate_i <= num_vertices); - /* match found */ - if (colors[candidate.id] != color::black) { - total_vertices_visited_to_success += - tmp_total_vertices_visited_to_success; - break; + constexpr uint64_t num_optimized_rounds_for_speed = 2; + if (rounds.size() <= num_optimized_rounds_for_speed) { + /* update candidate position in abundance_map */ + abundance_map[back] = candidate_i; } - candidate_i += 1; - } + if (no_match_found or candidate_i == num_vertices) return false; - assert(candidate_i <= num_vertices); + /* valid match was found, then visit it next */ + i = candidate_i; + return true; + }; - constexpr uint64_t num_optimized_rounds_for_speed = 2; - if (rounds.size() <= num_optimized_rounds_for_speed) { - /* update candidate position in abundance_map */ - (*it).second = candidate_i; + /* 3. search for a match */ + uint64_t back = vertex.back; + uint64_t candidate_i = abundance_map[back]; + bool found = search_match(back, candidate_i); + + if (!found and walk.size() == 1) { + back = vertex.front; + candidate_i = abundance_map[back]; + found = search_match(back, candidate_i); + if (found) { + /* change orientation of the vertex */ + walk[0] = {vertex.id, vertex.back, vertex.front, !vertex.sign}; + } } - if (no_match_found or candidate_i == num_vertices) break; + if (!found) break; - /* valid match was found, then visit it next */ - i = candidate_i; + no_more_matches_possible = false; } assert(!walk.empty()); @@ -246,9 +299,12 @@ struct cover { assert(std::all_of(walk.begin(), walk.end(), [&](vertex const& v) { return colors[v.id] == color::black; })); - /* create a new vertex for next round */ - tmp_vertices.emplace_back(walks_in_round.size(), walk.front().front, - walk.back().back); + /* create new vertices for next round */ + uint32_t id = walks_in_round.size(); + uint32_t front = walk.front().front; + uint32_t back = walk.back().back; + tmp_vertices.emplace_back(id, front, back, true); + tmp_vertices.emplace_back(id, back, front, false); walks_in_round.push_back(walk); } assert(unvisited_vertices.empty()); @@ -269,27 +325,38 @@ struct cover { /* add all gray vertices (singleton walks) */ for (auto const& v : vertices) { if (colors[v.id] == color::gray) { - tmp_vertices.emplace_back(walks_in_round.size(), v.front, v.back); + uint32_t id = walks_in_round.size(); + uint32_t front = v.front; + uint32_t back = v.back; + + if (v.sign) { + tmp_vertices.emplace_back(id, front, back, true); + tmp_vertices.emplace_back(id, back, front, false); + } else { + tmp_vertices.emplace_back(id, front, back, false); + tmp_vertices.emplace_back(id, back, front, true); + } + walk_t walk; walk.push_back(v); walks_in_round.push_back(walk); + + colors[v.id] = color::black; } } std::cout << " num_walks = " << walks_in_round.size() << std::endl; - bool all_singletons = true; { #ifndef NDEBUG std::fill(colors.begin(), colors.end(), color::white); #endif uint64_t num_mergings_in_round = 0; for (auto const& walk : walks_in_round) { - if (walk.size() > 1) all_singletons = false; num_mergings_in_round += walk.size() - 1; #ifndef NDEBUG uint64_t prev_back = walk.front().front; - // std::cout << "=>"; + std::cout << "=>"; for (auto const& w : walk) { if (colors[w.id] == color::black) { std::cout << "ERROR: duplicate vertex." << std::endl; @@ -299,9 +366,10 @@ struct cover { } prev_back = w.back; colors[w.id] = color::black; - // std::cout << w.id << ":[" << w.front << "," << w.back << "] "; + std::cout << w.id << ":[" << w.front << "," << w.back << "," + << (w.sign ? '+' : '-') << "] "; } - // std::cout << std::endl; + std::cout << std::endl; #endif } assert(m_num_runs_abundances > num_mergings_in_round); @@ -314,7 +382,8 @@ struct cover { // std::cout << "created vertices in round " << rounds.size() << ":" << std::endl; // for (auto const& v : tmp_vertices) { - // std::cout << v.id << ":[" << v.front << "," << v.back << "]\n"; + // std::cout << v.id << ":[" << v.front << "," << v.back << "," + // << (v.sign ? '+' : '-') << "]\n"; // } } @@ -325,9 +394,8 @@ struct cover { walks_in_round.clear(); abundance_map.clear(); - if (all_singletons and rounds.size() > 1) { - std::cout << "STOP: all walks are singletons --> no new mergings were found" - << std::endl; + if (no_more_matches_possible) { + std::cout << "STOP: no more matches possible." << std::endl; break; } } @@ -345,9 +413,7 @@ struct cover { int r = rounds.size() - 1; const auto& walks = rounds[r]; uint64_t num_sequences = 0; - for (auto const& walk : walks) { - for (auto const& vertex : walk) num_sequences += visit(vertex.id, r, out); - } + for (uint64_t w = 0; w != walks.size(); ++w) num_sequences += visit(w, r, out); if (num_sequences != m_num_sequences) { std::cerr << "Error: expected to write " << m_num_sequences << " but written " << num_sequences << std::endl; @@ -375,7 +441,8 @@ struct cover { auto const& walk = rounds[0][w]; for (auto const& vertex : walk) { // out << vertex.id << ":[" << vertex.front << "," << vertex.back << "] "; - out << vertex.id << '\n'; + out << vertex.id; + out << (vertex.sign ? " +\n" : " -\n"); } // out << '\n'; return walk.size(); diff --git a/src/permute.cpp b/src/permute.cpp index ca247e8..0c041d7 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -91,7 +91,8 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& num_sequences_diff_abs += kmers_have_different_abundances; num_sequences_all_mfa += kmers_have_all_mfa; - data.vertices.emplace_back(data.num_sequences, front, back); + constexpr bool sign = true; + data.vertices.emplace_back(data.num_sequences, front, back, sign); }; while (!is.eof()) { @@ -437,8 +438,9 @@ int main(int argc, char** argv) { if (!is.good()) { throw std::runtime_error("error in opening the file '" + permutation_filename + "'"); } - pthash::compact_vector::builder cv_builder(data.num_sequences, - std::ceil(std::log2(data.num_sequences))); + pthash::compact_vector::builder cv_builder( + data.num_sequences, + data.num_sequences == 1 ? 1 : std::ceil(std::log2(data.num_sequences))); for (uint64_t i = 0; i != data.num_sequences; ++i) { uint64_t position = 0; is >> position; From 485a19c9f70b144e218db1ec0f0a8793a8b942ed Mon Sep 17 00:00:00 2001 From: jermp Date: Sat, 26 Mar 2022 14:09:11 +0100 Subject: [PATCH 27/40] take into account the orientation of the vertices --- include/builder/cover.hpp | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index 313955e..98d4b39 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -43,8 +43,8 @@ struct cover { std::vector colors; std::vector tmp_vertices; std::unordered_set unvisited_vertices; - /* map from vertex id to (offset+,offset-)*/ - std::vector> id_to_offsets; + /* map from vertex id to offset+ into vertices */ + std::vector id_to_offset; walk_t walk; walks_t walks_in_round; @@ -127,12 +127,12 @@ struct cover { if (rounds.size() == 0) { /* remember: we removed some vertices but the id-space still spans [0..m_num_sequences-1] */ - id_to_offsets.resize(m_num_sequences); + id_to_offset.resize(m_num_sequences); colors.resize(m_num_sequences); std::fill(colors.begin(), colors.end(), color::invalid); for (auto const& vertex : vertices) colors[vertex.id] = color::white; } else { - id_to_offsets.resize(num_vertices / 2); + id_to_offset.resize(num_vertices / 2); colors.resize(num_vertices / 2); std::fill(colors.begin(), colors.end(), color::white); } @@ -147,7 +147,7 @@ struct cover { return x.id < y.id; }); - // if (num_vertices > 0) + /* fill abundance_map */ { uint64_t prev_front = constants::invalid; uint64_t offset = 0; @@ -159,15 +159,11 @@ struct cover { assert(offset == vertices.size()); } - /* fill id_to_offsets map */ + /* fill id_to_offset map */ { uint64_t offset = 0; for (auto const& vertex : vertices) { - if (vertex.sign) { - id_to_offsets[vertex.id].first = offset; - } else { - id_to_offsets[vertex.id].second = offset; - } + if (vertex.sign) id_to_offset[vertex.id] = offset; offset += 1; } } @@ -184,7 +180,7 @@ struct cover { /* 1. take an unvisited vertex */ { uint32_t unvisited_vertex_id = *(unvisited_vertices.begin()); - i = id_to_offsets[unvisited_vertex_id].first; + i = id_to_offset[unvisited_vertex_id]; } /* 2. create a new walk */ @@ -195,8 +191,6 @@ struct cover { auto vertex = vertices[i]; uint64_t id = vertex.id; - // std::cout << " visiting vertex " << id << " at offset " << i << std::endl; - assert(colors[id] != color::black); colors[id] = color::gray; @@ -256,11 +250,8 @@ struct cover { assert(candidate_i <= num_vertices); - constexpr uint64_t num_optimized_rounds_for_speed = 2; - if (rounds.size() <= num_optimized_rounds_for_speed) { - /* update candidate position in abundance_map */ - abundance_map[back] = candidate_i; - } + /* update candidate position in abundance_map */ + abundance_map[back] = candidate_i; if (no_match_found or candidate_i == num_vertices) return false; From 7c8d4c35a1d06ee8d76d9037c0eca239ff17a65e Mon Sep 17 00:00:00 2001 From: jermp Date: Sat, 26 Mar 2022 18:47:39 +0100 Subject: [PATCH 28/40] take into account the orientation of the vertices: less code; refactored some names --- include/builder/cover.hpp | 311 +++++++++++++++++--------------------- src/permute.cpp | 28 ++-- 2 files changed, 149 insertions(+), 190 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index 98d4b39..7c06655 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -8,105 +8,105 @@ namespace sshash { -struct vertex { +struct node { // We assume there are less than 2^32 sequences and that // the largest abundance fits into a 32-bit uint. - vertex(uint32_t i, uint32_t f, uint32_t b, bool s) : id(i), front(f), back(b), sign(s) {} + node(uint32_t i, uint32_t f, uint32_t b, bool s) : id(i), front(f), back(b), sign(s) {} uint32_t id, front, back; bool sign; // '+' --> forward // '-' --> backward }; struct cover { - typedef std::vector walk_t; + typedef std::vector walk_t; typedef std::vector walks_t; - enum color { - white = 0, // unvisited - gray = 1, // visited by not merged - black = 2, // visited and merged - invalid = -1 - }; - cover(uint64_t num_sequences, uint64_t num_runs_abundances) : m_num_sequences(num_sequences), m_num_runs_abundances(num_runs_abundances) {} - void compute(std::vector& vertices) { - assert(vertices.size() == m_num_sequences); + void compute(std::vector& nodes) { + assert(nodes.size() == m_num_sequences); essentials::timer_type timer; - timer.start(); /* (abundance, position of a candidate abundance with front = abundance) */ std::unordered_map abundance_map; - std::vector colors; - std::vector tmp_vertices; - std::unordered_set unvisited_vertices; - /* map from vertex id to offset+ into vertices */ + + /* visited[node.id] = true if node has been visited; false otherwise */ + std::vector visited; + + /* set of unvisited nodes */ + std::unordered_set unvisited_nodes; + + /* map from node id to offset+ into nodes */ std::vector id_to_offset; + + /* nodes created for next round */ + std::vector tmp_nodes; + walk_t walk; walks_t walks_in_round; - unvisited_vertices.reserve(vertices.size()); // at most - walk.reserve(vertices.size()); // at most - walks_in_round.reserve(2 * vertices.size()); // at most + unvisited_nodes.reserve(nodes.size()); // at most + walk.reserve(nodes.size()); // at most + walks_in_round.reserve(2 * nodes.size()); // at most std::cout << "initial number of runs = " << m_num_runs_abundances << std::endl; { /* optimize for the most frequent case: - push vertices of the form v:[mfa,mfa] to the bottom, and remove them + push nodes of the form v:[mfa,mfa] to the bottom, and remove them forming a single (usually, long) walk. - Warning: here we are assuming the mfa is also the smallest abundance. + Warning: here we are assuming the mfa is also the *smallest* abundance. */ essentials::timer_type timer; timer.start(); - std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) { + std::sort(nodes.begin(), nodes.end(), [](auto const& x, auto const& y) { if (x.front != y.front) return x.front > y.front; if (x.back != y.back) return x.back > y.back; return x.id > y.id; }); - uint64_t num_special_vertices = 0; - uint64_t num_vertices = vertices.size(); - while (!vertices.empty()) { - auto v = vertices.back(); + uint64_t num_special_nodes = 0; + uint64_t num_nodes = nodes.size(); + while (!nodes.empty()) { + auto v = nodes.back(); if (v.front == constants::most_frequent_abundance and v.back == constants::most_frequent_abundance) { walk.push_back(v); - vertices.pop_back(); - num_special_vertices += 1; + nodes.pop_back(); + num_special_nodes += 1; } else { break; } } - std::cout << "num vertices of the form v:[" << constants::most_frequent_abundance << "," - << constants::most_frequent_abundance << "] = " << num_special_vertices << "/" - << num_vertices << "(" << (num_special_vertices * 100.0) / num_vertices - << "%)" << std::endl; + std::cout << "num nodes of the form v:[" << constants::most_frequent_abundance << "," + << constants::most_frequent_abundance << "] = " << num_special_nodes << "/" + << num_nodes << "(" << (num_special_nodes * 100.0) / num_nodes << "%)" + << std::endl; - if (num_special_vertices != 0) { - /* create new vertices for next round */ + if (num_special_nodes != 0) { + /* create new nodes for next round */ assert(!walk.empty()); uint32_t id = walks_in_round.size(); uint32_t front = walk.front().front; uint32_t back = walk.back().back; - tmp_vertices.emplace_back(id, front, back, true); - tmp_vertices.emplace_back(id, back, front, false); + tmp_nodes.emplace_back(id, front, back, true); + tmp_nodes.emplace_back(id, back, front, false); walks_in_round.push_back(walk); walk.clear(); } - /* now add all the vertices with backward orientation */ - num_vertices = vertices.size(); - for (uint64_t i = 0; i != num_vertices; ++i) { - auto vertex = vertices[i]; - assert(vertex.sign == true); - vertices.emplace_back(vertex.id, vertex.back, vertex.front, false); + /* add all the nodes with backward orientation */ + num_nodes = nodes.size(); + for (uint64_t i = 0; i != num_nodes; ++i) { + auto node = nodes[i]; + assert(node.sign == true); + nodes.emplace_back(node.id, node.back, node.front, false); } timer.stop(); @@ -116,32 +116,30 @@ struct cover { while (true) { std::cout << "round " << rounds.size() << std::endl; - uint64_t num_vertices = vertices.size(); - std::cout << " num_vertices " << num_vertices << std::endl; - assert(num_vertices % 2 == 0); + uint64_t num_nodes = nodes.size(); + std::cout << " num_nodes " << num_nodes << std::endl; + assert(num_nodes % 2 == 0); essentials::timer_type round_timer; // total time of round round_timer.start(); - /* all unvisited */ if (rounds.size() == 0) { - /* remember: we removed some vertices but the id-space still spans + /* remember: we removed some nodes but the id-space still spans [0..m_num_sequences-1] */ id_to_offset.resize(m_num_sequences); - colors.resize(m_num_sequences); - std::fill(colors.begin(), colors.end(), color::invalid); - for (auto const& vertex : vertices) colors[vertex.id] = color::white; + visited.resize(m_num_sequences); } else { - id_to_offset.resize(num_vertices / 2); - colors.resize(num_vertices / 2); - std::fill(colors.begin(), colors.end(), color::white); + id_to_offset.resize(num_nodes / 2); + visited.resize(num_nodes / 2); } - for (auto const& vertex : vertices) unvisited_vertices.insert(vertex.id); - std::cout << " num_unvisited_vertices " << unvisited_vertices.size() << std::endl; - assert(unvisited_vertices.size() == num_vertices / 2); + /* all nodes unvisited */ + std::fill(visited.begin(), visited.end(), false); - std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) { + for (auto const& node : nodes) unvisited_nodes.insert(node.id); + assert(unvisited_nodes.size() == num_nodes / 2); + + std::sort(nodes.begin(), nodes.end(), [](auto const& x, auto const& y) { if (x.front != y.front) return x.front < y.front; if (x.back != y.back) return x.back < y.back; return x.id < y.id; @@ -151,78 +149,71 @@ struct cover { { uint64_t prev_front = constants::invalid; uint64_t offset = 0; - for (auto const& vertex : vertices) { - if (vertex.front != prev_front) abundance_map[vertex.front] = offset; + for (auto const& node : nodes) { + if (node.front != prev_front) abundance_map[node.front] = offset; offset += 1; - prev_front = vertex.front; + prev_front = node.front; } - assert(offset == vertices.size()); + assert(offset == nodes.size()); } /* fill id_to_offset map */ { uint64_t offset = 0; - for (auto const& vertex : vertices) { - if (vertex.sign) id_to_offset[vertex.id] = offset; + for (auto const& node : nodes) { + if (node.sign) id_to_offset[node.id] = offset; offset += 1; } } - uint64_t i = 0; // position of an unvisited vertex in vertices + uint64_t i = 0; // position of an unvisited node in nodes + bool no_more_matches_possible = true; // to stop the algorithm - uint64_t total_vertices_visited_to_find_matches = 0; - uint64_t total_vertices_visited_to_failure = 0; - uint64_t total_vertices_visited_to_success = 0; - uint64_t total_vertices = 0; - bool no_more_matches_possible = true; + /* some statistics */ + uint64_t total_nodes_visited_to_find_matches = 0; + uint64_t total_nodes_visited_to_failure = 0; + uint64_t total_nodes_visited_to_success = 0; + uint64_t total_nodes = 0; - while (!unvisited_vertices.empty()) { - /* 1. take an unvisited vertex */ + while (!unvisited_nodes.empty()) { + /* 1. take an unvisited node */ { - uint32_t unvisited_vertex_id = *(unvisited_vertices.begin()); - i = id_to_offset[unvisited_vertex_id]; + uint32_t unvisited_node_id = *(unvisited_nodes.begin()); + i = id_to_offset[unvisited_node_id]; } /* 2. create a new walk */ walk.clear(); while (true) { - total_vertices += 1; - - auto vertex = vertices[i]; - uint64_t id = vertex.id; - - assert(colors[id] != color::black); - colors[id] = color::gray; + total_nodes += 1; - /* vertex has been visited, so erase it from unvisited_vertices */ - if (unvisited_vertices.find(id) != unvisited_vertices.cend()) { - unvisited_vertices.erase(id); - } + auto node = nodes[i]; + uint64_t id = node.id; + assert(visited[id] == false); + visited[id] = true; + walk.push_back(node); - if (walk.size() > 0) { - assert(walk.back().id != id); - colors[walk.back().id] = color::black; - colors[id] = color::black; + /* node has been visited, so erase it from unvisited_nodes */ + if (unvisited_nodes.find(id) != unvisited_nodes.cend()) { + unvisited_nodes.erase(id); } - walk.push_back(vertex); - auto search_match = [&](uint64_t back, uint64_t candidate_i) { bool no_match_found = false; - uint64_t tmp_total_vertices_visited_to_failure = 0; - uint64_t tmp_total_vertices_visited_to_success = 0; + uint64_t tmp_total_nodes_visited_to_failure = 0; + uint64_t tmp_total_nodes_visited_to_success = 0; while (true) { - total_vertices_visited_to_find_matches += 1; - tmp_total_vertices_visited_to_failure += 1; - tmp_total_vertices_visited_to_success += 1; + total_nodes_visited_to_find_matches += 1; + tmp_total_nodes_visited_to_failure += 1; + tmp_total_nodes_visited_to_success += 1; - if (candidate_i == num_vertices) { - total_vertices_visited_to_failure += - tmp_total_vertices_visited_to_failure; + if (candidate_i == num_nodes) { + total_nodes_visited_to_failure += + tmp_total_nodes_visited_to_failure; break; } - auto candidate = vertices[candidate_i]; + auto candidate = nodes[candidate_i]; /* skip */ if (candidate.id == id) { @@ -233,27 +224,26 @@ struct cover { /* checked all candidate matches */ if (candidate.front != back) { no_match_found = true; - total_vertices_visited_to_failure += - tmp_total_vertices_visited_to_failure; + total_nodes_visited_to_failure += + tmp_total_nodes_visited_to_failure; break; } /* match found */ - if (colors[candidate.id] != color::black) { - total_vertices_visited_to_success += - tmp_total_vertices_visited_to_success; + if (visited[candidate.id] == false) { + total_nodes_visited_to_success += + tmp_total_nodes_visited_to_success; break; } candidate_i += 1; } - - assert(candidate_i <= num_vertices); + assert(candidate_i <= num_nodes); /* update candidate position in abundance_map */ abundance_map[back] = candidate_i; - if (no_match_found or candidate_i == num_vertices) return false; + if (no_match_found or candidate_i == num_nodes) return false; /* valid match was found, then visit it next */ i = candidate_i; @@ -261,118 +251,87 @@ struct cover { }; /* 3. search for a match */ - uint64_t back = vertex.back; + + /* first, try to match back abundance */ + uint64_t back = node.back; uint64_t candidate_i = abundance_map[back]; bool found = search_match(back, candidate_i); + /* if a match is not found and the walk is singleton, + then the only node in the walk could be matched on + front abundance */ if (!found and walk.size() == 1) { - back = vertex.front; + back = node.front; candidate_i = abundance_map[back]; found = search_match(back, candidate_i); if (found) { - /* change orientation of the vertex */ - walk[0] = {vertex.id, vertex.back, vertex.front, !vertex.sign}; + /* change orientation of the node */ + walk[0] = {node.id, node.back, node.front, !node.sign}; } } if (!found) break; - no_more_matches_possible = false; } assert(!walk.empty()); - if (walk.size() == 1) { - assert(colors[walk.front().id] == color::gray); // visited but not merged - continue; - } - - /* invariant: all vertices belonging to a walk are all black */ - assert(std::all_of(walk.begin(), walk.end(), - [&](vertex const& v) { return colors[v.id] == color::black; })); - - /* create new vertices for next round */ + /* create new nodes for next round */ uint32_t id = walks_in_round.size(); uint32_t front = walk.front().front; uint32_t back = walk.back().back; - tmp_vertices.emplace_back(id, front, back, true); - tmp_vertices.emplace_back(id, back, front, false); + tmp_nodes.emplace_back(id, front, back, true); + tmp_nodes.emplace_back(id, back, front, false); walks_in_round.push_back(walk); } - assert(unvisited_vertices.empty()); + assert(unvisited_nodes.empty()); - if (total_vertices_visited_to_find_matches > total_vertices) { - std::cout << " avg. num. vertices visited to find a match = " - << static_cast(total_vertices_visited_to_find_matches) / - total_vertices + if (total_nodes_visited_to_find_matches > total_nodes) { + std::cout << " avg. num. nodes visited to find a match = " + << static_cast(total_nodes_visited_to_find_matches) / total_nodes << std::endl; - std::cout << " avg. num. vertices visited to success = " - << static_cast(total_vertices_visited_to_success) / total_vertices + std::cout << " avg. num. nodes visited to success = " + << static_cast(total_nodes_visited_to_success) / total_nodes << std::endl; - std::cout << " avg. num. vertices visited to failure = " - << static_cast(total_vertices_visited_to_failure) / total_vertices + std::cout << " avg. num. nodes visited to failure = " + << static_cast(total_nodes_visited_to_failure) / total_nodes << std::endl; } - /* add all gray vertices (singleton walks) */ - for (auto const& v : vertices) { - if (colors[v.id] == color::gray) { - uint32_t id = walks_in_round.size(); - uint32_t front = v.front; - uint32_t back = v.back; - - if (v.sign) { - tmp_vertices.emplace_back(id, front, back, true); - tmp_vertices.emplace_back(id, back, front, false); - } else { - tmp_vertices.emplace_back(id, front, back, false); - tmp_vertices.emplace_back(id, back, front, true); - } - - walk_t walk; - walk.push_back(v); - walks_in_round.push_back(walk); - - colors[v.id] = color::black; - } - } - std::cout << " num_walks = " << walks_in_round.size() << std::endl; { #ifndef NDEBUG - std::fill(colors.begin(), colors.end(), color::white); + std::fill(visited.begin(), visited.end(), false); #endif - uint64_t num_mergings_in_round = 0; + uint64_t num_matches_in_round = 0; for (auto const& walk : walks_in_round) { - num_mergings_in_round += walk.size() - 1; + num_matches_in_round += walk.size() - 1; #ifndef NDEBUG uint64_t prev_back = walk.front().front; std::cout << "=>"; for (auto const& w : walk) { - if (colors[w.id] == color::black) { - std::cout << "ERROR: duplicate vertex." << std::endl; - } + if (visited[w.id]) std::cout << "ERROR: duplicate node." << std::endl; if (w.front != prev_back) { std::cout << "ERROR: path is broken." << std::endl; } prev_back = w.back; - colors[w.id] = color::black; + visited[w.id] = true; std::cout << w.id << ":[" << w.front << "," << w.back << "," << (w.sign ? '+' : '-') << "] "; } std::cout << std::endl; #endif } - assert(m_num_runs_abundances > num_mergings_in_round); - m_num_runs_abundances -= num_mergings_in_round; - std::cout << " num_mergings = " << num_mergings_in_round << std::endl; + assert(m_num_runs_abundances > num_matches_in_round); + m_num_runs_abundances -= num_matches_in_round; + std::cout << " num_matches = " << num_matches_in_round << std::endl; std::cout << " num_runs " << m_num_runs_abundances << std::endl; round_timer.stop(); std::cout << " time: " << round_timer.elapsed() / 1000000 << " [sec]" << std::endl; - // std::cout << "created vertices in round " << rounds.size() << ":" << std::endl; - // for (auto const& v : tmp_vertices) { + // std::cout << "created nodes in round " << rounds.size() << ":" << std::endl; + // for (auto const& v : tmp_nodes) { // std::cout << v.id << ":[" << v.front << "," << v.back << "," // << (v.sign ? '+' : '-') << "]\n"; // } @@ -380,8 +339,8 @@ struct cover { rounds.push_back(walks_in_round); - vertices.swap(tmp_vertices); - tmp_vertices.clear(); + nodes.swap(tmp_nodes); + tmp_nodes.clear(); walks_in_round.clear(); abundance_map.clear(); @@ -424,16 +383,16 @@ struct cover { assert(size_t(w) < rounds[r].size()); auto const& walk = rounds[r][w]; uint64_t num_sequences = 0; - for (auto const& vertex : walk) num_sequences += visit(vertex.id, r - 1, out); + for (auto const& node : walk) num_sequences += visit(node.id, r - 1, out); return num_sequences; } /* print */ assert(size_t(w) < rounds[0].size()); auto const& walk = rounds[0][w]; - for (auto const& vertex : walk) { - // out << vertex.id << ":[" << vertex.front << "," << vertex.back << "] "; - out << vertex.id; - out << (vertex.sign ? " +\n" : " -\n"); + for (auto const& node : walk) { + // out << node.id << ":[" << node.front << "," << node.back << "] "; + out << node.id; + out << (node.sign ? " +\n" : " -\n"); } // out << '\n'; return walk.size(); diff --git a/src/permute.cpp b/src/permute.cpp index 0c041d7..b1c00cc 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -12,7 +12,7 @@ struct permute_data { permute_data() : num_runs_abundances(0), num_sequences(0) {} uint64_t num_runs_abundances; uint64_t num_sequences; - std::vector vertices; + std::vector nodes; }; void parse_file(std::istream& is, permute_data& data, build_configuration const& build_config) { @@ -92,7 +92,7 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& num_sequences_all_mfa += kmers_have_all_mfa; constexpr bool sign = true; - data.vertices.emplace_back(data.num_sequences, front, back, sign); + data.nodes.emplace_back(data.num_sequences, front, back, sign); }; while (!is.eof()) { @@ -117,7 +117,7 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& } } - assert(data.vertices.size() == data.num_sequences); + assert(data.nodes.size() == data.num_sequences); assert(data.num_runs_abundances >= data.num_sequences); std::cout << "read " << data.num_sequences << " sequences, " << num_bases << " bases, " @@ -374,12 +374,12 @@ int main(int argc, char** argv) { auto data = parse_file(input_filename, build_config); { - uint64_t R_lower = data.num_runs_abundances - data.vertices.size() + 1; + uint64_t R_lower = data.num_runs_abundances - data.nodes.size() + 1; std::cout << "The trivial lower bound (too optimistic) assumes we are able to concatenate " "all sequences : R_lo = " << R_lower << std::endl; - std::sort(data.vertices.begin(), data.vertices.end(), [](auto const& x, auto const& y) { + std::sort(data.nodes.begin(), data.nodes.end(), [](auto const& x, auto const& y) { /* sort on front */ if (x.front != y.front) return x.front < y.front; if (x.back != y.back) return x.back < y.back; @@ -391,21 +391,21 @@ int main(int argc, char** argv) { // the largest abundance fits into a 32-bit uint. std::unordered_map abundance_map; - uint64_t prev_front = data.vertices.front().front; + uint64_t prev_front = data.nodes.front().front; uint64_t count = 0; - for (auto const& vertex : data.vertices) { - if (vertex.front != prev_front) { + for (auto const& node : data.nodes) { + if (node.front != prev_front) { abundance_map[prev_front] = count; count = 0; } count += 1; - prev_front = vertex.front; + prev_front = node.front; } abundance_map[prev_front] = count; uint64_t R = data.num_runs_abundances; - for (auto const& vertex : data.vertices) { - uint64_t back = vertex.back; + for (auto const& node : data.nodes) { + uint64_t back = node.back; auto it = abundance_map.find(back); if (it != abundance_map.cend()) { // found if ((*it).second > 0) { // if it is 0, we cannot find a match @@ -425,10 +425,10 @@ int main(int argc, char** argv) { { /* compute cover */ cover c(data.num_sequences, data.num_runs_abundances); - assert(data.vertices.size() == data.num_sequences); - c.compute(data.vertices); + assert(data.nodes.size() == data.num_sequences); + c.compute(data.nodes); c.save(permutation_filename); - std::vector().swap(data.vertices); + std::vector().swap(data.nodes); } /* permute */ From a9c8ca73548993f427a2f234174ff79b4b6cd164 Mon Sep 17 00:00:00 2001 From: jermp Date: Sun, 27 Mar 2022 12:14:55 +0200 Subject: [PATCH 29/40] take into account the orientation of the vertices: done --- include/abundances.hpp | 15 ++++---- include/builder/cover.hpp | 43 ++++++++++++++++++----- src/permute.cpp | 73 +++++++++++++++++++++++++++++++++++---- 3 files changed, 110 insertions(+), 21 deletions(-) diff --git a/include/abundances.hpp b/include/abundances.hpp index f0d3684..7a5e555 100644 --- a/include/abundances.hpp +++ b/include/abundances.hpp @@ -80,17 +80,20 @@ struct abundances { return x.first < y.first; }); - /* If this test fails, then we need to change the value of - constants::most_frequent_abundance. */ - if (m_most_frequent_abundance != m_abundances.front().first) { - throw std::runtime_error("the most frequent abundance is not " + - std::to_string(constants::most_frequent_abundance) + - " but " + std::to_string(m_abundances.front().first)); + if (m_kmer_id_interval_values.size() != 0) { // optimize_mfa + /* If this test fails, then we need to change the value of + constants::most_frequent_abundance. */ + if (m_most_frequent_abundance != m_abundances.front().first) { + throw std::runtime_error("the most frequent abundance is not " + + std::to_string(constants::most_frequent_abundance) + + " but " + std::to_string(m_abundances.front().first)); + } } uint64_t rest = num_kmers - m_abundances.front().second; std::cout << "kmers that do not have the most frequent ab: " << rest << " (" << (rest * 100.0) / num_kmers << "%)" << std::endl; + if (m_kmer_id_interval_values.size() != 0) { // optimize_mfa std::cout << "cumulative_kmer_id_interval_lengths " << m_kmer_id_interval_lengths.back() << '/' << rest << std::endl; diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index 7c06655..b664c99 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -363,7 +363,11 @@ struct cover { int r = rounds.size() - 1; const auto& walks = rounds[r]; uint64_t num_sequences = 0; - for (uint64_t w = 0; w != walks.size(); ++w) num_sequences += visit(w, r, out); + for (uint64_t w = 0; w != walks.size(); ++w) { + assert(walks[w].size() == 1); + assert(walks[w].front().sign == true); + num_sequences += visit(w, r, true, out); + } if (num_sequences != m_num_sequences) { std::cerr << "Error: expected to write " << m_num_sequences << " but written " << num_sequences << std::endl; @@ -378,23 +382,46 @@ struct cover { std::vector rounds; /* visit walk of index w in round of index r */ - uint64_t visit(int w, int r, std::ofstream& out) const { + uint64_t visit(int w, int r, bool sign, std::ofstream& out) const { if (r > 0) { assert(size_t(w) < rounds[r].size()); auto const& walk = rounds[r][w]; uint64_t num_sequences = 0; - for (auto const& node : walk) num_sequences += visit(node.id, r - 1, out); + for (auto const& node : walk) { + /* + + & + = + + - & - = + + + & - = - + - & + = - + */ + bool new_sign = sign * node.sign; + if (sign == false and node.sign == false) new_sign = true; + num_sequences += visit(node.id, r - 1, new_sign, out); + } return num_sequences; } /* print */ assert(size_t(w) < rounds[0].size()); auto const& walk = rounds[0][w]; - for (auto const& node : walk) { - // out << node.id << ":[" << node.front << "," << node.back << "] "; - out << node.id; - out << (node.sign ? " +\n" : " -\n"); + + if (sign) { + /* print forward */ + for (uint64_t i = 0; i != walk.size(); ++i) { + auto const& node = walk[i]; + // out << node.id << ":[" << node.front << "," << node.back << "] "; + out << node.id; + out << (node.sign ? " 1\n" : " 0\n"); + } + } else { + /* print backward and change orientation */ + for (int64_t i = walk.size() - 1; i >= 0; --i) { + auto const& node = walk[i]; + // out << node.id << ":[" << node.back << "," << node.front << "] "; + out << node.id; + out << (node.sign ? " 0\n" : " 1\n"); + } } - // out << '\n'; + return walk.size(); } }; diff --git a/src/permute.cpp b/src/permute.cpp index b1c00cc..6764eae 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -181,8 +181,41 @@ struct lines_iterator { } }; +void reverse_header(std::string const& input, std::string& output, uint64_t k) { + // Example header: + // >2 LN:i:61 ab:Z:4 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 + // Expected output: + // >2 LN:i:61 ab:Z:1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 4 4 + + /* skip validation of input string */ + + uint64_t i = 0; + i = input.find_first_of(' ', i); + if (i == std::string::npos) throw parse_runtime_error(); + + i += 6; // skip ' LN:i': + uint64_t j = input.find_first_of(' ', i); + if (j == std::string::npos) throw parse_runtime_error(); + + uint64_t seq_len = std::strtoull(input.data() + i, nullptr, 10); + i = j + 6; // skip ' ab:Z:' + output.append(input.data(), input.data() + i); + + std::vector abundances; + abundances.reserve(seq_len - k + 1); + for (uint64_t j = 0; j != seq_len - k + 1; ++j) { + uint64_t abundance = std::strtoull(input.data() + i, nullptr, 10); + abundances.push_back(abundance); + i = input.find_first_of(' ', i) + 1; + } + + std::reverse(abundances.begin(), abundances.end()); + for (auto abundance : abundances) { output.append(std::to_string(abundance) + " "); } +} + void permute_and_write(std::istream& is, std::string const& output_filename, - std::string const& tmp_dirname, pthash::compact_vector const& permutation) { + std::string const& tmp_dirname, pthash::compact_vector const& permutation, + pthash::bit_vector const& signs, uint64_t k) { constexpr uint64_t limit = 1 * essentials::GB; std::vector> buffer; // (header, dna) @@ -228,6 +261,25 @@ void permute_and_write(std::istream& is, std::string const& output_filename, std::getline(is, header_sequence); std::getline(is, dna_sequence); + if (!signs[i]) { + /* compute reverse complement of dna_sequence + and reverse the abundances in header_sequence */ + std::string dna_sequence_rc(dna_sequence.size(), 0); + std::string header_sequence_r; + + util::compute_reverse_complement(dna_sequence.data(), dna_sequence_rc.data(), + dna_sequence.size()); + reverse_header(header_sequence, header_sequence_r, k); + + // std::cout << "header_sequence '" << header_sequence << "'" << std::endl; + // std::cout << "dna_sequence '" << dna_sequence << "'" << std::endl; + // std::cout << "header_sequence_r '" << header_sequence_r << "'" << std::endl; + // std::cout << "dna_sequence_rc '" << dna_sequence_rc << "'" << std::endl; + + dna_sequence.swap(dna_sequence_rc); + header_sequence.swap(header_sequence_r); + } + uint64_t seq_bytes = header_sequence.size() + dna_sequence.size() + 16; if (bytes + seq_bytes > limit) sort_and_flush(); @@ -319,15 +371,16 @@ void permute_and_write(std::istream& is, std::string const& output_filename, } void permute_and_write(std::string const& input_filename, std::string const& output_filename, - std::string const& tmp_dirname, pthash::compact_vector const& permutation) { + std::string const& tmp_dirname, pthash::compact_vector const& permutation, + pthash::bit_vector const& signs, uint64_t k) { std::ifstream is(input_filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + input_filename + "'"); std::cout << "reading file '" << input_filename << "'..." << std::endl; if (util::ends_with(input_filename, ".gz")) { zip_istream zis(is); - permute_and_write(zis, output_filename, tmp_dirname, permutation); + permute_and_write(zis, output_filename, tmp_dirname, permutation, signs, k); } else { - permute_and_write(is, output_filename, tmp_dirname, permutation); + permute_and_write(is, output_filename, tmp_dirname, permutation, signs, k); } is.close(); } @@ -431,8 +484,8 @@ int main(int argc, char** argv) { std::vector().swap(data.nodes); } - /* permute */ pthash::compact_vector permutation; + pthash::bit_vector signs; { std::ifstream is(permutation_filename.c_str()); if (!is.good()) { @@ -441,17 +494,23 @@ int main(int argc, char** argv) { pthash::compact_vector::builder cv_builder( data.num_sequences, data.num_sequences == 1 ? 1 : std::ceil(std::log2(data.num_sequences))); + pthash::bit_vector_builder bv_builder(data.num_sequences); for (uint64_t i = 0; i != data.num_sequences; ++i) { uint64_t position = 0; + bool sign = 0; is >> position; + is >> sign; cv_builder.set(position, i); + bv_builder.set(position, sign); } is.close(); cv_builder.build(permutation); + signs.build(&bv_builder); } - permute_and_write(input_filename, output_filename, tmp_dirname, permutation); - std::remove(permutation_filename.c_str()); + /* permute */ + permute_and_write(input_filename, output_filename, tmp_dirname, permutation, signs, k); + // std::remove(permutation_filename.c_str()); return 0; } From 0f8dd889a1938a60d1268789db33c4ffde196858 Mon Sep 17 00:00:00 2001 From: jermp Date: Sun, 27 Mar 2022 12:39:19 +0200 Subject: [PATCH 30/40] take into account the orientation of the vertices: done --- src/permute.cpp | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/permute.cpp b/src/permute.cpp index 6764eae..28016b9 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -223,7 +223,11 @@ void permute_and_write(std::istream& is, std::string const& output_filename, std::to_string(pthash::clock_type::now().time_since_epoch().count()); std::string header_sequence; + std::string header_sequence_r; + std::string dna_sequence; + std::string dna_sequence_rc; + uint64_t num_sequences = permutation.size(); uint64_t num_bases = 0; uint64_t bytes = 0; @@ -264,18 +268,11 @@ void permute_and_write(std::istream& is, std::string const& output_filename, if (!signs[i]) { /* compute reverse complement of dna_sequence and reverse the abundances in header_sequence */ - std::string dna_sequence_rc(dna_sequence.size(), 0); - std::string header_sequence_r; - + dna_sequence_rc.resize(dna_sequence.size()); + header_sequence_r.clear(); util::compute_reverse_complement(dna_sequence.data(), dna_sequence_rc.data(), dna_sequence.size()); reverse_header(header_sequence, header_sequence_r, k); - - // std::cout << "header_sequence '" << header_sequence << "'" << std::endl; - // std::cout << "dna_sequence '" << dna_sequence << "'" << std::endl; - // std::cout << "header_sequence_r '" << header_sequence_r << "'" << std::endl; - // std::cout << "dna_sequence_rc '" << dna_sequence_rc << "'" << std::endl; - dna_sequence.swap(dna_sequence_rc); header_sequence.swap(header_sequence_r); } @@ -508,9 +505,9 @@ int main(int argc, char** argv) { signs.build(&bv_builder); } - /* permute */ + /* permute and save to output file */ permute_and_write(input_filename, output_filename, tmp_dirname, permutation, signs, k); - // std::remove(permutation_filename.c_str()); + std::remove(permutation_filename.c_str()); return 0; } From 8d8e02fb9a3ff1ad405b4f7d64ae63d967f0b3d9 Mon Sep 17 00:00:00 2001 From: jermp Date: Sun, 27 Mar 2022 19:31:58 +0200 Subject: [PATCH 31/40] another strategy to compute R_hi --- include/builder/cover.hpp | 2 +- src/permute.cpp | 39 ++++++++++++++++++++++----------------- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index b664c99..8780d5f 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -394,7 +394,7 @@ struct cover { + & - = - - & + = - */ - bool new_sign = sign * node.sign; + bool new_sign = sign and node.sign; if (sign == false and node.sign == false) new_sign = true; num_sequences += visit(node.id, r - 1, new_sign, out); } diff --git a/src/permute.cpp b/src/permute.cpp index 28016b9..8aea66b 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -424,10 +424,10 @@ int main(int argc, char** argv) { auto data = parse_file(input_filename, build_config); { - uint64_t R_lower = data.num_runs_abundances - data.nodes.size() + 1; + uint64_t R_lo = data.num_runs_abundances - data.nodes.size() + 1; std::cout << "The trivial lower bound (too optimistic) assumes we are able to concatenate " "all sequences : R_lo = " - << R_lower << std::endl; + << R_lo << std::endl; std::sort(data.nodes.begin(), data.nodes.end(), [](auto const& x, auto const& y) { /* sort on front */ @@ -436,11 +436,14 @@ int main(int argc, char** argv) { return x.id < y.id; }); - /* (abundance, num_seqs_with_front=abundance) */ + /* (back_abundance, num_seqs_with_front=back_abundance) */ // We assume there are less than 2^32 sequences and that // the largest abundance fits into a 32-bit uint. std::unordered_map abundance_map; + /* (back_abundance, num_seqs_with_back=back_abundance) */ + std::unordered_map back_abundance_freqs; + uint64_t prev_front = data.nodes.front().front; uint64_t count = 0; for (auto const& node : data.nodes) { @@ -452,24 +455,26 @@ int main(int argc, char** argv) { prev_front = node.front; } abundance_map[prev_front] = count; + // for (auto const& p : abundance_map) { + // std::cout << "ab:" << p.first << " - potential matches: " << p.second << std::endl; + // } - uint64_t R = data.num_runs_abundances; for (auto const& node : data.nodes) { - uint64_t back = node.back; - auto it = abundance_map.find(back); - if (it != abundance_map.cend()) { // found - if ((*it).second > 0) { // if it is 0, we cannot find a match - - /* We clearly cannot create more mergings than num_sequences - 1, - thus R cannot be lower than R_lower. */ - if (R == R_lower) break; - - (*it).second -= 1; - R -= 1; - } + auto it = back_abundance_freqs.find(node.back); + if (it == back_abundance_freqs.cend()) { + back_abundance_freqs[node.back] = 1; + } else { + (*it).second += 1; + } + } + uint64_t R_hi = R_lo; + for (auto const& p : back_abundance_freqs) { + // std::cout << "ab:" << p.first << " - freq:" << p.second << std::endl; + if (p.second == 1 and abundance_map.find(p.first) == abundance_map.cend()) { + R_hi += 1; } } - std::cout << "Computed lower bound: R_hi = " << R << std::endl; + std::cout << "Computed lower bound: R_hi = " << R_hi << std::endl; } { From 4c3ac4f88b9cb692250116b631192edb8208a891 Mon Sep 17 00:00:00 2001 From: jermp Date: Mon, 28 Mar 2022 11:06:25 +0200 Subject: [PATCH 32/40] another strategy to compute R_hi --- src/permute.cpp | 58 +++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/src/permute.cpp b/src/permute.cpp index 8aea66b..28e6b33 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -429,52 +429,54 @@ int main(int argc, char** argv) { "all sequences : R_lo = " << R_lo << std::endl; - std::sort(data.nodes.begin(), data.nodes.end(), [](auto const& x, auto const& y) { - /* sort on front */ - if (x.front != y.front) return x.front < y.front; - if (x.back != y.back) return x.back < y.back; - return x.id < y.id; - }); - - /* (back_abundance, num_seqs_with_front=back_abundance) */ // We assume there are less than 2^32 sequences and that // the largest abundance fits into a 32-bit uint. - std::unordered_map abundance_map; + + /* (front_abundance, num_seqs_with_front=front_abundance) */ + std::unordered_map front_abundance_freqs; /* (back_abundance, num_seqs_with_back=back_abundance) */ std::unordered_map back_abundance_freqs; - uint64_t prev_front = data.nodes.front().front; - uint64_t count = 0; for (auto const& node : data.nodes) { - if (node.front != prev_front) { - abundance_map[prev_front] = count; - count = 0; + auto it_front = front_abundance_freqs.find(node.front); + if (it_front == front_abundance_freqs.cend()) { + front_abundance_freqs[node.front] = 1; + } else { + (*it_front).second += 1; } - count += 1; - prev_front = node.front; - } - abundance_map[prev_front] = count; - // for (auto const& p : abundance_map) { - // std::cout << "ab:" << p.first << " - potential matches: " << p.second << std::endl; - // } - - for (auto const& node : data.nodes) { - auto it = back_abundance_freqs.find(node.back); - if (it == back_abundance_freqs.cend()) { + auto it_back = back_abundance_freqs.find(node.back); + if (it_back == back_abundance_freqs.cend()) { back_abundance_freqs[node.back] = 1; } else { - (*it).second += 1; + (*it_back).second += 1; } } + uint64_t R_hi = R_lo; for (auto const& p : back_abundance_freqs) { - // std::cout << "ab:" << p.first << " - freq:" << p.second << std::endl; - if (p.second == 1 and abundance_map.find(p.first) == abundance_map.cend()) { + /* if there is only one occurrence of this back abundance + and it does not appear in any front abundance, then it cannot be matched */ + if (p.second == 1 and + front_abundance_freqs.find(p.first) == front_abundance_freqs.cend()) { R_hi += 1; } } std::cout << "Computed lower bound: R_hi = " << R_hi << std::endl; + + /* + note that we do NOT do the same for front_abundance_freqs, + namely the following piece of code: + + for (auto const& p : front_abundance_freqs) { + if (p.second == 1 and + back_abundance_freqs.find(p.first) == back_abundance_freqs.cend()) { + R_hi += 1; + } + } + + because otherwise we will end up in potentially counting twice the number of cuts. + */ } { From 7662a6b068b4501e41b978a3211fcdcb2a26eeef Mon Sep 17 00:00:00 2001 From: jermp Date: Mon, 28 Mar 2022 11:15:09 +0200 Subject: [PATCH 33/40] minor --- include/builder/build.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/builder/build.cpp b/include/builder/build.cpp index af4a359..3abb581 100644 --- a/include/builder/build.cpp +++ b/include/builder/build.cpp @@ -132,8 +132,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b uint64_t j = sequence.find_first_of(' ', i); if (j == std::string::npos) throw parse_runtime_error(); - char* end; - seq_len = std::strtoull(sequence.data() + i, &end, 10); + seq_len = std::strtoull(sequence.data() + i, nullptr, 10); i = j + 1; expect(sequence[i + 0], 'a'); expect(sequence[i + 1], 'b'); @@ -150,7 +149,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b for (uint64_t j = 0, num_kmers = data.num_kmers, prev_ab = constants::invalid; j != seq_len - k + 1; ++j, ++num_kmers) { - uint64_t ab = std::strtoull(sequence.data() + i, &end, 10); + uint64_t ab = std::strtoull(sequence.data() + i, nullptr, 10); i = sequence.find_first_of(' ', i) + 1; if (ab != constants::most_frequent_abundance) kmers_have_all_mfa = false; From e6a10945c5f1f544fa30b99d51d373729d7b7573 Mon Sep 17 00:00:00 2001 From: jermp Date: Mon, 28 Mar 2022 14:00:10 +0200 Subject: [PATCH 34/40] refined R_lo --- src/permute.cpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/permute.cpp b/src/permute.cpp index 28e6b33..edcb85e 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -12,6 +12,7 @@ struct permute_data { permute_data() : num_runs_abundances(0), num_sequences(0) {} uint64_t num_runs_abundances; uint64_t num_sequences; + uint64_t num_distinct_abundances; std::vector nodes; }; @@ -27,6 +28,9 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& uint64_t num_sequences_all_mfa = 0; // num sequences whose kmers have same abundance == mfa data.num_runs_abundances = 0; data.num_sequences = 0; + data.num_distinct_abundances = 0; + + std::unordered_set distinct_abundances; // count number of distinct abundances auto parse_header = [&]() { if (sequence.empty()) return; @@ -74,6 +78,8 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& sum_of_abundances += abundance; i = sequence.find_first_of(' ', i) + 1; + distinct_abundances.insert(abundance); + /* set front and back */ if (j == 0) front = abundance; if (j == seq_len - k) back = abundance; @@ -119,9 +125,11 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& assert(data.nodes.size() == data.num_sequences); assert(data.num_runs_abundances >= data.num_sequences); + data.num_distinct_abundances = distinct_abundances.size(); std::cout << "read " << data.num_sequences << " sequences, " << num_bases << " bases, " << num_kmers << " kmers" << std::endl; + std::cout << "found " << data.num_distinct_abundances << " distint abundances" << std::endl; std::cout << "sum_of_abundances " << sum_of_abundances << std::endl; std::cout << "num_sequences whose kmers have different abundances: " << num_sequences_diff_abs << "/" << data.num_sequences << " (" @@ -424,10 +432,16 @@ int main(int argc, char** argv) { auto data = parse_file(input_filename, build_config); { + /* R_lo = max(c,r-p+1) + where c = num. distinct abundances + r = num. runs in the abundances + p = num. sequences */ uint64_t R_lo = data.num_runs_abundances - data.nodes.size() + 1; - std::cout << "The trivial lower bound (too optimistic) assumes we are able to concatenate " - "all sequences : R_lo = " - << R_lo << std::endl; + if (R_lo < data.num_distinct_abundances) { + /* clearly we cannot have less than data.num_distinct_abundances runs */ + R_lo = data.num_distinct_abundances; + } + std::cout << "R_lo = " << R_lo << std::endl; // We assume there are less than 2^32 sequences and that // the largest abundance fits into a 32-bit uint. @@ -462,7 +476,7 @@ int main(int argc, char** argv) { R_hi += 1; } } - std::cout << "Computed lower bound: R_hi = " << R_hi << std::endl; + std::cout << "R_hi = " << R_hi << std::endl; /* note that we do NOT do the same for front_abundance_freqs, From 98168097e7a8a13da1f51932c3af1e181383d739 Mon Sep 17 00:00:00 2001 From: jermp Date: Tue, 29 Mar 2022 18:18:51 +0200 Subject: [PATCH 35/40] refined computation of lower bound: commit for reference --- include/builder/cover.hpp | 32 ++--- src/permute.cpp | 249 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 253 insertions(+), 28 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index 8780d5f..acf30c0 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -306,21 +306,23 @@ struct cover { uint64_t num_matches_in_round = 0; for (auto const& walk : walks_in_round) { num_matches_in_round += walk.size() - 1; -#ifndef NDEBUG - uint64_t prev_back = walk.front().front; - std::cout << "=>"; - for (auto const& w : walk) { - if (visited[w.id]) std::cout << "ERROR: duplicate node." << std::endl; - if (w.front != prev_back) { - std::cout << "ERROR: path is broken." << std::endl; - } - prev_back = w.back; - visited[w.id] = true; - std::cout << w.id << ":[" << w.front << "," << w.back << "," - << (w.sign ? '+' : '-') << "] "; - } - std::cout << std::endl; -#endif + // #ifndef NDEBUG + // if (no_more_matches_possible) { + // // uint64_t prev_back = walk.front().front; + // std::cout << "=>"; + // for (auto const& w : walk) { + // // if (visited[w.id]) std::cout << "ERROR: duplicate node." << + // // std::endl; if (w.front != prev_back) { + // // std::cout << "ERROR: path is broken." << std::endl; + // // } + // // prev_back = w.back; + // // visited[w.id] = true; + // std::cout << w.id << ":[" << w.front << "," << w.back << "," + // << (w.sign ? '+' : '-') << "] "; + // } + // std::cout << std::endl; + // // #endif + // } } assert(m_num_runs_abundances > num_matches_in_round); m_num_runs_abundances -= num_matches_in_round; diff --git a/src/permute.cpp b/src/permute.cpp index edcb85e..41c21a7 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -13,9 +13,16 @@ struct permute_data { uint64_t num_runs_abundances; uint64_t num_sequences; uint64_t num_distinct_abundances; + uint64_t num_distinct_links; std::vector nodes; }; +struct pair_hash { + inline uint64_t operator()(std::pair const& v) const { + return static_cast(v.first) * 31 + static_cast(v.second); + } +}; + void parse_file(std::istream& is, permute_data& data, build_configuration const& build_config) { std::string sequence; uint64_t k = build_config.k; @@ -32,6 +39,10 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& std::unordered_set distinct_abundances; // count number of distinct abundances + std::unordered_map, uint32_t, pair_hash> distinct_links_freq; + std::unordered_map indegree; + std::unordered_map outdegree; + auto parse_header = [&]() { if (sequence.empty()) return; @@ -99,6 +110,33 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& constexpr bool sign = true; data.nodes.emplace_back(data.num_sequences, front, back, sign); + + if (front != back) { + { + auto it = indegree.find(back); + if (it != indegree.cend()) { + (*it).second += 1; + } else { + indegree[back] = 1; + } + } + { + auto it = outdegree.find(front); + if (it != outdegree.cend()) { + (*it).second += 1; + } else { + outdegree[front] = 1; + } + } + { + auto it = distinct_links_freq.find({front, back}); + if (it != distinct_links_freq.cend()) { + (*it).second += 1; + } else { + distinct_links_freq[{front, back}] = 1; + } + } + } }; while (!is.eof()) { @@ -126,6 +164,140 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& assert(data.nodes.size() == data.num_sequences); assert(data.num_runs_abundances >= data.num_sequences); data.num_distinct_abundances = distinct_abundances.size(); + data.num_distinct_links = distinct_links_freq.size(); + + { + /* (front_abundance, num_seqs_with_front=front_abundance) */ + std::unordered_map front_abundance_freqs; + + /* (back_abundance, num_seqs_with_back=back_abundance) */ + std::unordered_map back_abundance_freqs; + + for (auto const& node : data.nodes) { + auto it_front = front_abundance_freqs.find(node.front); + if (it_front == front_abundance_freqs.cend()) { + front_abundance_freqs[node.front] = 1; + } else { + (*it_front).second += 1; + } + auto it_back = back_abundance_freqs.find(node.back); + if (it_back == back_abundance_freqs.cend()) { + back_abundance_freqs[node.back] = 1; + } else { + (*it_back).second += 1; + } + } + + std::vector distinct_abundances_vec; + distinct_abundances_vec.reserve(distinct_abundances.size()); + for (auto x : distinct_abundances) distinct_abundances_vec.push_back(x); + std::sort(distinct_abundances_vec.begin(), distinct_abundances_vec.end()); + + uint64_t will_appear = 0; + for (auto ab : distinct_abundances_vec) { + int64_t freq_front = 0; + int64_t freq_back = 0; + auto it_front = front_abundance_freqs.find(ab); + auto it_back = back_abundance_freqs.find(ab); + if (it_front != front_abundance_freqs.cend()) { freq_front = (*it_front).second; } + if (it_back != back_abundance_freqs.cend()) { freq_back = (*it_back).second; } + + /* abundance only appears for internal kmers */ + if (freq_front == 0 and freq_back == 0) continue; + + // std::cout << "ab:" << ab << " - freq_front:" << freq_front + // << " - freq_back:" << freq_back; + + uint64_t in_count = 0; + uint64_t out_count = 0; + auto it_in_count = indegree.find(ab); + auto it_out_count = outdegree.find(ab); + if (it_in_count != indegree.cend()) { in_count = (*it_in_count).second; } + if (it_out_count != outdegree.cend()) { out_count = (*it_out_count).second; } + + // std::cout << " - indegree:" << in_count << " - outdegree:" << out_count; + + /* special case */ + if (in_count == 0 and out_count == 0) { + // std::cout << " - **2**\n"; + will_appear += 2; // will appear as singleton, so count twice + continue; + } + + uint64_t diff = std::abs(freq_front - freq_back); + if (diff % 2 == 0) { + // std::cout << " - 0\n"; + } else { + // std::cout << " - 1\n"; + will_appear += 1; + } + } + + uint64_t num_runs_abundances_internal = data.num_runs_abundances - data.nodes.size(); + std::cout << "will_appear = " << will_appear / 2 << std::endl; + std::cout << "computed lower bound = " << (num_runs_abundances_internal + will_appear / 2) + << std::endl; + + // uint64_t cuts = 0; + // for (uint64_t i = 0; i != distinct_abundances_vec.size(); ++i) { + // uint64_t front = distinct_abundances_vec[i]; + // if (i + 1 == distinct_abundances_vec.size()) break; + // uint64_t back = distinct_abundances_vec[i + 1]; + + // std::cout << "trying to glue [" << front << "," << back << "]; "; + + // int64_t freq_forward = 0; // signed + // int64_t freq_bckward = 0; // signed + // auto it_forward = distinct_links_freq.find({front, back}); + // auto it_bckward = distinct_links_freq.find({back, front}); + // if (it_forward != distinct_links_freq.cend()) { freq_forward = (*it_forward).second; + // } if (it_bckward != distinct_links_freq.cend()) { freq_bckward = + // (*it_bckward).second; } + + // std::cout << "freq of link [" << front << "," << back << "] = " << freq_forward << "; + // "; std::cout << "freq of link [" << back << "," << front << "] = " << freq_bckward << + // "; "; + + // /* if the difference in abs value is even, then the link + // disappears and we cannot glue anymore, i.e., we originate a cut */ + // uint64_t diff = std::abs(freq_forward - freq_bckward); + // if (diff % 2 == 0) { + // // std::cout << "NOT GLUEABLE"; + // // cuts += 1; + + // int64_t freq_front = 0; // signed + // int64_t freq_back = 0; // signed + // auto it_front = front_abundance_freqs.find(front); + // auto it_back = back_abundance_freqs.find(front); + // if (it_front != front_abundance_freqs.cend()) { freq_front = (*it_front).second; + // } if (it_back != back_abundance_freqs.cend()) { freq_back = (*it_back).second; } + + // bool front_appears = (std::abs(freq_front - freq_back) % 2 != 0); + // // or (freq_front == 1 and freq_back == 1); + + // freq_front = 0; // signed + // freq_back = 0; // signed + // it_front = front_abundance_freqs.find(back); + // it_back = back_abundance_freqs.find(back); + // if (it_front != front_abundance_freqs.cend()) { freq_front = (*it_front).second; + // } if (it_back != back_abundance_freqs.cend()) { freq_back = (*it_back).second; } + + // bool back_appears = (std::abs(freq_front - freq_back) % 2 != 0); + // // or (freq_front == 1 and freq_back == 1); + + // if (front_appears and back_appears) { // and link is absent + // std::cout << "NOT GLUEABLE"; + // cuts += 1; + // } + + // } else { + // std::cout << "glueable"; + // } + + // std::cout << '\n'; + // } + // std::cout << "cuts = " << cuts << std::endl; + } std::cout << "read " << data.num_sequences << " sequences, " << num_bases << " bases, " << num_kmers << " kmers" << std::endl; @@ -436,47 +608,95 @@ int main(int argc, char** argv) { where c = num. distinct abundances r = num. runs in the abundances p = num. sequences */ + uint64_t num_runs_abundances_internal = data.num_runs_abundances - data.nodes.size(); + std::cout << "num_runs_abundances = " << data.num_runs_abundances << std::endl; + std::cout << "num_runs_abundances_internal = " << num_runs_abundances_internal << std::endl; + std::cout << "num_distinct_links = " << data.num_distinct_links << std::endl; + uint64_t R_lo = data.num_runs_abundances - data.nodes.size() + 1; + uint64_t R_hi = R_lo; if (R_lo < data.num_distinct_abundances) { /* clearly we cannot have less than data.num_distinct_abundances runs */ R_lo = data.num_distinct_abundances; + // R_hi = data.num_runs_abundances_internal; + R_hi = data.num_distinct_abundances + num_runs_abundances_internal - + data.num_distinct_links; } - std::cout << "R_lo = " << R_lo << std::endl; - + // else { // We assume there are less than 2^32 sequences and that // the largest abundance fits into a 32-bit uint. /* (front_abundance, num_seqs_with_front=front_abundance) */ - std::unordered_map front_abundance_freqs; + std::unordered_map> front_abundance_freqs; /* (back_abundance, num_seqs_with_back=back_abundance) */ - std::unordered_map back_abundance_freqs; + std::unordered_map> back_abundance_freqs; + uint64_t offset = 0; for (auto const& node : data.nodes) { auto it_front = front_abundance_freqs.find(node.front); if (it_front == front_abundance_freqs.cend()) { - front_abundance_freqs[node.front] = 1; + front_abundance_freqs[node.front] = {1, offset}; } else { - (*it_front).second += 1; + (*it_front).second.first += 1; + (*it_front).second.second = -1; } auto it_back = back_abundance_freqs.find(node.back); if (it_back == back_abundance_freqs.cend()) { - back_abundance_freqs[node.back] = 1; + back_abundance_freqs[node.back] = {1, offset}; } else { - (*it_back).second += 1; + (*it_back).second.first += 1; + (*it_back).second.second = -1; } + ++offset; } - uint64_t R_hi = R_lo; + // std::cout << "front_abundance_freqs.size() = " << front_abundance_freqs.size() << + // std::endl; std::cout << "back_abundance_freqs.size() = " << back_abundance_freqs.size() + // << std::endl; + + // for (auto const& p : back_abundance_freqs) { + // std::cout << "back-ab:" << p.first << " - freq:" << p.second.first << std::endl; + // } + // for (auto const& p : front_abundance_freqs) { + // std::cout << "front-ab:" << p.first << " - freq:" << p.second.first << std::endl; + // } + for (auto const& p : back_abundance_freqs) { /* if there is only one occurrence of this back abundance and it does not appear in any front abundance, then it cannot be matched */ - if (p.second == 1 and - front_abundance_freqs.find(p.first) == front_abundance_freqs.cend()) { - R_hi += 1; + if (p.second.first == 1) { + auto it = front_abundance_freqs.find(p.first); + if (it == front_abundance_freqs.cend()) { R_hi += 1; } + // else { + // if ((*it).second.first == 1 and p.second.second == (*it).second.second) { + // // uint64_t offset = p.second.second; + // // auto node = data.nodes[offset]; + // // std::cout << "node " << node.id << ":[" << node.front << "," << + // // node.back + // // << "]" << std::endl; + // R_hi += 1; + // } + // } } + // else { + // auto it = front_abundance_freqs.find(p.first); + // if (it != front_abundance_freqs.cend()) { + // // abundance is present both as back and front + + // // and appears an even number of times, will originate a walk + // // with both sides unmatchable + // if ((*it).second.first % 2 == 0 and p.second.first % 2 == 0) { R_hi += 1; } + // } + // } } - std::cout << "R_hi = " << R_hi << std::endl; + + // for (auto const& p : front_abundance_freqs) { + // if (p.second.first == 1 and + // back_abundance_freqs.find(p.first) == back_abundance_freqs.cend()) { + // R_hi += 1; + // } + // } /* note that we do NOT do the same for front_abundance_freqs, @@ -491,6 +711,9 @@ int main(int argc, char** argv) { because otherwise we will end up in potentially counting twice the number of cuts. */ + // } + std::cout << "R_lo = " << R_lo << std::endl; + std::cout << "R_hi = " << R_hi << std::endl; } { From 10a4b71585478f917b3dc0bf38c1a93203c0d07b Mon Sep 17 00:00:00 2001 From: jermp Date: Tue, 29 Mar 2022 18:56:52 +0200 Subject: [PATCH 36/40] refined computation of lower bound: cleaned code --- include/builder/cover.hpp | 38 ++--- src/permute.cpp | 315 ++++++++------------------------------ 2 files changed, 83 insertions(+), 270 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index acf30c0..3dc681d 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -306,23 +306,21 @@ struct cover { uint64_t num_matches_in_round = 0; for (auto const& walk : walks_in_round) { num_matches_in_round += walk.size() - 1; - // #ifndef NDEBUG - // if (no_more_matches_possible) { - // // uint64_t prev_back = walk.front().front; - // std::cout << "=>"; - // for (auto const& w : walk) { - // // if (visited[w.id]) std::cout << "ERROR: duplicate node." << - // // std::endl; if (w.front != prev_back) { - // // std::cout << "ERROR: path is broken." << std::endl; - // // } - // // prev_back = w.back; - // // visited[w.id] = true; - // std::cout << w.id << ":[" << w.front << "," << w.back << "," - // << (w.sign ? '+' : '-') << "] "; - // } - // std::cout << std::endl; - // // #endif - // } +#ifndef NDEBUG + uint64_t prev_back = walk.front().front; + // std::cout << "=>"; + for (auto const& w : walk) { + if (visited[w.id]) std::cout << "ERROR: duplicate node." << std::endl; + if (w.front != prev_back) { + std::cout << "ERROR: path is broken." << std::endl; + } + prev_back = w.back; + visited[w.id] = true; + // std::cout << w.id << ":[" << w.front << "," << w.back << "," + // << (w.sign ? '+' : '-') << "] "; + } + // std::cout << std::endl; +#endif } assert(m_num_runs_abundances > num_matches_in_round); m_num_runs_abundances -= num_matches_in_round; @@ -331,12 +329,6 @@ struct cover { round_timer.stop(); std::cout << " time: " << round_timer.elapsed() / 1000000 << " [sec]" << std::endl; - - // std::cout << "created nodes in round " << rounds.size() << ":" << std::endl; - // for (auto const& v : tmp_nodes) { - // std::cout << v.id << ":[" << v.front << "," << v.back << "," - // << (v.sign ? '+' : '-') << "]\n"; - // } } rounds.push_back(walks_in_round); diff --git a/src/permute.cpp b/src/permute.cpp index 41c21a7..3a84f71 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -12,8 +12,6 @@ struct permute_data { permute_data() : num_runs_abundances(0), num_sequences(0) {} uint64_t num_runs_abundances; uint64_t num_sequences; - uint64_t num_distinct_abundances; - uint64_t num_distinct_links; std::vector nodes; }; @@ -33,15 +31,12 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& uint64_t sum_of_abundances = 0; uint64_t num_sequences_diff_abs = 0; // num sequences whose kmers have different abundances uint64_t num_sequences_all_mfa = 0; // num sequences whose kmers have same abundance == mfa + data.num_runs_abundances = 0; data.num_sequences = 0; - data.num_distinct_abundances = 0; - - std::unordered_set distinct_abundances; // count number of distinct abundances - std::unordered_map, uint32_t, pair_hash> distinct_links_freq; - std::unordered_map indegree; - std::unordered_map outdegree; + /* count number of distinct abundances */ + std::unordered_set distinct_abundances; auto parse_header = [&]() { if (sequence.empty()) return; @@ -110,33 +105,6 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& constexpr bool sign = true; data.nodes.emplace_back(data.num_sequences, front, back, sign); - - if (front != back) { - { - auto it = indegree.find(back); - if (it != indegree.cend()) { - (*it).second += 1; - } else { - indegree[back] = 1; - } - } - { - auto it = outdegree.find(front); - if (it != outdegree.cend()) { - (*it).second += 1; - } else { - outdegree[front] = 1; - } - } - { - auto it = distinct_links_freq.find({front, back}); - if (it != distinct_links_freq.cend()) { - (*it).second += 1; - } else { - distinct_links_freq[{front, back}] = 1; - } - } - } }; while (!is.eof()) { @@ -163,9 +131,25 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& assert(data.nodes.size() == data.num_sequences); assert(data.num_runs_abundances >= data.num_sequences); - data.num_distinct_abundances = distinct_abundances.size(); - data.num_distinct_links = distinct_links_freq.size(); + std::cout << "read " << data.num_sequences << " sequences, " << num_bases << " bases, " + << num_kmers << " kmers" << std::endl; + std::cout << "found " << distinct_abundances.size() << " distinct abundances" << std::endl; + std::cout << "sum_of_abundances " << sum_of_abundances << std::endl; + std::cout << "num_sequences whose kmers have different abundances: " << num_sequences_diff_abs + << "/" << data.num_sequences << " (" + << (num_sequences_diff_abs * 100.0) / data.num_sequences << "%)" << std::endl; + std::cout << "num_sequences whose kmers all have the same abundance != mfa: " + << (data.num_sequences - num_sequences_diff_abs) << "/" << data.num_sequences << " (" + << ((data.num_sequences - num_sequences_diff_abs) * 100.0) / data.num_sequences + << "%)" << std::endl; + std::cout << "num_sequences whose kmers all have the same abundance == mfa: " + << num_sequences_all_mfa << "/" << (data.num_sequences - num_sequences_diff_abs) + << " (" + << (num_sequences_all_mfa * 100.0) / (data.num_sequences - num_sequences_diff_abs) + << "%)" << std::endl; + + /* computation of lower bound to the number of final abundance runs */ { /* (front_abundance, num_seqs_with_front=front_abundance) */ std::unordered_map front_abundance_freqs; @@ -173,19 +157,45 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& /* (back_abundance, num_seqs_with_back=back_abundance) */ std::unordered_map back_abundance_freqs; + /* (front_abundance, num_edges_outgoing_from) */ + std::unordered_map outdegree; + + /* (back_abundance, num_edges_incoming_in) */ + std::unordered_map indegree; + for (auto const& node : data.nodes) { - auto it_front = front_abundance_freqs.find(node.front); + uint64_t front = node.front; + uint64_t back = node.back; + auto it_front = front_abundance_freqs.find(front); if (it_front == front_abundance_freqs.cend()) { - front_abundance_freqs[node.front] = 1; + front_abundance_freqs[front] = 1; } else { (*it_front).second += 1; } - auto it_back = back_abundance_freqs.find(node.back); + auto it_back = back_abundance_freqs.find(back); if (it_back == back_abundance_freqs.cend()) { - back_abundance_freqs[node.back] = 1; + back_abundance_freqs[back] = 1; } else { (*it_back).second += 1; } + if (front != back) { + { + auto it = indegree.find(back); + if (it != indegree.cend()) { + (*it).second += 1; + } else { + indegree[back] = 1; + } + } + { + auto it = outdegree.find(front); + if (it != outdegree.cend()) { + (*it).second += 1; + } else { + outdegree[front] = 1; + } + } + } } std::vector distinct_abundances_vec; @@ -193,7 +203,7 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& for (auto x : distinct_abundances) distinct_abundances_vec.push_back(x); std::sort(distinct_abundances_vec.begin(), distinct_abundances_vec.end()); - uint64_t will_appear = 0; + uint64_t num_end_points = 0; for (auto ab : distinct_abundances_vec) { int64_t freq_front = 0; int64_t freq_back = 0; @@ -205,9 +215,6 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& /* abundance only appears for internal kmers */ if (freq_front == 0 and freq_back == 0) continue; - // std::cout << "ab:" << ab << " - freq_front:" << freq_front - // << " - freq_back:" << freq_back; - uint64_t in_count = 0; uint64_t out_count = 0; auto it_in_count = indegree.find(ab); @@ -215,106 +222,33 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& if (it_in_count != indegree.cend()) { in_count = (*it_in_count).second; } if (it_out_count != outdegree.cend()) { out_count = (*it_out_count).second; } - // std::cout << " - indegree:" << in_count << " - outdegree:" << out_count; + // std::cout << "ab:" << ab << " - freq_front:" << freq_front + // << " - freq_back:" << freq_back << " - indegree:" << in_count + // << " - outdegree:" << out_count << std::endl; - /* special case */ + /* special case: abundance will appear as singletion, so count twice */ if (in_count == 0 and out_count == 0) { - // std::cout << " - **2**\n"; - will_appear += 2; // will appear as singleton, so count twice + num_end_points += 2; continue; } + /* if the excess of frequency is odd, then the abundance will appear as end-point */ uint64_t diff = std::abs(freq_front - freq_back); - if (diff % 2 == 0) { - // std::cout << " - 0\n"; - } else { - // std::cout << " - 1\n"; - will_appear += 1; - } + if (diff % 2 == 1) num_end_points += 1; } + uint64_t num_distinct_abundances = distinct_abundances.size(); uint64_t num_runs_abundances_internal = data.num_runs_abundances - data.nodes.size(); - std::cout << "will_appear = " << will_appear / 2 << std::endl; - std::cout << "computed lower bound = " << (num_runs_abundances_internal + will_appear / 2) - << std::endl; + std::cout << "num_end_points = " << num_end_points / 2 << std::endl; + std::cout << "num_runs_abundances = " << data.num_runs_abundances << std::endl; + std::cout << "num_runs_abundances_internal = " << num_runs_abundances_internal << std::endl; - // uint64_t cuts = 0; - // for (uint64_t i = 0; i != distinct_abundances_vec.size(); ++i) { - // uint64_t front = distinct_abundances_vec[i]; - // if (i + 1 == distinct_abundances_vec.size()) break; - // uint64_t back = distinct_abundances_vec[i + 1]; - - // std::cout << "trying to glue [" << front << "," << back << "]; "; - - // int64_t freq_forward = 0; // signed - // int64_t freq_bckward = 0; // signed - // auto it_forward = distinct_links_freq.find({front, back}); - // auto it_bckward = distinct_links_freq.find({back, front}); - // if (it_forward != distinct_links_freq.cend()) { freq_forward = (*it_forward).second; - // } if (it_bckward != distinct_links_freq.cend()) { freq_bckward = - // (*it_bckward).second; } - - // std::cout << "freq of link [" << front << "," << back << "] = " << freq_forward << "; - // "; std::cout << "freq of link [" << back << "," << front << "] = " << freq_bckward << - // "; "; - - // /* if the difference in abs value is even, then the link - // disappears and we cannot glue anymore, i.e., we originate a cut */ - // uint64_t diff = std::abs(freq_forward - freq_bckward); - // if (diff % 2 == 0) { - // // std::cout << "NOT GLUEABLE"; - // // cuts += 1; - - // int64_t freq_front = 0; // signed - // int64_t freq_back = 0; // signed - // auto it_front = front_abundance_freqs.find(front); - // auto it_back = back_abundance_freqs.find(front); - // if (it_front != front_abundance_freqs.cend()) { freq_front = (*it_front).second; - // } if (it_back != back_abundance_freqs.cend()) { freq_back = (*it_back).second; } - - // bool front_appears = (std::abs(freq_front - freq_back) % 2 != 0); - // // or (freq_front == 1 and freq_back == 1); - - // freq_front = 0; // signed - // freq_back = 0; // signed - // it_front = front_abundance_freqs.find(back); - // it_back = back_abundance_freqs.find(back); - // if (it_front != front_abundance_freqs.cend()) { freq_front = (*it_front).second; - // } if (it_back != back_abundance_freqs.cend()) { freq_back = (*it_back).second; } - - // bool back_appears = (std::abs(freq_front - freq_back) % 2 != 0); - // // or (freq_front == 1 and freq_back == 1); - - // if (front_appears and back_appears) { // and link is absent - // std::cout << "NOT GLUEABLE"; - // cuts += 1; - // } - - // } else { - // std::cout << "glueable"; - // } - - // std::cout << '\n'; - // } - // std::cout << "cuts = " << cuts << std::endl; + uint64_t R_lo = + std::max(num_distinct_abundances, num_runs_abundances_internal + 1); + uint64_t R_hi = num_runs_abundances_internal + num_end_points / 2; + std::cout << "R_lo = " << R_lo << std::endl; + std::cout << "R_hi = " << R_hi << std::endl; } - - std::cout << "read " << data.num_sequences << " sequences, " << num_bases << " bases, " - << num_kmers << " kmers" << std::endl; - std::cout << "found " << data.num_distinct_abundances << " distint abundances" << std::endl; - std::cout << "sum_of_abundances " << sum_of_abundances << std::endl; - std::cout << "num_sequences whose kmers have different abundances: " << num_sequences_diff_abs - << "/" << data.num_sequences << " (" - << (num_sequences_diff_abs * 100.0) / data.num_sequences << "%)" << std::endl; - std::cout << "num_sequences whose kmers all have the same abundance != mfa: " - << (data.num_sequences - num_sequences_diff_abs) << "/" << data.num_sequences << " (" - << ((data.num_sequences - num_sequences_diff_abs) * 100.0) / data.num_sequences - << "%)" << std::endl; - std::cout << "num_sequences whose kmers all have the same abundance == mfa: " - << num_sequences_all_mfa << "/" << (data.num_sequences - num_sequences_diff_abs) - << " (" - << (num_sequences_all_mfa * 100.0) / (data.num_sequences - num_sequences_diff_abs) - << "%)" << std::endl; } permute_data parse_file(std::string const& filename, build_configuration const& build_config) { @@ -603,119 +537,6 @@ int main(int argc, char** argv) { auto data = parse_file(input_filename, build_config); - { - /* R_lo = max(c,r-p+1) - where c = num. distinct abundances - r = num. runs in the abundances - p = num. sequences */ - uint64_t num_runs_abundances_internal = data.num_runs_abundances - data.nodes.size(); - std::cout << "num_runs_abundances = " << data.num_runs_abundances << std::endl; - std::cout << "num_runs_abundances_internal = " << num_runs_abundances_internal << std::endl; - std::cout << "num_distinct_links = " << data.num_distinct_links << std::endl; - - uint64_t R_lo = data.num_runs_abundances - data.nodes.size() + 1; - uint64_t R_hi = R_lo; - if (R_lo < data.num_distinct_abundances) { - /* clearly we cannot have less than data.num_distinct_abundances runs */ - R_lo = data.num_distinct_abundances; - // R_hi = data.num_runs_abundances_internal; - R_hi = data.num_distinct_abundances + num_runs_abundances_internal - - data.num_distinct_links; - } - // else { - // We assume there are less than 2^32 sequences and that - // the largest abundance fits into a 32-bit uint. - - /* (front_abundance, num_seqs_with_front=front_abundance) */ - std::unordered_map> front_abundance_freqs; - - /* (back_abundance, num_seqs_with_back=back_abundance) */ - std::unordered_map> back_abundance_freqs; - - uint64_t offset = 0; - for (auto const& node : data.nodes) { - auto it_front = front_abundance_freqs.find(node.front); - if (it_front == front_abundance_freqs.cend()) { - front_abundance_freqs[node.front] = {1, offset}; - } else { - (*it_front).second.first += 1; - (*it_front).second.second = -1; - } - auto it_back = back_abundance_freqs.find(node.back); - if (it_back == back_abundance_freqs.cend()) { - back_abundance_freqs[node.back] = {1, offset}; - } else { - (*it_back).second.first += 1; - (*it_back).second.second = -1; - } - ++offset; - } - - // std::cout << "front_abundance_freqs.size() = " << front_abundance_freqs.size() << - // std::endl; std::cout << "back_abundance_freqs.size() = " << back_abundance_freqs.size() - // << std::endl; - - // for (auto const& p : back_abundance_freqs) { - // std::cout << "back-ab:" << p.first << " - freq:" << p.second.first << std::endl; - // } - // for (auto const& p : front_abundance_freqs) { - // std::cout << "front-ab:" << p.first << " - freq:" << p.second.first << std::endl; - // } - - for (auto const& p : back_abundance_freqs) { - /* if there is only one occurrence of this back abundance - and it does not appear in any front abundance, then it cannot be matched */ - if (p.second.first == 1) { - auto it = front_abundance_freqs.find(p.first); - if (it == front_abundance_freqs.cend()) { R_hi += 1; } - // else { - // if ((*it).second.first == 1 and p.second.second == (*it).second.second) { - // // uint64_t offset = p.second.second; - // // auto node = data.nodes[offset]; - // // std::cout << "node " << node.id << ":[" << node.front << "," << - // // node.back - // // << "]" << std::endl; - // R_hi += 1; - // } - // } - } - // else { - // auto it = front_abundance_freqs.find(p.first); - // if (it != front_abundance_freqs.cend()) { - // // abundance is present both as back and front - - // // and appears an even number of times, will originate a walk - // // with both sides unmatchable - // if ((*it).second.first % 2 == 0 and p.second.first % 2 == 0) { R_hi += 1; } - // } - // } - } - - // for (auto const& p : front_abundance_freqs) { - // if (p.second.first == 1 and - // back_abundance_freqs.find(p.first) == back_abundance_freqs.cend()) { - // R_hi += 1; - // } - // } - - /* - note that we do NOT do the same for front_abundance_freqs, - namely the following piece of code: - - for (auto const& p : front_abundance_freqs) { - if (p.second == 1 and - back_abundance_freqs.find(p.first) == back_abundance_freqs.cend()) { - R_hi += 1; - } - } - - because otherwise we will end up in potentially counting twice the number of cuts. - */ - // } - std::cout << "R_lo = " << R_lo << std::endl; - std::cout << "R_hi = " << R_hi << std::endl; - } - { /* compute cover */ cover c(data.num_sequences, data.num_runs_abundances); From 223b38ab2450fd025700806db0e696694eff9f11 Mon Sep 17 00:00:00 2001 From: jermp Date: Wed, 30 Mar 2022 12:06:53 +0200 Subject: [PATCH 37/40] minor --- src/permute.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/permute.cpp b/src/permute.cpp index 3a84f71..1a2ebdf 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -205,8 +205,8 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& uint64_t num_end_points = 0; for (auto ab : distinct_abundances_vec) { - int64_t freq_front = 0; - int64_t freq_back = 0; + uint64_t freq_front = 0; + uint64_t freq_back = 0; auto it_front = front_abundance_freqs.find(ab); auto it_back = back_abundance_freqs.find(ab); if (it_front != front_abundance_freqs.cend()) { freq_front = (*it_front).second; } @@ -226,15 +226,15 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& // << " - freq_back:" << freq_back << " - indegree:" << in_count // << " - outdegree:" << out_count << std::endl; - /* special case: abundance will appear as singletion, so count twice */ + /* special case: abundance will appear as singleton, so count twice */ if (in_count == 0 and out_count == 0) { num_end_points += 2; continue; } /* if the excess of frequency is odd, then the abundance will appear as end-point */ - uint64_t diff = std::abs(freq_front - freq_back); - if (diff % 2 == 1) num_end_points += 1; + uint64_t sum = freq_front + freq_back; + if (sum % 2 == 1) num_end_points += 1; } uint64_t num_distinct_abundances = distinct_abundances.size(); From 0ccbe1aca89103fa3275cd350255d311684f1d11 Mon Sep 17 00:00:00 2001 From: jermp Date: Thu, 31 Mar 2022 10:26:53 +0200 Subject: [PATCH 38/40] simplified computation of lower bound --- include/builder/cover.hpp | 6 +-- src/permute.cpp | 102 ++++++++++++-------------------------- 2 files changed, 34 insertions(+), 74 deletions(-) diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index 3dc681d..abe84f3 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -357,11 +357,7 @@ struct cover { int r = rounds.size() - 1; const auto& walks = rounds[r]; uint64_t num_sequences = 0; - for (uint64_t w = 0; w != walks.size(); ++w) { - assert(walks[w].size() == 1); - assert(walks[w].front().sign == true); - num_sequences += visit(w, r, true, out); - } + for (uint64_t w = 0; w != walks.size(); ++w) num_sequences += visit(w, r, true, out); if (num_sequences != m_num_sequences) { std::cerr << "Error: expected to write " << m_num_sequences << " but written " << num_sequences << std::endl; diff --git a/src/permute.cpp b/src/permute.cpp index 1a2ebdf..44904b2 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -151,101 +151,65 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& /* computation of lower bound to the number of final abundance runs */ { - /* (front_abundance, num_seqs_with_front=front_abundance) */ - std::unordered_map front_abundance_freqs; - - /* (back_abundance, num_seqs_with_back=back_abundance) */ - std::unordered_map back_abundance_freqs; - - /* (front_abundance, num_edges_outgoing_from) */ - std::unordered_map outdegree; + struct counts { + uint32_t freq; + uint32_t indegree; + uint32_t outdegree; + }; - /* (back_abundance, num_edges_incoming_in) */ - std::unordered_map indegree; + std::unordered_map endpoint_counts; for (auto const& node : data.nodes) { uint64_t front = node.front; uint64_t back = node.back; - auto it_front = front_abundance_freqs.find(front); - if (it_front == front_abundance_freqs.cend()) { - front_abundance_freqs[front] = 1; - } else { - (*it_front).second += 1; + { + auto it = endpoint_counts.find(front); + if (it == endpoint_counts.cend()) { + endpoint_counts[front] = {1, 0, 0}; + } else { + (*it).second.freq += 1; + } } - auto it_back = back_abundance_freqs.find(back); - if (it_back == back_abundance_freqs.cend()) { - back_abundance_freqs[back] = 1; - } else { - (*it_back).second += 1; + { + auto it = endpoint_counts.find(back); + if (it == endpoint_counts.cend()) { + endpoint_counts[back] = {1, 0, 0}; + } else { + (*it).second.freq += 1; + } } if (front != back) { - { - auto it = indegree.find(back); - if (it != indegree.cend()) { - (*it).second += 1; - } else { - indegree[back] = 1; - } - } - { - auto it = outdegree.find(front); - if (it != outdegree.cend()) { - (*it).second += 1; - } else { - outdegree[front] = 1; - } - } + endpoint_counts[back].indegree += 1; + endpoint_counts[front].outdegree += 1; } } - std::vector distinct_abundances_vec; - distinct_abundances_vec.reserve(distinct_abundances.size()); - for (auto x : distinct_abundances) distinct_abundances_vec.push_back(x); - std::sort(distinct_abundances_vec.begin(), distinct_abundances_vec.end()); - - uint64_t num_end_points = 0; - for (auto ab : distinct_abundances_vec) { - uint64_t freq_front = 0; - uint64_t freq_back = 0; - auto it_front = front_abundance_freqs.find(ab); - auto it_back = back_abundance_freqs.find(ab); - if (it_front != front_abundance_freqs.cend()) { freq_front = (*it_front).second; } - if (it_back != back_abundance_freqs.cend()) { freq_back = (*it_back).second; } - - /* abundance only appears for internal kmers */ - if (freq_front == 0 and freq_back == 0) continue; - - uint64_t in_count = 0; - uint64_t out_count = 0; - auto it_in_count = indegree.find(ab); - auto it_out_count = outdegree.find(ab); - if (it_in_count != indegree.cend()) { in_count = (*it_in_count).second; } - if (it_out_count != outdegree.cend()) { out_count = (*it_out_count).second; } - - // std::cout << "ab:" << ab << " - freq_front:" << freq_front - // << " - freq_back:" << freq_back << " - indegree:" << in_count - // << " - outdegree:" << out_count << std::endl; + uint64_t num_endpoints = 0; + for (auto const& p : endpoint_counts) { + uint64_t freq = p.second.freq; + uint64_t indegree = p.second.indegree; + uint64_t outdegree = p.second.outdegree; /* special case: abundance will appear as singleton, so count twice */ - if (in_count == 0 and out_count == 0) { - num_end_points += 2; + if (indegree == 0 and outdegree == 0) { + num_endpoints += 2; continue; } /* if the excess of frequency is odd, then the abundance will appear as end-point */ - uint64_t sum = freq_front + freq_back; - if (sum % 2 == 1) num_end_points += 1; + if (freq % 2 == 1) num_endpoints += 1; } uint64_t num_distinct_abundances = distinct_abundances.size(); uint64_t num_runs_abundances_internal = data.num_runs_abundances - data.nodes.size(); - std::cout << "num_end_points = " << num_end_points / 2 << std::endl; + uint64_t num_walks = num_endpoints / 2; + std::cout << "(estimated) num_walks = " << num_walks << std::endl; std::cout << "num_runs_abundances = " << data.num_runs_abundances << std::endl; std::cout << "num_runs_abundances_internal = " << num_runs_abundances_internal << std::endl; uint64_t R_lo = std::max(num_distinct_abundances, num_runs_abundances_internal + 1); - uint64_t R_hi = num_runs_abundances_internal + num_end_points / 2; + uint64_t R_hi = num_runs_abundances_internal + num_walks; std::cout << "R_lo = " << R_lo << std::endl; std::cout << "R_hi = " << R_hi << std::endl; } From 350ee9215c4b4098135e4dea6c4a7b912cf0fd68 Mon Sep 17 00:00:00 2001 From: jermp Date: Sat, 2 Apr 2022 15:12:38 +0200 Subject: [PATCH 39/40] some refactoring --- external/pthash | 2 +- include/abundances.hpp | 109 +++----------------------------------- include/builder/build.cpp | 81 ++++------------------------ include/builder/cover.hpp | 13 ++++- include/ef_sequence.hpp | 9 ++-- include/util.hpp | 6 +-- src/build.cpp | 3 -- src/permute.cpp | 8 +-- 8 files changed, 37 insertions(+), 194 deletions(-) diff --git a/external/pthash b/external/pthash index f0a0fb9..3f1deb0 160000 --- a/external/pthash +++ b/external/pthash @@ -1 +1 @@ -Subproject commit f0a0fb929ecfe1d925d88bc9bf41f7b4eaf0516b +Subproject commit 3f1deb0e1ec132274af8ef55478915209f608b2c diff --git a/include/abundances.hpp b/include/abundances.hpp index 7a5e555..4fd4536 100644 --- a/include/abundances.hpp +++ b/include/abundances.hpp @@ -1,20 +1,17 @@ #pragma once #include -#include // count the distinct abundances +#include // count the distinct abundances with freq information + #include "ef_sequence.hpp" namespace sshash { struct abundances { struct builder { - builder() : m_most_frequent_abundance(0) {} + builder() {} - void init(uint64_t most_frequent_abundance) { - m_most_frequent_abundance = most_frequent_abundance; - m_kmer_id_interval_lengths.push_back(0); - m_abundance_interval_lengths.push_back(0); - } + void init() { m_abundance_interval_lengths.push_back(0); } void eat(uint64_t abundance) { assert(abundance > 0); @@ -26,28 +23,17 @@ struct abundances { } } - void push_kmer_id_interval(uint64_t value, uint64_t length) { - m_kmer_id_interval_values.push_back(value); - m_kmer_id_interval_lengths.push_back(m_kmer_id_interval_lengths.back() + length); - } - void push_abundance_interval(uint64_t value, uint64_t length) { m_abundance_interval_values.push_back(value); m_abundance_interval_lengths.push_back(m_abundance_interval_lengths.back() + length); } - uint64_t num_kmer_id_intervals() const { return m_kmer_id_interval_values.size(); } uint64_t num_abundance_intervals() const { return m_abundance_interval_values.size(); } void finalize(uint64_t num_kmers) { - assert( - std::is_sorted(m_kmer_id_interval_values.begin(), m_kmer_id_interval_values.end())); - assert(std::is_sorted(m_kmer_id_interval_lengths.begin(), - m_kmer_id_interval_lengths.end())); assert(std::is_sorted(m_abundance_interval_lengths.begin(), m_abundance_interval_lengths.end())); - std::cout << "num_kmer_id_intervals " << num_kmer_id_intervals() << std::endl; std::cout << "num_abundance_intervals " << num_abundance_intervals() << std::endl; uint64_t num_distinct_abundances = m_abundances_map.size(); @@ -80,27 +66,10 @@ struct abundances { return x.first < y.first; }); - if (m_kmer_id_interval_values.size() != 0) { // optimize_mfa - /* If this test fails, then we need to change the value of - constants::most_frequent_abundance. */ - if (m_most_frequent_abundance != m_abundances.front().first) { - throw std::runtime_error("the most frequent abundance is not " + - std::to_string(constants::most_frequent_abundance) + - " but " + std::to_string(m_abundances.front().first)); - } - } - uint64_t rest = num_kmers - m_abundances.front().second; std::cout << "kmers that do not have the most frequent ab: " << rest << " (" << (rest * 100.0) / num_kmers << "%)" << std::endl; - if (m_kmer_id_interval_values.size() != 0) { // optimize_mfa - std::cout << "cumulative_kmer_id_interval_lengths " - << m_kmer_id_interval_lengths.back() << '/' << rest << std::endl; - std::cout << "cumulative_abundance_interval_lengths " - << m_abundance_interval_lengths.back() << '/' << rest << std::endl; - } - m_abundance_dictionary_builder.resize(num_distinct_abundances, std::ceil(std::log2(largest_ab + 1))); for (uint64_t id = 0; id != num_distinct_abundances; ++id) { @@ -111,13 +80,6 @@ struct abundances { } void build(abundances& index) { - std::swap(index.m_most_frequent_abundance, m_most_frequent_abundance); - - index.m_kmer_id_interval_values.encode(m_kmer_id_interval_values.begin(), - m_kmer_id_interval_values.size()); - index.m_kmer_id_interval_lengths.encode(m_kmer_id_interval_lengths.begin(), - m_kmer_id_interval_lengths.size()); - uint64_t num_distinct_abundances = m_abundance_dictionary_builder.size(); pthash::compact_vector::builder abundance_interval_values; abundance_interval_values.resize( @@ -147,7 +109,7 @@ struct abundances { m_abundance_dictionary_builder.build(index.m_abundance_dictionary); } - // return the average empirical entropy per abundance + /* return the average empirical entropy per abundance */ double print_info(uint64_t num_kmers) { assert(!m_abundances.empty()); double expected_ab_value = 0.0; @@ -169,15 +131,10 @@ struct abundances { } private: - uint64_t m_most_frequent_abundance; - /* (abundance,frequency) pairs during construction, then (abundance,id) after sorting */ std::unordered_map m_abundances_map; std::vector> m_abundances; // (abundance,frequency) - std::vector m_kmer_id_interval_values; - std::vector m_kmer_id_interval_lengths; - std::vector m_abundance_interval_values; std::vector m_abundance_interval_lengths; @@ -187,48 +144,18 @@ struct abundances { bool empty() const { return m_abundance_dictionary.size() == 0; } uint64_t abundance(uint64_t kmer_id) const { - uint64_t rank = kmer_id; - - if (m_kmer_id_interval_values.size() != 0) { // optimize_mfa - bool is_present = false; - auto [pos, val, prev] = m_kmer_id_interval_values.next_geq_neighbourhood(kmer_id); - if (val == kmer_id) { - is_present = true; - rank = m_kmer_id_interval_lengths.access(pos); - } else { - if (pos > 0) { - rank = m_kmer_id_interval_lengths.access(pos - 1); - uint64_t length = m_kmer_id_interval_lengths.access(pos) - rank; - if (kmer_id < prev + length) { - is_present = true; - assert(kmer_id >= prev); - rank += kmer_id - prev; - } - } - } - if (!is_present) return m_most_frequent_abundance; - } - - uint64_t i = m_abundance_interval_lengths.prev_leq(rank); + uint64_t i = m_abundance_interval_lengths.prev_leq(kmer_id); uint64_t id = m_abundance_interval_values.access(i); uint64_t abundance = m_abundance_dictionary.access(id); - return abundance; } uint64_t num_bits() const { - return sizeof(m_most_frequent_abundance) * 8 + m_kmer_id_interval_values.num_bits() + - m_kmer_id_interval_lengths.num_bits() + m_abundance_interval_values.bytes() * 8 + - m_abundance_interval_lengths.num_bits() + m_abundance_dictionary.bytes() * 8; + return m_abundance_interval_values.bytes() * 8 + m_abundance_interval_lengths.num_bits() + + m_abundance_dictionary.bytes() * 8; } void print_space_breakdown(uint64_t num_kmers) const { - std::cout << " kmer_id_interval_values: " - << static_cast(m_kmer_id_interval_values.num_bits()) / num_kmers - << " [bits/kmer]\n"; - std::cout << " kmer_id_interval_lengths: " - << static_cast(m_kmer_id_interval_lengths.num_bits()) / num_kmers - << " [bits/kmer]\n"; std::cout << " abundance_interval_values: " << static_cast(m_abundance_interval_values.bytes() * 8) / num_kmers << " [bits/kmer]\n"; @@ -242,34 +169,14 @@ struct abundances { template void visit(Visitor& visitor) { - visitor.visit(m_most_frequent_abundance); - visitor.visit(m_kmer_id_interval_values); - visitor.visit(m_kmer_id_interval_lengths); visitor.visit(m_abundance_interval_values); visitor.visit(m_abundance_interval_lengths); visitor.visit(m_abundance_dictionary); } private: - uint64_t m_most_frequent_abundance; - - /***** - We model abundances as two lists of intervals. - Each interval is a pair (value,length). - - First list: kmer_ids list. In this case, a pair (value,length) - represents all kmer_ids = value, value+1, value+2, ..., value+length-1. - - Second list: abundance list. In this case, a pair (value,length) - represents that the abundance [value] repeats for [length] times. - */ - - ef_sequence m_kmer_id_interval_values; - ef_sequence m_kmer_id_interval_lengths; - pthash::compact_vector m_abundance_interval_values; ef_sequence m_abundance_interval_lengths; - /***/ - - /* Abundance dictionary, listing all distinct abundances sorted by decreasing frequency. */ pthash::compact_vector m_abundance_dictionary; }; diff --git a/include/builder/build.cpp b/include/builder/build.cpp index 3abb581..4931b06 100644 --- a/include/builder/build.cpp +++ b/include/builder/build.cpp @@ -80,30 +80,11 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b uint64_t seq_len = 0; uint64_t sum_of_abundances = 0; - data.abundances_builder.init(constants::most_frequent_abundance); - - /* intervals of kmer_ids */ - uint64_t kmer_id_value = constants::invalid; - uint64_t kmer_id_length = 1; + data.abundances_builder.init(); /* intervals of abundances */ uint64_t ab_value = constants::invalid; - uint64_t ab_length = 1; - - uint64_t num_sequences_diff_abs = 0; // num sequences whose kmers have different abundances - uint64_t num_sequences_all_mfa = 0; // num sequences whose kmers have same abundance == mfa - - auto process_abundance = [&](uint64_t ab) { - if (ab == ab_value) { - ab_length += 1; - } else { - if (ab_value != constants::invalid) { - data.abundances_builder.push_abundance_interval(ab_value, ab_length); - } - ab_value = ab; - ab_length = 1; - } - }; + uint64_t ab_length = 0; auto parse_header = [&]() { if (sequence.empty()) return; @@ -141,53 +122,23 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b expect(sequence[i + 4], ':'); i += 5; - kmer_id_value = constants::invalid; - kmer_id_length = 1; - - bool kmers_have_all_mfa = true; - bool kmers_have_different_abundances = false; - - for (uint64_t j = 0, num_kmers = data.num_kmers, prev_ab = constants::invalid; - j != seq_len - k + 1; ++j, ++num_kmers) { + for (uint64_t j = 0; j != seq_len - k + 1; ++j) { uint64_t ab = std::strtoull(sequence.data() + i, nullptr, 10); i = sequence.find_first_of(' ', i) + 1; - if (ab != constants::most_frequent_abundance) kmers_have_all_mfa = false; - if (j > 0) { - if (ab != prev_ab) { kmers_have_different_abundances = true; } - } - prev_ab = ab; - data.abundances_builder.eat(ab); sum_of_abundances += ab; - if (build_config.optimize_mfa) { - if (ab != constants::most_frequent_abundance) { - if (kmer_id_value == constants::invalid) { - kmer_id_value = num_kmers; - kmer_id_length = 1; - } else { - if (num_kmers == kmer_id_value + kmer_id_length) kmer_id_length += 1; - } - process_abundance(ab); - } else { - if (kmer_id_value != constants::invalid) { - data.abundances_builder.push_kmer_id_interval(kmer_id_value, - kmer_id_length); - } - kmer_id_value = constants::invalid; - } + if (ab == ab_value) { + ab_length += 1; } else { - process_abundance(ab); + if (ab_value != constants::invalid) { + data.abundances_builder.push_abundance_interval(ab_value, ab_length); + } + ab_value = ab; + ab_length = 1; } } - - if (kmer_id_value != constants::invalid) { - data.abundances_builder.push_kmer_id_interval(kmer_id_value, kmer_id_length); - } - - num_sequences_diff_abs += kmers_have_different_abundances; - num_sequences_all_mfa += kmers_have_all_mfa; }; while (!is.eof()) { @@ -255,18 +206,6 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b if (build_config.store_abundances) { std::cout << "sum_of_abundances " << sum_of_abundances << std::endl; - std::cout << "num_sequences whose kmers have different abundances: " - << num_sequences_diff_abs << "/" << num_sequences << " (" - << (num_sequences_diff_abs * 100.0) / num_sequences << "%)" << std::endl; - std::cout << "num_sequences whose kmers all have the same abundance != mfa: " - << (num_sequences - num_sequences_diff_abs) << "/" << num_sequences << " (" - << ((num_sequences - num_sequences_diff_abs) * 100.0) / num_sequences << "%)" - << std::endl; - std::cout << "num_sequences whose kmers all have the same abundance == mfa: " - << num_sequences_all_mfa << "/" << (num_sequences - num_sequences_diff_abs) - << " (" - << (num_sequences_all_mfa * 100.0) / (num_sequences - num_sequences_diff_abs) - << "%)" << std::endl; data.abundances_builder.push_abundance_interval(ab_value, ab_length); data.abundances_builder.finalize(data.num_kmers); } diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index abe84f3..9296e0f 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -8,6 +8,10 @@ namespace sshash { +namespace constants { +constexpr uint64_t most_frequent_abundance = 1; // usual value +} + struct node { // We assume there are less than 2^32 sequences and that // the largest abundance fits into a 32-bit uint. @@ -59,13 +63,20 @@ struct cover { optimize for the most frequent case: push nodes of the form v:[mfa,mfa] to the bottom, and remove them forming a single (usually, long) walk. - Warning: here we are assuming the mfa is also the *smallest* abundance. */ essentials::timer_type timer; timer.start(); std::sort(nodes.begin(), nodes.end(), [](auto const& x, auto const& y) { + if (x.front == constants::most_frequent_abundance and + x.back == constants::most_frequent_abundance) { + return false; + } + if (y.front == constants::most_frequent_abundance and + y.back == constants::most_frequent_abundance) { + return true; + } if (x.front != y.front) return x.front > y.front; if (x.back != y.back) return x.back > y.back; return x.id > y.id; diff --git a/include/ef_sequence.hpp b/include/ef_sequence.hpp index 3d1ee93..6fd17ae 100644 --- a/include/ef_sequence.hpp +++ b/include/ef_sequence.hpp @@ -8,13 +8,13 @@ namespace sshash { template struct ef_sequence { - ef_sequence() {} + ef_sequence() : m_universe(0) {} template void encode(Iterator begin, uint64_t n) { if (n == 0) return; uint64_t u = *(begin + n - 1); - m_universe = u; // universe is last element, "back" + m_universe = u; uint64_t l = uint64_t((n && u / n) ? pthash::util::msb(u / n) : 0); pthash::bit_vector_builder bvb_high_bits(n + (u >> l) + 1); @@ -37,8 +37,8 @@ struct ef_sequence { pthash::bit_vector(&bvb_high_bits).swap(m_high_bits); cv_builder_low_bits.build(m_low_bits); - pthash::darray1(m_high_bits).swap(m_high_bits_d1); - if constexpr (index_zeros) { pthash::darray0(m_high_bits).swap(m_high_bits_d0); } + m_high_bits_d1.build(m_high_bits); + if constexpr (index_zeros) m_high_bits_d0.build(m_high_bits); } struct iterator { @@ -55,7 +55,6 @@ struct ef_sequence { } bool good() const { return m_ef != nullptr; } - bool has_next() const { return m_pos < m_ef->size(); } uint64_t next() { diff --git a/include/util.hpp b/include/util.hpp index 0727dac..6403374 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -17,7 +17,6 @@ constexpr uint64_t hashcode_bits = 64; constexpr double c = 3.0; // for PThash constexpr uint64_t min_l = 6; constexpr uint64_t max_l = 12; -constexpr uint64_t most_frequent_abundance = 1; // usual value static const std::string default_tmp_dirname("."); } // namespace constants @@ -68,7 +67,6 @@ struct build_configuration { , canonical_parsing(false) , store_abundances(false) - , optimize_mfa(false) , verbose(true) {} uint64_t k; // kmer size @@ -80,15 +78,13 @@ struct build_configuration { bool canonical_parsing; bool store_abundances; - bool optimize_mfa; // fma: "most frequent abundance" bool verbose; void print() const { std::cout << "k = " << k << ", m = " << m << ", seed = " << seed << ", l = " << l << ", c = " << c << ", canonical_parsing = " << (canonical_parsing ? "true" : "false") - << ", store_abundances = " << (store_abundances ? "true" : "false") - << ", optimize_mfa = " << (optimize_mfa ? "true" : "false") << std::endl; + << ", store_abundances = " << (store_abundances ? "true" : "false") << std::endl; } }; diff --git a/src/build.cpp b/src/build.cpp index 40adc6e..1477c9a 100644 --- a/src/build.cpp +++ b/src/build.cpp @@ -43,8 +43,6 @@ int main(int argc, char** argv) { "--canonical-parsing", true); parser.add("store_abundances", "Also store the abundances in compressed format.", "--abundances", true); - parser.add("optimize_mfa", "Turn on space optimization for most frequent abundance (mfa).", - "--optimize-mfa", true); parser.add("output_filename", "Output file name where the data structure will be serialized.", "-o", false); parser.add("check", "Check correctness after construction.", "--check", true); @@ -68,7 +66,6 @@ int main(int argc, char** argv) { if (parser.parsed("c")) build_config.c = parser.get("c"); build_config.canonical_parsing = parser.get("canonical_parsing"); build_config.store_abundances = parser.get("store_abundances"); - build_config.optimize_mfa = parser.get("optimize_mfa"); build_config.verbose = parser.get("verbose"); build_config.print(); diff --git a/src/permute.cpp b/src/permute.cpp index 44904b2..6844e5c 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -15,12 +15,6 @@ struct permute_data { std::vector nodes; }; -struct pair_hash { - inline uint64_t operator()(std::pair const& v) const { - return static_cast(v.first) * 31 + static_cast(v.second); - } -}; - void parse_file(std::istream& is, permute_data& data, build_configuration const& build_config) { std::string sequence; uint64_t k = build_config.k; @@ -288,7 +282,7 @@ void reverse_header(std::string const& input, std::string& output, uint64_t k) { } std::reverse(abundances.begin(), abundances.end()); - for (auto abundance : abundances) { output.append(std::to_string(abundance) + " "); } + for (auto abundance : abundances) output.append(std::to_string(abundance) + " "); } void permute_and_write(std::istream& is, std::string const& output_filename, From 2f3672c8caef46cdfc499849177ae6643318d007 Mon Sep 17 00:00:00 2001 From: jermp Date: Sat, 2 Apr 2022 15:44:17 +0200 Subject: [PATCH 40/40] put Example 4 in the README --- README.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/README.md b/README.md index f756267..7f34707 100644 --- a/README.md +++ b/README.md @@ -218,6 +218,28 @@ We see that the canonical dictionary is twice as fast as the regular dictionary for low-hit workloads, even on this tiny example, for only +0.4 bits/k-mer. +### Example 4 + + ./permute ../data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz 31 -o ecoli_sakai.permuted.fa + +This command re-orders (and possibly reverse-complement) the strings in the collection as to *minimize* the number of runs in the abundances and, hence, optimize the encoding of the abundances. +The result is saved to the file `ecoli_sakai.permuted.fa`. + +In this example for the E.Coli collection (Sakai strain) we reduce the number of runs in the abundances from 5820 to 3723. + +Then use the `build` command as usual to build the permuted collection: + + ./build ecoli_sakai.permuted.fa 31 13 --abundances --verbose + +The index built on the permuted collection +optimizes the storage space for the abundances which results in a 15.1X better space than the empirical entropy of the abundances. + +For reference, the index built on the original collection: + + ./build ../data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz 31 13 --abundances --verbose + +already achieves a 12.4X better space than the empirical entropy. + Input Files -----------