diff --git a/CMakeLists.txt b/CMakeLists.txt index c54ca24..820c2ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,3 +42,6 @@ target_link_libraries(check z) add_executable(query src/query.cpp) target_link_libraries(query z) + +add_executable(permute src/permute.cpp) +target_link_libraries(permute z) \ No newline at end of file diff --git a/README.md b/README.md index f756267..7f34707 100644 --- a/README.md +++ b/README.md @@ -218,6 +218,28 @@ We see that the canonical dictionary is twice as fast as the regular dictionary for low-hit workloads, even on this tiny example, for only +0.4 bits/k-mer. +### Example 4 + + ./permute ../data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz 31 -o ecoli_sakai.permuted.fa + +This command re-orders (and possibly reverse-complement) the strings in the collection as to *minimize* the number of runs in the abundances and, hence, optimize the encoding of the abundances. +The result is saved to the file `ecoli_sakai.permuted.fa`. + +In this example for the E.Coli collection (Sakai strain) we reduce the number of runs in the abundances from 5820 to 3723. + +Then use the `build` command as usual to build the permuted collection: + + ./build ecoli_sakai.permuted.fa 31 13 --abundances --verbose + +The index built on the permuted collection +optimizes the storage space for the abundances which results in a 15.1X better space than the empirical entropy of the abundances. + +For reference, the index built on the original collection: + + ./build ../data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz 31 13 --abundances --verbose + +already achieves a 12.4X better space than the empirical entropy. + Input Files ----------- diff --git a/data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz b/data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz index d5c697e..7607fd9 100644 Binary files a/data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz and b/data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz differ diff --git a/external/pthash b/external/pthash index f0a0fb9..3f1deb0 160000 --- a/external/pthash +++ b/external/pthash @@ -1 +1 @@ -Subproject commit f0a0fb929ecfe1d925d88bc9bf41f7b4eaf0516b +Subproject commit 3f1deb0e1ec132274af8ef55478915209f608b2c diff --git a/include/abundances.hpp b/include/abundances.hpp index f0d3684..4fd4536 100644 --- a/include/abundances.hpp +++ b/include/abundances.hpp @@ -1,20 +1,17 @@ #pragma once #include -#include // count the distinct abundances +#include // count the distinct abundances with freq information + #include "ef_sequence.hpp" namespace sshash { struct abundances { struct builder { - builder() : m_most_frequent_abundance(0) {} + builder() {} - void init(uint64_t most_frequent_abundance) { - m_most_frequent_abundance = most_frequent_abundance; - m_kmer_id_interval_lengths.push_back(0); - m_abundance_interval_lengths.push_back(0); - } + void init() { m_abundance_interval_lengths.push_back(0); } void eat(uint64_t abundance) { assert(abundance > 0); @@ -26,28 +23,17 @@ struct abundances { } } - void push_kmer_id_interval(uint64_t value, uint64_t length) { - m_kmer_id_interval_values.push_back(value); - m_kmer_id_interval_lengths.push_back(m_kmer_id_interval_lengths.back() + length); - } - void push_abundance_interval(uint64_t value, uint64_t length) { m_abundance_interval_values.push_back(value); m_abundance_interval_lengths.push_back(m_abundance_interval_lengths.back() + length); } - uint64_t num_kmer_id_intervals() const { return m_kmer_id_interval_values.size(); } uint64_t num_abundance_intervals() const { return m_abundance_interval_values.size(); } void finalize(uint64_t num_kmers) { - assert( - std::is_sorted(m_kmer_id_interval_values.begin(), m_kmer_id_interval_values.end())); - assert(std::is_sorted(m_kmer_id_interval_lengths.begin(), - m_kmer_id_interval_lengths.end())); assert(std::is_sorted(m_abundance_interval_lengths.begin(), m_abundance_interval_lengths.end())); - std::cout << "num_kmer_id_intervals " << num_kmer_id_intervals() << std::endl; std::cout << "num_abundance_intervals " << num_abundance_intervals() << std::endl; uint64_t num_distinct_abundances = m_abundances_map.size(); @@ -80,23 +66,9 @@ struct abundances { return x.first < y.first; }); - /* If this test fails, then we need to change the value of - constants::most_frequent_abundance. */ - if (m_most_frequent_abundance != m_abundances.front().first) { - throw std::runtime_error("the most frequent abundance is not " + - std::to_string(constants::most_frequent_abundance) + - " but " + std::to_string(m_abundances.front().first)); - } - uint64_t rest = num_kmers - m_abundances.front().second; std::cout << "kmers that do not have the most frequent ab: " << rest << " (" << (rest * 100.0) / num_kmers << "%)" << std::endl; - if (m_kmer_id_interval_values.size() != 0) { // optimize_mfa - std::cout << "cumulative_kmer_id_interval_lengths " - << m_kmer_id_interval_lengths.back() << '/' << rest << std::endl; - std::cout << "cumulative_abundance_interval_lengths " - << m_abundance_interval_lengths.back() << '/' << rest << std::endl; - } m_abundance_dictionary_builder.resize(num_distinct_abundances, std::ceil(std::log2(largest_ab + 1))); @@ -108,13 +80,6 @@ struct abundances { } void build(abundances& index) { - std::swap(index.m_most_frequent_abundance, m_most_frequent_abundance); - - index.m_kmer_id_interval_values.encode(m_kmer_id_interval_values.begin(), - m_kmer_id_interval_values.size()); - index.m_kmer_id_interval_lengths.encode(m_kmer_id_interval_lengths.begin(), - m_kmer_id_interval_lengths.size()); - uint64_t num_distinct_abundances = m_abundance_dictionary_builder.size(); pthash::compact_vector::builder abundance_interval_values; abundance_interval_values.resize( @@ -144,7 +109,7 @@ struct abundances { m_abundance_dictionary_builder.build(index.m_abundance_dictionary); } - // return the average empirical entropy per abundance + /* return the average empirical entropy per abundance */ double print_info(uint64_t num_kmers) { assert(!m_abundances.empty()); double expected_ab_value = 0.0; @@ -166,15 +131,10 @@ struct abundances { } private: - uint64_t m_most_frequent_abundance; - /* (abundance,frequency) pairs during construction, then (abundance,id) after sorting */ std::unordered_map m_abundances_map; std::vector> m_abundances; // (abundance,frequency) - std::vector m_kmer_id_interval_values; - std::vector m_kmer_id_interval_lengths; - std::vector m_abundance_interval_values; std::vector m_abundance_interval_lengths; @@ -184,48 +144,18 @@ struct abundances { bool empty() const { return m_abundance_dictionary.size() == 0; } uint64_t abundance(uint64_t kmer_id) const { - uint64_t rank = kmer_id; - - if (m_kmer_id_interval_values.size() != 0) { // optimize_mfa - bool is_present = false; - auto [pos, val, prev] = m_kmer_id_interval_values.next_geq_neighbourhood(kmer_id); - if (val == kmer_id) { - is_present = true; - rank = m_kmer_id_interval_lengths.access(pos); - } else { - if (pos > 0) { - rank = m_kmer_id_interval_lengths.access(pos - 1); - uint64_t length = m_kmer_id_interval_lengths.access(pos) - rank; - if (kmer_id < prev + length) { - is_present = true; - assert(kmer_id >= prev); - rank += kmer_id - prev; - } - } - } - if (!is_present) return m_most_frequent_abundance; - } - - uint64_t i = m_abundance_interval_lengths.prev_leq(rank); + uint64_t i = m_abundance_interval_lengths.prev_leq(kmer_id); uint64_t id = m_abundance_interval_values.access(i); uint64_t abundance = m_abundance_dictionary.access(id); - return abundance; } uint64_t num_bits() const { - return sizeof(m_most_frequent_abundance) * 8 + m_kmer_id_interval_values.num_bits() + - m_kmer_id_interval_lengths.num_bits() + m_abundance_interval_values.bytes() * 8 + - m_abundance_interval_lengths.num_bits() + m_abundance_dictionary.bytes() * 8; + return m_abundance_interval_values.bytes() * 8 + m_abundance_interval_lengths.num_bits() + + m_abundance_dictionary.bytes() * 8; } void print_space_breakdown(uint64_t num_kmers) const { - std::cout << " kmer_id_interval_values: " - << static_cast(m_kmer_id_interval_values.num_bits()) / num_kmers - << " [bits/kmer]\n"; - std::cout << " kmer_id_interval_lengths: " - << static_cast(m_kmer_id_interval_lengths.num_bits()) / num_kmers - << " [bits/kmer]\n"; std::cout << " abundance_interval_values: " << static_cast(m_abundance_interval_values.bytes() * 8) / num_kmers << " [bits/kmer]\n"; @@ -239,34 +169,14 @@ struct abundances { template void visit(Visitor& visitor) { - visitor.visit(m_most_frequent_abundance); - visitor.visit(m_kmer_id_interval_values); - visitor.visit(m_kmer_id_interval_lengths); visitor.visit(m_abundance_interval_values); visitor.visit(m_abundance_interval_lengths); visitor.visit(m_abundance_dictionary); } private: - uint64_t m_most_frequent_abundance; - - /***** - We model abundances as two lists of intervals. - Each interval is a pair (value,length). - - First list: kmer_ids list. In this case, a pair (value,length) - represents all kmer_ids = value, value+1, value+2, ..., value+length-1. - - Second list: abundance list. In this case, a pair (value,length) - represents that the abundance [value] repeats for [length] times. - */ - - ef_sequence m_kmer_id_interval_values; - ef_sequence m_kmer_id_interval_lengths; - pthash::compact_vector m_abundance_interval_values; ef_sequence m_abundance_interval_lengths; - /***/ - - /* Abundance dictionary, listing all distinct abundances sorted by decreasing frequency. */ pthash::compact_vector m_abundance_dictionary; }; diff --git a/include/builder/build.cpp b/include/builder/build.cpp index 9dab699..4931b06 100644 --- a/include/builder/build.cpp +++ b/include/builder/build.cpp @@ -6,21 +6,13 @@ #include "../info.cpp" #include "../../external/pthash/include/pthash.hpp" #include "../../external/pthash/external/essentials/include/essentials.hpp" - -#include "build_util_types.hpp" +#include "util.hpp" #include // for std::partial_sum #include namespace sshash { -void expect(char got, char expected) { - if (got != expected) { - std::cout << "got '" << got << "' but expected '" << expected << "'" << std::endl; - throw parse_runtime_error(); - } -} - struct parse_data { parse_data() : num_kmers(0) {} uint64_t num_kmers; @@ -88,37 +80,18 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b uint64_t seq_len = 0; uint64_t sum_of_abundances = 0; - data.abundances_builder.init(constants::most_frequent_abundance); - - /* intervals of kmer_ids */ - uint64_t kmer_id_value = constants::invalid; - uint64_t kmer_id_length = 1; + data.abundances_builder.init(); /* intervals of abundances */ uint64_t ab_value = constants::invalid; - uint64_t ab_length = 1; - - uint64_t num_sequences_diff_abs = 0; // num sequences whose kmers have different abundances - uint64_t num_sequences_all_mfa = 0; // num sequences whose kmers have same abundance == mfa - - auto process_abundance = [&](uint64_t ab) { - if (ab == ab_value) { - ab_length += 1; - } else { - if (ab_value != constants::invalid) { - data.abundances_builder.push_abundance_interval(ab_value, ab_length); - } - ab_value = ab; - ab_length = 1; - } - }; + uint64_t ab_length = 0; auto parse_header = [&]() { if (sequence.empty()) return; /* Heder format: - >[seq_id] LN:i:[seq_len] ab:Z:[ab_seq] + >[id] LN:i:[seq_len] ab:Z:[ab_seq] where [ab_seq] is a space-separated sequence of integer counters (the abundances), whose length is equal to [seq_len]-k+1 */ @@ -140,8 +113,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b uint64_t j = sequence.find_first_of(' ', i); if (j == std::string::npos) throw parse_runtime_error(); - char* end; - seq_len = std::strtoull(sequence.data() + i, &end, 10); + seq_len = std::strtoull(sequence.data() + i, nullptr, 10); i = j + 1; expect(sequence[i + 0], 'a'); expect(sequence[i + 1], 'b'); @@ -150,53 +122,23 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b expect(sequence[i + 4], ':'); i += 5; - kmer_id_value = constants::invalid; - kmer_id_length = 1; - - bool kmers_have_all_mfa = true; - bool kmers_have_different_abundances = false; - - for (uint64_t j = 0, num_kmers = data.num_kmers, prev_ab = constants::invalid; - j != seq_len - k + 1; ++j, ++num_kmers) { - uint64_t ab = std::strtoull(sequence.data() + i, &end, 10); + for (uint64_t j = 0; j != seq_len - k + 1; ++j) { + uint64_t ab = std::strtoull(sequence.data() + i, nullptr, 10); i = sequence.find_first_of(' ', i) + 1; - if (ab != constants::most_frequent_abundance) kmers_have_all_mfa = false; - if (j > 0) { - if (ab != prev_ab) { kmers_have_different_abundances = true; } - } - prev_ab = ab; - data.abundances_builder.eat(ab); sum_of_abundances += ab; - if (build_config.optimize_mfa) { - if (ab != constants::most_frequent_abundance) { - if (kmer_id_value == constants::invalid) { - kmer_id_value = num_kmers; - kmer_id_length = 1; - } else { - if (num_kmers == kmer_id_value + kmer_id_length) kmer_id_length += 1; - } - process_abundance(ab); - } else { - if (kmer_id_value != constants::invalid) { - data.abundances_builder.push_kmer_id_interval(kmer_id_value, - kmer_id_length); - } - kmer_id_value = constants::invalid; - } + if (ab == ab_value) { + ab_length += 1; } else { - process_abundance(ab); + if (ab_value != constants::invalid) { + data.abundances_builder.push_abundance_interval(ab_value, ab_length); + } + ab_value = ab; + ab_length = 1; } } - - if (kmer_id_value != constants::invalid) { - data.abundances_builder.push_kmer_id_interval(kmer_id_value, kmer_id_length); - } - - num_sequences_diff_abs += kmers_have_different_abundances; - num_sequences_all_mfa += kmers_have_all_mfa; }; while (!is.eof()) { @@ -264,18 +206,6 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b if (build_config.store_abundances) { std::cout << "sum_of_abundances " << sum_of_abundances << std::endl; - std::cout << "num_sequences whose kmers have different abundances: " - << num_sequences_diff_abs << "/" << num_sequences << " (" - << (num_sequences_diff_abs * 100.0) / num_sequences << "%)" << std::endl; - std::cout << "num_sequences whose kmers all have the same abundance != mfa: " - << (num_sequences - num_sequences_diff_abs) << "/" << num_sequences << " (" - << ((num_sequences - num_sequences_diff_abs) * 100.0) / num_sequences << "%)" - << std::endl; - std::cout << "num_sequences whose kmers all have the same abundance == mfa: " - << num_sequences_all_mfa << "/" << (num_sequences - num_sequences_diff_abs) - << " (" - << (num_sequences_all_mfa * 100.0) / (num_sequences - num_sequences_diff_abs) - << "%)" << std::endl; data.abundances_builder.push_abundance_interval(ab_value, ab_length); data.abundances_builder.finalize(data.num_kmers); } diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp new file mode 100644 index 0000000..9296e0f --- /dev/null +++ b/include/builder/cover.hpp @@ -0,0 +1,430 @@ +#pragma once + +#include +#include +#include + +#include "../util.hpp" + +namespace sshash { + +namespace constants { +constexpr uint64_t most_frequent_abundance = 1; // usual value +} + +struct node { + // We assume there are less than 2^32 sequences and that + // the largest abundance fits into a 32-bit uint. + node(uint32_t i, uint32_t f, uint32_t b, bool s) : id(i), front(f), back(b), sign(s) {} + uint32_t id, front, back; + bool sign; // '+' --> forward + // '-' --> backward +}; + +struct cover { + typedef std::vector walk_t; + typedef std::vector walks_t; + + cover(uint64_t num_sequences, uint64_t num_runs_abundances) + : m_num_sequences(num_sequences), m_num_runs_abundances(num_runs_abundances) {} + + void compute(std::vector& nodes) { + assert(nodes.size() == m_num_sequences); + + essentials::timer_type timer; + timer.start(); + + /* (abundance, position of a candidate abundance with front = abundance) */ + std::unordered_map abundance_map; + + /* visited[node.id] = true if node has been visited; false otherwise */ + std::vector visited; + + /* set of unvisited nodes */ + std::unordered_set unvisited_nodes; + + /* map from node id to offset+ into nodes */ + std::vector id_to_offset; + + /* nodes created for next round */ + std::vector tmp_nodes; + + walk_t walk; + walks_t walks_in_round; + + unvisited_nodes.reserve(nodes.size()); // at most + walk.reserve(nodes.size()); // at most + walks_in_round.reserve(2 * nodes.size()); // at most + + std::cout << "initial number of runs = " << m_num_runs_abundances << std::endl; + + { + /* + optimize for the most frequent case: + push nodes of the form v:[mfa,mfa] to the bottom, and remove them + forming a single (usually, long) walk. + */ + + essentials::timer_type timer; + timer.start(); + + std::sort(nodes.begin(), nodes.end(), [](auto const& x, auto const& y) { + if (x.front == constants::most_frequent_abundance and + x.back == constants::most_frequent_abundance) { + return false; + } + if (y.front == constants::most_frequent_abundance and + y.back == constants::most_frequent_abundance) { + return true; + } + if (x.front != y.front) return x.front > y.front; + if (x.back != y.back) return x.back > y.back; + return x.id > y.id; + }); + + uint64_t num_special_nodes = 0; + uint64_t num_nodes = nodes.size(); + while (!nodes.empty()) { + auto v = nodes.back(); + if (v.front == constants::most_frequent_abundance and + v.back == constants::most_frequent_abundance) { + walk.push_back(v); + nodes.pop_back(); + num_special_nodes += 1; + } else { + break; + } + } + std::cout << "num nodes of the form v:[" << constants::most_frequent_abundance << "," + << constants::most_frequent_abundance << "] = " << num_special_nodes << "/" + << num_nodes << "(" << (num_special_nodes * 100.0) / num_nodes << "%)" + << std::endl; + + if (num_special_nodes != 0) { + /* create new nodes for next round */ + assert(!walk.empty()); + uint32_t id = walks_in_round.size(); + uint32_t front = walk.front().front; + uint32_t back = walk.back().back; + tmp_nodes.emplace_back(id, front, back, true); + tmp_nodes.emplace_back(id, back, front, false); + walks_in_round.push_back(walk); + walk.clear(); + } + + /* add all the nodes with backward orientation */ + num_nodes = nodes.size(); + for (uint64_t i = 0; i != num_nodes; ++i) { + auto node = nodes[i]; + assert(node.sign == true); + nodes.emplace_back(node.id, node.back, node.front, false); + } + + timer.stop(); + std::cout << " time: " << timer.elapsed() / 1000000 << " [sec]" << std::endl; + } + + while (true) { + std::cout << "round " << rounds.size() << std::endl; + + uint64_t num_nodes = nodes.size(); + std::cout << " num_nodes " << num_nodes << std::endl; + assert(num_nodes % 2 == 0); + + essentials::timer_type round_timer; // total time of round + round_timer.start(); + + if (rounds.size() == 0) { + /* remember: we removed some nodes but the id-space still spans + [0..m_num_sequences-1] */ + id_to_offset.resize(m_num_sequences); + visited.resize(m_num_sequences); + } else { + id_to_offset.resize(num_nodes / 2); + visited.resize(num_nodes / 2); + } + + /* all nodes unvisited */ + std::fill(visited.begin(), visited.end(), false); + + for (auto const& node : nodes) unvisited_nodes.insert(node.id); + assert(unvisited_nodes.size() == num_nodes / 2); + + std::sort(nodes.begin(), nodes.end(), [](auto const& x, auto const& y) { + if (x.front != y.front) return x.front < y.front; + if (x.back != y.back) return x.back < y.back; + return x.id < y.id; + }); + + /* fill abundance_map */ + { + uint64_t prev_front = constants::invalid; + uint64_t offset = 0; + for (auto const& node : nodes) { + if (node.front != prev_front) abundance_map[node.front] = offset; + offset += 1; + prev_front = node.front; + } + assert(offset == nodes.size()); + } + + /* fill id_to_offset map */ + { + uint64_t offset = 0; + for (auto const& node : nodes) { + if (node.sign) id_to_offset[node.id] = offset; + offset += 1; + } + } + + uint64_t i = 0; // position of an unvisited node in nodes + bool no_more_matches_possible = true; // to stop the algorithm + + /* some statistics */ + uint64_t total_nodes_visited_to_find_matches = 0; + uint64_t total_nodes_visited_to_failure = 0; + uint64_t total_nodes_visited_to_success = 0; + uint64_t total_nodes = 0; + + while (!unvisited_nodes.empty()) { + /* 1. take an unvisited node */ + { + uint32_t unvisited_node_id = *(unvisited_nodes.begin()); + i = id_to_offset[unvisited_node_id]; + } + + /* 2. create a new walk */ + walk.clear(); + while (true) { + total_nodes += 1; + + auto node = nodes[i]; + uint64_t id = node.id; + assert(visited[id] == false); + visited[id] = true; + walk.push_back(node); + + /* node has been visited, so erase it from unvisited_nodes */ + if (unvisited_nodes.find(id) != unvisited_nodes.cend()) { + unvisited_nodes.erase(id); + } + + auto search_match = [&](uint64_t back, uint64_t candidate_i) { + bool no_match_found = false; + uint64_t tmp_total_nodes_visited_to_failure = 0; + uint64_t tmp_total_nodes_visited_to_success = 0; + + while (true) { + total_nodes_visited_to_find_matches += 1; + tmp_total_nodes_visited_to_failure += 1; + tmp_total_nodes_visited_to_success += 1; + + if (candidate_i == num_nodes) { + total_nodes_visited_to_failure += + tmp_total_nodes_visited_to_failure; + break; + } + auto candidate = nodes[candidate_i]; + + /* skip */ + if (candidate.id == id) { + candidate_i += 1; + continue; + } + + /* checked all candidate matches */ + if (candidate.front != back) { + no_match_found = true; + total_nodes_visited_to_failure += + tmp_total_nodes_visited_to_failure; + break; + } + + /* match found */ + if (visited[candidate.id] == false) { + total_nodes_visited_to_success += + tmp_total_nodes_visited_to_success; + break; + } + + candidate_i += 1; + } + assert(candidate_i <= num_nodes); + + /* update candidate position in abundance_map */ + abundance_map[back] = candidate_i; + + if (no_match_found or candidate_i == num_nodes) return false; + + /* valid match was found, then visit it next */ + i = candidate_i; + return true; + }; + + /* 3. search for a match */ + + /* first, try to match back abundance */ + uint64_t back = node.back; + uint64_t candidate_i = abundance_map[back]; + bool found = search_match(back, candidate_i); + + /* if a match is not found and the walk is singleton, + then the only node in the walk could be matched on + front abundance */ + if (!found and walk.size() == 1) { + back = node.front; + candidate_i = abundance_map[back]; + found = search_match(back, candidate_i); + if (found) { + /* change orientation of the node */ + walk[0] = {node.id, node.back, node.front, !node.sign}; + } + } + + if (!found) break; + no_more_matches_possible = false; + } + assert(!walk.empty()); + + /* create new nodes for next round */ + uint32_t id = walks_in_round.size(); + uint32_t front = walk.front().front; + uint32_t back = walk.back().back; + tmp_nodes.emplace_back(id, front, back, true); + tmp_nodes.emplace_back(id, back, front, false); + walks_in_round.push_back(walk); + } + assert(unvisited_nodes.empty()); + + if (total_nodes_visited_to_find_matches > total_nodes) { + std::cout << " avg. num. nodes visited to find a match = " + << static_cast(total_nodes_visited_to_find_matches) / total_nodes + << std::endl; + std::cout << " avg. num. nodes visited to success = " + << static_cast(total_nodes_visited_to_success) / total_nodes + << std::endl; + std::cout << " avg. num. nodes visited to failure = " + << static_cast(total_nodes_visited_to_failure) / total_nodes + << std::endl; + } + + std::cout << " num_walks = " << walks_in_round.size() << std::endl; + + { +#ifndef NDEBUG + std::fill(visited.begin(), visited.end(), false); +#endif + uint64_t num_matches_in_round = 0; + for (auto const& walk : walks_in_round) { + num_matches_in_round += walk.size() - 1; +#ifndef NDEBUG + uint64_t prev_back = walk.front().front; + // std::cout << "=>"; + for (auto const& w : walk) { + if (visited[w.id]) std::cout << "ERROR: duplicate node." << std::endl; + if (w.front != prev_back) { + std::cout << "ERROR: path is broken." << std::endl; + } + prev_back = w.back; + visited[w.id] = true; + // std::cout << w.id << ":[" << w.front << "," << w.back << "," + // << (w.sign ? '+' : '-') << "] "; + } + // std::cout << std::endl; +#endif + } + assert(m_num_runs_abundances > num_matches_in_round); + m_num_runs_abundances -= num_matches_in_round; + std::cout << " num_matches = " << num_matches_in_round << std::endl; + std::cout << " num_runs " << m_num_runs_abundances << std::endl; + + round_timer.stop(); + std::cout << " time: " << round_timer.elapsed() / 1000000 << " [sec]" << std::endl; + } + + rounds.push_back(walks_in_round); + + nodes.swap(tmp_nodes); + tmp_nodes.clear(); + walks_in_round.clear(); + abundance_map.clear(); + + if (no_more_matches_possible) { + std::cout << "STOP: no more matches possible." << std::endl; + break; + } + } + + timer.stop(); + std::cout << "cover computed in: " << timer.elapsed() / 1000000 << " [sec]" << std::endl; + std::cout << "final number of runs R = " << m_num_runs_abundances << std::endl; + + assert(m_num_runs_abundances >= 1); + } + + void save(std::string const& filename) { + std::ofstream out(filename.c_str()); + assert(rounds.size() > 0); + int r = rounds.size() - 1; + const auto& walks = rounds[r]; + uint64_t num_sequences = 0; + for (uint64_t w = 0; w != walks.size(); ++w) num_sequences += visit(w, r, true, out); + if (num_sequences != m_num_sequences) { + std::cerr << "Error: expected to write " << m_num_sequences << " but written " + << num_sequences << std::endl; + throw std::runtime_error("wrong number of sequences written"); + } + out.close(); + } + +private: + uint64_t m_num_sequences; + uint64_t m_num_runs_abundances; + std::vector rounds; + + /* visit walk of index w in round of index r */ + uint64_t visit(int w, int r, bool sign, std::ofstream& out) const { + if (r > 0) { + assert(size_t(w) < rounds[r].size()); + auto const& walk = rounds[r][w]; + uint64_t num_sequences = 0; + for (auto const& node : walk) { + /* + + & + = + + - & - = + + + & - = - + - & + = - + */ + bool new_sign = sign and node.sign; + if (sign == false and node.sign == false) new_sign = true; + num_sequences += visit(node.id, r - 1, new_sign, out); + } + return num_sequences; + } + /* print */ + assert(size_t(w) < rounds[0].size()); + auto const& walk = rounds[0][w]; + + if (sign) { + /* print forward */ + for (uint64_t i = 0; i != walk.size(); ++i) { + auto const& node = walk[i]; + // out << node.id << ":[" << node.front << "," << node.back << "] "; + out << node.id; + out << (node.sign ? " 1\n" : " 0\n"); + } + } else { + /* print backward and change orientation */ + for (int64_t i = walk.size() - 1; i >= 0; --i) { + auto const& node = walk[i]; + // out << node.id << ":[" << node.back << "," << node.front << "] "; + out << node.id; + out << (node.sign ? " 0\n" : " 1\n"); + } + } + + return walk.size(); + } +}; + +} // namespace sshash \ No newline at end of file diff --git a/include/builder/build_util_types.hpp b/include/builder/util.hpp similarity index 96% rename from include/builder/build_util_types.hpp rename to include/builder/util.hpp index 41151c9..725c56e 100644 --- a/include/builder/build_util_types.hpp +++ b/include/builder/util.hpp @@ -12,6 +12,13 @@ struct parse_runtime_error : public std::runtime_error { : std::runtime_error("did you provide an input file with abundance counts?") {} }; +void expect(char got, char expected) { + if (got != expected) { + std::cout << "got '" << got << "' but expected '" << expected << "'" << std::endl; + throw parse_runtime_error(); + } +} + struct compact_string_pool { compact_string_pool() {} diff --git a/include/ef_sequence.hpp b/include/ef_sequence.hpp index 3d1ee93..6fd17ae 100644 --- a/include/ef_sequence.hpp +++ b/include/ef_sequence.hpp @@ -8,13 +8,13 @@ namespace sshash { template struct ef_sequence { - ef_sequence() {} + ef_sequence() : m_universe(0) {} template void encode(Iterator begin, uint64_t n) { if (n == 0) return; uint64_t u = *(begin + n - 1); - m_universe = u; // universe is last element, "back" + m_universe = u; uint64_t l = uint64_t((n && u / n) ? pthash::util::msb(u / n) : 0); pthash::bit_vector_builder bvb_high_bits(n + (u >> l) + 1); @@ -37,8 +37,8 @@ struct ef_sequence { pthash::bit_vector(&bvb_high_bits).swap(m_high_bits); cv_builder_low_bits.build(m_low_bits); - pthash::darray1(m_high_bits).swap(m_high_bits_d1); - if constexpr (index_zeros) { pthash::darray0(m_high_bits).swap(m_high_bits_d0); } + m_high_bits_d1.build(m_high_bits); + if constexpr (index_zeros) m_high_bits_d0.build(m_high_bits); } struct iterator { @@ -55,7 +55,6 @@ struct ef_sequence { } bool good() const { return m_ef != nullptr; } - bool has_next() const { return m_pos < m_ef->size(); } uint64_t next() { diff --git a/include/util.hpp b/include/util.hpp index ff01b2f..6403374 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -17,7 +17,7 @@ constexpr uint64_t hashcode_bits = 64; constexpr double c = 3.0; // for PThash constexpr uint64_t min_l = 6; constexpr uint64_t max_l = 12; -constexpr uint64_t most_frequent_abundance = 1; // usual value +static const std::string default_tmp_dirname("."); } // namespace constants typedef pthash::murmurhash2_64 base_hasher_type; @@ -67,7 +67,6 @@ struct build_configuration { , canonical_parsing(false) , store_abundances(false) - , optimize_mfa(false) , verbose(true) {} uint64_t k; // kmer size @@ -79,15 +78,13 @@ struct build_configuration { bool canonical_parsing; bool store_abundances; - bool optimize_mfa; // fma: "most frequent abundance" bool verbose; void print() const { std::cout << "k = " << k << ", m = " << m << ", seed = " << seed << ", l = " << l << ", c = " << c << ", canonical_parsing = " << (canonical_parsing ? "true" : "false") - << ", store_abundances = " << (store_abundances ? "true" : "false") - << ", optimize_mfa = " << (optimize_mfa ? "true" : "false") << std::endl; + << ", store_abundances = " << (store_abundances ? "true" : "false") << std::endl; } }; diff --git a/src/build.cpp b/src/build.cpp index 40adc6e..1477c9a 100644 --- a/src/build.cpp +++ b/src/build.cpp @@ -43,8 +43,6 @@ int main(int argc, char** argv) { "--canonical-parsing", true); parser.add("store_abundances", "Also store the abundances in compressed format.", "--abundances", true); - parser.add("optimize_mfa", "Turn on space optimization for most frequent abundance (mfa).", - "--optimize-mfa", true); parser.add("output_filename", "Output file name where the data structure will be serialized.", "-o", false); parser.add("check", "Check correctness after construction.", "--check", true); @@ -68,7 +66,6 @@ int main(int argc, char** argv) { if (parser.parsed("c")) build_config.c = parser.get("c"); build_config.canonical_parsing = parser.get("canonical_parsing"); build_config.store_abundances = parser.get("store_abundances"); - build_config.optimize_mfa = parser.get("optimize_mfa"); build_config.verbose = parser.get("verbose"); build_config.print(); diff --git a/src/permute.cpp b/src/permute.cpp new file mode 100644 index 0000000..6844e5c --- /dev/null +++ b/src/permute.cpp @@ -0,0 +1,536 @@ +#include + +#include "../include/gz/zip_stream.cpp" +#include "../external/pthash/external/essentials/include/essentials.hpp" +#include "../external/pthash/external/cmd_line_parser/include/parser.hpp" +#include "../include/builder/cover.hpp" +#include "../include/builder/util.hpp" + +using namespace sshash; + +struct permute_data { + permute_data() : num_runs_abundances(0), num_sequences(0) {} + uint64_t num_runs_abundances; + uint64_t num_sequences; + std::vector nodes; +}; + +void parse_file(std::istream& is, permute_data& data, build_configuration const& build_config) { + std::string sequence; + uint64_t k = build_config.k; + uint64_t num_bases = 0; + uint64_t num_kmers = 0; + uint64_t seq_len = 0; + + uint64_t sum_of_abundances = 0; + uint64_t num_sequences_diff_abs = 0; // num sequences whose kmers have different abundances + uint64_t num_sequences_all_mfa = 0; // num sequences whose kmers have same abundance == mfa + + data.num_runs_abundances = 0; + data.num_sequences = 0; + + /* count number of distinct abundances */ + std::unordered_set distinct_abundances; + + auto parse_header = [&]() { + if (sequence.empty()) return; + + /* + Heder format: + >[id] LN:i:[seq_len] ab:Z:[ab_seq] + where [ab_seq] is a space-separated sequence of integer counters (the abundances), + whose length is equal to [seq_len]-k+1 + */ + + // example header: '>12 LN:i:41 ab:Z:2 2 2 2 2 2 2 2 2 2 2' + + expect(sequence[0], '>'); + uint64_t i = 0; + i = sequence.find_first_of(' ', i); + if (i == std::string::npos) throw parse_runtime_error(); + + i += 1; + expect(sequence[i + 0], 'L'); + expect(sequence[i + 1], 'N'); + expect(sequence[i + 2], ':'); + expect(sequence[i + 3], 'i'); + expect(sequence[i + 4], ':'); + i += 5; + uint64_t j = sequence.find_first_of(' ', i); + if (j == std::string::npos) throw parse_runtime_error(); + + seq_len = std::strtoull(sequence.data() + i, nullptr, 10); + i = j + 1; + expect(sequence[i + 0], 'a'); + expect(sequence[i + 1], 'b'); + expect(sequence[i + 2], ':'); + expect(sequence[i + 3], 'Z'); + expect(sequence[i + 4], ':'); + i += 5; + + bool kmers_have_all_mfa = true; + bool kmers_have_different_abundances = false; + uint64_t front = constants::invalid; + uint64_t back = constants::invalid; + + for (uint64_t j = 0, prev_abundance = constants::invalid; j != seq_len - k + 1; ++j) { + uint64_t abundance = std::strtoull(sequence.data() + i, nullptr, 10); + sum_of_abundances += abundance; + i = sequence.find_first_of(' ', i) + 1; + + distinct_abundances.insert(abundance); + + /* set front and back */ + if (j == 0) front = abundance; + if (j == seq_len - k) back = abundance; + + /* accumulate statistics */ + if (abundance != constants::most_frequent_abundance) kmers_have_all_mfa = false; + if (j > 0 and abundance != prev_abundance) kmers_have_different_abundances = true; + + /* count the number of runs */ + if (abundance != prev_abundance) data.num_runs_abundances += 1; + + prev_abundance = abundance; + } + + num_sequences_diff_abs += kmers_have_different_abundances; + num_sequences_all_mfa += kmers_have_all_mfa; + + constexpr bool sign = true; + data.nodes.emplace_back(data.num_sequences, front, back, sign); + }; + + while (!is.eof()) { + std::getline(is, sequence); // header sequence + parse_header(); + + std::getline(is, sequence); // DNA sequence + if (sequence.size() < k) continue; + + if (++data.num_sequences % 100000 == 0) { + std::cout << "read " << data.num_sequences << " sequences, " << num_bases << " bases, " + << num_kmers << " kmers" << std::endl; + } + + num_bases += sequence.size(); + num_kmers += sequence.size() - k + 1; + + if (seq_len != sequence.size()) { + std::cout << "ERROR: expected a sequence of length " << seq_len + << " but got one of length " << sequence.size() << std::endl; + throw std::runtime_error("file is malformed"); + } + } + + assert(data.nodes.size() == data.num_sequences); + assert(data.num_runs_abundances >= data.num_sequences); + + std::cout << "read " << data.num_sequences << " sequences, " << num_bases << " bases, " + << num_kmers << " kmers" << std::endl; + std::cout << "found " << distinct_abundances.size() << " distinct abundances" << std::endl; + std::cout << "sum_of_abundances " << sum_of_abundances << std::endl; + std::cout << "num_sequences whose kmers have different abundances: " << num_sequences_diff_abs + << "/" << data.num_sequences << " (" + << (num_sequences_diff_abs * 100.0) / data.num_sequences << "%)" << std::endl; + std::cout << "num_sequences whose kmers all have the same abundance != mfa: " + << (data.num_sequences - num_sequences_diff_abs) << "/" << data.num_sequences << " (" + << ((data.num_sequences - num_sequences_diff_abs) * 100.0) / data.num_sequences + << "%)" << std::endl; + std::cout << "num_sequences whose kmers all have the same abundance == mfa: " + << num_sequences_all_mfa << "/" << (data.num_sequences - num_sequences_diff_abs) + << " (" + << (num_sequences_all_mfa * 100.0) / (data.num_sequences - num_sequences_diff_abs) + << "%)" << std::endl; + + /* computation of lower bound to the number of final abundance runs */ + { + struct counts { + uint32_t freq; + uint32_t indegree; + uint32_t outdegree; + }; + + std::unordered_map endpoint_counts; + + for (auto const& node : data.nodes) { + uint64_t front = node.front; + uint64_t back = node.back; + { + auto it = endpoint_counts.find(front); + if (it == endpoint_counts.cend()) { + endpoint_counts[front] = {1, 0, 0}; + } else { + (*it).second.freq += 1; + } + } + { + auto it = endpoint_counts.find(back); + if (it == endpoint_counts.cend()) { + endpoint_counts[back] = {1, 0, 0}; + } else { + (*it).second.freq += 1; + } + } + if (front != back) { + endpoint_counts[back].indegree += 1; + endpoint_counts[front].outdegree += 1; + } + } + + uint64_t num_endpoints = 0; + for (auto const& p : endpoint_counts) { + uint64_t freq = p.second.freq; + uint64_t indegree = p.second.indegree; + uint64_t outdegree = p.second.outdegree; + + /* special case: abundance will appear as singleton, so count twice */ + if (indegree == 0 and outdegree == 0) { + num_endpoints += 2; + continue; + } + + /* if the excess of frequency is odd, then the abundance will appear as end-point */ + if (freq % 2 == 1) num_endpoints += 1; + } + + uint64_t num_distinct_abundances = distinct_abundances.size(); + uint64_t num_runs_abundances_internal = data.num_runs_abundances - data.nodes.size(); + uint64_t num_walks = num_endpoints / 2; + std::cout << "(estimated) num_walks = " << num_walks << std::endl; + std::cout << "num_runs_abundances = " << data.num_runs_abundances << std::endl; + std::cout << "num_runs_abundances_internal = " << num_runs_abundances_internal << std::endl; + + uint64_t R_lo = + std::max(num_distinct_abundances, num_runs_abundances_internal + 1); + uint64_t R_hi = num_runs_abundances_internal + num_walks; + std::cout << "R_lo = " << R_lo << std::endl; + std::cout << "R_hi = " << R_hi << std::endl; + } +} + +permute_data parse_file(std::string const& filename, build_configuration const& build_config) { + std::ifstream is(filename.c_str()); + if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); + std::cout << "reading file '" << filename << "'..." << std::endl; + permute_data data; + if (util::ends_with(filename, ".gz")) { + zip_istream zis(is); + parse_file(zis, data, build_config); + } else { + parse_file(is, data, build_config); + } + is.close(); + return data; +} + +struct lines_iterator { + lines_iterator(uint8_t const* begin, uint8_t const* end) : m_begin(begin), m_end(end) { + advance_to_next(); + } + + void advance_to_next() { + m_header = read_line(); + m_sequence = read_line(); + } + + std::string const& header() const { return m_header; } + std::string const& sequence() const { return m_sequence; } + bool has_next() const { return !m_header.empty() and !m_sequence.empty(); } + +private: + uint8_t const* m_begin; + uint8_t const* m_end; + std::string m_header; + std::string m_sequence; + + std::string read_line() { + uint8_t const* begin = m_begin; + while (m_begin != m_end and *m_begin++ != '\n') + ; + if (begin == m_begin) return std::string(""); + return std::string(reinterpret_cast(begin), m_begin - begin - 1); + } +}; + +void reverse_header(std::string const& input, std::string& output, uint64_t k) { + // Example header: + // >2 LN:i:61 ab:Z:4 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 + // Expected output: + // >2 LN:i:61 ab:Z:1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 4 4 + + /* skip validation of input string */ + + uint64_t i = 0; + i = input.find_first_of(' ', i); + if (i == std::string::npos) throw parse_runtime_error(); + + i += 6; // skip ' LN:i': + uint64_t j = input.find_first_of(' ', i); + if (j == std::string::npos) throw parse_runtime_error(); + + uint64_t seq_len = std::strtoull(input.data() + i, nullptr, 10); + i = j + 6; // skip ' ab:Z:' + output.append(input.data(), input.data() + i); + + std::vector abundances; + abundances.reserve(seq_len - k + 1); + for (uint64_t j = 0; j != seq_len - k + 1; ++j) { + uint64_t abundance = std::strtoull(input.data() + i, nullptr, 10); + abundances.push_back(abundance); + i = input.find_first_of(' ', i) + 1; + } + + std::reverse(abundances.begin(), abundances.end()); + for (auto abundance : abundances) output.append(std::to_string(abundance) + " "); +} + +void permute_and_write(std::istream& is, std::string const& output_filename, + std::string const& tmp_dirname, pthash::compact_vector const& permutation, + pthash::bit_vector const& signs, uint64_t k) { + constexpr uint64_t limit = 1 * essentials::GB; + std::vector> buffer; // (header, dna) + + std::string run_identifier = + std::to_string(pthash::clock_type::now().time_since_epoch().count()); + + std::string header_sequence; + std::string header_sequence_r; + + std::string dna_sequence; + std::string dna_sequence_rc; + + uint64_t num_sequences = permutation.size(); + uint64_t num_bases = 0; + uint64_t bytes = 0; + uint64_t num_files_to_merge = 0; + + auto get_tmp_output_filename = [&](uint64_t id) { + return tmp_dirname + "/tmp.run" + run_identifier + "." + std::to_string(id); + }; + + auto sort_and_flush = [&]() { + if (buffer.empty()) return; + + std::cout << "sorting buffer..." << std::endl; + std::sort(buffer.begin(), buffer.end(), [&](auto const& p_x, auto const& p_y) { + assert(p_x.first.front() == '>'); + assert(p_y.first.front() == '>'); + uint64_t seq_id_x = std::strtoull(p_x.first.data() + 1, nullptr, 10); + uint64_t seq_id_y = std::strtoull(p_y.first.data() + 1, nullptr, 10); + return permutation[seq_id_x] < permutation[seq_id_y]; + }); + + auto tmp_output_filename = get_tmp_output_filename(num_files_to_merge); + std::cout << "saving to file '" << tmp_output_filename << "'..." << std::endl; + std::ofstream out(tmp_output_filename.c_str()); + if (!out.is_open()) throw std::runtime_error("cannot open file"); + for (auto const& seq : buffer) out << seq.first << '\n' << seq.second << '\n'; + out.close(); + + buffer.clear(); + bytes = 0; + num_files_to_merge += 1; + }; + + for (uint64_t i = 0; i != num_sequences; ++i) { + std::getline(is, header_sequence); + std::getline(is, dna_sequence); + + if (!signs[i]) { + /* compute reverse complement of dna_sequence + and reverse the abundances in header_sequence */ + dna_sequence_rc.resize(dna_sequence.size()); + header_sequence_r.clear(); + util::compute_reverse_complement(dna_sequence.data(), dna_sequence_rc.data(), + dna_sequence.size()); + reverse_header(header_sequence, header_sequence_r, k); + dna_sequence.swap(dna_sequence_rc); + header_sequence.swap(header_sequence_r); + } + + uint64_t seq_bytes = header_sequence.size() + dna_sequence.size() + 16; + if (bytes + seq_bytes > limit) sort_and_flush(); + + bytes += seq_bytes; + buffer.emplace_back(header_sequence, dna_sequence); + num_bases += dna_sequence.size(); + + if (i != 0 and i % 100000 == 0) { + std::cout << "read " << i << " sequences, " << num_bases << " bases" << std::endl; + } + } + sort_and_flush(); + + std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases" << std::endl; + + /* merge */ + { + assert(num_files_to_merge > 0); + std::cout << "files to merge = " << num_files_to_merge << std::endl; + + std::vector iterators; + std::vector idx_heap; + iterators.reserve(num_files_to_merge); + idx_heap.reserve(num_files_to_merge); + std::vector> mm_files(num_files_to_merge); + + auto heap_idx_comparator = [&](uint32_t i, uint32_t j) { + assert(iterators[i].header().front() == '>'); + assert(iterators[j].header().front() == '>'); + uint64_t seq_id_x = std::strtoull(iterators[i].header().data() + 1, nullptr, 10); + uint64_t seq_id_y = std::strtoull(iterators[j].header().data() + 1, nullptr, 10); + return permutation[seq_id_x] > permutation[seq_id_y]; + }; + + auto advance_heap_head = [&]() { + auto idx = idx_heap.front(); + iterators[idx].advance_to_next(); + if (iterators[idx].has_next()) { // percolate down the head + uint64_t pos = 0; + uint64_t size = idx_heap.size(); + while (2 * pos + 1 < size) { + uint64_t i = 2 * pos + 1; + if (i + 1 < size and heap_idx_comparator(idx_heap[i], idx_heap[i + 1])) ++i; + if (heap_idx_comparator(idx_heap[i], idx_heap[pos])) break; + std::swap(idx_heap[pos], idx_heap[i]); + pos = i; + } + } else { + std::pop_heap(idx_heap.begin(), idx_heap.end(), heap_idx_comparator); + idx_heap.pop_back(); + } + }; + + /* create the input iterators and make the heap */ + for (uint64_t i = 0; i != num_files_to_merge; ++i) { + auto tmp_output_filename = get_tmp_output_filename(i); + mm_files[i].open(tmp_output_filename, mm::advice::sequential); + iterators.emplace_back(mm_files[i].data(), mm_files[i].data() + mm_files[i].size()); + idx_heap.push_back(i); + } + std::make_heap(idx_heap.begin(), idx_heap.end(), heap_idx_comparator); + + std::ofstream out(output_filename.c_str()); + if (!out.is_open()) throw std::runtime_error("cannot open file"); + + uint64_t num_written_sequences = 0; + while (!idx_heap.empty()) { + auto const& it = iterators[idx_heap.front()]; + out << it.header() << '\n' << it.sequence() << '\n'; + num_written_sequences += 1; + if (num_written_sequences % 100000 == 0) { + std::cout << "written sequences = " << num_written_sequences << "/" << num_sequences + << std::endl; + } + advance_heap_head(); + } + std::cout << "written sequences = " << num_written_sequences << "/" << num_sequences + << std::endl; + out.close(); + assert(num_written_sequences == num_sequences); + + /* remove tmp files */ + for (uint64_t i = 0; i != num_files_to_merge; ++i) { + mm_files[i].close(); + auto tmp_output_filename = get_tmp_output_filename(i); + std::remove(tmp_output_filename.c_str()); + } + } +} + +void permute_and_write(std::string const& input_filename, std::string const& output_filename, + std::string const& tmp_dirname, pthash::compact_vector const& permutation, + pthash::bit_vector const& signs, uint64_t k) { + std::ifstream is(input_filename.c_str()); + if (!is.good()) throw std::runtime_error("error in opening the file '" + input_filename + "'"); + std::cout << "reading file '" << input_filename << "'..." << std::endl; + if (util::ends_with(input_filename, ".gz")) { + zip_istream zis(is); + permute_and_write(zis, output_filename, tmp_dirname, permutation, signs, k); + } else { + permute_and_write(is, output_filename, tmp_dirname, permutation, signs, k); + } + is.close(); +} + +int main(int argc, char** argv) { + cmd_line_parser::parser parser(argc, argv); + + /* mandatory arguments */ + parser.add("input_filename", + "Must be a FASTA file (.fa/fasta extension) compressed with gzip (.gz) or not:\n" + "\t- without duplicate nor invalid kmers\n" + "\t- one DNA sequence per line\n" + "\t- with also kmers' abundances.\n" + "\tFor example, it could be the de Bruijn graph topology output by BCALM."); + + parser.add("k", "K-mer length (must be <= " + std::to_string(constants::max_k) + ")."); + + /* optional arguments */ + parser.add("output_filename", "Output file where the permuted collection will be written.", + "-o", false); + parser.add("tmp_dirname", + "Temporary directory used for merging in external memory. Default is directory '" + + constants::default_tmp_dirname + "'.", + "-d", false); + + if (!parser.parse()) return 1; + + auto input_filename = parser.get("input_filename"); + auto k = parser.get("k"); + + build_configuration build_config; + build_config.k = k; + + std::string output_filename = input_filename + ".permuted"; + if (parser.parsed("output_filename")) { + output_filename = parser.get("output_filename"); + } + + std::string tmp_dirname = constants::default_tmp_dirname; + if (parser.parsed("tmp_dirname")) tmp_dirname = parser.get("tmp_dirname"); + + std::string permutation_filename = tmp_dirname + "/tmp.permutation"; + + auto data = parse_file(input_filename, build_config); + + { + /* compute cover */ + cover c(data.num_sequences, data.num_runs_abundances); + assert(data.nodes.size() == data.num_sequences); + c.compute(data.nodes); + c.save(permutation_filename); + std::vector().swap(data.nodes); + } + + pthash::compact_vector permutation; + pthash::bit_vector signs; + { + std::ifstream is(permutation_filename.c_str()); + if (!is.good()) { + throw std::runtime_error("error in opening the file '" + permutation_filename + "'"); + } + pthash::compact_vector::builder cv_builder( + data.num_sequences, + data.num_sequences == 1 ? 1 : std::ceil(std::log2(data.num_sequences))); + pthash::bit_vector_builder bv_builder(data.num_sequences); + for (uint64_t i = 0; i != data.num_sequences; ++i) { + uint64_t position = 0; + bool sign = 0; + is >> position; + is >> sign; + cv_builder.set(position, i); + bv_builder.set(position, sign); + } + is.close(); + cv_builder.build(permutation); + signs.build(&bv_builder); + } + + /* permute and save to output file */ + permute_and_write(input_filename, output_filename, tmp_dirname, permutation, signs, k); + std::remove(permutation_filename.c_str()); + + return 0; +}