From 78b639c64a2ca8925009d3c712c8ae1ab9c2da49 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Sun, 20 Mar 2022 22:28:26 +0100
Subject: [PATCH 01/40] compute runs

---
 include/builder/build.cpp | 166 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 166 insertions(+)

diff --git a/include/builder/build.cpp b/include/builder/build.cpp
index 9dab699..aa61015 100644
--- a/include/builder/build.cpp
+++ b/include/builder/build.cpp
@@ -113,6 +113,14 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
         }
     };
 
+    struct seq_abundance_endpoints {
+        seq_abundance_endpoints(uint32_t seq_id_, uint32_t first_ab_, uint32_t last_ab_)
+            : seq_id(seq_id_), first_ab(first_ab_), last_ab(last_ab_) {}
+        uint32_t seq_id, first_ab, last_ab;
+    };
+
+    std::vector<seq_abundance_endpoints> seq_abundance_endpoints_vec;
+
     auto parse_header = [&]() {
         if (sequence.empty()) return;
 
@@ -156,11 +164,17 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
         bool kmers_have_all_mfa = true;
         bool kmers_have_different_abundances = false;
 
+        uint64_t first_ab = constants::invalid;
+        uint64_t last_ab = constants::invalid;
+
         for (uint64_t j = 0, num_kmers = data.num_kmers, prev_ab = constants::invalid;
              j != seq_len - k + 1; ++j, ++num_kmers) {
             uint64_t ab = std::strtoull(sequence.data() + i, &end, 10);
             i = sequence.find_first_of(' ', i) + 1;
 
+            if (j == 0) { first_ab = ab; }
+            if (j == seq_len - k) { last_ab = ab; }
+
             if (ab != constants::most_frequent_abundance) kmers_have_all_mfa = false;
             if (j > 0) {
                 if (ab != prev_ab) { kmers_have_different_abundances = true; }
@@ -197,6 +211,9 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
 
         num_sequences_diff_abs += kmers_have_different_abundances;
         num_sequences_all_mfa += kmers_have_all_mfa;
+
+        seq_abundance_endpoints_vec.emplace_back(num_sequences, first_ab, last_ab);
+        // std::cerr << "seq:" << num_sequences << "[" << first_ab << "," << last_ab << "]\n";
     };
 
     while (!is.eof()) {
@@ -278,6 +295,155 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
                   << "%)" << std::endl;
         data.abundances_builder.push_abundance_interval(ab_value, ab_length);
         data.abundances_builder.finalize(data.num_kmers);
+
+        /* sort on first component */
+        std::sort(seq_abundance_endpoints_vec.begin(), seq_abundance_endpoints_vec.end(),
+                  [](auto const& x, auto const& y) {
+                      if (x.first_ab != y.first_ab) return x.first_ab < y.first_ab;
+                      if (x.last_ab != y.last_ab) return x.last_ab < y.last_ab;
+                      return x.seq_id < y.seq_id;
+                  });
+
+        /* (abundance, num_seqs_with_front_abundance=abundance) */
+        std::unordered_map<uint64_t, uint64_t> abundance_map;
+        for (auto const& x : seq_abundance_endpoints_vec) {
+            // std::cerr << "seq:" << x.seq_id << "[" << x.first_ab << "," << x.last_ab << "]\n";
+            auto it = abundance_map.find(x.first_ab);
+            if (it != abundance_map.cend()) {  // found
+                (*it).second += 1;
+            } else {
+                abundance_map[x.first_ab] = 1;
+            }
+        }
+
+        std::cout << "calculating lower bound..." << std::endl;
+        uint64_t R = data.abundances_builder.num_abundance_intervals();
+
+        for (auto const& x : seq_abundance_endpoints_vec) {
+            uint64_t back = x.last_ab;
+            auto it = abundance_map.find(back);
+            if (it != abundance_map.cend()) {  // found
+                if ((*it).second > 0) {        // if it is 0, we cannot find a match
+                    (*it).second -= 1;
+                    // std::cerr << "seq:" << x.seq_id << "[" << x.first_ab << "," << x.last_ab
+                    //           << "] got paired";
+                    // std::cerr << " (remaining " << (*it).second << ")\n";
+                    R -= 1;
+                }
+                // else {
+                //     std::cerr << "seq:" << x.seq_id << "[" << x.first_ab << "," << x.last_ab
+                //               << "] NOT paired!\n";
+                // }
+            }
+        }
+        std::cout << "R = " << R << std::endl;
+
+        assert(num_sequences == seq_abundance_endpoints_vec.size());
+
+        // visited array
+        std::vector<bool> visited(num_sequences, false);
+
+        // edges: a map from abundance "x" to offset "t", where offset "t" is the position into
+        // seq_abundance_endpoints_vec of the next egde having first abundance equal to "x"
+        for (auto& p : abundance_map) { p.second = 0; }
+        for (auto const& x : seq_abundance_endpoints_vec) {
+            uint64_t front = x.first_ab;
+            assert(abundance_map.find(front) != abundance_map.cend());  // found
+            abundance_map[front] += 1;
+        }
+        {
+            std::vector<std::pair<uint64_t, uint64_t>> offsets;  // (abundance,offset)
+            offsets.reserve(abundance_map.size());
+            for (auto const& p : abundance_map) { offsets.emplace_back(p.first, p.second); }
+            assert(offsets.size() > 0);
+            std::sort(offsets.begin(), offsets.end(),
+                      [](auto const& x, auto const& y) { return x.first < y.first; });
+            uint64_t offset = 0;
+            for (auto const& p : offsets) {
+                uint64_t ab = p.first;
+                abundance_map[ab] = offset;
+                offset += p.second;
+            }
+        }
+
+        typedef std::vector<seq_abundance_endpoints> walk_t;
+        walk_t walk;
+        std::vector<walk_t> walks;
+
+        while (true) {
+            walk.clear();
+
+            // take an unvisited vertex
+            uint64_t i = 0;
+            for (; i != num_sequences; ++i) {
+                uint64_t seq_id = seq_abundance_endpoints_vec[i].seq_id;
+                if (visited[seq_id] == false) break;
+            }
+
+            if (i == num_sequences) break;  // all visited
+
+            // std::cout << " i = " << i << std::endl;
+
+            while (true) {
+                uint64_t seq_id = seq_abundance_endpoints_vec[i].seq_id;
+                if (visited[seq_id] == true) break;
+
+                visited[seq_id] = true;
+                uint64_t front = seq_abundance_endpoints_vec[i].first_ab;
+                uint64_t back = seq_abundance_endpoints_vec[i].last_ab;
+                walk.emplace_back(seq_id, front, back);
+                // std::cout << "added " << seq_id << " to walk-" << walks.size() << std::endl;
+
+                uint64_t next_offset = abundance_map[back];
+
+                while (visited[seq_id] == true) {
+                    next_offset += 1;
+                    if (next_offset == num_sequences) break;
+                    auto ab_endpoint = seq_abundance_endpoints_vec[next_offset];
+                    if (ab_endpoint.first_ab != back) break;
+                    seq_id = ab_endpoint.seq_id;
+                }
+
+                if (next_offset == num_sequences) {
+                    abundance_map[back] = next_offset - 1;
+                    break;
+                }
+
+                if (seq_abundance_endpoints_vec[next_offset].first_ab != back) {
+                    abundance_map[back] = next_offset - 1;
+                    i = next_offset - 1;
+                } else {
+                    abundance_map[back] = next_offset;
+                    i = next_offset;
+                }
+            }
+
+            if (!walk.empty()) {
+                // add walk to walks
+                walks.push_back(walk);
+                std::cout << "walks.size() = " << walks.size() << std::endl;
+            }
+        }
+
+        uint64_t num_runs = data.abundances_builder.num_abundance_intervals();
+        std::cout << "computed " << walks.size() << " walks" << std::endl;
+        std::fill(visited.begin(), visited.end(), false);
+        for (auto const& walk : walks) {
+            num_runs -= walk.size() - 1;
+            uint64_t prev_back = walk.front().first_ab;
+            for (auto const& w : walk) {
+                if (visited[w.seq_id] == true) {
+                    std::cout << "ERROR: duplicate vertex." << std::endl;
+                }
+                if (w.first_ab != prev_back) { std::cout << "ERROR: path is broken." << std::endl; }
+                prev_back = w.last_ab;
+                visited[w.seq_id] = true;
+                std::cout << w.seq_id << ":[" << w.first_ab << "," << w.last_ab << "] ";
+            }
+            std::cout << std::endl;
+        }
+
+        std::cout << "num_runs " << num_runs << std::endl;
     }
 }
 

From 8d1dc2addf2a082117def7d8be92e9cc6fae8dfe Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Sun, 20 Mar 2022 22:59:46 +0100
Subject: [PATCH 02/40] compute runs with cover

---
 include/builder/build.cpp | 138 ++++---------------------------------
 include/builder/cover.hpp | 140 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 154 insertions(+), 124 deletions(-)
 create mode 100644 include/builder/cover.hpp

diff --git a/include/builder/build.cpp b/include/builder/build.cpp
index aa61015..4ee8592 100644
--- a/include/builder/build.cpp
+++ b/include/builder/build.cpp
@@ -8,6 +8,7 @@
 #include "../../external/pthash/external/essentials/include/essentials.hpp"
 
 #include "build_util_types.hpp"
+#include "cover.hpp"
 
 #include <numeric>  // for std::partial_sum
 #include <vector>
@@ -113,13 +114,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
         }
     };
 
-    struct seq_abundance_endpoints {
-        seq_abundance_endpoints(uint32_t seq_id_, uint32_t first_ab_, uint32_t last_ab_)
-            : seq_id(seq_id_), first_ab(first_ab_), last_ab(last_ab_) {}
-        uint32_t seq_id, first_ab, last_ab;
-    };
-
-    std::vector<seq_abundance_endpoints> seq_abundance_endpoints_vec;
+    std::vector<vertex> vertices;
 
     auto parse_header = [&]() {
         if (sequence.empty()) return;
@@ -212,7 +207,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
         num_sequences_diff_abs += kmers_have_different_abundances;
         num_sequences_all_mfa += kmers_have_all_mfa;
 
-        seq_abundance_endpoints_vec.emplace_back(num_sequences, first_ab, last_ab);
+        vertices.emplace_back(num_sequences, first_ab, last_ab);
         // std::cerr << "seq:" << num_sequences << "[" << first_ab << "," << last_ab << "]\n";
     };
 
@@ -296,17 +291,15 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
         data.abundances_builder.push_abundance_interval(ab_value, ab_length);
         data.abundances_builder.finalize(data.num_kmers);
 
-        /* sort on first component */
-        std::sort(seq_abundance_endpoints_vec.begin(), seq_abundance_endpoints_vec.end(),
-                  [](auto const& x, auto const& y) {
-                      if (x.first_ab != y.first_ab) return x.first_ab < y.first_ab;
-                      if (x.last_ab != y.last_ab) return x.last_ab < y.last_ab;
-                      return x.seq_id < y.seq_id;
-                  });
+        std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) {
+            if (x.first_ab != y.first_ab) return x.first_ab < y.first_ab;
+            if (x.last_ab != y.last_ab) return x.last_ab < y.last_ab;
+            return x.seq_id < y.seq_id;
+        });
 
         /* (abundance, num_seqs_with_front_abundance=abundance) */
         std::unordered_map<uint64_t, uint64_t> abundance_map;
-        for (auto const& x : seq_abundance_endpoints_vec) {
+        for (auto const& x : vertices) {
             // std::cerr << "seq:" << x.seq_id << "[" << x.first_ab << "," << x.last_ab << "]\n";
             auto it = abundance_map.find(x.first_ab);
             if (it != abundance_map.cend()) {  // found
@@ -317,9 +310,10 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
         }
 
         std::cout << "calculating lower bound..." << std::endl;
-        uint64_t R = data.abundances_builder.num_abundance_intervals();
+        uint64_t num_runs_abundances = data.abundances_builder.num_abundance_intervals();
 
-        for (auto const& x : seq_abundance_endpoints_vec) {
+        uint64_t R = num_runs_abundances;
+        for (auto const& x : vertices) {
             uint64_t back = x.last_ab;
             auto it = abundance_map.find(back);
             if (it != abundance_map.cend()) {  // found
@@ -338,112 +332,8 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
         }
         std::cout << "R = " << R << std::endl;
 
-        assert(num_sequences == seq_abundance_endpoints_vec.size());
-
-        // visited array
-        std::vector<bool> visited(num_sequences, false);
-
-        // edges: a map from abundance "x" to offset "t", where offset "t" is the position into
-        // seq_abundance_endpoints_vec of the next egde having first abundance equal to "x"
-        for (auto& p : abundance_map) { p.second = 0; }
-        for (auto const& x : seq_abundance_endpoints_vec) {
-            uint64_t front = x.first_ab;
-            assert(abundance_map.find(front) != abundance_map.cend());  // found
-            abundance_map[front] += 1;
-        }
-        {
-            std::vector<std::pair<uint64_t, uint64_t>> offsets;  // (abundance,offset)
-            offsets.reserve(abundance_map.size());
-            for (auto const& p : abundance_map) { offsets.emplace_back(p.first, p.second); }
-            assert(offsets.size() > 0);
-            std::sort(offsets.begin(), offsets.end(),
-                      [](auto const& x, auto const& y) { return x.first < y.first; });
-            uint64_t offset = 0;
-            for (auto const& p : offsets) {
-                uint64_t ab = p.first;
-                abundance_map[ab] = offset;
-                offset += p.second;
-            }
-        }
-
-        typedef std::vector<seq_abundance_endpoints> walk_t;
-        walk_t walk;
-        std::vector<walk_t> walks;
-
-        while (true) {
-            walk.clear();
-
-            // take an unvisited vertex
-            uint64_t i = 0;
-            for (; i != num_sequences; ++i) {
-                uint64_t seq_id = seq_abundance_endpoints_vec[i].seq_id;
-                if (visited[seq_id] == false) break;
-            }
-
-            if (i == num_sequences) break;  // all visited
-
-            // std::cout << " i = " << i << std::endl;
-
-            while (true) {
-                uint64_t seq_id = seq_abundance_endpoints_vec[i].seq_id;
-                if (visited[seq_id] == true) break;
-
-                visited[seq_id] = true;
-                uint64_t front = seq_abundance_endpoints_vec[i].first_ab;
-                uint64_t back = seq_abundance_endpoints_vec[i].last_ab;
-                walk.emplace_back(seq_id, front, back);
-                // std::cout << "added " << seq_id << " to walk-" << walks.size() << std::endl;
-
-                uint64_t next_offset = abundance_map[back];
-
-                while (visited[seq_id] == true) {
-                    next_offset += 1;
-                    if (next_offset == num_sequences) break;
-                    auto ab_endpoint = seq_abundance_endpoints_vec[next_offset];
-                    if (ab_endpoint.first_ab != back) break;
-                    seq_id = ab_endpoint.seq_id;
-                }
-
-                if (next_offset == num_sequences) {
-                    abundance_map[back] = next_offset - 1;
-                    break;
-                }
-
-                if (seq_abundance_endpoints_vec[next_offset].first_ab != back) {
-                    abundance_map[back] = next_offset - 1;
-                    i = next_offset - 1;
-                } else {
-                    abundance_map[back] = next_offset;
-                    i = next_offset;
-                }
-            }
-
-            if (!walk.empty()) {
-                // add walk to walks
-                walks.push_back(walk);
-                std::cout << "walks.size() = " << walks.size() << std::endl;
-            }
-        }
-
-        uint64_t num_runs = data.abundances_builder.num_abundance_intervals();
-        std::cout << "computed " << walks.size() << " walks" << std::endl;
-        std::fill(visited.begin(), visited.end(), false);
-        for (auto const& walk : walks) {
-            num_runs -= walk.size() - 1;
-            uint64_t prev_back = walk.front().first_ab;
-            for (auto const& w : walk) {
-                if (visited[w.seq_id] == true) {
-                    std::cout << "ERROR: duplicate vertex." << std::endl;
-                }
-                if (w.first_ab != prev_back) { std::cout << "ERROR: path is broken." << std::endl; }
-                prev_back = w.last_ab;
-                visited[w.seq_id] = true;
-                std::cout << w.seq_id << ":[" << w.first_ab << "," << w.last_ab << "] ";
-            }
-            std::cout << std::endl;
-        }
-
-        std::cout << "num_runs " << num_runs << std::endl;
+        cover c;
+        c.compute(vertices, num_runs_abundances);
     }
 }
 
diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
new file mode 100644
index 0000000..ee41929
--- /dev/null
+++ b/include/builder/cover.hpp
@@ -0,0 +1,140 @@
+#pragma once
+
+namespace sshash {
+
+struct vertex {
+    vertex(uint32_t s, uint32_t f, uint32_t b) : seq_id(s), first_ab(f), last_ab(b) {}
+    uint32_t seq_id, first_ab, last_ab;
+};
+
+struct cover {
+    typedef std::vector<vertex> walk_t;
+
+    void compute(std::vector<vertex>& vertices,
+                 uint64_t num_runs_abundances  // TODO: remove from here
+    ) {
+        /* (abundance, num_seqs_with_front_abundance=abundance) */
+        std::unordered_map<uint64_t, uint64_t> abundance_map;
+        std::vector<bool> visited;
+
+        while (true) {
+            uint64_t num_vertices = vertices.size();
+
+            std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) {
+                if (x.first_ab != y.first_ab) return x.first_ab < y.first_ab;
+                if (x.last_ab != y.last_ab) return x.last_ab < y.last_ab;
+                return x.seq_id < y.seq_id;
+            });
+
+            visited.resize(num_vertices);
+            std::fill(visited.begin(), visited.end(), false);
+
+            for (auto const& x : vertices) {
+                auto it = abundance_map.find(x.first_ab);
+                if (it != abundance_map.cend()) {  // found
+                    (*it).second += 1;
+                } else {
+                    abundance_map[x.first_ab] = 1;
+                }
+            }
+
+            {
+                std::vector<std::pair<uint64_t, uint64_t>> offsets;  // (abundance,offset)
+                offsets.reserve(abundance_map.size());
+                for (auto const& p : abundance_map) { offsets.emplace_back(p.first, p.second); }
+                assert(offsets.size() > 0);
+                std::sort(offsets.begin(), offsets.end(),
+                          [](auto const& x, auto const& y) { return x.first < y.first; });
+                uint64_t offset = 0;
+                for (auto const& p : offsets) {
+                    uint64_t ab = p.first;
+                    abundance_map[ab] = offset;
+                    offset += p.second;
+                }
+            }
+
+            walk_t walk;
+
+            while (true) {
+                walk.clear();
+
+                // take an unvisited vertex
+                uint64_t i = 0;
+                for (; i != num_vertices; ++i) {
+                    uint64_t seq_id = vertices[i].seq_id;
+                    if (visited[seq_id] == false) break;
+                }
+
+                if (i == num_vertices) break;  // all visited
+
+                // std::cout << " i = " << i << std::endl;
+
+                while (true) {
+                    uint64_t seq_id = vertices[i].seq_id;
+                    if (visited[seq_id] == true) break;
+
+                    visited[seq_id] = true;
+                    uint64_t front = vertices[i].first_ab;
+                    uint64_t back = vertices[i].last_ab;
+                    walk.emplace_back(seq_id, front, back);
+                    // std::cout << "added " << seq_id << " to walk-" << walks.size() << std::endl;
+
+                    uint64_t next_offset = abundance_map[back];
+
+                    while (visited[seq_id] == true) {
+                        next_offset += 1;
+                        if (next_offset == num_vertices) break;
+                        auto ab_endpoint = vertices[next_offset];
+                        if (ab_endpoint.first_ab != back) break;
+                        seq_id = ab_endpoint.seq_id;
+                    }
+
+                    if (next_offset == num_vertices) {
+                        abundance_map[back] = next_offset - 1;
+                        break;
+                    }
+
+                    if (vertices[next_offset].first_ab != back) {
+                        abundance_map[back] = next_offset - 1;
+                        i = next_offset - 1;
+                    } else {
+                        abundance_map[back] = next_offset;
+                        i = next_offset;
+                    }
+                }
+
+                if (!walk.empty()) {
+                    // add walk to walks
+                    walks.push_back(walk);
+                    std::cout << "walks.size() = " << walks.size() << std::endl;
+                }
+            }
+
+            break;
+        }
+
+        uint64_t num_runs = num_runs_abundances;
+        std::cout << "computed " << walks.size() << " walks" << std::endl;
+        std::fill(visited.begin(), visited.end(), false);
+        for (auto const& walk : walks) {
+            num_runs -= walk.size() - 1;
+            uint64_t prev_back = walk.front().first_ab;
+            for (auto const& w : walk) {
+                if (visited[w.seq_id] == true) {
+                    std::cout << "ERROR: duplicate vertex." << std::endl;
+                }
+                if (w.first_ab != prev_back) { std::cout << "ERROR: path is broken." << std::endl; }
+                prev_back = w.last_ab;
+                visited[w.seq_id] = true;
+                std::cout << w.seq_id << ":[" << w.first_ab << "," << w.last_ab << "] ";
+            }
+            std::cout << std::endl;
+        }
+        std::cout << "num_runs " << num_runs << std::endl;
+    }
+
+private:
+    std::vector<walk_t> walks;
+};
+
+}  // namespace sshash
\ No newline at end of file

From 3d1db5b9768fd0934e38b06270d2d1c5c98aee28 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Mon, 21 Mar 2022 10:28:49 +0100
Subject: [PATCH 03/40] compute runs with cover

---
 include/builder/build.cpp |  32 ++++----
 include/builder/cover.hpp | 158 ++++++++++++++++++++++++++------------
 2 files changed, 125 insertions(+), 65 deletions(-)

diff --git a/include/builder/build.cpp b/include/builder/build.cpp
index 4ee8592..f2b7866 100644
--- a/include/builder/build.cpp
+++ b/include/builder/build.cpp
@@ -121,7 +121,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
 
         /*
             Heder format:
-            >[seq_id] LN:i:[seq_len] ab:Z:[ab_seq]
+            >[id] LN:i:[seq_len] ab:Z:[ab_seq]
             where [ab_seq] is a space-separated sequence of integer counters (the abundances),
             whose length is equal to [seq_len]-k+1
         */
@@ -159,16 +159,16 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
         bool kmers_have_all_mfa = true;
         bool kmers_have_different_abundances = false;
 
-        uint64_t first_ab = constants::invalid;
-        uint64_t last_ab = constants::invalid;
+        uint64_t front = constants::invalid;
+        uint64_t back = constants::invalid;
 
         for (uint64_t j = 0, num_kmers = data.num_kmers, prev_ab = constants::invalid;
              j != seq_len - k + 1; ++j, ++num_kmers) {
             uint64_t ab = std::strtoull(sequence.data() + i, &end, 10);
             i = sequence.find_first_of(' ', i) + 1;
 
-            if (j == 0) { first_ab = ab; }
-            if (j == seq_len - k) { last_ab = ab; }
+            if (j == 0) { front = ab; }
+            if (j == seq_len - k) { back = ab; }
 
             if (ab != constants::most_frequent_abundance) kmers_have_all_mfa = false;
             if (j > 0) {
@@ -207,8 +207,8 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
         num_sequences_diff_abs += kmers_have_different_abundances;
         num_sequences_all_mfa += kmers_have_all_mfa;
 
-        vertices.emplace_back(num_sequences, first_ab, last_ab);
-        // std::cerr << "seq:" << num_sequences << "[" << first_ab << "," << last_ab << "]\n";
+        vertices.emplace_back(num_sequences, front, back);
+        // std::cerr << "seq:" << num_sequences << "[" << front << "," << back << "]\n";
     };
 
     while (!is.eof()) {
@@ -292,20 +292,20 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
         data.abundances_builder.finalize(data.num_kmers);
 
         std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) {
-            if (x.first_ab != y.first_ab) return x.first_ab < y.first_ab;
-            if (x.last_ab != y.last_ab) return x.last_ab < y.last_ab;
-            return x.seq_id < y.seq_id;
+            if (x.front != y.front) return x.front < y.front;
+            if (x.back != y.back) return x.back < y.back;
+            return x.id < y.id;
         });
 
         /* (abundance, num_seqs_with_front_abundance=abundance) */
         std::unordered_map<uint64_t, uint64_t> abundance_map;
         for (auto const& x : vertices) {
-            // std::cerr << "seq:" << x.seq_id << "[" << x.first_ab << "," << x.last_ab << "]\n";
-            auto it = abundance_map.find(x.first_ab);
+            // std::cerr << "seq:" << x.id << "[" << x.front << "," << x.back << "]\n";
+            auto it = abundance_map.find(x.front);
             if (it != abundance_map.cend()) {  // found
                 (*it).second += 1;
             } else {
-                abundance_map[x.first_ab] = 1;
+                abundance_map[x.front] = 1;
             }
         }
 
@@ -314,18 +314,18 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
 
         uint64_t R = num_runs_abundances;
         for (auto const& x : vertices) {
-            uint64_t back = x.last_ab;
+            uint64_t back = x.back;
             auto it = abundance_map.find(back);
             if (it != abundance_map.cend()) {  // found
                 if ((*it).second > 0) {        // if it is 0, we cannot find a match
                     (*it).second -= 1;
-                    // std::cerr << "seq:" << x.seq_id << "[" << x.first_ab << "," << x.last_ab
+                    // std::cerr << "seq:" << x.id << "[" << x.front << "," << x.back
                     //           << "] got paired";
                     // std::cerr << " (remaining " << (*it).second << ")\n";
                     R -= 1;
                 }
                 // else {
-                //     std::cerr << "seq:" << x.seq_id << "[" << x.first_ab << "," << x.last_ab
+                //     std::cerr << "seq:" << x.id << "[" << x.front << "," << x.back
                 //               << "] NOT paired!\n";
                 // }
             }
diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index ee41929..2aed435 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -3,38 +3,54 @@
 namespace sshash {
 
 struct vertex {
-    vertex(uint32_t s, uint32_t f, uint32_t b) : seq_id(s), first_ab(f), last_ab(b) {}
-    uint32_t seq_id, first_ab, last_ab;
+    vertex(uint32_t s, uint32_t f, uint32_t b) : id(s), front(f), back(b) {}
+    uint32_t id, front, back;
 };
 
 struct cover {
     typedef std::vector<vertex> walk_t;
+    typedef std::vector<walk_t> walks_t;
+
+    enum color_t {
+        white = 0,  // unvisited
+        gray = 1,   // visited by not merged
+        black = 2   // visited and merged
+    };
 
     void compute(std::vector<vertex>& vertices,
                  uint64_t num_runs_abundances  // TODO: remove from here
     ) {
         /* (abundance, num_seqs_with_front_abundance=abundance) */
         std::unordered_map<uint64_t, uint64_t> abundance_map;
-        std::vector<bool> visited;
+
+        std::vector<color_t> colors;
+        std::vector<vertex> tmp_vertices;
+        uint64_t num_runs = num_runs_abundances;
 
         while (true) {
+            std::cout << "round " << rounds.size() << std::endl;
+
             uint64_t num_vertices = vertices.size();
+            std::cout << "num_vertices " << num_vertices << std::endl;
+            tmp_vertices.clear();
+            abundance_map.clear();
+
+            /* all unvisited */
+            colors.resize(num_vertices);
+            std::fill(colors.begin(), colors.end(), color_t::white);
 
             std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) {
-                if (x.first_ab != y.first_ab) return x.first_ab < y.first_ab;
-                if (x.last_ab != y.last_ab) return x.last_ab < y.last_ab;
-                return x.seq_id < y.seq_id;
+                if (x.front != y.front) return x.front < y.front;
+                if (x.back != y.back) return x.back < y.back;
+                return x.id < y.id;
             });
 
-            visited.resize(num_vertices);
-            std::fill(visited.begin(), visited.end(), false);
-
             for (auto const& x : vertices) {
-                auto it = abundance_map.find(x.first_ab);
+                auto it = abundance_map.find(x.front);
                 if (it != abundance_map.cend()) {  // found
                     (*it).second += 1;
                 } else {
-                    abundance_map[x.first_ab] = 1;
+                    abundance_map[x.front] = 1;
                 }
             }
 
@@ -54,6 +70,7 @@ struct cover {
             }
 
             walk_t walk;
+            walks_t walks_in_round;
 
             while (true) {
                 walk.clear();
@@ -61,32 +78,34 @@ struct cover {
                 // take an unvisited vertex
                 uint64_t i = 0;
                 for (; i != num_vertices; ++i) {
-                    uint64_t seq_id = vertices[i].seq_id;
-                    if (visited[seq_id] == false) break;
+                    uint64_t id = vertices[i].id;
+                    if (colors[id] == color_t::white) { break; }
                 }
 
                 if (i == num_vertices) break;  // all visited
 
-                // std::cout << " i = " << i << std::endl;
-
+                // create new walk
                 while (true) {
-                    uint64_t seq_id = vertices[i].seq_id;
-                    if (visited[seq_id] == true) break;
+                    uint64_t id = vertices[i].id;
+                    if (colors[id] == color_t::black or colors[id] == color_t::gray) break;
 
-                    visited[seq_id] = true;
-                    uint64_t front = vertices[i].first_ab;
-                    uint64_t back = vertices[i].last_ab;
-                    walk.emplace_back(seq_id, front, back);
-                    // std::cout << "added " << seq_id << " to walk-" << walks.size() << std::endl;
+                    // colors[id] = color_t::gray;
+                    colors[id] = color_t::black;
+                    uint64_t front = vertices[i].front;
+                    uint64_t back = vertices[i].back;
+                    // if (walk.size() > 0) { colors[walk.back().id] = color_t::black; }
+                    walk.emplace_back(id, front, back);
+                    // std::cout << "added " << id << " to walk-" << walks.size() << std::endl;
 
                     uint64_t next_offset = abundance_map[back];
 
-                    while (visited[seq_id] == true) {
+                    while (colors[id] == color_t::black) {
                         next_offset += 1;
                         if (next_offset == num_vertices) break;
-                        auto ab_endpoint = vertices[next_offset];
-                        if (ab_endpoint.first_ab != back) break;
-                        seq_id = ab_endpoint.seq_id;
+                        auto vertex = vertices[next_offset];
+                        if (vertex.front != back) break;
+                        id = vertex.id;
+                        // if (colors[vertex.id] != color_t::white) break;  // visited
                     }
 
                     if (next_offset == num_vertices) {
@@ -94,7 +113,7 @@ struct cover {
                         break;
                     }
 
-                    if (vertices[next_offset].first_ab != back) {
+                    if (vertices[next_offset].front != back) {
                         abundance_map[back] = next_offset - 1;
                         i = next_offset - 1;
                     } else {
@@ -103,38 +122,79 @@ struct cover {
                     }
                 }
 
-                if (!walk.empty()) {
-                    // add walk to walks
-                    walks.push_back(walk);
-                    std::cout << "walks.size() = " << walks.size() << std::endl;
+                if (walk.empty()) { continue; }
+
+                if (walk.size() == 1) {
+                    colors[walk.front().id] = color_t::gray;  // visited but not merged
                 }
+
+                /* create a new vertex for next round */
+                tmp_vertices.emplace_back(walks_in_round.size(), walk.front().front,
+                                          walk.back().back);
+                walks_in_round.push_back(walk);
             }
 
-            break;
-        }
+            std::cout << "num_walks_in_round " << rounds.size() << ": " << walks_in_round.size()
+                      << std::endl;
 
-        uint64_t num_runs = num_runs_abundances;
-        std::cout << "computed " << walks.size() << " walks" << std::endl;
-        std::fill(visited.begin(), visited.end(), false);
-        for (auto const& walk : walks) {
-            num_runs -= walk.size() - 1;
-            uint64_t prev_back = walk.front().first_ab;
-            for (auto const& w : walk) {
-                if (visited[w.seq_id] == true) {
-                    std::cout << "ERROR: duplicate vertex." << std::endl;
+            // if all walks are singletons, then new merging were found
+
+            bool all_singletons = true;
+            for (auto const& walk : walks_in_round) {
+                if (walk.size() > 1) {
+                    all_singletons = false;
+                    break;
                 }
-                if (w.first_ab != prev_back) { std::cout << "ERROR: path is broken." << std::endl; }
-                prev_back = w.last_ab;
-                visited[w.seq_id] = true;
-                std::cout << w.seq_id << ":[" << w.first_ab << "," << w.last_ab << "] ";
             }
-            std::cout << std::endl;
+            if (all_singletons) {
+                std::cout << "all walks are singletons: no new mergings were found" << std::endl;
+                break;
+            }
+
+            {  // print walks
+                std::fill(colors.begin(), colors.end(), color_t::white);
+                for (auto const& walk : walks_in_round) {
+                    num_runs -= walk.size() - 1;
+                    uint64_t prev_back = walk.front().front;
+                    for (auto const& w : walk) {
+                        if (colors[w.id] == color_t::black) {
+                            std::cout << "ERROR: duplicate vertex." << std::endl;
+                        }
+                        if (w.front != prev_back) {
+                            std::cout << "ERROR: path is broken." << std::endl;
+                        }
+                        prev_back = w.back;
+                        colors[w.id] = color_t::black;
+                        std::cout << w.id << ":[" << w.front << "," << w.back << "] ";
+                    }
+                    std::cout << std::endl;
+                }
+                std::cout << "num_runs " << num_runs << std::endl;
+            }
+
+            rounds.push_back(walks_in_round);
+            vertices.swap(tmp_vertices);
         }
-        std::cout << "num_runs " << num_runs << std::endl;
+
+        // uint64_t num_runs = num_runs_abundances;
+        // std::cout << "computed " << walks.size() << " walks" << std::endl;
+        // std::fill(colors.begin(), colors.end(), false);
+        // for (auto const& walk : walks) {
+        //     num_runs -= walk.size() - 1;
+        //     uint64_t prev_back = walk.front().front;
+        //     for (auto const& w : walk) {
+        //         if (colors[w.id] == true) { std::cout << "ERROR: duplicate vertex." <<
+        //         std::endl; } if (w.front != prev_back) { std::cout << "ERROR: path is broken." <<
+        //         std::endl; } prev_back = w.back; colors[w.id] = true; std::cout << w.id << ":["
+        //         << w.front << "," << w.back << "] ";
+        //     }
+        //     std::cout << std::endl;
+        // }
+        // std::cout << "num_runs " << num_runs << std::endl;
     }
 
 private:
-    std::vector<walk_t> walks;
+    std::vector<walks_t> rounds;
 };
 
 }  // namespace sshash
\ No newline at end of file

From 2c33fa99f9050c85cc7559c70ec407199c27fceb Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Mon, 21 Mar 2022 12:58:47 +0100
Subject: [PATCH 04/40] compute runs with cover

---
 include/builder/cover.hpp | 123 +++++++++++++++++++++++++-------------
 1 file changed, 83 insertions(+), 40 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index 2aed435..279ad97 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -75,87 +75,118 @@ struct cover {
             while (true) {
                 walk.clear();
 
-                // take an unvisited vertex
+                /* take an unvisited vertex */
                 uint64_t i = 0;
                 for (; i != num_vertices; ++i) {
                     uint64_t id = vertices[i].id;
-                    if (colors[id] == color_t::white) { break; }
+                    if (colors[id] == color_t::white) break;
                 }
 
                 if (i == num_vertices) break;  // all visited
 
-                // create new walk
+                /* create a new walk */
                 while (true) {
                     uint64_t id = vertices[i].id;
-                    if (colors[id] == color_t::black or colors[id] == color_t::gray) break;
 
-                    // colors[id] = color_t::gray;
-                    colors[id] = color_t::black;
+                    // std::cout << "visiting vertex " << id << ":[" << vertices[i].front << ","
+                    //           << vertices[i].back << "]" << std::endl;
+                    // std::cout << "at offset " << i << std::endl;
+
+                    if (colors[id] == color_t::black) break;
+
+                    colors[id] = color_t::gray;
                     uint64_t front = vertices[i].front;
                     uint64_t back = vertices[i].back;
-                    // if (walk.size() > 0) { colors[walk.back().id] = color_t::black; }
+                    if (walk.size() > 0) {
+                        assert(walk.back().id != id);
+                        colors[walk.back().id] = color_t::black;
+                        colors[id] = color_t::black;
+                    }
                     walk.emplace_back(id, front, back);
                     // std::cout << "added " << id << " to walk-" << walks.size() << std::endl;
 
-                    uint64_t next_offset = abundance_map[back];
+                    uint64_t offset = abundance_map[back];
 
-                    while (colors[id] == color_t::black) {
-                        next_offset += 1;
-                        if (next_offset == num_vertices) break;
-                        auto vertex = vertices[next_offset];
-                        if (vertex.front != back) break;
-                        id = vertex.id;
-                        // if (colors[vertex.id] != color_t::white) break;  // visited
-                    }
+                    /* search for a match */
+                    bool no_match_found = false;
+                    while (true) {
+                        if (offset == num_vertices) break;
+                        auto vertex = vertices[offset];
+                        // std::cout << "  vertex " << vertex.id << ":[" << vertex.front << ","
+                        //           << vertex.back << "]" << std::endl;
 
-                    if (next_offset == num_vertices) {
-                        abundance_map[back] = next_offset - 1;
-                        break;
-                    }
+                        /* skip */
+                        if (vertex.id == id) {
+                            offset += 1;
+                            continue;
+                        }
+
+                        /* checked all candidate matches */
+                        if (vertex.front != back) {
+                            // std::cout << "  MISfound match " << vertex.id << ":[" << vertex.front
+                            //           << "," << vertex.back << "]" << std::endl;
+                            // std::cout << "  at offset " << offset << std::endl;
+                            // std::cout << "NO MATCHES FOUND" << std::endl;
+                            no_match_found = true;
+                            break;
+                        }
 
-                    if (vertices[next_offset].front != back) {
-                        abundance_map[back] = next_offset - 1;
-                        i = next_offset - 1;
-                    } else {
-                        abundance_map[back] = next_offset;
-                        i = next_offset;
+                        /* match found */
+                        if (colors[vertex.id] != color_t::black) {
+                            // std::cout << "  found match " << vertex.id << ":[" << vertex.front
+                            //           << "," << vertex.back << "]" << std::endl;
+                            // std::cout << "  at offset " << offset << std::endl;
+                            break;
+                        }
+                        offset += 1;
                     }
+
+                    assert(offset <= num_vertices);
+                    if (no_match_found or offset == num_vertices) break;
+
+                    /* valid match was found, then visit it next */
+                    i = offset;
                 }
 
                 if (walk.empty()) { continue; }
 
                 if (walk.size() == 1) {
                     colors[walk.front().id] = color_t::gray;  // visited but not merged
+                    continue;
                 }
 
+                /* invariant: all vertices belonging to a walk are all black */
+                assert(std::all_of(walk.begin(), walk.end(), [&](vertex const& v) {
+                    return colors[v.id] == color_t::black;
+                }));
+
                 /* create a new vertex for next round */
                 tmp_vertices.emplace_back(walks_in_round.size(), walk.front().front,
                                           walk.back().back);
                 walks_in_round.push_back(walk);
             }
 
+            /* add all gray vertices (singleton walks) */
+            for (auto const& v : vertices) {
+                if (colors[v.id] == color_t::gray) {
+                    tmp_vertices.emplace_back(walks_in_round.size(), v.front, v.back);
+                    walk_t walk;
+                    walk.push_back(v);
+                    walks_in_round.push_back(walk);
+                }
+            }
+
             std::cout << "num_walks_in_round " << rounds.size() << ": " << walks_in_round.size()
                       << std::endl;
 
-            // if all walks are singletons, then new merging were found
-
             bool all_singletons = true;
-            for (auto const& walk : walks_in_round) {
-                if (walk.size() > 1) {
-                    all_singletons = false;
-                    break;
-                }
-            }
-            if (all_singletons) {
-                std::cout << "all walks are singletons: no new mergings were found" << std::endl;
-                break;
-            }
-
-            {  // print walks
+            {
                 std::fill(colors.begin(), colors.end(), color_t::white);
                 for (auto const& walk : walks_in_round) {
+                    if (walk.size() > 1) all_singletons = false;
                     num_runs -= walk.size() - 1;
                     uint64_t prev_back = walk.front().front;
+                    std::cout << "=>";
                     for (auto const& w : walk) {
                         if (colors[w.id] == color_t::black) {
                             std::cout << "ERROR: duplicate vertex." << std::endl;
@@ -170,12 +201,24 @@ struct cover {
                     std::cout << std::endl;
                 }
                 std::cout << "num_runs " << num_runs << std::endl;
+
+                std::cout << "created vertices in round " << rounds.size() << ":" << std::endl;
+                for (auto const& v : tmp_vertices) {
+                    std::cout << v.id << ":[" << v.front << "," << v.back << "]\n";
+                }
+            }
+
+            if (all_singletons) {
+                std::cout << "all walks are singletons: no new mergings were found" << std::endl;
+                break;
             }
 
             rounds.push_back(walks_in_round);
             vertices.swap(tmp_vertices);
         }
 
+        // TODO: form final walks and check that all vertex id are present
+
         // uint64_t num_runs = num_runs_abundances;
         // std::cout << "computed " << walks.size() << " walks" << std::endl;
         // std::fill(colors.begin(), colors.end(), false);

From ad95e9faaef02a061711359f9b5cd9af3bfddacd Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Mon, 21 Mar 2022 13:14:12 +0100
Subject: [PATCH 05/40] compute runs with cover

---
 include/builder/cover.hpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index 279ad97..98e26a1 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -17,15 +17,17 @@ struct cover {
         black = 2   // visited and merged
     };
 
+    cover() : num_sequences(0) {}
+
     void compute(std::vector<vertex>& vertices,
                  uint64_t num_runs_abundances  // TODO: remove from here
     ) {
         /* (abundance, num_seqs_with_front_abundance=abundance) */
         std::unordered_map<uint64_t, uint64_t> abundance_map;
-
         std::vector<color_t> colors;
         std::vector<vertex> tmp_vertices;
         uint64_t num_runs = num_runs_abundances;
+        num_sequences = vertices.size();
 
         while (true) {
             std::cout << "round " << rounds.size() << std::endl;
@@ -148,10 +150,10 @@ struct cover {
                     i = offset;
                 }
 
-                if (walk.empty()) { continue; }
+                assert(!walk.empty());
 
                 if (walk.size() == 1) {
-                    colors[walk.front().id] = color_t::gray;  // visited but not merged
+                    assert(colors[walk.front().id] == color_t::gray);  // visited but not merged
                     continue;
                 }
 
@@ -237,6 +239,7 @@ struct cover {
     }
 
 private:
+    uint64_t num_sequences;
     std::vector<walks_t> rounds;
 };
 

From e94747c8011e75b59dbb6cd105224098f9143411 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Mon, 21 Mar 2022 13:18:17 +0100
Subject: [PATCH 06/40] compute runs with cover

---
 include/builder/cover.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index 98e26a1..cb28ec1 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -89,13 +89,12 @@ struct cover {
                 /* create a new walk */
                 while (true) {
                     uint64_t id = vertices[i].id;
+                    assert(colors[id] != color_t::black);
 
                     // std::cout << "visiting vertex " << id << ":[" << vertices[i].front << ","
                     //           << vertices[i].back << "]" << std::endl;
                     // std::cout << "at offset " << i << std::endl;
 
-                    if (colors[id] == color_t::black) break;
-
                     colors[id] = color_t::gray;
                     uint64_t front = vertices[i].front;
                     uint64_t back = vertices[i].back;

From 64a236e678ad9867eccea6b66cb7f950156fb785 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Mon, 21 Mar 2022 17:19:12 +0100
Subject: [PATCH 07/40] compute runs with cover

---
 include/builder/cover.hpp | 88 ++++++++++++++++++++++-----------------
 1 file changed, 50 insertions(+), 38 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index cb28ec1..b36c2b1 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -11,20 +11,28 @@ struct cover {
     typedef std::vector<vertex> walk_t;
     typedef std::vector<walk_t> walks_t;
 
-    enum color_t {
+    enum color {
         white = 0,  // unvisited
         gray = 1,   // visited by not merged
         black = 2   // visited and merged
     };
 
+    struct range {
+        range() {}
+        range(uint64_t b, uint64_t e, uint64_t p) : begin(b), end(e), position(p) {}
+        uint64_t begin;
+        uint64_t end;
+        uint64_t position;
+    };
+
     cover() : num_sequences(0) {}
 
     void compute(std::vector<vertex>& vertices,
                  uint64_t num_runs_abundances  // TODO: remove from here
     ) {
         /* (abundance, num_seqs_with_front_abundance=abundance) */
-        std::unordered_map<uint64_t, uint64_t> abundance_map;
-        std::vector<color_t> colors;
+        std::unordered_map<uint64_t, range> abundance_map;
+        std::vector<color> colors;
         std::vector<vertex> tmp_vertices;
         uint64_t num_runs = num_runs_abundances;
         num_sequences = vertices.size();
@@ -39,7 +47,7 @@ struct cover {
 
             /* all unvisited */
             colors.resize(num_vertices);
-            std::fill(colors.begin(), colors.end(), color_t::white);
+            std::fill(colors.begin(), colors.end(), color::white);
 
             std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) {
                 if (x.front != y.front) return x.front < y.front;
@@ -50,23 +58,28 @@ struct cover {
             for (auto const& x : vertices) {
                 auto it = abundance_map.find(x.front);
                 if (it != abundance_map.cend()) {  // found
-                    (*it).second += 1;
+                    (*it).second.begin += 1;
                 } else {
-                    abundance_map[x.front] = 1;
+                    abundance_map[x.front] = {1, 0, 0};
                 }
             }
 
             {
                 std::vector<std::pair<uint64_t, uint64_t>> offsets;  // (abundance,offset)
                 offsets.reserve(abundance_map.size());
-                for (auto const& p : abundance_map) { offsets.emplace_back(p.first, p.second); }
+                for (auto const& p : abundance_map) {
+                    offsets.emplace_back(p.first, p.second.begin);
+                }
                 assert(offsets.size() > 0);
                 std::sort(offsets.begin(), offsets.end(),
                           [](auto const& x, auto const& y) { return x.first < y.first; });
                 uint64_t offset = 0;
                 for (auto const& p : offsets) {
                     uint64_t ab = p.first;
-                    abundance_map[ab] = offset;
+                    auto& r = abundance_map[ab];
+                    r.end = r.begin;
+                    r.begin = offset;
+                    r.position = 0;
                     offset += p.second;
                 }
             }
@@ -81,7 +94,7 @@ struct cover {
                 uint64_t i = 0;
                 for (; i != num_vertices; ++i) {
                     uint64_t id = vertices[i].id;
-                    if (colors[id] == color_t::white) break;
+                    if (colors[id] == color::white) break;
                 }
 
                 if (i == num_vertices) break;  // all visited
@@ -89,32 +102,35 @@ struct cover {
                 /* create a new walk */
                 while (true) {
                     uint64_t id = vertices[i].id;
-                    assert(colors[id] != color_t::black);
+                    assert(colors[id] != color::black);
 
-                    // std::cout << "visiting vertex " << id << ":[" << vertices[i].front << ","
-                    //           << vertices[i].back << "]" << std::endl;
-                    // std::cout << "at offset " << i << std::endl;
-
-                    colors[id] = color_t::gray;
+                    colors[id] = color::gray;
                     uint64_t front = vertices[i].front;
                     uint64_t back = vertices[i].back;
                     if (walk.size() > 0) {
                         assert(walk.back().id != id);
-                        colors[walk.back().id] = color_t::black;
-                        colors[id] = color_t::black;
+                        colors[walk.back().id] = color::black;
+                        colors[id] = color::black;
                     }
                     walk.emplace_back(id, front, back);
-                    // std::cout << "added " << id << " to walk-" << walks.size() << std::endl;
 
-                    uint64_t offset = abundance_map[back];
+                    uint64_t offset = 0;
+                    bool no_match_found = false;
+
+                    auto it = abundance_map.find(back);
+
+                    /* abundance back is not found, so no match is possible */
+                    if (it == abundance_map.cend()) {
+                        no_match_found = true;
+                        break;
+                    }
+                    offset = (*it).second.begin;
 
                     /* search for a match */
-                    bool no_match_found = false;
                     while (true) {
+                        // std::cout << "offset " << offset << "/" << num_vertices << std::endl;
                         if (offset == num_vertices) break;
                         auto vertex = vertices[offset];
-                        // std::cout << "  vertex " << vertex.id << ":[" << vertex.front << ","
-                        //           << vertex.back << "]" << std::endl;
 
                         /* skip */
                         if (vertex.id == id) {
@@ -124,21 +140,18 @@ struct cover {
 
                         /* checked all candidate matches */
                         if (vertex.front != back) {
-                            // std::cout << "  MISfound match " << vertex.id << ":[" << vertex.front
-                            //           << "," << vertex.back << "]" << std::endl;
-                            // std::cout << "  at offset " << offset << std::endl;
-                            // std::cout << "NO MATCHES FOUND" << std::endl;
                             no_match_found = true;
                             break;
                         }
 
                         /* match found */
-                        if (colors[vertex.id] != color_t::black) {
-                            // std::cout << "  found match " << vertex.id << ":[" << vertex.front
-                            //           << "," << vertex.back << "]" << std::endl;
-                            // std::cout << "  at offset " << offset << std::endl;
+                        if (colors[vertex.id] != color::black) {
+                            uint64_t& position = (*it).second.position;
+                            assert((*it).second.begin + position <= offset);
+                            position += 1;
                             break;
                         }
+
                         offset += 1;
                     }
 
@@ -152,14 +165,13 @@ struct cover {
                 assert(!walk.empty());
 
                 if (walk.size() == 1) {
-                    assert(colors[walk.front().id] == color_t::gray);  // visited but not merged
+                    assert(colors[walk.front().id] == color::gray);  // visited but not merged
                     continue;
                 }
 
                 /* invariant: all vertices belonging to a walk are all black */
-                assert(std::all_of(walk.begin(), walk.end(), [&](vertex const& v) {
-                    return colors[v.id] == color_t::black;
-                }));
+                assert(std::all_of(walk.begin(), walk.end(),
+                                   [&](vertex const& v) { return colors[v.id] == color::black; }));
 
                 /* create a new vertex for next round */
                 tmp_vertices.emplace_back(walks_in_round.size(), walk.front().front,
@@ -169,7 +181,7 @@ struct cover {
 
             /* add all gray vertices (singleton walks) */
             for (auto const& v : vertices) {
-                if (colors[v.id] == color_t::gray) {
+                if (colors[v.id] == color::gray) {
                     tmp_vertices.emplace_back(walks_in_round.size(), v.front, v.back);
                     walk_t walk;
                     walk.push_back(v);
@@ -182,21 +194,21 @@ struct cover {
 
             bool all_singletons = true;
             {
-                std::fill(colors.begin(), colors.end(), color_t::white);
+                std::fill(colors.begin(), colors.end(), color::white);
                 for (auto const& walk : walks_in_round) {
                     if (walk.size() > 1) all_singletons = false;
                     num_runs -= walk.size() - 1;
                     uint64_t prev_back = walk.front().front;
                     std::cout << "=>";
                     for (auto const& w : walk) {
-                        if (colors[w.id] == color_t::black) {
+                        if (colors[w.id] == color::black) {
                             std::cout << "ERROR: duplicate vertex." << std::endl;
                         }
                         if (w.front != prev_back) {
                             std::cout << "ERROR: path is broken." << std::endl;
                         }
                         prev_back = w.back;
-                        colors[w.id] = color_t::black;
+                        colors[w.id] = color::black;
                         std::cout << w.id << ":[" << w.front << "," << w.back << "] ";
                     }
                     std::cout << std::endl;

From fc863c67d7ddd9f02fe0187e24221d1560b318aa Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Mon, 21 Mar 2022 17:49:15 +0100
Subject: [PATCH 08/40] compute runs with cover

---
 include/builder/cover.hpp | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index b36c2b1..09ff91a 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -30,11 +30,16 @@ struct cover {
     void compute(std::vector<vertex>& vertices,
                  uint64_t num_runs_abundances  // TODO: remove from here
     ) {
+        essentials::timer_type timer;
+
+        timer.start();
+
         /* (abundance, num_seqs_with_front_abundance=abundance) */
         std::unordered_map<uint64_t, range> abundance_map;
         std::vector<color> colors;
         std::vector<vertex> tmp_vertices;
         uint64_t num_runs = num_runs_abundances;
+        std::cout << "initial number of runs = " << num_runs << std::endl;
         num_sequences = vertices.size();
 
         while (true) {
@@ -194,12 +199,13 @@ struct cover {
 
             bool all_singletons = true;
             {
+                uint64_t num_mergings_in_round = 0;
                 std::fill(colors.begin(), colors.end(), color::white);
                 for (auto const& walk : walks_in_round) {
                     if (walk.size() > 1) all_singletons = false;
-                    num_runs -= walk.size() - 1;
+                    num_mergings_in_round += walk.size() - 1;
                     uint64_t prev_back = walk.front().front;
-                    std::cout << "=>";
+                    // std::cout << "=>";
                     for (auto const& w : walk) {
                         if (colors[w.id] == color::black) {
                             std::cout << "ERROR: duplicate vertex." << std::endl;
@@ -209,16 +215,18 @@ struct cover {
                         }
                         prev_back = w.back;
                         colors[w.id] = color::black;
-                        std::cout << w.id << ":[" << w.front << "," << w.back << "] ";
+                        // std::cout << w.id << ":[" << w.front << "," << w.back << "] ";
                     }
-                    std::cout << std::endl;
+                    // std::cout << std::endl;
                 }
+                num_runs -= num_mergings_in_round;
+                std::cout << "num_mergings_in_round = " << num_mergings_in_round << std::endl;
                 std::cout << "num_runs " << num_runs << std::endl;
 
-                std::cout << "created vertices in round " << rounds.size() << ":" << std::endl;
-                for (auto const& v : tmp_vertices) {
-                    std::cout << v.id << ":[" << v.front << "," << v.back << "]\n";
-                }
+                // std::cout << "created vertices in round " << rounds.size() << ":" << std::endl;
+                // for (auto const& v : tmp_vertices) {
+                //     std::cout << v.id << ":[" << v.front << "," << v.back << "]\n";
+                // }
             }
 
             if (all_singletons) {
@@ -230,6 +238,10 @@ struct cover {
             vertices.swap(tmp_vertices);
         }
 
+        timer.stop();
+
+        std::cout << "cover computed in: " << timer.elapsed() / 1000000 << " [sec]" << std::endl;
+
         // TODO: form final walks and check that all vertex id are present
 
         // uint64_t num_runs = num_runs_abundances;

From 89234867c9d92016903884f7dbd61f7cbd72f871 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Mon, 21 Mar 2022 18:08:35 +0100
Subject: [PATCH 09/40] compute runs with cover

---
 include/builder/build.cpp |  3 +--
 include/builder/cover.hpp | 20 ++++++++++++--------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/include/builder/build.cpp b/include/builder/build.cpp
index f2b7866..19aa713 100644
--- a/include/builder/build.cpp
+++ b/include/builder/build.cpp
@@ -309,7 +309,6 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
             }
         }
 
-        std::cout << "calculating lower bound..." << std::endl;
         uint64_t num_runs_abundances = data.abundances_builder.num_abundance_intervals();
 
         uint64_t R = num_runs_abundances;
@@ -330,7 +329,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
                 // }
             }
         }
-        std::cout << "R = " << R << std::endl;
+        std::cout << "computed lower bound: R = " << R << std::endl;
 
         cover c;
         c.compute(vertices, num_runs_abundances);
diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index 09ff91a..93923c5 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -46,7 +46,7 @@ struct cover {
             std::cout << "round " << rounds.size() << std::endl;
 
             uint64_t num_vertices = vertices.size();
-            std::cout << "num_vertices " << num_vertices << std::endl;
+            std::cout << "  num_vertices " << num_vertices << std::endl;
             tmp_vertices.clear();
             abundance_map.clear();
 
@@ -129,7 +129,7 @@ struct cover {
                         no_match_found = true;
                         break;
                     }
-                    offset = (*it).second.begin;
+                    offset = (*it).second.begin + (*it).second.position;  // skip to position
 
                     /* search for a match */
                     while (true) {
@@ -194,16 +194,18 @@ struct cover {
                 }
             }
 
-            std::cout << "num_walks_in_round " << rounds.size() << ": " << walks_in_round.size()
-                      << std::endl;
+            std::cout << "  num_walks = " << walks_in_round.size() << std::endl;
 
             bool all_singletons = true;
             {
-                uint64_t num_mergings_in_round = 0;
+#ifndef NDEBUG
                 std::fill(colors.begin(), colors.end(), color::white);
+#endif
+                uint64_t num_mergings_in_round = 0;
                 for (auto const& walk : walks_in_round) {
                     if (walk.size() > 1) all_singletons = false;
                     num_mergings_in_round += walk.size() - 1;
+#ifndef NDEBUG
                     uint64_t prev_back = walk.front().front;
                     // std::cout << "=>";
                     for (auto const& w : walk) {
@@ -218,10 +220,11 @@ struct cover {
                         // std::cout << w.id << ":[" << w.front << "," << w.back << "] ";
                     }
                     // std::cout << std::endl;
+#endif
                 }
                 num_runs -= num_mergings_in_round;
-                std::cout << "num_mergings_in_round = " << num_mergings_in_round << std::endl;
-                std::cout << "num_runs " << num_runs << std::endl;
+                std::cout << "  num_mergings = " << num_mergings_in_round << std::endl;
+                std::cout << "  num_runs " << num_runs << std::endl;
 
                 // std::cout << "created vertices in round " << rounds.size() << ":" << std::endl;
                 // for (auto const& v : tmp_vertices) {
@@ -230,7 +233,8 @@ struct cover {
             }
 
             if (all_singletons) {
-                std::cout << "all walks are singletons: no new mergings were found" << std::endl;
+                std::cout << "STOP: all walks are singletons --> no new mergings were found"
+                          << std::endl;
                 break;
             }
 

From a05d53b735aa8c88b8a9facbea6131c05a155c56 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Mon, 21 Mar 2022 21:59:44 +0100
Subject: [PATCH 10/40] optimize for the most frequent case

---
 include/builder/cover.hpp | 81 +++++++++++++++++++++++++++++++--------
 1 file changed, 65 insertions(+), 16 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index 93923c5..2ec3d25 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -14,7 +14,8 @@ struct cover {
     enum color {
         white = 0,  // unvisited
         gray = 1,   // visited by not merged
-        black = 2   // visited and merged
+        black = 2,  // visited and merged
+        invalid = -1
     };
 
     struct range {
@@ -42,17 +43,64 @@ struct cover {
         std::cout << "initial number of runs = " << num_runs << std::endl;
         num_sequences = vertices.size();
 
+        walk_t walk;
+        walks_t walks_in_round;
+
+        {
+            /*
+                optimize for the most frequent case:
+                push vertices of the form v:[mfa,mfa] to the bottom, and remove them
+                forming a single (usually, long) walk.
+                Warning: here we are assuming the mfa is also the smallest abundance.
+            */
+
+            std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) {
+                if (x.front != y.front) return x.front > y.front;
+                if (x.back != y.back) return x.back > y.back;
+                return x.id > y.id;
+            });
+
+            uint64_t count = 0;
+            uint64_t total = vertices.size();
+            while (!vertices.empty()) {
+                auto v = vertices.back();
+                if (v.front == constants::most_frequent_abundance and
+                    v.back == constants::most_frequent_abundance) {
+                    walk.push_back(v);
+                    vertices.pop_back();
+                    count += 1;
+                } else {
+                    break;
+                }
+            }
+            std::cout << "num vertices of the form v:[" << constants::most_frequent_abundance << ","
+                      << constants::most_frequent_abundance << "] = " << count << "/" << total
+                      << "(" << (count * 100.0) / total << "%)" << std::endl;
+
+            /* create a new vertex for next round */
+            tmp_vertices.emplace_back(walks_in_round.size(), walk.front().front, walk.back().back);
+            walks_in_round.push_back(walk);
+
+            walk.clear();
+        }
+
         while (true) {
             std::cout << "round " << rounds.size() << std::endl;
 
             uint64_t num_vertices = vertices.size();
             std::cout << "  num_vertices " << num_vertices << std::endl;
-            tmp_vertices.clear();
-            abundance_map.clear();
 
             /* all unvisited */
-            colors.resize(num_vertices);
-            std::fill(colors.begin(), colors.end(), color::white);
+            if (rounds.size() == 0) {
+                /* remember: we removed some vertices but the id-space still spans
+                   [0..num_sequences-1] */
+                colors.resize(num_sequences);
+                std::fill(colors.begin(), colors.end(), color::invalid);
+                for (auto const& v : vertices) colors[v.id] = color::white;
+            } else {
+                colors.resize(num_vertices);
+                std::fill(colors.begin(), colors.end(), color::white);
+            }
 
             std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) {
                 if (x.front != y.front) return x.front < y.front;
@@ -89,9 +137,6 @@ struct cover {
                 }
             }
 
-            walk_t walk;
-            walks_t walks_in_round;
-
             while (true) {
                 walk.clear();
 
@@ -106,18 +151,18 @@ struct cover {
 
                 /* create a new walk */
                 while (true) {
-                    uint64_t id = vertices[i].id;
+                    auto vertex = vertices[i];
+                    uint64_t id = vertex.id;
                     assert(colors[id] != color::black);
 
                     colors[id] = color::gray;
-                    uint64_t front = vertices[i].front;
-                    uint64_t back = vertices[i].back;
+                    uint64_t back = vertex.back;
                     if (walk.size() > 0) {
                         assert(walk.back().id != id);
                         colors[walk.back().id] = color::black;
                         colors[id] = color::black;
                     }
-                    walk.emplace_back(id, front, back);
+                    walk.push_back(vertex);
 
                     uint64_t offset = 0;
                     bool no_match_found = false;
@@ -135,22 +180,22 @@ struct cover {
                     while (true) {
                         // std::cout << "offset " << offset << "/" << num_vertices << std::endl;
                         if (offset == num_vertices) break;
-                        auto vertex = vertices[offset];
+                        auto candidate = vertices[offset];
 
                         /* skip */
-                        if (vertex.id == id) {
+                        if (candidate.id == id) {
                             offset += 1;
                             continue;
                         }
 
                         /* checked all candidate matches */
-                        if (vertex.front != back) {
+                        if (candidate.front != back) {
                             no_match_found = true;
                             break;
                         }
 
                         /* match found */
-                        if (colors[vertex.id] != color::black) {
+                        if (colors[candidate.id] != color::black) {
                             uint64_t& position = (*it).second.position;
                             assert((*it).second.begin + position <= offset);
                             position += 1;
@@ -239,7 +284,11 @@ struct cover {
             }
 
             rounds.push_back(walks_in_round);
+
             vertices.swap(tmp_vertices);
+            tmp_vertices.clear();
+            walks_in_round.clear();
+            abundance_map.clear();
         }
 
         timer.stop();

From db92ebefa9c07c2bca5ea09c88b67c2e4734c2e2 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Tue, 22 Mar 2022 10:42:47 +0100
Subject: [PATCH 11/40] simplification

---
 include/builder/cover.hpp | 36 ++++++++++--------------------------
 1 file changed, 10 insertions(+), 26 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index 2ec3d25..ba010c9 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -108,34 +108,19 @@ struct cover {
                 return x.id < y.id;
             });
 
+            uint64_t prev_front = vertices.front().front;
+            uint64_t prev_offset = 0;
+            uint64_t offset = 0;
             for (auto const& x : vertices) {
-                auto it = abundance_map.find(x.front);
-                if (it != abundance_map.cend()) {  // found
-                    (*it).second.begin += 1;
-                } else {
-                    abundance_map[x.front] = {1, 0, 0};
-                }
-            }
-
-            {
-                std::vector<std::pair<uint64_t, uint64_t>> offsets;  // (abundance,offset)
-                offsets.reserve(abundance_map.size());
-                for (auto const& p : abundance_map) {
-                    offsets.emplace_back(p.first, p.second.begin);
-                }
-                assert(offsets.size() > 0);
-                std::sort(offsets.begin(), offsets.end(),
-                          [](auto const& x, auto const& y) { return x.first < y.first; });
-                uint64_t offset = 0;
-                for (auto const& p : offsets) {
-                    uint64_t ab = p.first;
-                    auto& r = abundance_map[ab];
-                    r.end = r.begin;
-                    r.begin = offset;
-                    r.position = 0;
-                    offset += p.second;
+                if (x.front != prev_front) {
+                    abundance_map[prev_front] = {prev_offset, offset, 0};
+                    prev_offset = offset;
                 }
+                offset += 1;
+                prev_front = x.front;
             }
+            abundance_map[prev_front] = {prev_offset, offset, 0};
+            assert(offset == vertices.size());
 
             while (true) {
                 walk.clear();
@@ -178,7 +163,6 @@ struct cover {
 
                     /* search for a match */
                     while (true) {
-                        // std::cout << "offset " << offset << "/" << num_vertices << std::endl;
                         if (offset == num_vertices) break;
                         auto candidate = vertices[offset];
 

From c21dc1c4f9b0084dca43f0a7878adb038ee23f49 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Tue, 22 Mar 2022 11:48:36 +0100
Subject: [PATCH 12/40] re-construction of cover and saving to file

---
 include/builder/build.cpp |  2 ++
 include/builder/cover.hpp | 63 +++++++++++++++++++++++----------------
 2 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/include/builder/build.cpp b/include/builder/build.cpp
index 19aa713..9bf8ea4 100644
--- a/include/builder/build.cpp
+++ b/include/builder/build.cpp
@@ -333,6 +333,8 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
 
         cover c;
         c.compute(vertices, num_runs_abundances);
+        std::string output_cover_filename("test_cover.txt");
+        c.save(output_cover_filename);
     }
 }
 
diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index ba010c9..c353ff0 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -28,6 +28,8 @@ struct cover {
 
     cover() : num_sequences(0) {}
 
+    uint64_t num_vertices() const { return num_sequences; }
+
     void compute(std::vector<vertex>& vertices,
                  uint64_t num_runs_abundances  // TODO: remove from here
     ) {
@@ -236,7 +238,7 @@ struct cover {
                     num_mergings_in_round += walk.size() - 1;
 #ifndef NDEBUG
                     uint64_t prev_back = walk.front().front;
-                    // std::cout << "=>";
+                    std::cout << "=>";
                     for (auto const& w : walk) {
                         if (colors[w.id] == color::black) {
                             std::cout << "ERROR: duplicate vertex." << std::endl;
@@ -246,9 +248,9 @@ struct cover {
                         }
                         prev_back = w.back;
                         colors[w.id] = color::black;
-                        // std::cout << w.id << ":[" << w.front << "," << w.back << "] ";
+                        std::cout << w.id << ":[" << w.front << "," << w.back << "] ";
                     }
-                    // std::cout << std::endl;
+                    std::cout << std::endl;
 #endif
                 }
                 num_runs -= num_mergings_in_round;
@@ -261,46 +263,55 @@ struct cover {
                 // }
             }
 
-            if (all_singletons) {
-                std::cout << "STOP: all walks are singletons --> no new mergings were found"
-                          << std::endl;
-                break;
-            }
-
             rounds.push_back(walks_in_round);
 
             vertices.swap(tmp_vertices);
             tmp_vertices.clear();
             walks_in_round.clear();
             abundance_map.clear();
+
+            if (all_singletons) {
+                std::cout << "STOP: all walks are singletons --> no new mergings were found"
+                          << std::endl;
+                break;
+            }
         }
 
         timer.stop();
 
         std::cout << "cover computed in: " << timer.elapsed() / 1000000 << " [sec]" << std::endl;
+    }
 
-        // TODO: form final walks and check that all vertex id are present
-
-        // uint64_t num_runs = num_runs_abundances;
-        // std::cout << "computed " << walks.size() << " walks" << std::endl;
-        // std::fill(colors.begin(), colors.end(), false);
-        // for (auto const& walk : walks) {
-        //     num_runs -= walk.size() - 1;
-        //     uint64_t prev_back = walk.front().front;
-        //     for (auto const& w : walk) {
-        //         if (colors[w.id] == true) { std::cout << "ERROR: duplicate vertex." <<
-        //         std::endl; } if (w.front != prev_back) { std::cout << "ERROR: path is broken." <<
-        //         std::endl; } prev_back = w.back; colors[w.id] = true; std::cout << w.id << ":["
-        //         << w.front << "," << w.back << "] ";
-        //     }
-        //     std::cout << std::endl;
-        // }
-        // std::cout << "num_runs " << num_runs << std::endl;
+    void save(std::string const& filename) {
+        std::ofstream out(filename.c_str());
+        int r = rounds.size() - 1;
+        const auto& walks = rounds[r];
+        for (auto const& walk : walks) {
+            for (auto const& vertex : walk) visit(vertex.id, r, out);
+        }
+        out.close();
     }
 
 private:
     uint64_t num_sequences;
     std::vector<walks_t> rounds;
+
+    /* visit walk of index w in round of index r */
+    void visit(int w, int r, std::ofstream& out) const {
+        if (r > 0) {
+            assert(size_t(w) < rounds[r].size());
+            auto const& walk = rounds[r][w];
+            for (auto const& vertex : walk) { visit(vertex.id, r - 1, out); }
+        } else {  // print
+            assert(size_t(w) < rounds[0].size());
+            auto const& walk = rounds[0][w];
+            for (auto const& vertex : walk) {
+                // out << vertex.id << ":[" << vertex.front << "," << vertex.back << "] ";
+                out << vertex.id << '\n';
+            }
+            // out << '\n';
+        }
+    }
 };
 
 }  // namespace sshash
\ No newline at end of file

From 9773ce4a9a96c4b55275f57bb3a48186ff646a3e Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Tue, 22 Mar 2022 14:57:58 +0100
Subject: [PATCH 13/40] now all included into a new executable 'permute'

---
 CMakeLists.txt                                |   3 +
 include/builder/build.cpp                     |  67 +----
 include/builder/cover.hpp                     |  21 +-
 .../{build_util_types.hpp => util.hpp}        |   7 +
 src/permute.cpp                               | 255 ++++++++++++++++++
 5 files changed, 281 insertions(+), 72 deletions(-)
 rename include/builder/{build_util_types.hpp => util.hpp} (96%)
 create mode 100644 src/permute.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c54ca24..820c2ec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,3 +42,6 @@ target_link_libraries(check z)
 
 add_executable(query src/query.cpp)
 target_link_libraries(query z)
+
+add_executable(permute src/permute.cpp)
+target_link_libraries(permute z)
\ No newline at end of file
diff --git a/include/builder/build.cpp b/include/builder/build.cpp
index 9bf8ea4..af4a359 100644
--- a/include/builder/build.cpp
+++ b/include/builder/build.cpp
@@ -6,22 +6,13 @@
 #include "../info.cpp"
 #include "../../external/pthash/include/pthash.hpp"
 #include "../../external/pthash/external/essentials/include/essentials.hpp"
-
-#include "build_util_types.hpp"
-#include "cover.hpp"
+#include "util.hpp"
 
 #include <numeric>  // for std::partial_sum
 #include <vector>
 
 namespace sshash {
 
-void expect(char got, char expected) {
-    if (got != expected) {
-        std::cout << "got '" << got << "' but expected '" << expected << "'" << std::endl;
-        throw parse_runtime_error();
-    }
-}
-
 struct parse_data {
     parse_data() : num_kmers(0) {}
     uint64_t num_kmers;
@@ -114,8 +105,6 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
         }
     };
 
-    std::vector<vertex> vertices;
-
     auto parse_header = [&]() {
         if (sequence.empty()) return;
 
@@ -159,17 +148,11 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
         bool kmers_have_all_mfa = true;
         bool kmers_have_different_abundances = false;
 
-        uint64_t front = constants::invalid;
-        uint64_t back = constants::invalid;
-
         for (uint64_t j = 0, num_kmers = data.num_kmers, prev_ab = constants::invalid;
              j != seq_len - k + 1; ++j, ++num_kmers) {
             uint64_t ab = std::strtoull(sequence.data() + i, &end, 10);
             i = sequence.find_first_of(' ', i) + 1;
 
-            if (j == 0) { front = ab; }
-            if (j == seq_len - k) { back = ab; }
-
             if (ab != constants::most_frequent_abundance) kmers_have_all_mfa = false;
             if (j > 0) {
                 if (ab != prev_ab) { kmers_have_different_abundances = true; }
@@ -206,9 +189,6 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
 
         num_sequences_diff_abs += kmers_have_different_abundances;
         num_sequences_all_mfa += kmers_have_all_mfa;
-
-        vertices.emplace_back(num_sequences, front, back);
-        // std::cerr << "seq:" << num_sequences << "[" << front << "," << back << "]\n";
     };
 
     while (!is.eof()) {
@@ -290,51 +270,6 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
                   << "%)" << std::endl;
         data.abundances_builder.push_abundance_interval(ab_value, ab_length);
         data.abundances_builder.finalize(data.num_kmers);
-
-        std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) {
-            if (x.front != y.front) return x.front < y.front;
-            if (x.back != y.back) return x.back < y.back;
-            return x.id < y.id;
-        });
-
-        /* (abundance, num_seqs_with_front_abundance=abundance) */
-        std::unordered_map<uint64_t, uint64_t> abundance_map;
-        for (auto const& x : vertices) {
-            // std::cerr << "seq:" << x.id << "[" << x.front << "," << x.back << "]\n";
-            auto it = abundance_map.find(x.front);
-            if (it != abundance_map.cend()) {  // found
-                (*it).second += 1;
-            } else {
-                abundance_map[x.front] = 1;
-            }
-        }
-
-        uint64_t num_runs_abundances = data.abundances_builder.num_abundance_intervals();
-
-        uint64_t R = num_runs_abundances;
-        for (auto const& x : vertices) {
-            uint64_t back = x.back;
-            auto it = abundance_map.find(back);
-            if (it != abundance_map.cend()) {  // found
-                if ((*it).second > 0) {        // if it is 0, we cannot find a match
-                    (*it).second -= 1;
-                    // std::cerr << "seq:" << x.id << "[" << x.front << "," << x.back
-                    //           << "] got paired";
-                    // std::cerr << " (remaining " << (*it).second << ")\n";
-                    R -= 1;
-                }
-                // else {
-                //     std::cerr << "seq:" << x.id << "[" << x.front << "," << x.back
-                //               << "] NOT paired!\n";
-                // }
-            }
-        }
-        std::cout << "computed lower bound: R = " << R << std::endl;
-
-        cover c;
-        c.compute(vertices, num_runs_abundances);
-        std::string output_cover_filename("test_cover.txt");
-        c.save(output_cover_filename);
     }
 }
 
diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index c353ff0..f4618db 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -1,8 +1,15 @@
 #pragma once
 
+#include <unordered_map>
+#include <vector>
+
+#include "../util.hpp"
+
 namespace sshash {
 
 struct vertex {
+    // We assume there are less than 2^32 sequences and that
+    // the largest abundance fits into a 32-bit uint.
     vertex(uint32_t s, uint32_t f, uint32_t b) : id(s), front(f), back(b) {}
     uint32_t id, front, back;
 };
@@ -47,6 +54,8 @@ struct cover {
 
         walk_t walk;
         walks_t walks_in_round;
+        walk.reserve(vertices.size());            // at most
+        walks_in_round.reserve(vertices.size());  // at most
 
         {
             /*
@@ -113,13 +122,13 @@ struct cover {
             uint64_t prev_front = vertices.front().front;
             uint64_t prev_offset = 0;
             uint64_t offset = 0;
-            for (auto const& x : vertices) {
-                if (x.front != prev_front) {
+            for (auto const& vertex : vertices) {
+                if (vertex.front != prev_front) {
                     abundance_map[prev_front] = {prev_offset, offset, 0};
                     prev_offset = offset;
                 }
                 offset += 1;
-                prev_front = x.front;
+                prev_front = vertex.front;
             }
             abundance_map[prev_front] = {prev_offset, offset, 0};
             assert(offset == vertices.size());
@@ -238,7 +247,7 @@ struct cover {
                     num_mergings_in_round += walk.size() - 1;
 #ifndef NDEBUG
                     uint64_t prev_back = walk.front().front;
-                    std::cout << "=>";
+                    // std::cout << "=>";
                     for (auto const& w : walk) {
                         if (colors[w.id] == color::black) {
                             std::cout << "ERROR: duplicate vertex." << std::endl;
@@ -248,9 +257,9 @@ struct cover {
                         }
                         prev_back = w.back;
                         colors[w.id] = color::black;
-                        std::cout << w.id << ":[" << w.front << "," << w.back << "] ";
+                        // std::cout << w.id << ":[" << w.front << "," << w.back << "] ";
                     }
-                    std::cout << std::endl;
+                    // std::cout << std::endl;
 #endif
                 }
                 num_runs -= num_mergings_in_round;
diff --git a/include/builder/build_util_types.hpp b/include/builder/util.hpp
similarity index 96%
rename from include/builder/build_util_types.hpp
rename to include/builder/util.hpp
index 41151c9..725c56e 100644
--- a/include/builder/build_util_types.hpp
+++ b/include/builder/util.hpp
@@ -12,6 +12,13 @@ struct parse_runtime_error : public std::runtime_error {
         : std::runtime_error("did you provide an input file with abundance counts?") {}
 };
 
+void expect(char got, char expected) {
+    if (got != expected) {
+        std::cout << "got '" << got << "' but expected '" << expected << "'" << std::endl;
+        throw parse_runtime_error();
+    }
+}
+
 struct compact_string_pool {
     compact_string_pool() {}
 
diff --git a/src/permute.cpp b/src/permute.cpp
new file mode 100644
index 0000000..ac843d0
--- /dev/null
+++ b/src/permute.cpp
@@ -0,0 +1,255 @@
+#include <iostream>
+
+#include "../include/gz/zip_stream.cpp"
+#include "../external/pthash/external/essentials/include/essentials.hpp"
+#include "../external/pthash/external/cmd_line_parser/include/parser.hpp"
+#include "../include/builder/cover.hpp"
+#include "../include/builder/util.hpp"
+#include "../include/abundances.hpp"
+
+using namespace sshash;
+
+struct permute_data {
+    permute_data() : num_runs_abundances(0) {}
+    uint64_t num_runs_abundances;
+    std::vector<vertex> vertices;
+};
+
+void parse_file(std::istream& is, permute_data& data, build_configuration const& build_config) {
+    std::string sequence;
+    uint64_t k = build_config.k;
+    uint64_t num_sequences = 0;
+    uint64_t num_bases = 0;
+    uint64_t num_kmers = 0;
+
+    uint64_t seq_len = 0;
+    uint64_t sum_of_abundances = 0;
+    abundances::builder abundances_builder;
+    abundances_builder.init(constants::most_frequent_abundance);
+
+    /* intervals of abundances */
+    uint64_t ab_value = constants::invalid;
+    uint64_t ab_length = 1;
+
+    uint64_t num_sequences_diff_abs = 0;  // num sequences whose kmers have different abundances
+    uint64_t num_sequences_all_mfa = 0;   // num sequences whose kmers have same abundance == mfa
+
+    auto process_abundance = [&](uint64_t ab) {
+        if (ab == ab_value) {
+            ab_length += 1;
+        } else {
+            if (ab_value != constants::invalid) {
+                abundances_builder.push_abundance_interval(ab_value, ab_length);
+            }
+            ab_value = ab;
+            ab_length = 1;
+        }
+    };
+
+    auto parse_header = [&]() {
+        if (sequence.empty()) return;
+
+        /*
+            Heder format:
+            >[id] LN:i:[seq_len] ab:Z:[ab_seq]
+            where [ab_seq] is a space-separated sequence of integer counters (the abundances),
+            whose length is equal to [seq_len]-k+1
+        */
+
+        // example header: '>12 LN:i:41 ab:Z:2 2 2 2 2 2 2 2 2 2 2'
+
+        expect(sequence[0], '>');
+        uint64_t i = 0;
+        i = sequence.find_first_of(' ', i);
+        if (i == std::string::npos) throw parse_runtime_error();
+
+        i += 1;
+        expect(sequence[i + 0], 'L');
+        expect(sequence[i + 1], 'N');
+        expect(sequence[i + 2], ':');
+        expect(sequence[i + 3], 'i');
+        expect(sequence[i + 4], ':');
+        i += 5;
+        uint64_t j = sequence.find_first_of(' ', i);
+        if (j == std::string::npos) throw parse_runtime_error();
+
+        char* end;
+        seq_len = std::strtoull(sequence.data() + i, &end, 10);
+        i = j + 1;
+        expect(sequence[i + 0], 'a');
+        expect(sequence[i + 1], 'b');
+        expect(sequence[i + 2], ':');
+        expect(sequence[i + 3], 'Z');
+        expect(sequence[i + 4], ':');
+        i += 5;
+
+        bool kmers_have_all_mfa = true;
+        bool kmers_have_different_abundances = false;
+        uint64_t front = constants::invalid;
+        uint64_t back = constants::invalid;
+
+        for (uint64_t j = 0, kmer_id = num_kmers, prev_ab = constants::invalid;
+             j != seq_len - k + 1; ++j, ++kmer_id) {
+            uint64_t ab = std::strtoull(sequence.data() + i, &end, 10);
+            i = sequence.find_first_of(' ', i) + 1;
+
+            if (j == 0) front = ab;
+            if (j == seq_len - k) back = ab;
+
+            if (ab != constants::most_frequent_abundance) kmers_have_all_mfa = false;
+            if (j > 0) {
+                if (ab != prev_ab) kmers_have_different_abundances = true;
+            }
+            prev_ab = ab;
+
+            abundances_builder.eat(ab);
+            sum_of_abundances += ab;
+
+            process_abundance(ab);
+        }
+
+        num_sequences_diff_abs += kmers_have_different_abundances;
+        num_sequences_all_mfa += kmers_have_all_mfa;
+
+        data.vertices.emplace_back(num_sequences, front, back);
+    };
+
+    while (!is.eof()) {
+        std::getline(is, sequence);  // header sequence
+        parse_header();
+
+        std::getline(is, sequence);  // DNA sequence
+        if (sequence.size() < k) continue;
+
+        if (++num_sequences % 100000 == 0) {
+            std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases, "
+                      << num_kmers << " kmers" << std::endl;
+        }
+
+        num_bases += sequence.size();
+        num_kmers += sequence.size() - k + 1;
+
+        if (seq_len != sequence.size()) {
+            std::cout << "ERROR: expected a sequence of length " << seq_len
+                      << " but got one of length " << sequence.size() << std::endl;
+            throw std::runtime_error("file is malformed");
+        }
+    }
+
+    std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases, " << num_kmers
+              << " kmers" << std::endl;
+    std::cout << "sum_of_abundances " << sum_of_abundances << std::endl;
+
+    std::cout << "num_sequences whose kmers have different abundances: " << num_sequences_diff_abs
+              << "/" << num_sequences << " (" << (num_sequences_diff_abs * 100.0) / num_sequences
+              << "%)" << std::endl;
+    std::cout << "num_sequences whose kmers all have the same abundance != mfa: "
+              << (num_sequences - num_sequences_diff_abs) << "/" << num_sequences << " ("
+              << ((num_sequences - num_sequences_diff_abs) * 100.0) / num_sequences << "%)"
+              << std::endl;
+    std::cout << "num_sequences whose kmers all have the same abundance == mfa: "
+              << num_sequences_all_mfa << "/" << (num_sequences - num_sequences_diff_abs) << " ("
+              << (num_sequences_all_mfa * 100.0) / (num_sequences - num_sequences_diff_abs) << "%)"
+              << std::endl;
+
+    abundances_builder.push_abundance_interval(ab_value, ab_length);
+    abundances_builder.finalize(num_kmers);
+
+    data.num_runs_abundances = abundances_builder.num_abundance_intervals();
+}
+
+permute_data parse_file(std::string const& filename, build_configuration const& build_config) {
+    std::ifstream is(filename.c_str());
+    if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'");
+    std::cout << "reading file '" << filename << "'..." << std::endl;
+    permute_data data;
+    if (util::ends_with(filename, ".gz")) {
+        zip_istream zis(is);
+        parse_file(zis, data, build_config);
+    } else {
+        parse_file(is, data, build_config);
+    }
+    is.close();
+    return data;
+}
+
+int main(int argc, char** argv) {
+    cmd_line_parser::parser parser(argc, argv);
+
+    /* mandatory arguments */
+    parser.add("input_filename",
+               "Must be a FASTA file (.fa/fasta extension) compressed with gzip (.gz) or not:\n"
+               "\t- without duplicate nor invalid kmers\n"
+               "\t- one DNA sequence per line\n"
+               "\t- with also kmers' abundances.\n"
+               "\tFor example, it could be the de Bruijn graph topology output by BCALM.");
+
+    parser.add("k", "K-mer length (must be <= " + std::to_string(constants::max_k) + ").");
+
+    /* optional arguments */
+    parser.add("output_filename", "Output file where the permuted collection will be written.",
+               "-o", false);
+    parser.add("verbose", "Verbose output during construction.", "--verbose", true);
+
+    if (!parser.parse()) return 1;
+
+    auto input_filename = parser.get<std::string>("input_filename");
+    auto k = parser.get<uint64_t>("k");
+
+    build_configuration build_config;
+    build_config.k = k;
+    build_config.verbose = parser.get<bool>("verbose");
+
+    std::string output_filename = input_filename + ".permuted";
+    if (parser.parsed("output_filename")) {
+        output_filename = parser.get<std::string>("output_filename");
+    }
+
+    auto data = parse_file(input_filename, build_config);
+    uint64_t num_runs_abundances = data.num_runs_abundances;
+
+    {
+        std::sort(data.vertices.begin(), data.vertices.end(), [](auto const& x, auto const& y) {
+            /* sort on front */
+            if (x.front != y.front) return x.front < y.front;
+            if (x.back != y.back) return x.back < y.back;
+            return x.id < y.id;
+        });
+
+        /* (abundance, num_seqs_with_front=abundance) */
+        // We assume there are less than 2^32 distinct abundances and that
+        // the largest abundance fits into a 32-bit uint.
+        std::unordered_map<uint32_t, uint32_t> abundance_map;
+
+        uint64_t prev_front = data.vertices.front().front;
+        uint64_t count = 0;
+        for (auto const& vertex : data.vertices) {
+            if (vertex.front != prev_front) {
+                abundance_map[prev_front] = count;
+                count = 0;
+            }
+            count += 1;
+            prev_front = vertex.front;
+        }
+        abundance_map[prev_front] = count;
+
+        uint64_t R = num_runs_abundances;
+        for (auto const& vertex : data.vertices) {
+            uint64_t back = vertex.back;
+            auto it = abundance_map.find(back);
+            if (it != abundance_map.cend()) {  // found
+                if ((*it).second > 0) {        // if it is 0, we cannot find a match
+                    (*it).second -= 1;
+                    R -= 1;
+                }
+            }
+        }
+        std::cout << "computed lower bound: R = " << R << std::endl;
+    }
+
+    cover c;
+    c.compute(data.vertices, num_runs_abundances);
+    c.save(output_filename);
+
+    return 0;
+}

From bf6bb5d80869b6a312c02a5a66392c7fe8125f2d Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Tue, 22 Mar 2022 15:23:08 +0100
Subject: [PATCH 14/40] do not use abundances::builder in src/permute

---
 src/permute.cpp | 59 ++++++++++++++++---------------------------------
 1 file changed, 19 insertions(+), 40 deletions(-)

diff --git a/src/permute.cpp b/src/permute.cpp
index ac843d0..b4a1d69 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -5,7 +5,6 @@
 #include "../external/pthash/external/cmd_line_parser/include/parser.hpp"
 #include "../include/builder/cover.hpp"
 #include "../include/builder/util.hpp"
-#include "../include/abundances.hpp"
 
 using namespace sshash;
 
@@ -21,30 +20,13 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
     uint64_t num_sequences = 0;
     uint64_t num_bases = 0;
     uint64_t num_kmers = 0;
-
     uint64_t seq_len = 0;
-    uint64_t sum_of_abundances = 0;
-    abundances::builder abundances_builder;
-    abundances_builder.init(constants::most_frequent_abundance);
-
-    /* intervals of abundances */
-    uint64_t ab_value = constants::invalid;
-    uint64_t ab_length = 1;
 
+    uint64_t prev_abundance = constants::invalid;
+    uint64_t sum_of_abundances = 0;
     uint64_t num_sequences_diff_abs = 0;  // num sequences whose kmers have different abundances
     uint64_t num_sequences_all_mfa = 0;   // num sequences whose kmers have same abundance == mfa
-
-    auto process_abundance = [&](uint64_t ab) {
-        if (ab == ab_value) {
-            ab_length += 1;
-        } else {
-            if (ab_value != constants::invalid) {
-                abundances_builder.push_abundance_interval(ab_value, ab_length);
-            }
-            ab_value = ab;
-            ab_length = 1;
-        }
-    };
+    data.num_runs_abundances = 0;
 
     auto parse_header = [&]() {
         if (sequence.empty()) return;
@@ -88,24 +70,25 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
         uint64_t front = constants::invalid;
         uint64_t back = constants::invalid;
 
-        for (uint64_t j = 0, kmer_id = num_kmers, prev_ab = constants::invalid;
-             j != seq_len - k + 1; ++j, ++kmer_id) {
-            uint64_t ab = std::strtoull(sequence.data() + i, &end, 10);
+        for (uint64_t j = 0, kmer_id = num_kmers; j != seq_len - k + 1; ++j, ++kmer_id) {
+            uint64_t abundance = std::strtoull(sequence.data() + i, &end, 10);
+            sum_of_abundances += abundance;
             i = sequence.find_first_of(' ', i) + 1;
 
-            if (j == 0) front = ab;
-            if (j == seq_len - k) back = ab;
+            /* set front and back */
+            if (j == 0) front = abundance;
+            if (j == seq_len - k) back = abundance;
 
-            if (ab != constants::most_frequent_abundance) kmers_have_all_mfa = false;
-            if (j > 0) {
-                if (ab != prev_ab) kmers_have_different_abundances = true;
-            }
-            prev_ab = ab;
+            /* accumulate statistics */
+            if (abundance != constants::most_frequent_abundance) kmers_have_all_mfa = false;
+            if (j > 0 and abundance != prev_abundance) kmers_have_different_abundances = true;
 
-            abundances_builder.eat(ab);
-            sum_of_abundances += ab;
+            /* count the number of runs */
+            if (prev_abundance != constants::invalid and abundance != prev_abundance) {
+                data.num_runs_abundances += 1;
+            }
 
-            process_abundance(ab);
+            prev_abundance = abundance;
         }
 
         num_sequences_diff_abs += kmers_have_different_abundances;
@@ -136,10 +119,11 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
         }
     }
 
+    data.num_runs_abundances += 1;
+
     std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases, " << num_kmers
               << " kmers" << std::endl;
     std::cout << "sum_of_abundances " << sum_of_abundances << std::endl;
-
     std::cout << "num_sequences whose kmers have different abundances: " << num_sequences_diff_abs
               << "/" << num_sequences << " (" << (num_sequences_diff_abs * 100.0) / num_sequences
               << "%)" << std::endl;
@@ -151,11 +135,6 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
               << num_sequences_all_mfa << "/" << (num_sequences - num_sequences_diff_abs) << " ("
               << (num_sequences_all_mfa * 100.0) / (num_sequences - num_sequences_diff_abs) << "%)"
               << std::endl;
-
-    abundances_builder.push_abundance_interval(ab_value, ab_length);
-    abundances_builder.finalize(num_kmers);
-
-    data.num_runs_abundances = abundances_builder.num_abundance_intervals();
 }
 
 permute_data parse_file(std::string const& filename, build_configuration const& build_config) {

From ae23a269fb2546b9503b28a292b0fe09fd3abaf1 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Wed, 23 Mar 2022 18:34:30 +0100
Subject: [PATCH 15/40] minor fixes

---
 include/builder/cover.hpp | 41 +++++++++++++++++++++------------------
 src/permute.cpp           | 20 ++++++++++++-------
 2 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index f4618db..48e55ca 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -119,19 +119,21 @@ struct cover {
                 return x.id < y.id;
             });
 
-            uint64_t prev_front = vertices.front().front;
-            uint64_t prev_offset = 0;
-            uint64_t offset = 0;
-            for (auto const& vertex : vertices) {
-                if (vertex.front != prev_front) {
-                    abundance_map[prev_front] = {prev_offset, offset, 0};
-                    prev_offset = offset;
+            if (num_vertices > 0) {
+                uint64_t prev_front = vertices.front().front;
+                uint64_t prev_offset = 0;
+                uint64_t offset = 0;
+                for (auto const& vertex : vertices) {
+                    if (vertex.front != prev_front) {
+                        abundance_map[prev_front] = {prev_offset, offset, 0};
+                        prev_offset = offset;
+                    }
+                    offset += 1;
+                    prev_front = vertex.front;
                 }
-                offset += 1;
-                prev_front = vertex.front;
+                abundance_map[prev_front] = {prev_offset, offset, 0};
+                assert(offset == vertices.size());
             }
-            abundance_map[prev_front] = {prev_offset, offset, 0};
-            assert(offset == vertices.size());
 
             while (true) {
                 walk.clear();
@@ -247,7 +249,7 @@ struct cover {
                     num_mergings_in_round += walk.size() - 1;
 #ifndef NDEBUG
                     uint64_t prev_back = walk.front().front;
-                    // std::cout << "=>";
+                    std::cout << "=>";
                     for (auto const& w : walk) {
                         if (colors[w.id] == color::black) {
                             std::cout << "ERROR: duplicate vertex." << std::endl;
@@ -257,19 +259,19 @@ struct cover {
                         }
                         prev_back = w.back;
                         colors[w.id] = color::black;
-                        // std::cout << w.id << ":[" << w.front << "," << w.back << "] ";
+                        std::cout << w.id << ":[" << w.front << "," << w.back << "] ";
                     }
-                    // std::cout << std::endl;
+                    std::cout << std::endl;
 #endif
                 }
                 num_runs -= num_mergings_in_round;
                 std::cout << "  num_mergings = " << num_mergings_in_round << std::endl;
                 std::cout << "  num_runs " << num_runs << std::endl;
 
-                // std::cout << "created vertices in round " << rounds.size() << ":" << std::endl;
-                // for (auto const& v : tmp_vertices) {
-                //     std::cout << v.id << ":[" << v.front << "," << v.back << "]\n";
-                // }
+                std::cout << "created vertices in round " << rounds.size() << ":" << std::endl;
+                for (auto const& v : tmp_vertices) {
+                    std::cout << v.id << ":[" << v.front << "," << v.back << "]\n";
+                }
             }
 
             rounds.push_back(walks_in_round);
@@ -279,7 +281,7 @@ struct cover {
             walks_in_round.clear();
             abundance_map.clear();
 
-            if (all_singletons) {
+            if (all_singletons and rounds.size() > 1) {
                 std::cout << "STOP: all walks are singletons --> no new mergings were found"
                           << std::endl;
                 break;
@@ -293,6 +295,7 @@ struct cover {
 
     void save(std::string const& filename) {
         std::ofstream out(filename.c_str());
+        assert(rounds.size() > 0);
         int r = rounds.size() - 1;
         const auto& walks = rounds[r];
         for (auto const& walk : walks) {
diff --git a/src/permute.cpp b/src/permute.cpp
index b4a1d69..3898ccc 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -22,7 +22,6 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
     uint64_t num_kmers = 0;
     uint64_t seq_len = 0;
 
-    uint64_t prev_abundance = constants::invalid;
     uint64_t sum_of_abundances = 0;
     uint64_t num_sequences_diff_abs = 0;  // num sequences whose kmers have different abundances
     uint64_t num_sequences_all_mfa = 0;   // num sequences whose kmers have same abundance == mfa
@@ -70,7 +69,7 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
         uint64_t front = constants::invalid;
         uint64_t back = constants::invalid;
 
-        for (uint64_t j = 0, kmer_id = num_kmers; j != seq_len - k + 1; ++j, ++kmer_id) {
+        for (uint64_t j = 0, prev_abundance = constants::invalid; j != seq_len - k + 1; ++j) {
             uint64_t abundance = std::strtoull(sequence.data() + i, &end, 10);
             sum_of_abundances += abundance;
             i = sequence.find_first_of(' ', i) + 1;
@@ -84,9 +83,7 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
             if (j > 0 and abundance != prev_abundance) kmers_have_different_abundances = true;
 
             /* count the number of runs */
-            if (prev_abundance != constants::invalid and abundance != prev_abundance) {
-                data.num_runs_abundances += 1;
-            }
+            if (abundance != prev_abundance) data.num_runs_abundances += 1;
 
             prev_abundance = abundance;
         }
@@ -94,6 +91,10 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
         num_sequences_diff_abs += kmers_have_different_abundances;
         num_sequences_all_mfa += kmers_have_all_mfa;
 
+        std::cout << "num_runs_abundances in seq " << num_sequences << " = "
+                  << data.num_runs_abundances << std::endl;
+        std::cout << "vertex " << num_sequences << ":[" << front << "," << back << "]" << std::endl;
+
         data.vertices.emplace_back(num_sequences, front, back);
     };
 
@@ -119,7 +120,8 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
         }
     }
 
-    data.num_runs_abundances += 1;
+    assert(data.vertices.size() == num_sequences);
+    assert(data.num_runs_abundances >= num_sequences);
 
     std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases, " << num_kmers
               << " kmers" << std::endl;
@@ -188,6 +190,10 @@ int main(int argc, char** argv) {
     uint64_t num_runs_abundances = data.num_runs_abundances;
 
     {
+        std::cout
+            << "The trivial lower bound assumes we are able to concatenate all sequences : R' = "
+            << num_runs_abundances - data.vertices.size() + 1 << std::endl;
+
         std::sort(data.vertices.begin(), data.vertices.end(), [](auto const& x, auto const& y) {
             /* sort on front */
             if (x.front != y.front) return x.front < y.front;
@@ -223,7 +229,7 @@ int main(int argc, char** argv) {
                 }
             }
         }
-        std::cout << "computed lower bound: R = " << R << std::endl;
+        std::cout << "Computed lower bound: R = " << R << std::endl;
     }
 
     cover c;

From a47a74a5eb66b4395ec87eb16db85bbe36cac7ca Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Thu, 24 Mar 2022 10:51:01 +0100
Subject: [PATCH 16/40] minor fixes

---
 include/builder/cover.hpp | 58 ++++++++++++++++++++-----------------
 src/permute.cpp           | 60 +++++++++++++++++++++------------------
 2 files changed, 64 insertions(+), 54 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index 48e55ca..f35dd08 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -33,13 +33,12 @@ struct cover {
         uint64_t position;
     };
 
-    cover() : num_sequences(0) {}
+    cover(uint64_t num_sequences, uint64_t num_runs_abundances)
+        : m_num_sequences(num_sequences), m_num_runs_abundances(num_runs_abundances) {}
 
-    uint64_t num_vertices() const { return num_sequences; }
+    void compute(std::vector<vertex>& vertices) {
+        assert(vertices.size() == m_num_sequences);
 
-    void compute(std::vector<vertex>& vertices,
-                 uint64_t num_runs_abundances  // TODO: remove from here
-    ) {
         essentials::timer_type timer;
 
         timer.start();
@@ -48,9 +47,7 @@ struct cover {
         std::unordered_map<uint64_t, range> abundance_map;
         std::vector<color> colors;
         std::vector<vertex> tmp_vertices;
-        uint64_t num_runs = num_runs_abundances;
-        std::cout << "initial number of runs = " << num_runs << std::endl;
-        num_sequences = vertices.size();
+        std::cout << "initial number of runs = " << m_num_runs_abundances << std::endl;
 
         walk_t walk;
         walks_t walks_in_round;
@@ -71,28 +68,33 @@ struct cover {
                 return x.id > y.id;
             });
 
-            uint64_t count = 0;
-            uint64_t total = vertices.size();
+            uint64_t num_special_vertices = 0;
+            uint64_t num_vertices = vertices.size();
             while (!vertices.empty()) {
                 auto v = vertices.back();
                 if (v.front == constants::most_frequent_abundance and
                     v.back == constants::most_frequent_abundance) {
                     walk.push_back(v);
                     vertices.pop_back();
-                    count += 1;
+                    num_special_vertices += 1;
                 } else {
                     break;
                 }
             }
             std::cout << "num vertices of the form v:[" << constants::most_frequent_abundance << ","
-                      << constants::most_frequent_abundance << "] = " << count << "/" << total
-                      << "(" << (count * 100.0) / total << "%)" << std::endl;
+                      << constants::most_frequent_abundance << "] = " << num_special_vertices << "/"
+                      << num_vertices << "(" << (num_special_vertices * 100.0) / num_vertices
+                      << "%)" << std::endl;
 
-            /* create a new vertex for next round */
-            tmp_vertices.emplace_back(walks_in_round.size(), walk.front().front, walk.back().back);
-            walks_in_round.push_back(walk);
+            if (num_special_vertices != 0) {
+                /* create a new vertex for next round */
+                assert(!walk.empty());
+                tmp_vertices.emplace_back(walks_in_round.size(), walk.front().front,
+                                          walk.back().back);
+                walks_in_round.push_back(walk);
 
-            walk.clear();
+                walk.clear();
+            }
         }
 
         while (true) {
@@ -104,8 +106,8 @@ struct cover {
             /* all unvisited */
             if (rounds.size() == 0) {
                 /* remember: we removed some vertices but the id-space still spans
-                   [0..num_sequences-1] */
-                colors.resize(num_sequences);
+                   [0..m_num_sequences-1] */
+                colors.resize(m_num_sequences);
                 std::fill(colors.begin(), colors.end(), color::invalid);
                 for (auto const& v : vertices) colors[v.id] = color::white;
             } else {
@@ -264,14 +266,15 @@ struct cover {
                     std::cout << std::endl;
 #endif
                 }
-                num_runs -= num_mergings_in_round;
+                assert(m_num_runs_abundances > num_mergings_in_round);
+                m_num_runs_abundances -= num_mergings_in_round;
                 std::cout << "  num_mergings = " << num_mergings_in_round << std::endl;
-                std::cout << "  num_runs " << num_runs << std::endl;
+                std::cout << "  num_runs " << m_num_runs_abundances << std::endl;
 
-                std::cout << "created vertices in round " << rounds.size() << ":" << std::endl;
-                for (auto const& v : tmp_vertices) {
-                    std::cout << v.id << ":[" << v.front << "," << v.back << "]\n";
-                }
+                // std::cout << "created vertices in round " << rounds.size() << ":" << std::endl;
+                // for (auto const& v : tmp_vertices) {
+                //     std::cout << v.id << ":[" << v.front << "," << v.back << "]\n";
+                // }
             }
 
             rounds.push_back(walks_in_round);
@@ -291,6 +294,8 @@ struct cover {
         timer.stop();
 
         std::cout << "cover computed in: " << timer.elapsed() / 1000000 << " [sec]" << std::endl;
+
+        assert(m_num_runs_abundances >= 1);
     }
 
     void save(std::string const& filename) {
@@ -305,7 +310,8 @@ struct cover {
     }
 
 private:
-    uint64_t num_sequences;
+    uint64_t m_num_sequences;
+    uint64_t m_num_runs_abundances;
     std::vector<walks_t> rounds;
 
     /* visit walk of index w in round of index r */
diff --git a/src/permute.cpp b/src/permute.cpp
index 3898ccc..3f0f3ce 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -9,15 +9,15 @@
 using namespace sshash;
 
 struct permute_data {
-    permute_data() : num_runs_abundances(0) {}
+    permute_data() : num_runs_abundances(0), num_sequences(0) {}
     uint64_t num_runs_abundances;
+    uint64_t num_sequences;
     std::vector<vertex> vertices;
 };
 
 void parse_file(std::istream& is, permute_data& data, build_configuration const& build_config) {
     std::string sequence;
     uint64_t k = build_config.k;
-    uint64_t num_sequences = 0;
     uint64_t num_bases = 0;
     uint64_t num_kmers = 0;
     uint64_t seq_len = 0;
@@ -26,6 +26,7 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
     uint64_t num_sequences_diff_abs = 0;  // num sequences whose kmers have different abundances
     uint64_t num_sequences_all_mfa = 0;   // num sequences whose kmers have same abundance == mfa
     data.num_runs_abundances = 0;
+    data.num_sequences = 0;
 
     auto parse_header = [&]() {
         if (sequence.empty()) return;
@@ -91,11 +92,7 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
         num_sequences_diff_abs += kmers_have_different_abundances;
         num_sequences_all_mfa += kmers_have_all_mfa;
 
-        std::cout << "num_runs_abundances in seq " << num_sequences << " = "
-                  << data.num_runs_abundances << std::endl;
-        std::cout << "vertex " << num_sequences << ":[" << front << "," << back << "]" << std::endl;
-
-        data.vertices.emplace_back(num_sequences, front, back);
+        data.vertices.emplace_back(data.num_sequences, front, back);
     };
 
     while (!is.eof()) {
@@ -105,8 +102,8 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
         std::getline(is, sequence);  // DNA sequence
         if (sequence.size() < k) continue;
 
-        if (++num_sequences % 100000 == 0) {
-            std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases, "
+        if (++data.num_sequences % 100000 == 0) {
+            std::cout << "read " << data.num_sequences << " sequences, " << num_bases << " bases, "
                       << num_kmers << " kmers" << std::endl;
         }
 
@@ -120,23 +117,24 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
         }
     }
 
-    assert(data.vertices.size() == num_sequences);
-    assert(data.num_runs_abundances >= num_sequences);
+    assert(data.vertices.size() == data.num_sequences);
+    assert(data.num_runs_abundances >= data.num_sequences);
 
-    std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases, " << num_kmers
-              << " kmers" << std::endl;
+    std::cout << "read " << data.num_sequences << " sequences, " << num_bases << " bases, "
+              << num_kmers << " kmers" << std::endl;
     std::cout << "sum_of_abundances " << sum_of_abundances << std::endl;
     std::cout << "num_sequences whose kmers have different abundances: " << num_sequences_diff_abs
-              << "/" << num_sequences << " (" << (num_sequences_diff_abs * 100.0) / num_sequences
-              << "%)" << std::endl;
+              << "/" << data.num_sequences << " ("
+              << (num_sequences_diff_abs * 100.0) / data.num_sequences << "%)" << std::endl;
     std::cout << "num_sequences whose kmers all have the same abundance != mfa: "
-              << (num_sequences - num_sequences_diff_abs) << "/" << num_sequences << " ("
-              << ((num_sequences - num_sequences_diff_abs) * 100.0) / num_sequences << "%)"
-              << std::endl;
+              << (data.num_sequences - num_sequences_diff_abs) << "/" << data.num_sequences << " ("
+              << ((data.num_sequences - num_sequences_diff_abs) * 100.0) / data.num_sequences
+              << "%)" << std::endl;
     std::cout << "num_sequences whose kmers all have the same abundance == mfa: "
-              << num_sequences_all_mfa << "/" << (num_sequences - num_sequences_diff_abs) << " ("
-              << (num_sequences_all_mfa * 100.0) / (num_sequences - num_sequences_diff_abs) << "%)"
-              << std::endl;
+              << num_sequences_all_mfa << "/" << (data.num_sequences - num_sequences_diff_abs)
+              << " ("
+              << (num_sequences_all_mfa * 100.0) / (data.num_sequences - num_sequences_diff_abs)
+              << "%)" << std::endl;
 }
 
 permute_data parse_file(std::string const& filename, build_configuration const& build_config) {
@@ -187,12 +185,12 @@ int main(int argc, char** argv) {
     }
 
     auto data = parse_file(input_filename, build_config);
-    uint64_t num_runs_abundances = data.num_runs_abundances;
 
     {
-        std::cout
-            << "The trivial lower bound assumes we are able to concatenate all sequences : R' = "
-            << num_runs_abundances - data.vertices.size() + 1 << std::endl;
+        uint64_t R_lower = data.num_runs_abundances - data.vertices.size() + 1;
+        std::cout << "The trivial lower bound (too optimistic) assumes we are able to concatenate "
+                     "all sequences : R_lower = "
+                  << R_lower << std::endl;
 
         std::sort(data.vertices.begin(), data.vertices.end(), [](auto const& x, auto const& y) {
             /* sort on front */
@@ -218,12 +216,17 @@ int main(int argc, char** argv) {
         }
         abundance_map[prev_front] = count;
 
-        uint64_t R = num_runs_abundances;
+        uint64_t R = data.num_runs_abundances;
         for (auto const& vertex : data.vertices) {
             uint64_t back = vertex.back;
             auto it = abundance_map.find(back);
             if (it != abundance_map.cend()) {  // found
                 if ((*it).second > 0) {        // if it is 0, we cannot find a match
+
+                    /* We clearly cannot create more mergings than num_sequences - 1,
+                       thus R cannot be lower than R_lower. */
+                    if (R == R_lower) break;
+
                     (*it).second -= 1;
                     R -= 1;
                 }
@@ -232,8 +235,9 @@ int main(int argc, char** argv) {
         std::cout << "Computed lower bound: R = " << R << std::endl;
     }
 
-    cover c;
-    c.compute(data.vertices, num_runs_abundances);
+    cover c(data.num_sequences, data.num_runs_abundances);
+    assert(data.vertices.size() == data.num_sequences);
+    c.compute(data.vertices);
     c.save(output_filename);
 
     return 0;

From d337a4dd6f4cd0aafcd7c317f20fbba592b61468 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Thu, 24 Mar 2022 12:06:36 +0100
Subject: [PATCH 17/40] faster

---
 include/builder/cover.hpp | 55 ++++++++++++++++++++++++++-------------
 1 file changed, 37 insertions(+), 18 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index f35dd08..c0a16d9 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -29,7 +29,7 @@ struct cover {
         range() {}
         range(uint64_t b, uint64_t e, uint64_t p) : begin(b), end(e), position(p) {}
         uint64_t begin;
-        uint64_t end;
+        uint64_t end;  // TODO: currently unused
         uint64_t position;
     };
 
@@ -62,6 +62,9 @@ struct cover {
                 Warning: here we are assuming the mfa is also the smallest abundance.
             */
 
+            essentials::timer_type timer;
+            timer.start();
+
             std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) {
                 if (x.front != y.front) return x.front > y.front;
                 if (x.back != y.back) return x.back > y.back;
@@ -95,6 +98,9 @@ struct cover {
 
                 walk.clear();
             }
+
+            timer.stop();
+            std::cout << "  time: " << timer.elapsed() / 1000000 << " [sec]" << std::endl;
         }
 
         while (true) {
@@ -103,6 +109,9 @@ struct cover {
             uint64_t num_vertices = vertices.size();
             std::cout << "  num_vertices " << num_vertices << std::endl;
 
+            essentials::timer_type round_timer;
+            round_timer.start();
+
             /* all unvisited */
             if (rounds.size() == 0) {
                 /* remember: we removed some vertices but the id-space still spans
@@ -137,26 +146,37 @@ struct cover {
                 assert(offset == vertices.size());
             }
 
+            uint64_t i = 0;  // offset of unvisited vertex
+
             while (true) {
                 walk.clear();
 
-                /* take an unvisited vertex */
-                uint64_t i = 0;
+                /* 1. take an unvisited vertex */
+
+                // try to visit forward
                 for (; i != num_vertices; ++i) {
                     uint64_t id = vertices[i].id;
                     if (colors[id] == color::white) break;
                 }
 
+                // if a vertex is not found, try to visit backward
+                if (i == num_vertices) {
+                    i = 0;
+                    for (; i != num_vertices; ++i) {
+                        uint64_t id = vertices[i].id;
+                        if (colors[id] == color::white) break;
+                    }
+                }
+
                 if (i == num_vertices) break;  // all visited
 
-                /* create a new walk */
+                /* 2. create a new walk */
                 while (true) {
                     auto vertex = vertices[i];
                     uint64_t id = vertex.id;
                     assert(colors[id] != color::black);
 
                     colors[id] = color::gray;
-                    uint64_t back = vertex.back;
                     if (walk.size() > 0) {
                         assert(walk.back().id != id);
                         colors[walk.back().id] = color::black;
@@ -164,19 +184,17 @@ struct cover {
                     }
                     walk.push_back(vertex);
 
-                    uint64_t offset = 0;
-                    bool no_match_found = false;
-
+                    uint64_t back = vertex.back;
                     auto it = abundance_map.find(back);
 
                     /* abundance back is not found, so no match is possible */
-                    if (it == abundance_map.cend()) {
-                        no_match_found = true;
-                        break;
-                    }
-                    offset = (*it).second.begin + (*it).second.position;  // skip to position
+                    if (it == abundance_map.cend()) break;
+
+                    bool no_match_found = false;
+                    uint64_t offset =
+                        (*it).second.begin + (*it).second.position;  // skip to position
 
-                    /* search for a match */
+                    /* 3. search for a match */
                     while (true) {
                         if (offset == num_vertices) break;
                         auto candidate = vertices[offset];
@@ -251,7 +269,7 @@ struct cover {
                     num_mergings_in_round += walk.size() - 1;
 #ifndef NDEBUG
                     uint64_t prev_back = walk.front().front;
-                    std::cout << "=>";
+                    // std::cout << "=>";
                     for (auto const& w : walk) {
                         if (colors[w.id] == color::black) {
                             std::cout << "ERROR: duplicate vertex." << std::endl;
@@ -261,9 +279,9 @@ struct cover {
                         }
                         prev_back = w.back;
                         colors[w.id] = color::black;
-                        std::cout << w.id << ":[" << w.front << "," << w.back << "] ";
+                        // std::cout << w.id << ":[" << w.front << "," << w.back << "] ";
                     }
-                    std::cout << std::endl;
+                    // std::cout << std::endl;
 #endif
                 }
                 assert(m_num_runs_abundances > num_mergings_in_round);
@@ -271,6 +289,8 @@ struct cover {
                 std::cout << "  num_mergings = " << num_mergings_in_round << std::endl;
                 std::cout << "  num_runs " << m_num_runs_abundances << std::endl;
 
+                round_timer.stop();
+                std::cout << "  time: " << round_timer.elapsed() / 1000000 << " [sec]" << std::endl;
                 // std::cout << "created vertices in round " << rounds.size() << ":" << std::endl;
                 // for (auto const& v : tmp_vertices) {
                 //     std::cout << v.id << ":[" << v.front << "," << v.back << "]\n";
@@ -292,7 +312,6 @@ struct cover {
         }
 
         timer.stop();
-
         std::cout << "cover computed in: " << timer.elapsed() / 1000000 << " [sec]" << std::endl;
 
         assert(m_num_runs_abundances >= 1);

From 38c10e6674d283fa0916e9e66ae9e161f698743e Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Thu, 24 Mar 2022 13:57:04 +0100
Subject: [PATCH 18/40] less code

---
 include/builder/cover.hpp | 45 +++++++++++++--------------------------
 1 file changed, 15 insertions(+), 30 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index c0a16d9..91cd25e 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -25,14 +25,6 @@ struct cover {
         invalid = -1
     };
 
-    struct range {
-        range() {}
-        range(uint64_t b, uint64_t e, uint64_t p) : begin(b), end(e), position(p) {}
-        uint64_t begin;
-        uint64_t end;  // TODO: currently unused
-        uint64_t position;
-    };
-
     cover(uint64_t num_sequences, uint64_t num_runs_abundances)
         : m_num_sequences(num_sequences), m_num_runs_abundances(num_runs_abundances) {}
 
@@ -43,8 +35,8 @@ struct cover {
 
         timer.start();
 
-        /* (abundance, num_seqs_with_front_abundance=abundance) */
-        std::unordered_map<uint64_t, range> abundance_map;
+        /* (abundance, position of a candidate abundance with front = abundance) */
+        std::unordered_map<uint64_t, uint64_t> abundance_map;
         std::vector<color> colors;
         std::vector<vertex> tmp_vertices;
         std::cout << "initial number of runs = " << m_num_runs_abundances << std::endl;
@@ -131,22 +123,17 @@ struct cover {
             });
 
             if (num_vertices > 0) {
-                uint64_t prev_front = vertices.front().front;
-                uint64_t prev_offset = 0;
+                uint64_t prev_front = constants::invalid;
                 uint64_t offset = 0;
                 for (auto const& vertex : vertices) {
-                    if (vertex.front != prev_front) {
-                        abundance_map[prev_front] = {prev_offset, offset, 0};
-                        prev_offset = offset;
-                    }
+                    if (vertex.front != prev_front) abundance_map[vertex.front] = offset;
                     offset += 1;
                     prev_front = vertex.front;
                 }
-                abundance_map[prev_front] = {prev_offset, offset, 0};
                 assert(offset == vertices.size());
             }
 
-            uint64_t i = 0;  // offset of unvisited vertex
+            uint64_t i = 0;  // position of unvisited vertex in vertices
 
             while (true) {
                 walk.clear();
@@ -191,17 +178,16 @@ struct cover {
                     if (it == abundance_map.cend()) break;
 
                     bool no_match_found = false;
-                    uint64_t offset =
-                        (*it).second.begin + (*it).second.position;  // skip to position
+                    uint64_t candidate_i = (*it).second;  // candidate position
 
                     /* 3. search for a match */
                     while (true) {
-                        if (offset == num_vertices) break;
-                        auto candidate = vertices[offset];
+                        if (candidate_i == num_vertices) break;
+                        auto candidate = vertices[candidate_i];
 
                         /* skip */
                         if (candidate.id == id) {
-                            offset += 1;
+                            candidate_i += 1;
                             continue;
                         }
 
@@ -213,20 +199,19 @@ struct cover {
 
                         /* match found */
                         if (colors[candidate.id] != color::black) {
-                            uint64_t& position = (*it).second.position;
-                            assert((*it).second.begin + position <= offset);
-                            position += 1;
+                            assert((*it).second <= candidate_i);
+                            (*it).second += 1;
                             break;
                         }
 
-                        offset += 1;
+                        candidate_i += 1;
                     }
 
-                    assert(offset <= num_vertices);
-                    if (no_match_found or offset == num_vertices) break;
+                    assert(candidate_i <= num_vertices);
+                    if (no_match_found or candidate_i == num_vertices) break;
 
                     /* valid match was found, then visit it next */
-                    i = offset;
+                    i = candidate_i;
                 }
 
                 assert(!walk.empty());

From 8ea1d329663e43efa6bddae736f652455f9fe156 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Thu, 24 Mar 2022 19:10:14 +0100
Subject: [PATCH 19/40] minor

---
 include/builder/cover.hpp |  3 ++-
 src/permute.cpp           | 10 +++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index 91cd25e..c2487c8 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -36,7 +36,7 @@ struct cover {
         timer.start();
 
         /* (abundance, position of a candidate abundance with front = abundance) */
-        std::unordered_map<uint64_t, uint64_t> abundance_map;
+        std::unordered_map<uint32_t, uint32_t> abundance_map;
         std::vector<color> colors;
         std::vector<vertex> tmp_vertices;
         std::cout << "initial number of runs = " << m_num_runs_abundances << std::endl;
@@ -298,6 +298,7 @@ struct cover {
 
         timer.stop();
         std::cout << "cover computed in: " << timer.elapsed() / 1000000 << " [sec]" << std::endl;
+        std::cout << "final number of runs R = " << m_num_runs_abundances << std::endl;
 
         assert(m_num_runs_abundances >= 1);
     }
diff --git a/src/permute.cpp b/src/permute.cpp
index 3f0f3ce..3aa554f 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -168,7 +168,7 @@ int main(int argc, char** argv) {
     /* optional arguments */
     parser.add("output_filename", "Output file where the permuted collection will be written.",
                "-o", false);
-    parser.add("verbose", "Verbose output during construction.", "--verbose", true);
+    // parser.add("verbose", "Verbose output during construction.", "--verbose", true);
 
     if (!parser.parse()) return 1;
 
@@ -177,7 +177,7 @@ int main(int argc, char** argv) {
 
     build_configuration build_config;
     build_config.k = k;
-    build_config.verbose = parser.get<bool>("verbose");
+    // build_config.verbose = parser.get<bool>("verbose");
 
     std::string output_filename = input_filename + ".permuted";
     if (parser.parsed("output_filename")) {
@@ -189,7 +189,7 @@ int main(int argc, char** argv) {
     {
         uint64_t R_lower = data.num_runs_abundances - data.vertices.size() + 1;
         std::cout << "The trivial lower bound (too optimistic) assumes we are able to concatenate "
-                     "all sequences : R_lower = "
+                     "all sequences : R_lo = "
                   << R_lower << std::endl;
 
         std::sort(data.vertices.begin(), data.vertices.end(), [](auto const& x, auto const& y) {
@@ -200,7 +200,7 @@ int main(int argc, char** argv) {
         });
 
         /* (abundance, num_seqs_with_front=abundance) */
-        // We assume there are less than 2^32 distinct abundances and that
+        // We assume there are less than 2^32 sequences and that
         // the largest abundance fits into a 32-bit uint.
         std::unordered_map<uint32_t, uint32_t> abundance_map;
 
@@ -232,7 +232,7 @@ int main(int argc, char** argv) {
                 }
             }
         }
-        std::cout << "Computed lower bound: R = " << R << std::endl;
+        std::cout << "Computed lower bound: R_hi = " << R << std::endl;
     }
 
     cover c(data.num_sequences, data.num_runs_abundances);

From 6fa2c7fc7d877bd017c946924d377e079d020ce5 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Thu, 24 Mar 2022 22:11:04 +0100
Subject: [PATCH 20/40] more

---
 ..._sakai.BA000007.3.k31_ust.abundances.fa.gz | Bin 1683039 -> 1683050 bytes
 include/builder/cover.hpp                     |  31 ++++--
 src/permute.cpp                               |  96 ++++++++++++++++--
 3 files changed, 109 insertions(+), 18 deletions(-)

diff --git a/data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz b/data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz
index d5c697e272b4a2befcf49083ee7d86f985620251..7607fd9af8191997568ea53cfb7e6f2a02033bc5 100644
GIT binary patch
delta 98
zcmV~$yA6T>06@VgDvAiG_}kz)0rLP#Nn;#<;`L5K$2KJNOyS)$HuKp0o>5tS+gI7v
q?@xy;WGR8HWGx#BWh*<0WG}HCB#~4yIm$`SlFLP|a+CYOqv-!by&Vkz

delta 87
zcmV~$xe<Z@006-ekWU`vrv&mFOQH{g%c;bSZkQpb3U}{wz0P%CHsm3Jgc8Y9Vo4;G
cOmcZiA*EDO%UeFuNGqN6GRP>?!TkOG0pkc7CIA2c

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index c2487c8..f0ca73f 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -308,8 +308,14 @@ struct cover {
         assert(rounds.size() > 0);
         int r = rounds.size() - 1;
         const auto& walks = rounds[r];
+        uint64_t num_sequences = 0;
         for (auto const& walk : walks) {
-            for (auto const& vertex : walk) visit(vertex.id, r, out);
+            for (auto const& vertex : walk) num_sequences += visit(vertex.id, r, out);
+        }
+        if (num_sequences != m_num_sequences) {
+            std::cerr << "Error: expected to write " << m_num_sequences << " but written "
+                      << num_sequences << std::endl;
+            throw std::runtime_error("wrong number of sequences written");
         }
         out.close();
     }
@@ -320,20 +326,23 @@ struct cover {
     std::vector<walks_t> rounds;
 
     /* visit walk of index w in round of index r */
-    void visit(int w, int r, std::ofstream& out) const {
+    uint64_t visit(int w, int r, std::ofstream& out) const {
         if (r > 0) {
             assert(size_t(w) < rounds[r].size());
             auto const& walk = rounds[r][w];
-            for (auto const& vertex : walk) { visit(vertex.id, r - 1, out); }
-        } else {  // print
-            assert(size_t(w) < rounds[0].size());
-            auto const& walk = rounds[0][w];
-            for (auto const& vertex : walk) {
-                // out << vertex.id << ":[" << vertex.front << "," << vertex.back << "] ";
-                out << vertex.id << '\n';
-            }
-            // out << '\n';
+            uint64_t num_sequences = 0;
+            for (auto const& vertex : walk) num_sequences += visit(vertex.id, r - 1, out);
+            return num_sequences;
+        }
+        /* print */
+        assert(size_t(w) < rounds[0].size());
+        auto const& walk = rounds[0][w];
+        for (auto const& vertex : walk) {
+            // out << vertex.id << ":[" << vertex.front << "," << vertex.back << "] ";
+            out << vertex.id << '\n';
         }
+        // out << '\n';
+        return walk.size();
     }
 };
 
diff --git a/src/permute.cpp b/src/permute.cpp
index 3aa554f..6fd3e7d 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -55,8 +55,7 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
         uint64_t j = sequence.find_first_of(' ', i);
         if (j == std::string::npos) throw parse_runtime_error();
 
-        char* end;
-        seq_len = std::strtoull(sequence.data() + i, &end, 10);
+        seq_len = std::strtoull(sequence.data() + i, nullptr, 10);
         i = j + 1;
         expect(sequence[i + 0], 'a');
         expect(sequence[i + 1], 'b');
@@ -71,7 +70,7 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
         uint64_t back = constants::invalid;
 
         for (uint64_t j = 0, prev_abundance = constants::invalid; j != seq_len - k + 1; ++j) {
-            uint64_t abundance = std::strtoull(sequence.data() + i, &end, 10);
+            uint64_t abundance = std::strtoull(sequence.data() + i, nullptr, 10);
             sum_of_abundances += abundance;
             i = sequence.find_first_of(' ', i) + 1;
 
@@ -152,6 +151,62 @@ permute_data parse_file(std::string const& filename, build_configuration const&
     return data;
 }
 
+void permute_and_write(std::istream& is, std::string const& output_filename,
+                       pthash::compact_vector const& permutation) {
+    // std::vector<std::string> filenames;
+    // constexpr uint64_t bytes = 2 * essentials::GB;
+    std::vector<std::pair<std::string, std::string>> buffer;  // (header, dna)
+
+    std::string header_sequence;
+    std::string dna_sequence;
+    uint64_t num_sequences = permutation.size();
+    uint64_t num_bases = 0;
+
+    for (uint64_t i = 0; i != num_sequences; ++i) {
+        std::getline(is, header_sequence);
+        std::getline(is, dna_sequence);
+        buffer.emplace_back(header_sequence, dna_sequence);
+
+        num_bases += dna_sequence.size();
+
+        // check for bytes used and flush
+
+        if (i != 0 and i % 100000 == 0) {
+            std::cout << "read " << i << " sequences, " << num_bases << " bases" << std::endl;
+        }
+    }
+
+    std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases" << std::endl;
+
+    std::cout << "sorting..." << std::endl;
+    std::sort(buffer.begin(), buffer.end(), [&](auto const& p_x, auto const& p_y) {
+        assert(p_x.first.front() == '>');
+        assert(p_y.first.front() == '>');
+        uint64_t seq_id_x = std::strtoull(p_x.first.data() + 1, nullptr, 10);
+        uint64_t seq_id_y = std::strtoull(p_y.first.data() + 1, nullptr, 10);
+        return permutation[seq_id_x] < permutation[seq_id_y];
+    });
+
+    std::cout << "saving to file '" << output_filename << "'..." << std::endl;
+    std::ofstream out(output_filename.c_str());
+    for (auto const& seq : buffer) out << seq.first << '\n' << seq.second << '\n';
+    out.close();
+}
+
+void permute_and_write(std::string const& input_filename, std::string const& output_filename,
+                       pthash::compact_vector const& permutation) {
+    std::ifstream is(input_filename.c_str());
+    if (!is.good()) throw std::runtime_error("error in opening the file '" + input_filename + "'");
+    std::cout << "reading file '" << input_filename << "'..." << std::endl;
+    if (util::ends_with(input_filename, ".gz")) {
+        zip_istream zis(is);
+        permute_and_write(zis, output_filename, permutation);
+    } else {
+        permute_and_write(is, output_filename, permutation);
+    }
+    is.close();
+}
+
 int main(int argc, char** argv) {
     cmd_line_parser::parser parser(argc, argv);
 
@@ -184,6 +239,8 @@ int main(int argc, char** argv) {
         output_filename = parser.get<std::string>("output_filename");
     }
 
+    std::string permutation_filename = input_filename + ".permutation";
+
     auto data = parse_file(input_filename, build_config);
 
     {
@@ -235,10 +292,35 @@ int main(int argc, char** argv) {
         std::cout << "Computed lower bound: R_hi = " << R << std::endl;
     }
 
-    cover c(data.num_sequences, data.num_runs_abundances);
-    assert(data.vertices.size() == data.num_sequences);
-    c.compute(data.vertices);
-    c.save(output_filename);
+    {
+        /* compute cover */
+        cover c(data.num_sequences, data.num_runs_abundances);
+        assert(data.vertices.size() == data.num_sequences);
+        c.compute(data.vertices);
+        c.save(permutation_filename);
+        std::vector<vertex>().swap(data.vertices);
+    }
+
+    /* permute */
+    pthash::compact_vector permutation;
+    {
+        std::ifstream is(permutation_filename.c_str());
+        if (!is.good()) {
+            throw std::runtime_error("error in opening the file '" + permutation_filename + "'");
+        }
+        pthash::compact_vector::builder cv_builder(data.num_sequences,
+                                                   std::ceil(std::log2(data.num_sequences)));
+        for (uint64_t i = 0; i != data.num_sequences; ++i) {
+            uint64_t position = 0;
+            is >> position;
+            cv_builder.set(position, i);
+        }
+        is.close();
+        cv_builder.build(permutation);
+    }
+
+    permute_and_write(input_filename, output_filename, permutation);
+    std::remove(permutation_filename.c_str());
 
     return 0;
 }

From 61e60f4d399e12fb30aebe8c0997415720ec2f1d Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Fri, 25 Mar 2022 10:40:12 +0100
Subject: [PATCH 21/40] faster identification of candidate vertex for matching

---
 include/builder/cover.hpp | 82 +++++++++++++++++++++++++++++++++++----
 1 file changed, 75 insertions(+), 7 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index f0ca73f..62836c1 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -101,9 +101,16 @@ struct cover {
             uint64_t num_vertices = vertices.size();
             std::cout << "  num_vertices " << num_vertices << std::endl;
 
-            essentials::timer_type round_timer;
+            essentials::timer_type init_timer;  // colors init + sorting + filling abundance_map +
+                                                // add all gray vertices
+            essentials::timer_type unvisited_timer;  // taking unvisited vertex
+            essentials::timer_type walk_timer;       // creating walks
+
+            essentials::timer_type round_timer;  // total time of round
             round_timer.start();
 
+            init_timer.start();
+
             /* all unvisited */
             if (rounds.size() == 0) {
                 /* remember: we removed some vertices but the id-space still spans
@@ -133,11 +140,17 @@ struct cover {
                 assert(offset == vertices.size());
             }
 
+            init_timer.stop();
+
             uint64_t i = 0;  // position of unvisited vertex in vertices
 
-            while (true) {
-                walk.clear();
+            uint64_t total_vertices_visited_to_find_matches = 0;
+            uint64_t total_vertices_visited_to_failure = 0;
+            uint64_t total_vertices_visited_to_success = 0;
+            uint64_t total_vertices = 0;
 
+            while (true) {
+                unvisited_timer.start();
                 /* 1. take an unvisited vertex */
 
                 // try to visit forward
@@ -155,10 +168,16 @@ struct cover {
                     }
                 }
 
+                unvisited_timer.stop();
+
                 if (i == num_vertices) break;  // all visited
 
                 /* 2. create a new walk */
+                walk_timer.start();
+                walk.clear();
                 while (true) {
+                    total_vertices += 1;
+
                     auto vertex = vertices[i];
                     uint64_t id = vertex.id;
                     assert(colors[id] != color::black);
@@ -181,8 +200,18 @@ struct cover {
                     uint64_t candidate_i = (*it).second;  // candidate position
 
                     /* 3. search for a match */
+                    uint64_t tmp_total_vertices_visited_to_failure = 0;
+                    uint64_t tmp_total_vertices_visited_to_success = 0;
                     while (true) {
-                        if (candidate_i == num_vertices) break;
+                        total_vertices_visited_to_find_matches += 1;
+                        tmp_total_vertices_visited_to_failure += 1;
+                        tmp_total_vertices_visited_to_success += 1;
+
+                        if (candidate_i == num_vertices) {
+                            total_vertices_visited_to_failure +=
+                                tmp_total_vertices_visited_to_failure;
+                            break;
+                        }
                         auto candidate = vertices[candidate_i];
 
                         /* skip */
@@ -194,13 +223,15 @@ struct cover {
                         /* checked all candidate matches */
                         if (candidate.front != back) {
                             no_match_found = true;
+                            total_vertices_visited_to_failure +=
+                                tmp_total_vertices_visited_to_failure;
                             break;
                         }
 
                         /* match found */
                         if (colors[candidate.id] != color::black) {
-                            assert((*it).second <= candidate_i);
-                            (*it).second += 1;
+                            total_vertices_visited_to_success +=
+                                tmp_total_vertices_visited_to_success;
                             break;
                         }
 
@@ -208,13 +239,20 @@ struct cover {
                     }
 
                     assert(candidate_i <= num_vertices);
+
+                    constexpr uint64_t num_optimized_rounds_for_speed = 2;
+                    if (rounds.size() <= num_optimized_rounds_for_speed) {
+                        /* update candidate position in abundance_map */
+                        (*it).second = candidate_i;
+                    }
+
                     if (no_match_found or candidate_i == num_vertices) break;
 
                     /* valid match was found, then visit it next */
                     i = candidate_i;
                 }
-
                 assert(!walk.empty());
+                walk_timer.stop();
 
                 if (walk.size() == 1) {
                     assert(colors[walk.front().id] == color::gray);  // visited but not merged
@@ -231,6 +269,28 @@ struct cover {
                 walks_in_round.push_back(walk);
             }
 
+            if (total_vertices_visited_to_find_matches > total_vertices) {
+                // std::cout << "  total_vertices " << total_vertices << std::endl;
+                // std::cout << "  total_vertices_visited_to_find_matches "
+                //           << total_vertices_visited_to_find_matches << std::endl;
+                // std::cout << "  total_vertices_visited_to_success  "
+                //           << total_vertices_visited_to_success << std::endl;
+                // std::cout << "  total_vertices_visited_to_failure  "
+                //           << total_vertices_visited_to_failure << std::endl;
+                std::cout << "  avg. num. vertices visited to find a match = "
+                          << static_cast<double>(total_vertices_visited_to_find_matches) /
+                                 total_vertices
+                          << std::endl;
+                std::cout << "  avg. num. vertices visited to success = "
+                          << static_cast<double>(total_vertices_visited_to_success) / total_vertices
+                          << std::endl;
+                std::cout << "  avg. num. vertices visited to failure = "
+                          << static_cast<double>(total_vertices_visited_to_failure) / total_vertices
+                          << std::endl;
+            }
+
+            init_timer.start();
+
             /* add all gray vertices (singleton walks) */
             for (auto const& v : vertices) {
                 if (colors[v.id] == color::gray) {
@@ -274,8 +334,16 @@ struct cover {
                 std::cout << "  num_mergings = " << num_mergings_in_round << std::endl;
                 std::cout << "  num_runs " << m_num_runs_abundances << std::endl;
 
+                init_timer.stop();
+
                 round_timer.stop();
                 std::cout << "  time: " << round_timer.elapsed() / 1000000 << " [sec]" << std::endl;
+                std::cout << "    init: " << init_timer.elapsed() / 1000000 << " [sec]"
+                          << std::endl;
+                std::cout << "    walk: " << walk_timer.elapsed() / 1000000 << " [sec]"
+                          << std::endl;
+                std::cout << "    unvisited: " << unvisited_timer.elapsed() / 1000000 << " [sec]"
+                          << std::endl;
                 // std::cout << "created vertices in round " << rounds.size() << ":" << std::endl;
                 // for (auto const& v : tmp_vertices) {
                 //     std::cout << v.id << ":[" << v.front << "," << v.back << "]\n";

From 1f7615f5908029ebcb368bd23a3f27606237987c Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Fri, 25 Mar 2022 12:06:40 +0100
Subject: [PATCH 22/40] faster identification of unvisited vertex

---
 include/builder/cover.hpp | 78 ++++++++++++---------------------------
 1 file changed, 23 insertions(+), 55 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index 62836c1..5038653 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "../util.hpp"
@@ -39,12 +40,15 @@ struct cover {
         std::unordered_map<uint32_t, uint32_t> abundance_map;
         std::vector<color> colors;
         std::vector<vertex> tmp_vertices;
-        std::cout << "initial number of runs = " << m_num_runs_abundances << std::endl;
-
+        std::unordered_set<uint32_t> unvisited_vertices;
         walk_t walk;
         walks_t walks_in_round;
-        walk.reserve(vertices.size());            // at most
-        walks_in_round.reserve(vertices.size());  // at most
+
+        unvisited_vertices.reserve(vertices.size());  // at most
+        walk.reserve(vertices.size());                // at most
+        walks_in_round.reserve(vertices.size());      // at most
+
+        std::cout << "initial number of runs = " << m_num_runs_abundances << std::endl;
 
         {
             /*
@@ -101,16 +105,9 @@ struct cover {
             uint64_t num_vertices = vertices.size();
             std::cout << "  num_vertices " << num_vertices << std::endl;
 
-            essentials::timer_type init_timer;  // colors init + sorting + filling abundance_map +
-                                                // add all gray vertices
-            essentials::timer_type unvisited_timer;  // taking unvisited vertex
-            essentials::timer_type walk_timer;       // creating walks
-
             essentials::timer_type round_timer;  // total time of round
             round_timer.start();
 
-            init_timer.start();
-
             /* all unvisited */
             if (rounds.size() == 0) {
                 /* remember: we removed some vertices but the id-space still spans
@@ -123,6 +120,8 @@ struct cover {
                 std::fill(colors.begin(), colors.end(), color::white);
             }
 
+            for (uint64_t i = 0; i != num_vertices; ++i) unvisited_vertices.insert(i);
+
             std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) {
                 if (x.front != y.front) return x.front < y.front;
                 if (x.back != y.back) return x.back < y.back;
@@ -140,54 +139,39 @@ struct cover {
                 assert(offset == vertices.size());
             }
 
-            init_timer.stop();
-
-            uint64_t i = 0;  // position of unvisited vertex in vertices
+            uint64_t i = 0;  // position of an unvisited vertex in vertices
 
             uint64_t total_vertices_visited_to_find_matches = 0;
             uint64_t total_vertices_visited_to_failure = 0;
             uint64_t total_vertices_visited_to_success = 0;
             uint64_t total_vertices = 0;
 
-            while (true) {
-                unvisited_timer.start();
+            while (!unvisited_vertices.empty()) {
                 /* 1. take an unvisited vertex */
-
-                // try to visit forward
-                for (; i != num_vertices; ++i) {
-                    uint64_t id = vertices[i].id;
-                    if (colors[id] == color::white) break;
-                }
-
-                // if a vertex is not found, try to visit backward
-                if (i == num_vertices) {
-                    i = 0;
-                    for (; i != num_vertices; ++i) {
-                        uint64_t id = vertices[i].id;
-                        if (colors[id] == color::white) break;
-                    }
-                }
-
-                unvisited_timer.stop();
-
-                if (i == num_vertices) break;  // all visited
+                i = *(unvisited_vertices.begin());
 
                 /* 2. create a new walk */
-                walk_timer.start();
                 walk.clear();
                 while (true) {
                     total_vertices += 1;
 
                     auto vertex = vertices[i];
                     uint64_t id = vertex.id;
-                    assert(colors[id] != color::black);
 
+                    assert(colors[id] != color::black);
                     colors[id] = color::gray;
+
+                    /* vertex has been visited, so erase it from unvisited_vertices */
+                    if (unvisited_vertices.find(i) != unvisited_vertices.cend()) {
+                        unvisited_vertices.erase(i);
+                    }
+
                     if (walk.size() > 0) {
                         assert(walk.back().id != id);
                         colors[walk.back().id] = color::black;
                         colors[id] = color::black;
                     }
+
                     walk.push_back(vertex);
 
                     uint64_t back = vertex.back;
@@ -252,7 +236,6 @@ struct cover {
                     i = candidate_i;
                 }
                 assert(!walk.empty());
-                walk_timer.stop();
 
                 if (walk.size() == 1) {
                     assert(colors[walk.front().id] == color::gray);  // visited but not merged
@@ -268,15 +251,9 @@ struct cover {
                                           walk.back().back);
                 walks_in_round.push_back(walk);
             }
+            assert(unvisited_vertices.empty());
 
             if (total_vertices_visited_to_find_matches > total_vertices) {
-                // std::cout << "  total_vertices " << total_vertices << std::endl;
-                // std::cout << "  total_vertices_visited_to_find_matches "
-                //           << total_vertices_visited_to_find_matches << std::endl;
-                // std::cout << "  total_vertices_visited_to_success  "
-                //           << total_vertices_visited_to_success << std::endl;
-                // std::cout << "  total_vertices_visited_to_failure  "
-                //           << total_vertices_visited_to_failure << std::endl;
                 std::cout << "  avg. num. vertices visited to find a match = "
                           << static_cast<double>(total_vertices_visited_to_find_matches) /
                                  total_vertices
@@ -289,8 +266,6 @@ struct cover {
                           << std::endl;
             }
 
-            init_timer.start();
-
             /* add all gray vertices (singleton walks) */
             for (auto const& v : vertices) {
                 if (colors[v.id] == color::gray) {
@@ -334,16 +309,9 @@ struct cover {
                 std::cout << "  num_mergings = " << num_mergings_in_round << std::endl;
                 std::cout << "  num_runs " << m_num_runs_abundances << std::endl;
 
-                init_timer.stop();
-
                 round_timer.stop();
                 std::cout << "  time: " << round_timer.elapsed() / 1000000 << " [sec]" << std::endl;
-                std::cout << "    init: " << init_timer.elapsed() / 1000000 << " [sec]"
-                          << std::endl;
-                std::cout << "    walk: " << walk_timer.elapsed() / 1000000 << " [sec]"
-                          << std::endl;
-                std::cout << "    unvisited: " << unvisited_timer.elapsed() / 1000000 << " [sec]"
-                          << std::endl;
+
                 // std::cout << "created vertices in round " << rounds.size() << ":" << std::endl;
                 // for (auto const& v : tmp_vertices) {
                 //     std::cout << v.id << ":[" << v.front << "," << v.back << "]\n";

From 5ac0615e196708f76f0e702e71b3d2162e905199 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Fri, 25 Mar 2022 15:11:05 +0100
Subject: [PATCH 23/40] permuting in external memory

---
 include/util.hpp |   1 +
 src/permute.cpp  | 185 ++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 160 insertions(+), 26 deletions(-)

diff --git a/include/util.hpp b/include/util.hpp
index ff01b2f..0727dac 100644
--- a/include/util.hpp
+++ b/include/util.hpp
@@ -18,6 +18,7 @@ constexpr double c = 3.0;  // for PThash
 constexpr uint64_t min_l = 6;
 constexpr uint64_t max_l = 12;
 constexpr uint64_t most_frequent_abundance = 1;  // usual value
+static const std::string default_tmp_dirname(".");
 }  // namespace constants
 
 typedef pthash::murmurhash2_64 base_hasher_type;
diff --git a/src/permute.cpp b/src/permute.cpp
index 6fd3e7d..ac202e9 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -151,58 +151,186 @@ permute_data parse_file(std::string const& filename, build_configuration const&
     return data;
 }
 
+struct lines_iterator {
+    lines_iterator(uint8_t const* begin, uint8_t const* end) : m_begin(begin), m_end(end) {
+        advance_to_next();
+    }
+
+    void advance_to_next() {
+        m_header = read_line();
+        m_sequence = read_line();
+    }
+
+    std::string const& header() const { return m_header; }
+    std::string const& sequence() const { return m_sequence; }
+    bool has_next() const { return !m_header.empty() and !m_sequence.empty(); }
+
+private:
+    uint8_t const* m_begin;
+    uint8_t const* m_end;
+
+    std::string m_header;
+    std::string m_sequence;
+
+    std::string read_line() {
+        uint8_t const* begin = m_begin;
+        while (m_begin != m_end and *m_begin++ != '\n')
+            ;
+        if (begin == m_begin) return "";
+        return std::string(reinterpret_cast<const char*>(begin), m_begin - begin - 1);
+    }
+};
+
 void permute_and_write(std::istream& is, std::string const& output_filename,
-                       pthash::compact_vector const& permutation) {
-    // std::vector<std::string> filenames;
-    // constexpr uint64_t bytes = 2 * essentials::GB;
+                       std::string const& tmp_dirname, pthash::compact_vector const& permutation) {
+    std::vector<std::string> filenames;
+    constexpr uint64_t limit = 1 * essentials::GB;
     std::vector<std::pair<std::string, std::string>> buffer;  // (header, dna)
 
+    std::string run_identifier =
+        std::to_string(pthash::clock_type::now().time_since_epoch().count());
+
     std::string header_sequence;
     std::string dna_sequence;
     uint64_t num_sequences = permutation.size();
     uint64_t num_bases = 0;
+    uint64_t bytes = 0;
+
+    auto sort_and_flush = [&]() {
+        if (buffer.empty()) return;
+
+        std::cout << "sorting buffer..." << std::endl;
+        std::sort(buffer.begin(), buffer.end(), [&](auto const& p_x, auto const& p_y) {
+            assert(p_x.first.front() == '>');
+            assert(p_y.first.front() == '>');
+            uint64_t seq_id_x = std::strtoull(p_x.first.data() + 1, nullptr, 10);
+            uint64_t seq_id_y = std::strtoull(p_y.first.data() + 1, nullptr, 10);
+            return permutation[seq_id_x] < permutation[seq_id_y];
+        });
+
+        std::string next_tmp_output_filename =
+            tmp_dirname + "/tmp.run" + run_identifier + "." + std::to_string(filenames.size());
+        std::cout << "saving to file '" << next_tmp_output_filename << "'..." << std::endl;
+        std::ofstream out(next_tmp_output_filename.c_str());
+        if (!out.is_open()) throw std::runtime_error("cannot open file");
+        for (auto const& seq : buffer) out << seq.first << '\n' << seq.second << '\n';
+        out.close();
+
+        filenames.push_back(next_tmp_output_filename);
+
+        buffer.clear();
+        bytes = 0;
+    };
 
     for (uint64_t i = 0; i != num_sequences; ++i) {
         std::getline(is, header_sequence);
         std::getline(is, dna_sequence);
-        buffer.emplace_back(header_sequence, dna_sequence);
 
-        num_bases += dna_sequence.size();
+        uint64_t seq_bytes = header_sequence.size() + dna_sequence.size() + 16;
+        if (bytes + seq_bytes > limit) sort_and_flush();
 
-        // check for bytes used and flush
+        bytes += seq_bytes;
+        buffer.emplace_back(header_sequence, dna_sequence);
+        num_bases += dna_sequence.size();
 
         if (i != 0 and i % 100000 == 0) {
             std::cout << "read " << i << " sequences, " << num_bases << " bases" << std::endl;
         }
     }
+    sort_and_flush();
 
     std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases" << std::endl;
 
-    std::cout << "sorting..." << std::endl;
-    std::sort(buffer.begin(), buffer.end(), [&](auto const& p_x, auto const& p_y) {
-        assert(p_x.first.front() == '>');
-        assert(p_y.first.front() == '>');
-        uint64_t seq_id_x = std::strtoull(p_x.first.data() + 1, nullptr, 10);
-        uint64_t seq_id_y = std::strtoull(p_y.first.data() + 1, nullptr, 10);
-        return permutation[seq_id_x] < permutation[seq_id_y];
-    });
-
-    std::cout << "saving to file '" << output_filename << "'..." << std::endl;
-    std::ofstream out(output_filename.c_str());
-    for (auto const& seq : buffer) out << seq.first << '\n' << seq.second << '\n';
-    out.close();
+    /* merge */
+    {
+        uint64_t num_files_to_merge = filenames.size();
+        assert(num_files_to_merge > 0);
+        std::cout << "files to merge = " << num_files_to_merge << std::endl;
+
+        std::ofstream out(output_filename.c_str());
+        if (!out.is_open()) throw std::runtime_error("cannot open file");
+
+        //  iterators and heap
+        std::vector<lines_iterator> iterators;
+        std::vector<uint32_t> idx_heap;
+        iterators.reserve(num_files_to_merge);
+        idx_heap.reserve(num_files_to_merge);
+
+        // heap functions
+        auto heap_idx_comparator = [&](uint32_t i, uint32_t j) {
+            assert(iterators[i].header().front() == '>');
+            assert(iterators[j].header().front() == '>');
+            uint64_t seq_id_x = std::strtoull(iterators[i].header().data() + 1, nullptr, 10);
+            uint64_t seq_id_y = std::strtoull(iterators[j].header().data() + 1, nullptr, 10);
+            return permutation[seq_id_x] < permutation[seq_id_y];
+        };
+        auto advance_heap_head = [&]() {
+            auto idx = idx_heap.front();
+            iterators[idx].advance_to_next();
+            if (iterators[idx].has_next()) {
+                // percolate down the head
+                uint64_t pos = 0;
+                uint64_t size = idx_heap.size();
+                while (2 * pos + 1 < size) {
+                    uint64_t i = 2 * pos + 1;
+                    if (i + 1 < size and heap_idx_comparator(idx_heap[i], idx_heap[i + 1])) ++i;
+                    if (heap_idx_comparator(idx_heap[i], idx_heap[pos])) break;
+                    std::swap(idx_heap[pos], idx_heap[i]);
+                    pos = i;
+                }
+            } else {
+                std::pop_heap(idx_heap.begin(), idx_heap.end(), heap_idx_comparator);
+                idx_heap.pop_back();
+            }
+        };
+
+        std::vector<mm::file_source<uint8_t>> mm_files(num_files_to_merge);
+
+        // create the input iterators and the heap
+        for (uint64_t i = 0; i != num_files_to_merge; ++i) {
+            auto const& filename = filenames[i];
+            mm_files[i].open(filename, mm::advice::sequential);
+            iterators.emplace_back(mm_files[i].data(), mm_files[i].data() + mm_files[i].size());
+            idx_heap.push_back(i);
+        }
+        std::make_heap(idx_heap.begin(), idx_heap.end(), heap_idx_comparator);
+
+        uint64_t num_written_sequences = 0;
+        while (!idx_heap.empty()) {
+            auto const& it = iterators[idx_heap.front()];
+            out << it.header() << '\n';
+            out << it.sequence() << '\n';
+            num_written_sequences += 1;
+            if (num_written_sequences % 100000 == 0) {
+                std::cout << "written sequences = " << num_written_sequences << "/" << num_sequences
+                          << std::endl;
+            }
+            advance_heap_head();
+        }
+        std::cout << "written sequences = " << num_written_sequences << "/" << num_sequences
+                  << std::endl;
+        out.close();
+        assert(num_written_sequences == num_sequences);
+
+        /* remove tmp files */
+        for (uint64_t i = 0; i != filenames.size(); ++i) {
+            mm_files[i].close();
+            auto const& filename = filenames[i];
+            std::remove(filename.c_str());
+        }
+    }
 }
 
 void permute_and_write(std::string const& input_filename, std::string const& output_filename,
-                       pthash::compact_vector const& permutation) {
+                       std::string const& tmp_dirname, pthash::compact_vector const& permutation) {
     std::ifstream is(input_filename.c_str());
     if (!is.good()) throw std::runtime_error("error in opening the file '" + input_filename + "'");
     std::cout << "reading file '" << input_filename << "'..." << std::endl;
     if (util::ends_with(input_filename, ".gz")) {
         zip_istream zis(is);
-        permute_and_write(zis, output_filename, permutation);
+        permute_and_write(zis, output_filename, tmp_dirname, permutation);
     } else {
-        permute_and_write(is, output_filename, permutation);
+        permute_and_write(is, output_filename, tmp_dirname, permutation);
     }
     is.close();
 }
@@ -223,7 +351,10 @@ int main(int argc, char** argv) {
     /* optional arguments */
     parser.add("output_filename", "Output file where the permuted collection will be written.",
                "-o", false);
-    // parser.add("verbose", "Verbose output during construction.", "--verbose", true);
+    parser.add("tmp_dirname",
+               "Temporary directory used for merging in external memory. Default is directory '" +
+                   constants::default_tmp_dirname + "'.",
+               "-d", false);
 
     if (!parser.parse()) return 1;
 
@@ -232,14 +363,16 @@ int main(int argc, char** argv) {
 
     build_configuration build_config;
     build_config.k = k;
-    // build_config.verbose = parser.get<bool>("verbose");
 
     std::string output_filename = input_filename + ".permuted";
     if (parser.parsed("output_filename")) {
         output_filename = parser.get<std::string>("output_filename");
     }
 
-    std::string permutation_filename = input_filename + ".permutation";
+    std::string tmp_dirname = constants::default_tmp_dirname;
+    if (parser.parsed("tmp_dirname")) tmp_dirname = parser.get<std::string>("tmp_dirname");
+
+    std::string permutation_filename = tmp_dirname + "/tmp.permutation";
 
     auto data = parse_file(input_filename, build_config);
 
@@ -319,7 +452,7 @@ int main(int argc, char** argv) {
         cv_builder.build(permutation);
     }
 
-    permute_and_write(input_filename, output_filename, permutation);
+    permute_and_write(input_filename, output_filename, tmp_dirname, permutation);
     std::remove(permutation_filename.c_str());
 
     return 0;

From 906ed741c0cefd3be896725664d044758bc4920f Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Fri, 25 Mar 2022 17:11:22 +0100
Subject: [PATCH 24/40] fix in heap comparator

---
 src/permute.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/permute.cpp b/src/permute.cpp
index ac202e9..451420c 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -250,7 +250,7 @@ void permute_and_write(std::istream& is, std::string const& output_filename,
         std::ofstream out(output_filename.c_str());
         if (!out.is_open()) throw std::runtime_error("cannot open file");
 
-        //  iterators and heap
+        // iterators and heap
         std::vector<lines_iterator> iterators;
         std::vector<uint32_t> idx_heap;
         iterators.reserve(num_files_to_merge);
@@ -262,7 +262,7 @@ void permute_and_write(std::istream& is, std::string const& output_filename,
             assert(iterators[j].header().front() == '>');
             uint64_t seq_id_x = std::strtoull(iterators[i].header().data() + 1, nullptr, 10);
             uint64_t seq_id_y = std::strtoull(iterators[j].header().data() + 1, nullptr, 10);
-            return permutation[seq_id_x] < permutation[seq_id_y];
+            return permutation[seq_id_x] > permutation[seq_id_y];
         };
         auto advance_heap_head = [&]() {
             auto idx = idx_heap.front();

From e078799431af4ac69278aa9dd7d62cd9eac373a9 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Fri, 25 Mar 2022 18:52:05 +0100
Subject: [PATCH 25/40] minor

---
 src/permute.cpp | 50 +++++++++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 27 deletions(-)

diff --git a/src/permute.cpp b/src/permute.cpp
index 451420c..ca247e8 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -168,7 +168,6 @@ struct lines_iterator {
 private:
     uint8_t const* m_begin;
     uint8_t const* m_end;
-
     std::string m_header;
     std::string m_sequence;
 
@@ -176,14 +175,13 @@ struct lines_iterator {
         uint8_t const* begin = m_begin;
         while (m_begin != m_end and *m_begin++ != '\n')
             ;
-        if (begin == m_begin) return "";
+        if (begin == m_begin) return std::string("");
         return std::string(reinterpret_cast<const char*>(begin), m_begin - begin - 1);
     }
 };
 
 void permute_and_write(std::istream& is, std::string const& output_filename,
                        std::string const& tmp_dirname, pthash::compact_vector const& permutation) {
-    std::vector<std::string> filenames;
     constexpr uint64_t limit = 1 * essentials::GB;
     std::vector<std::pair<std::string, std::string>> buffer;  // (header, dna)
 
@@ -195,6 +193,11 @@ void permute_and_write(std::istream& is, std::string const& output_filename,
     uint64_t num_sequences = permutation.size();
     uint64_t num_bases = 0;
     uint64_t bytes = 0;
+    uint64_t num_files_to_merge = 0;
+
+    auto get_tmp_output_filename = [&](uint64_t id) {
+        return tmp_dirname + "/tmp.run" + run_identifier + "." + std::to_string(id);
+    };
 
     auto sort_and_flush = [&]() {
         if (buffer.empty()) return;
@@ -208,18 +211,16 @@ void permute_and_write(std::istream& is, std::string const& output_filename,
             return permutation[seq_id_x] < permutation[seq_id_y];
         });
 
-        std::string next_tmp_output_filename =
-            tmp_dirname + "/tmp.run" + run_identifier + "." + std::to_string(filenames.size());
-        std::cout << "saving to file '" << next_tmp_output_filename << "'..." << std::endl;
-        std::ofstream out(next_tmp_output_filename.c_str());
+        auto tmp_output_filename = get_tmp_output_filename(num_files_to_merge);
+        std::cout << "saving to file '" << tmp_output_filename << "'..." << std::endl;
+        std::ofstream out(tmp_output_filename.c_str());
         if (!out.is_open()) throw std::runtime_error("cannot open file");
         for (auto const& seq : buffer) out << seq.first << '\n' << seq.second << '\n';
         out.close();
 
-        filenames.push_back(next_tmp_output_filename);
-
         buffer.clear();
         bytes = 0;
+        num_files_to_merge += 1;
     };
 
     for (uint64_t i = 0; i != num_sequences; ++i) {
@@ -243,20 +244,15 @@ void permute_and_write(std::istream& is, std::string const& output_filename,
 
     /* merge */
     {
-        uint64_t num_files_to_merge = filenames.size();
         assert(num_files_to_merge > 0);
         std::cout << "files to merge = " << num_files_to_merge << std::endl;
 
-        std::ofstream out(output_filename.c_str());
-        if (!out.is_open()) throw std::runtime_error("cannot open file");
-
-        // iterators and heap
         std::vector<lines_iterator> iterators;
         std::vector<uint32_t> idx_heap;
         iterators.reserve(num_files_to_merge);
         idx_heap.reserve(num_files_to_merge);
+        std::vector<mm::file_source<uint8_t>> mm_files(num_files_to_merge);
 
-        // heap functions
         auto heap_idx_comparator = [&](uint32_t i, uint32_t j) {
             assert(iterators[i].header().front() == '>');
             assert(iterators[j].header().front() == '>');
@@ -264,11 +260,11 @@ void permute_and_write(std::istream& is, std::string const& output_filename,
             uint64_t seq_id_y = std::strtoull(iterators[j].header().data() + 1, nullptr, 10);
             return permutation[seq_id_x] > permutation[seq_id_y];
         };
+
         auto advance_heap_head = [&]() {
             auto idx = idx_heap.front();
             iterators[idx].advance_to_next();
-            if (iterators[idx].has_next()) {
-                // percolate down the head
+            if (iterators[idx].has_next()) {  // percolate down the head
                 uint64_t pos = 0;
                 uint64_t size = idx_heap.size();
                 while (2 * pos + 1 < size) {
@@ -284,22 +280,22 @@ void permute_and_write(std::istream& is, std::string const& output_filename,
             }
         };
 
-        std::vector<mm::file_source<uint8_t>> mm_files(num_files_to_merge);
-
-        // create the input iterators and the heap
+        /* create the input iterators and make the heap */
         for (uint64_t i = 0; i != num_files_to_merge; ++i) {
-            auto const& filename = filenames[i];
-            mm_files[i].open(filename, mm::advice::sequential);
+            auto tmp_output_filename = get_tmp_output_filename(i);
+            mm_files[i].open(tmp_output_filename, mm::advice::sequential);
             iterators.emplace_back(mm_files[i].data(), mm_files[i].data() + mm_files[i].size());
             idx_heap.push_back(i);
         }
         std::make_heap(idx_heap.begin(), idx_heap.end(), heap_idx_comparator);
 
+        std::ofstream out(output_filename.c_str());
+        if (!out.is_open()) throw std::runtime_error("cannot open file");
+
         uint64_t num_written_sequences = 0;
         while (!idx_heap.empty()) {
             auto const& it = iterators[idx_heap.front()];
-            out << it.header() << '\n';
-            out << it.sequence() << '\n';
+            out << it.header() << '\n' << it.sequence() << '\n';
             num_written_sequences += 1;
             if (num_written_sequences % 100000 == 0) {
                 std::cout << "written sequences = " << num_written_sequences << "/" << num_sequences
@@ -313,10 +309,10 @@ void permute_and_write(std::istream& is, std::string const& output_filename,
         assert(num_written_sequences == num_sequences);
 
         /* remove tmp files */
-        for (uint64_t i = 0; i != filenames.size(); ++i) {
+        for (uint64_t i = 0; i != num_files_to_merge; ++i) {
             mm_files[i].close();
-            auto const& filename = filenames[i];
-            std::remove(filename.c_str());
+            auto tmp_output_filename = get_tmp_output_filename(i);
+            std::remove(tmp_output_filename.c_str());
         }
     }
 }

From d498c470206e2e9ffb1b6ca2c80382137ff93adc Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Sat, 26 Mar 2022 13:34:19 +0100
Subject: [PATCH 26/40] take into account the orientation of the vertices

---
 include/builder/cover.hpp | 223 +++++++++++++++++++++++++-------------
 src/permute.cpp           |   8 +-
 2 files changed, 150 insertions(+), 81 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index 5038653..313955e 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -11,8 +11,10 @@ namespace sshash {
 struct vertex {
     // We assume there are less than 2^32 sequences and that
     // the largest abundance fits into a 32-bit uint.
-    vertex(uint32_t s, uint32_t f, uint32_t b) : id(s), front(f), back(b) {}
+    vertex(uint32_t i, uint32_t f, uint32_t b, bool s) : id(i), front(f), back(b), sign(s) {}
     uint32_t id, front, back;
+    bool sign;  // '+' --> forward
+                // '-' --> backward
 };
 
 struct cover {
@@ -41,12 +43,14 @@ struct cover {
         std::vector<color> colors;
         std::vector<vertex> tmp_vertices;
         std::unordered_set<uint32_t> unvisited_vertices;
+        /* map from vertex id to (offset+,offset-)*/
+        std::vector<std::pair<uint32_t, uint32_t>> id_to_offsets;
         walk_t walk;
         walks_t walks_in_round;
 
         unvisited_vertices.reserve(vertices.size());  // at most
         walk.reserve(vertices.size());                // at most
-        walks_in_round.reserve(vertices.size());      // at most
+        walks_in_round.reserve(2 * vertices.size());  // at most
 
         std::cout << "initial number of runs = " << m_num_runs_abundances << std::endl;
 
@@ -86,15 +90,25 @@ struct cover {
                       << "%)" << std::endl;
 
             if (num_special_vertices != 0) {
-                /* create a new vertex for next round */
+                /* create new vertices for next round */
                 assert(!walk.empty());
-                tmp_vertices.emplace_back(walks_in_round.size(), walk.front().front,
-                                          walk.back().back);
+                uint32_t id = walks_in_round.size();
+                uint32_t front = walk.front().front;
+                uint32_t back = walk.back().back;
+                tmp_vertices.emplace_back(id, front, back, true);
+                tmp_vertices.emplace_back(id, back, front, false);
                 walks_in_round.push_back(walk);
-
                 walk.clear();
             }
 
+            /* now  add all the vertices with backward orientation */
+            num_vertices = vertices.size();
+            for (uint64_t i = 0; i != num_vertices; ++i) {
+                auto vertex = vertices[i];
+                assert(vertex.sign == true);
+                vertices.emplace_back(vertex.id, vertex.back, vertex.front, false);
+            }
+
             timer.stop();
             std::cout << "  time: " << timer.elapsed() / 1000000 << " [sec]" << std::endl;
         }
@@ -104,6 +118,7 @@ struct cover {
 
             uint64_t num_vertices = vertices.size();
             std::cout << "  num_vertices " << num_vertices << std::endl;
+            assert(num_vertices % 2 == 0);
 
             essentials::timer_type round_timer;  // total time of round
             round_timer.start();
@@ -112,15 +127,19 @@ struct cover {
             if (rounds.size() == 0) {
                 /* remember: we removed some vertices but the id-space still spans
                    [0..m_num_sequences-1] */
+                id_to_offsets.resize(m_num_sequences);
                 colors.resize(m_num_sequences);
                 std::fill(colors.begin(), colors.end(), color::invalid);
-                for (auto const& v : vertices) colors[v.id] = color::white;
+                for (auto const& vertex : vertices) colors[vertex.id] = color::white;
             } else {
-                colors.resize(num_vertices);
+                id_to_offsets.resize(num_vertices / 2);
+                colors.resize(num_vertices / 2);
                 std::fill(colors.begin(), colors.end(), color::white);
             }
 
-            for (uint64_t i = 0; i != num_vertices; ++i) unvisited_vertices.insert(i);
+            for (auto const& vertex : vertices) unvisited_vertices.insert(vertex.id);
+            std::cout << "  num_unvisited_vertices " << unvisited_vertices.size() << std::endl;
+            assert(unvisited_vertices.size() == num_vertices / 2);
 
             std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) {
                 if (x.front != y.front) return x.front < y.front;
@@ -128,7 +147,8 @@ struct cover {
                 return x.id < y.id;
             });
 
-            if (num_vertices > 0) {
+            // if (num_vertices > 0)
+            {
                 uint64_t prev_front = constants::invalid;
                 uint64_t offset = 0;
                 for (auto const& vertex : vertices) {
@@ -139,16 +159,33 @@ struct cover {
                 assert(offset == vertices.size());
             }
 
+            /* fill id_to_offsets map */
+            {
+                uint64_t offset = 0;
+                for (auto const& vertex : vertices) {
+                    if (vertex.sign) {
+                        id_to_offsets[vertex.id].first = offset;
+                    } else {
+                        id_to_offsets[vertex.id].second = offset;
+                    }
+                    offset += 1;
+                }
+            }
+
             uint64_t i = 0;  // position of an unvisited vertex in vertices
 
             uint64_t total_vertices_visited_to_find_matches = 0;
             uint64_t total_vertices_visited_to_failure = 0;
             uint64_t total_vertices_visited_to_success = 0;
             uint64_t total_vertices = 0;
+            bool no_more_matches_possible = true;
 
             while (!unvisited_vertices.empty()) {
                 /* 1. take an unvisited vertex */
-                i = *(unvisited_vertices.begin());
+                {
+                    uint32_t unvisited_vertex_id = *(unvisited_vertices.begin());
+                    i = id_to_offsets[unvisited_vertex_id].first;
+                }
 
                 /* 2. create a new walk */
                 walk.clear();
@@ -158,12 +195,14 @@ struct cover {
                     auto vertex = vertices[i];
                     uint64_t id = vertex.id;
 
+                    // std::cout << "  visiting vertex " << id << " at offset " << i << std::endl;
+
                     assert(colors[id] != color::black);
                     colors[id] = color::gray;
 
                     /* vertex has been visited, so erase it from unvisited_vertices */
-                    if (unvisited_vertices.find(i) != unvisited_vertices.cend()) {
-                        unvisited_vertices.erase(i);
+                    if (unvisited_vertices.find(id) != unvisited_vertices.cend()) {
+                        unvisited_vertices.erase(id);
                     }
 
                     if (walk.size() > 0) {
@@ -174,66 +213,80 @@ struct cover {
 
                     walk.push_back(vertex);
 
-                    uint64_t back = vertex.back;
-                    auto it = abundance_map.find(back);
-
-                    /* abundance back is not found, so no match is possible */
-                    if (it == abundance_map.cend()) break;
-
-                    bool no_match_found = false;
-                    uint64_t candidate_i = (*it).second;  // candidate position
-
-                    /* 3. search for a match */
-                    uint64_t tmp_total_vertices_visited_to_failure = 0;
-                    uint64_t tmp_total_vertices_visited_to_success = 0;
-                    while (true) {
-                        total_vertices_visited_to_find_matches += 1;
-                        tmp_total_vertices_visited_to_failure += 1;
-                        tmp_total_vertices_visited_to_success += 1;
-
-                        if (candidate_i == num_vertices) {
-                            total_vertices_visited_to_failure +=
-                                tmp_total_vertices_visited_to_failure;
-                            break;
-                        }
-                        auto candidate = vertices[candidate_i];
+                    auto search_match = [&](uint64_t back, uint64_t candidate_i) {
+                        bool no_match_found = false;
+                        uint64_t tmp_total_vertices_visited_to_failure = 0;
+                        uint64_t tmp_total_vertices_visited_to_success = 0;
+
+                        while (true) {
+                            total_vertices_visited_to_find_matches += 1;
+                            tmp_total_vertices_visited_to_failure += 1;
+                            tmp_total_vertices_visited_to_success += 1;
+
+                            if (candidate_i == num_vertices) {
+                                total_vertices_visited_to_failure +=
+                                    tmp_total_vertices_visited_to_failure;
+                                break;
+                            }
+                            auto candidate = vertices[candidate_i];
+
+                            /* skip */
+                            if (candidate.id == id) {
+                                candidate_i += 1;
+                                continue;
+                            }
+
+                            /* checked all candidate matches */
+                            if (candidate.front != back) {
+                                no_match_found = true;
+                                total_vertices_visited_to_failure +=
+                                    tmp_total_vertices_visited_to_failure;
+                                break;
+                            }
+
+                            /* match found */
+                            if (colors[candidate.id] != color::black) {
+                                total_vertices_visited_to_success +=
+                                    tmp_total_vertices_visited_to_success;
+                                break;
+                            }
 
-                        /* skip */
-                        if (candidate.id == id) {
                             candidate_i += 1;
-                            continue;
                         }
 
-                        /* checked all candidate matches */
-                        if (candidate.front != back) {
-                            no_match_found = true;
-                            total_vertices_visited_to_failure +=
-                                tmp_total_vertices_visited_to_failure;
-                            break;
-                        }
+                        assert(candidate_i <= num_vertices);
 
-                        /* match found */
-                        if (colors[candidate.id] != color::black) {
-                            total_vertices_visited_to_success +=
-                                tmp_total_vertices_visited_to_success;
-                            break;
+                        constexpr uint64_t num_optimized_rounds_for_speed = 2;
+                        if (rounds.size() <= num_optimized_rounds_for_speed) {
+                            /* update candidate position in abundance_map */
+                            abundance_map[back] = candidate_i;
                         }
 
-                        candidate_i += 1;
-                    }
+                        if (no_match_found or candidate_i == num_vertices) return false;
 
-                    assert(candidate_i <= num_vertices);
+                        /* valid match was found, then visit it next */
+                        i = candidate_i;
+                        return true;
+                    };
 
-                    constexpr uint64_t num_optimized_rounds_for_speed = 2;
-                    if (rounds.size() <= num_optimized_rounds_for_speed) {
-                        /* update candidate position in abundance_map */
-                        (*it).second = candidate_i;
+                    /* 3. search for a match */
+                    uint64_t back = vertex.back;
+                    uint64_t candidate_i = abundance_map[back];
+                    bool found = search_match(back, candidate_i);
+
+                    if (!found and walk.size() == 1) {
+                        back = vertex.front;
+                        candidate_i = abundance_map[back];
+                        found = search_match(back, candidate_i);
+                        if (found) {
+                            /* change orientation of the vertex */
+                            walk[0] = {vertex.id, vertex.back, vertex.front, !vertex.sign};
+                        }
                     }
 
-                    if (no_match_found or candidate_i == num_vertices) break;
+                    if (!found) break;
 
-                    /* valid match was found, then visit it next */
-                    i = candidate_i;
+                    no_more_matches_possible = false;
                 }
                 assert(!walk.empty());
 
@@ -246,9 +299,12 @@ struct cover {
                 assert(std::all_of(walk.begin(), walk.end(),
                                    [&](vertex const& v) { return colors[v.id] == color::black; }));
 
-                /* create a new vertex for next round */
-                tmp_vertices.emplace_back(walks_in_round.size(), walk.front().front,
-                                          walk.back().back);
+                /* create new vertices for next round */
+                uint32_t id = walks_in_round.size();
+                uint32_t front = walk.front().front;
+                uint32_t back = walk.back().back;
+                tmp_vertices.emplace_back(id, front, back, true);
+                tmp_vertices.emplace_back(id, back, front, false);
                 walks_in_round.push_back(walk);
             }
             assert(unvisited_vertices.empty());
@@ -269,27 +325,38 @@ struct cover {
             /* add all gray vertices (singleton walks) */
             for (auto const& v : vertices) {
                 if (colors[v.id] == color::gray) {
-                    tmp_vertices.emplace_back(walks_in_round.size(), v.front, v.back);
+                    uint32_t id = walks_in_round.size();
+                    uint32_t front = v.front;
+                    uint32_t back = v.back;
+
+                    if (v.sign) {
+                        tmp_vertices.emplace_back(id, front, back, true);
+                        tmp_vertices.emplace_back(id, back, front, false);
+                    } else {
+                        tmp_vertices.emplace_back(id, front, back, false);
+                        tmp_vertices.emplace_back(id, back, front, true);
+                    }
+
                     walk_t walk;
                     walk.push_back(v);
                     walks_in_round.push_back(walk);
+
+                    colors[v.id] = color::black;
                 }
             }
 
             std::cout << "  num_walks = " << walks_in_round.size() << std::endl;
 
-            bool all_singletons = true;
             {
 #ifndef NDEBUG
                 std::fill(colors.begin(), colors.end(), color::white);
 #endif
                 uint64_t num_mergings_in_round = 0;
                 for (auto const& walk : walks_in_round) {
-                    if (walk.size() > 1) all_singletons = false;
                     num_mergings_in_round += walk.size() - 1;
 #ifndef NDEBUG
                     uint64_t prev_back = walk.front().front;
-                    // std::cout << "=>";
+                    std::cout << "=>";
                     for (auto const& w : walk) {
                         if (colors[w.id] == color::black) {
                             std::cout << "ERROR: duplicate vertex." << std::endl;
@@ -299,9 +366,10 @@ struct cover {
                         }
                         prev_back = w.back;
                         colors[w.id] = color::black;
-                        // std::cout << w.id << ":[" << w.front << "," << w.back << "] ";
+                        std::cout << w.id << ":[" << w.front << "," << w.back << ","
+                                  << (w.sign ? '+' : '-') << "] ";
                     }
-                    // std::cout << std::endl;
+                    std::cout << std::endl;
 #endif
                 }
                 assert(m_num_runs_abundances > num_mergings_in_round);
@@ -314,7 +382,8 @@ struct cover {
 
                 // std::cout << "created vertices in round " << rounds.size() << ":" << std::endl;
                 // for (auto const& v : tmp_vertices) {
-                //     std::cout << v.id << ":[" << v.front << "," << v.back << "]\n";
+                //     std::cout << v.id << ":[" << v.front << "," << v.back << ","
+                //               << (v.sign ? '+' : '-') << "]\n";
                 // }
             }
 
@@ -325,9 +394,8 @@ struct cover {
             walks_in_round.clear();
             abundance_map.clear();
 
-            if (all_singletons and rounds.size() > 1) {
-                std::cout << "STOP: all walks are singletons --> no new mergings were found"
-                          << std::endl;
+            if (no_more_matches_possible) {
+                std::cout << "STOP: no more matches possible." << std::endl;
                 break;
             }
         }
@@ -345,9 +413,7 @@ struct cover {
         int r = rounds.size() - 1;
         const auto& walks = rounds[r];
         uint64_t num_sequences = 0;
-        for (auto const& walk : walks) {
-            for (auto const& vertex : walk) num_sequences += visit(vertex.id, r, out);
-        }
+        for (uint64_t w = 0; w != walks.size(); ++w) num_sequences += visit(w, r, out);
         if (num_sequences != m_num_sequences) {
             std::cerr << "Error: expected to write " << m_num_sequences << " but written "
                       << num_sequences << std::endl;
@@ -375,7 +441,8 @@ struct cover {
         auto const& walk = rounds[0][w];
         for (auto const& vertex : walk) {
             // out << vertex.id << ":[" << vertex.front << "," << vertex.back << "] ";
-            out << vertex.id << '\n';
+            out << vertex.id;
+            out << (vertex.sign ? " +\n" : " -\n");
         }
         // out << '\n';
         return walk.size();
diff --git a/src/permute.cpp b/src/permute.cpp
index ca247e8..0c041d7 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -91,7 +91,8 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
         num_sequences_diff_abs += kmers_have_different_abundances;
         num_sequences_all_mfa += kmers_have_all_mfa;
 
-        data.vertices.emplace_back(data.num_sequences, front, back);
+        constexpr bool sign = true;
+        data.vertices.emplace_back(data.num_sequences, front, back, sign);
     };
 
     while (!is.eof()) {
@@ -437,8 +438,9 @@ int main(int argc, char** argv) {
         if (!is.good()) {
             throw std::runtime_error("error in opening the file '" + permutation_filename + "'");
         }
-        pthash::compact_vector::builder cv_builder(data.num_sequences,
-                                                   std::ceil(std::log2(data.num_sequences)));
+        pthash::compact_vector::builder cv_builder(
+            data.num_sequences,
+            data.num_sequences == 1 ? 1 : std::ceil(std::log2(data.num_sequences)));
         for (uint64_t i = 0; i != data.num_sequences; ++i) {
             uint64_t position = 0;
             is >> position;

From 485a19c9f70b144e218db1ec0f0a8793a8b942ed Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Sat, 26 Mar 2022 14:09:11 +0100
Subject: [PATCH 27/40] take into account the orientation of the vertices

---
 include/builder/cover.hpp | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index 313955e..98d4b39 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -43,8 +43,8 @@ struct cover {
         std::vector<color> colors;
         std::vector<vertex> tmp_vertices;
         std::unordered_set<uint32_t> unvisited_vertices;
-        /* map from vertex id to (offset+,offset-)*/
-        std::vector<std::pair<uint32_t, uint32_t>> id_to_offsets;
+        /* map from vertex id to offset+ into vertices */
+        std::vector<uint32_t> id_to_offset;
         walk_t walk;
         walks_t walks_in_round;
 
@@ -127,12 +127,12 @@ struct cover {
             if (rounds.size() == 0) {
                 /* remember: we removed some vertices but the id-space still spans
                    [0..m_num_sequences-1] */
-                id_to_offsets.resize(m_num_sequences);
+                id_to_offset.resize(m_num_sequences);
                 colors.resize(m_num_sequences);
                 std::fill(colors.begin(), colors.end(), color::invalid);
                 for (auto const& vertex : vertices) colors[vertex.id] = color::white;
             } else {
-                id_to_offsets.resize(num_vertices / 2);
+                id_to_offset.resize(num_vertices / 2);
                 colors.resize(num_vertices / 2);
                 std::fill(colors.begin(), colors.end(), color::white);
             }
@@ -147,7 +147,7 @@ struct cover {
                 return x.id < y.id;
             });
 
-            // if (num_vertices > 0)
+            /* fill abundance_map */
             {
                 uint64_t prev_front = constants::invalid;
                 uint64_t offset = 0;
@@ -159,15 +159,11 @@ struct cover {
                 assert(offset == vertices.size());
             }
 
-            /* fill id_to_offsets map */
+            /* fill id_to_offset map */
             {
                 uint64_t offset = 0;
                 for (auto const& vertex : vertices) {
-                    if (vertex.sign) {
-                        id_to_offsets[vertex.id].first = offset;
-                    } else {
-                        id_to_offsets[vertex.id].second = offset;
-                    }
+                    if (vertex.sign) id_to_offset[vertex.id] = offset;
                     offset += 1;
                 }
             }
@@ -184,7 +180,7 @@ struct cover {
                 /* 1. take an unvisited vertex */
                 {
                     uint32_t unvisited_vertex_id = *(unvisited_vertices.begin());
-                    i = id_to_offsets[unvisited_vertex_id].first;
+                    i = id_to_offset[unvisited_vertex_id];
                 }
 
                 /* 2. create a new walk */
@@ -195,8 +191,6 @@ struct cover {
                     auto vertex = vertices[i];
                     uint64_t id = vertex.id;
 
-                    // std::cout << "  visiting vertex " << id << " at offset " << i << std::endl;
-
                     assert(colors[id] != color::black);
                     colors[id] = color::gray;
 
@@ -256,11 +250,8 @@ struct cover {
 
                         assert(candidate_i <= num_vertices);
 
-                        constexpr uint64_t num_optimized_rounds_for_speed = 2;
-                        if (rounds.size() <= num_optimized_rounds_for_speed) {
-                            /* update candidate position in abundance_map */
-                            abundance_map[back] = candidate_i;
-                        }
+                        /* update candidate position in abundance_map */
+                        abundance_map[back] = candidate_i;
 
                         if (no_match_found or candidate_i == num_vertices) return false;
 

From 7c8d4c35a1d06ee8d76d9037c0eca239ff17a65e Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Sat, 26 Mar 2022 18:47:39 +0100
Subject: [PATCH 28/40] take into account the orientation of the vertices: less
 code; refactored some names

---
 include/builder/cover.hpp | 311 +++++++++++++++++---------------------
 src/permute.cpp           |  28 ++--
 2 files changed, 149 insertions(+), 190 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index 98d4b39..7c06655 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -8,105 +8,105 @@
 
 namespace sshash {
 
-struct vertex {
+struct node {
     // We assume there are less than 2^32 sequences and that
     // the largest abundance fits into a 32-bit uint.
-    vertex(uint32_t i, uint32_t f, uint32_t b, bool s) : id(i), front(f), back(b), sign(s) {}
+    node(uint32_t i, uint32_t f, uint32_t b, bool s) : id(i), front(f), back(b), sign(s) {}
     uint32_t id, front, back;
     bool sign;  // '+' --> forward
                 // '-' --> backward
 };
 
 struct cover {
-    typedef std::vector<vertex> walk_t;
+    typedef std::vector<node> walk_t;
     typedef std::vector<walk_t> walks_t;
 
-    enum color {
-        white = 0,  // unvisited
-        gray = 1,   // visited by not merged
-        black = 2,  // visited and merged
-        invalid = -1
-    };
-
     cover(uint64_t num_sequences, uint64_t num_runs_abundances)
         : m_num_sequences(num_sequences), m_num_runs_abundances(num_runs_abundances) {}
 
-    void compute(std::vector<vertex>& vertices) {
-        assert(vertices.size() == m_num_sequences);
+    void compute(std::vector<node>& nodes) {
+        assert(nodes.size() == m_num_sequences);
 
         essentials::timer_type timer;
-
         timer.start();
 
         /* (abundance, position of a candidate abundance with front = abundance) */
         std::unordered_map<uint32_t, uint32_t> abundance_map;
-        std::vector<color> colors;
-        std::vector<vertex> tmp_vertices;
-        std::unordered_set<uint32_t> unvisited_vertices;
-        /* map from vertex id to offset+ into vertices */
+
+        /* visited[node.id] = true if node has been visited; false otherwise */
+        std::vector<bool> visited;
+
+        /* set of unvisited nodes */
+        std::unordered_set<uint32_t> unvisited_nodes;
+
+        /* map from node id to offset+ into nodes */
         std::vector<uint32_t> id_to_offset;
+
+        /* nodes created for next round */
+        std::vector<node> tmp_nodes;
+
         walk_t walk;
         walks_t walks_in_round;
 
-        unvisited_vertices.reserve(vertices.size());  // at most
-        walk.reserve(vertices.size());                // at most
-        walks_in_round.reserve(2 * vertices.size());  // at most
+        unvisited_nodes.reserve(nodes.size());     // at most
+        walk.reserve(nodes.size());                // at most
+        walks_in_round.reserve(2 * nodes.size());  // at most
 
         std::cout << "initial number of runs = " << m_num_runs_abundances << std::endl;
 
         {
             /*
                 optimize for the most frequent case:
-                push vertices of the form v:[mfa,mfa] to the bottom, and remove them
+                push nodes of the form v:[mfa,mfa] to the bottom, and remove them
                 forming a single (usually, long) walk.
-                Warning: here we are assuming the mfa is also the smallest abundance.
+                Warning: here we are assuming the mfa is also the *smallest* abundance.
             */
 
             essentials::timer_type timer;
             timer.start();
 
-            std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) {
+            std::sort(nodes.begin(), nodes.end(), [](auto const& x, auto const& y) {
                 if (x.front != y.front) return x.front > y.front;
                 if (x.back != y.back) return x.back > y.back;
                 return x.id > y.id;
             });
 
-            uint64_t num_special_vertices = 0;
-            uint64_t num_vertices = vertices.size();
-            while (!vertices.empty()) {
-                auto v = vertices.back();
+            uint64_t num_special_nodes = 0;
+            uint64_t num_nodes = nodes.size();
+            while (!nodes.empty()) {
+                auto v = nodes.back();
                 if (v.front == constants::most_frequent_abundance and
                     v.back == constants::most_frequent_abundance) {
                     walk.push_back(v);
-                    vertices.pop_back();
-                    num_special_vertices += 1;
+                    nodes.pop_back();
+                    num_special_nodes += 1;
                 } else {
                     break;
                 }
             }
-            std::cout << "num vertices of the form v:[" << constants::most_frequent_abundance << ","
-                      << constants::most_frequent_abundance << "] = " << num_special_vertices << "/"
-                      << num_vertices << "(" << (num_special_vertices * 100.0) / num_vertices
-                      << "%)" << std::endl;
+            std::cout << "num nodes of the form v:[" << constants::most_frequent_abundance << ","
+                      << constants::most_frequent_abundance << "] = " << num_special_nodes << "/"
+                      << num_nodes << "(" << (num_special_nodes * 100.0) / num_nodes << "%)"
+                      << std::endl;
 
-            if (num_special_vertices != 0) {
-                /* create new vertices for next round */
+            if (num_special_nodes != 0) {
+                /* create new nodes for next round */
                 assert(!walk.empty());
                 uint32_t id = walks_in_round.size();
                 uint32_t front = walk.front().front;
                 uint32_t back = walk.back().back;
-                tmp_vertices.emplace_back(id, front, back, true);
-                tmp_vertices.emplace_back(id, back, front, false);
+                tmp_nodes.emplace_back(id, front, back, true);
+                tmp_nodes.emplace_back(id, back, front, false);
                 walks_in_round.push_back(walk);
                 walk.clear();
             }
 
-            /* now  add all the vertices with backward orientation */
-            num_vertices = vertices.size();
-            for (uint64_t i = 0; i != num_vertices; ++i) {
-                auto vertex = vertices[i];
-                assert(vertex.sign == true);
-                vertices.emplace_back(vertex.id, vertex.back, vertex.front, false);
+            /* add all the nodes with backward orientation */
+            num_nodes = nodes.size();
+            for (uint64_t i = 0; i != num_nodes; ++i) {
+                auto node = nodes[i];
+                assert(node.sign == true);
+                nodes.emplace_back(node.id, node.back, node.front, false);
             }
 
             timer.stop();
@@ -116,32 +116,30 @@ struct cover {
         while (true) {
             std::cout << "round " << rounds.size() << std::endl;
 
-            uint64_t num_vertices = vertices.size();
-            std::cout << "  num_vertices " << num_vertices << std::endl;
-            assert(num_vertices % 2 == 0);
+            uint64_t num_nodes = nodes.size();
+            std::cout << "  num_nodes " << num_nodes << std::endl;
+            assert(num_nodes % 2 == 0);
 
             essentials::timer_type round_timer;  // total time of round
             round_timer.start();
 
-            /* all unvisited */
             if (rounds.size() == 0) {
-                /* remember: we removed some vertices but the id-space still spans
+                /* remember: we removed some nodes but the id-space still spans
                    [0..m_num_sequences-1] */
                 id_to_offset.resize(m_num_sequences);
-                colors.resize(m_num_sequences);
-                std::fill(colors.begin(), colors.end(), color::invalid);
-                for (auto const& vertex : vertices) colors[vertex.id] = color::white;
+                visited.resize(m_num_sequences);
             } else {
-                id_to_offset.resize(num_vertices / 2);
-                colors.resize(num_vertices / 2);
-                std::fill(colors.begin(), colors.end(), color::white);
+                id_to_offset.resize(num_nodes / 2);
+                visited.resize(num_nodes / 2);
             }
 
-            for (auto const& vertex : vertices) unvisited_vertices.insert(vertex.id);
-            std::cout << "  num_unvisited_vertices " << unvisited_vertices.size() << std::endl;
-            assert(unvisited_vertices.size() == num_vertices / 2);
+            /* all nodes unvisited */
+            std::fill(visited.begin(), visited.end(), false);
 
-            std::sort(vertices.begin(), vertices.end(), [](auto const& x, auto const& y) {
+            for (auto const& node : nodes) unvisited_nodes.insert(node.id);
+            assert(unvisited_nodes.size() == num_nodes / 2);
+
+            std::sort(nodes.begin(), nodes.end(), [](auto const& x, auto const& y) {
                 if (x.front != y.front) return x.front < y.front;
                 if (x.back != y.back) return x.back < y.back;
                 return x.id < y.id;
@@ -151,78 +149,71 @@ struct cover {
             {
                 uint64_t prev_front = constants::invalid;
                 uint64_t offset = 0;
-                for (auto const& vertex : vertices) {
-                    if (vertex.front != prev_front) abundance_map[vertex.front] = offset;
+                for (auto const& node : nodes) {
+                    if (node.front != prev_front) abundance_map[node.front] = offset;
                     offset += 1;
-                    prev_front = vertex.front;
+                    prev_front = node.front;
                 }
-                assert(offset == vertices.size());
+                assert(offset == nodes.size());
             }
 
             /* fill id_to_offset map */
             {
                 uint64_t offset = 0;
-                for (auto const& vertex : vertices) {
-                    if (vertex.sign) id_to_offset[vertex.id] = offset;
+                for (auto const& node : nodes) {
+                    if (node.sign) id_to_offset[node.id] = offset;
                     offset += 1;
                 }
             }
 
-            uint64_t i = 0;  // position of an unvisited vertex in vertices
+            uint64_t i = 0;                        // position of an unvisited node in nodes
+            bool no_more_matches_possible = true;  // to stop the algorithm
 
-            uint64_t total_vertices_visited_to_find_matches = 0;
-            uint64_t total_vertices_visited_to_failure = 0;
-            uint64_t total_vertices_visited_to_success = 0;
-            uint64_t total_vertices = 0;
-            bool no_more_matches_possible = true;
+            /* some statistics */
+            uint64_t total_nodes_visited_to_find_matches = 0;
+            uint64_t total_nodes_visited_to_failure = 0;
+            uint64_t total_nodes_visited_to_success = 0;
+            uint64_t total_nodes = 0;
 
-            while (!unvisited_vertices.empty()) {
-                /* 1. take an unvisited vertex */
+            while (!unvisited_nodes.empty()) {
+                /* 1. take an unvisited node */
                 {
-                    uint32_t unvisited_vertex_id = *(unvisited_vertices.begin());
-                    i = id_to_offset[unvisited_vertex_id];
+                    uint32_t unvisited_node_id = *(unvisited_nodes.begin());
+                    i = id_to_offset[unvisited_node_id];
                 }
 
                 /* 2. create a new walk */
                 walk.clear();
                 while (true) {
-                    total_vertices += 1;
-
-                    auto vertex = vertices[i];
-                    uint64_t id = vertex.id;
-
-                    assert(colors[id] != color::black);
-                    colors[id] = color::gray;
+                    total_nodes += 1;
 
-                    /* vertex has been visited, so erase it from unvisited_vertices */
-                    if (unvisited_vertices.find(id) != unvisited_vertices.cend()) {
-                        unvisited_vertices.erase(id);
-                    }
+                    auto node = nodes[i];
+                    uint64_t id = node.id;
+                    assert(visited[id] == false);
+                    visited[id] = true;
+                    walk.push_back(node);
 
-                    if (walk.size() > 0) {
-                        assert(walk.back().id != id);
-                        colors[walk.back().id] = color::black;
-                        colors[id] = color::black;
+                    /* node has been visited, so erase it from unvisited_nodes */
+                    if (unvisited_nodes.find(id) != unvisited_nodes.cend()) {
+                        unvisited_nodes.erase(id);
                     }
 
-                    walk.push_back(vertex);
-
                     auto search_match = [&](uint64_t back, uint64_t candidate_i) {
                         bool no_match_found = false;
-                        uint64_t tmp_total_vertices_visited_to_failure = 0;
-                        uint64_t tmp_total_vertices_visited_to_success = 0;
+                        uint64_t tmp_total_nodes_visited_to_failure = 0;
+                        uint64_t tmp_total_nodes_visited_to_success = 0;
 
                         while (true) {
-                            total_vertices_visited_to_find_matches += 1;
-                            tmp_total_vertices_visited_to_failure += 1;
-                            tmp_total_vertices_visited_to_success += 1;
+                            total_nodes_visited_to_find_matches += 1;
+                            tmp_total_nodes_visited_to_failure += 1;
+                            tmp_total_nodes_visited_to_success += 1;
 
-                            if (candidate_i == num_vertices) {
-                                total_vertices_visited_to_failure +=
-                                    tmp_total_vertices_visited_to_failure;
+                            if (candidate_i == num_nodes) {
+                                total_nodes_visited_to_failure +=
+                                    tmp_total_nodes_visited_to_failure;
                                 break;
                             }
-                            auto candidate = vertices[candidate_i];
+                            auto candidate = nodes[candidate_i];
 
                             /* skip */
                             if (candidate.id == id) {
@@ -233,27 +224,26 @@ struct cover {
                             /* checked all candidate matches */
                             if (candidate.front != back) {
                                 no_match_found = true;
-                                total_vertices_visited_to_failure +=
-                                    tmp_total_vertices_visited_to_failure;
+                                total_nodes_visited_to_failure +=
+                                    tmp_total_nodes_visited_to_failure;
                                 break;
                             }
 
                             /* match found */
-                            if (colors[candidate.id] != color::black) {
-                                total_vertices_visited_to_success +=
-                                    tmp_total_vertices_visited_to_success;
+                            if (visited[candidate.id] == false) {
+                                total_nodes_visited_to_success +=
+                                    tmp_total_nodes_visited_to_success;
                                 break;
                             }
 
                             candidate_i += 1;
                         }
-
-                        assert(candidate_i <= num_vertices);
+                        assert(candidate_i <= num_nodes);
 
                         /* update candidate position in abundance_map */
                         abundance_map[back] = candidate_i;
 
-                        if (no_match_found or candidate_i == num_vertices) return false;
+                        if (no_match_found or candidate_i == num_nodes) return false;
 
                         /* valid match was found, then visit it next */
                         i = candidate_i;
@@ -261,118 +251,87 @@ struct cover {
                     };
 
                     /* 3. search for a match */
-                    uint64_t back = vertex.back;
+
+                    /* first, try to match back abundance */
+                    uint64_t back = node.back;
                     uint64_t candidate_i = abundance_map[back];
                     bool found = search_match(back, candidate_i);
 
+                    /* if a match is not found and the walk is singleton,
+                       then the only node in the walk could be matched on
+                       front abundance */
                     if (!found and walk.size() == 1) {
-                        back = vertex.front;
+                        back = node.front;
                         candidate_i = abundance_map[back];
                         found = search_match(back, candidate_i);
                         if (found) {
-                            /* change orientation of the vertex */
-                            walk[0] = {vertex.id, vertex.back, vertex.front, !vertex.sign};
+                            /* change orientation of the node */
+                            walk[0] = {node.id, node.back, node.front, !node.sign};
                         }
                     }
 
                     if (!found) break;
-
                     no_more_matches_possible = false;
                 }
                 assert(!walk.empty());
 
-                if (walk.size() == 1) {
-                    assert(colors[walk.front().id] == color::gray);  // visited but not merged
-                    continue;
-                }
-
-                /* invariant: all vertices belonging to a walk are all black */
-                assert(std::all_of(walk.begin(), walk.end(),
-                                   [&](vertex const& v) { return colors[v.id] == color::black; }));
-
-                /* create new vertices for next round */
+                /* create new nodes for next round */
                 uint32_t id = walks_in_round.size();
                 uint32_t front = walk.front().front;
                 uint32_t back = walk.back().back;
-                tmp_vertices.emplace_back(id, front, back, true);
-                tmp_vertices.emplace_back(id, back, front, false);
+                tmp_nodes.emplace_back(id, front, back, true);
+                tmp_nodes.emplace_back(id, back, front, false);
                 walks_in_round.push_back(walk);
             }
-            assert(unvisited_vertices.empty());
+            assert(unvisited_nodes.empty());
 
-            if (total_vertices_visited_to_find_matches > total_vertices) {
-                std::cout << "  avg. num. vertices visited to find a match = "
-                          << static_cast<double>(total_vertices_visited_to_find_matches) /
-                                 total_vertices
+            if (total_nodes_visited_to_find_matches > total_nodes) {
+                std::cout << "  avg. num. nodes visited to find a match = "
+                          << static_cast<double>(total_nodes_visited_to_find_matches) / total_nodes
                           << std::endl;
-                std::cout << "  avg. num. vertices visited to success = "
-                          << static_cast<double>(total_vertices_visited_to_success) / total_vertices
+                std::cout << "  avg. num. nodes visited to success = "
+                          << static_cast<double>(total_nodes_visited_to_success) / total_nodes
                           << std::endl;
-                std::cout << "  avg. num. vertices visited to failure = "
-                          << static_cast<double>(total_vertices_visited_to_failure) / total_vertices
+                std::cout << "  avg. num. nodes visited to failure = "
+                          << static_cast<double>(total_nodes_visited_to_failure) / total_nodes
                           << std::endl;
             }
 
-            /* add all gray vertices (singleton walks) */
-            for (auto const& v : vertices) {
-                if (colors[v.id] == color::gray) {
-                    uint32_t id = walks_in_round.size();
-                    uint32_t front = v.front;
-                    uint32_t back = v.back;
-
-                    if (v.sign) {
-                        tmp_vertices.emplace_back(id, front, back, true);
-                        tmp_vertices.emplace_back(id, back, front, false);
-                    } else {
-                        tmp_vertices.emplace_back(id, front, back, false);
-                        tmp_vertices.emplace_back(id, back, front, true);
-                    }
-
-                    walk_t walk;
-                    walk.push_back(v);
-                    walks_in_round.push_back(walk);
-
-                    colors[v.id] = color::black;
-                }
-            }
-
             std::cout << "  num_walks = " << walks_in_round.size() << std::endl;
 
             {
 #ifndef NDEBUG
-                std::fill(colors.begin(), colors.end(), color::white);
+                std::fill(visited.begin(), visited.end(), false);
 #endif
-                uint64_t num_mergings_in_round = 0;
+                uint64_t num_matches_in_round = 0;
                 for (auto const& walk : walks_in_round) {
-                    num_mergings_in_round += walk.size() - 1;
+                    num_matches_in_round += walk.size() - 1;
 #ifndef NDEBUG
                     uint64_t prev_back = walk.front().front;
                     std::cout << "=>";
                     for (auto const& w : walk) {
-                        if (colors[w.id] == color::black) {
-                            std::cout << "ERROR: duplicate vertex." << std::endl;
-                        }
+                        if (visited[w.id]) std::cout << "ERROR: duplicate node." << std::endl;
                         if (w.front != prev_back) {
                             std::cout << "ERROR: path is broken." << std::endl;
                         }
                         prev_back = w.back;
-                        colors[w.id] = color::black;
+                        visited[w.id] = true;
                         std::cout << w.id << ":[" << w.front << "," << w.back << ","
                                   << (w.sign ? '+' : '-') << "] ";
                     }
                     std::cout << std::endl;
 #endif
                 }
-                assert(m_num_runs_abundances > num_mergings_in_round);
-                m_num_runs_abundances -= num_mergings_in_round;
-                std::cout << "  num_mergings = " << num_mergings_in_round << std::endl;
+                assert(m_num_runs_abundances > num_matches_in_round);
+                m_num_runs_abundances -= num_matches_in_round;
+                std::cout << "  num_matches = " << num_matches_in_round << std::endl;
                 std::cout << "  num_runs " << m_num_runs_abundances << std::endl;
 
                 round_timer.stop();
                 std::cout << "  time: " << round_timer.elapsed() / 1000000 << " [sec]" << std::endl;
 
-                // std::cout << "created vertices in round " << rounds.size() << ":" << std::endl;
-                // for (auto const& v : tmp_vertices) {
+                // std::cout << "created nodes in round " << rounds.size() << ":" << std::endl;
+                // for (auto const& v : tmp_nodes) {
                 //     std::cout << v.id << ":[" << v.front << "," << v.back << ","
                 //               << (v.sign ? '+' : '-') << "]\n";
                 // }
@@ -380,8 +339,8 @@ struct cover {
 
             rounds.push_back(walks_in_round);
 
-            vertices.swap(tmp_vertices);
-            tmp_vertices.clear();
+            nodes.swap(tmp_nodes);
+            tmp_nodes.clear();
             walks_in_round.clear();
             abundance_map.clear();
 
@@ -424,16 +383,16 @@ struct cover {
             assert(size_t(w) < rounds[r].size());
             auto const& walk = rounds[r][w];
             uint64_t num_sequences = 0;
-            for (auto const& vertex : walk) num_sequences += visit(vertex.id, r - 1, out);
+            for (auto const& node : walk) num_sequences += visit(node.id, r - 1, out);
             return num_sequences;
         }
         /* print */
         assert(size_t(w) < rounds[0].size());
         auto const& walk = rounds[0][w];
-        for (auto const& vertex : walk) {
-            // out << vertex.id << ":[" << vertex.front << "," << vertex.back << "] ";
-            out << vertex.id;
-            out << (vertex.sign ? " +\n" : " -\n");
+        for (auto const& node : walk) {
+            // out << node.id << ":[" << node.front << "," << node.back << "] ";
+            out << node.id;
+            out << (node.sign ? " +\n" : " -\n");
         }
         // out << '\n';
         return walk.size();
diff --git a/src/permute.cpp b/src/permute.cpp
index 0c041d7..b1c00cc 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -12,7 +12,7 @@ struct permute_data {
     permute_data() : num_runs_abundances(0), num_sequences(0) {}
     uint64_t num_runs_abundances;
     uint64_t num_sequences;
-    std::vector<vertex> vertices;
+    std::vector<node> nodes;
 };
 
 void parse_file(std::istream& is, permute_data& data, build_configuration const& build_config) {
@@ -92,7 +92,7 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
         num_sequences_all_mfa += kmers_have_all_mfa;
 
         constexpr bool sign = true;
-        data.vertices.emplace_back(data.num_sequences, front, back, sign);
+        data.nodes.emplace_back(data.num_sequences, front, back, sign);
     };
 
     while (!is.eof()) {
@@ -117,7 +117,7 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
         }
     }
 
-    assert(data.vertices.size() == data.num_sequences);
+    assert(data.nodes.size() == data.num_sequences);
     assert(data.num_runs_abundances >= data.num_sequences);
 
     std::cout << "read " << data.num_sequences << " sequences, " << num_bases << " bases, "
@@ -374,12 +374,12 @@ int main(int argc, char** argv) {
     auto data = parse_file(input_filename, build_config);
 
     {
-        uint64_t R_lower = data.num_runs_abundances - data.vertices.size() + 1;
+        uint64_t R_lower = data.num_runs_abundances - data.nodes.size() + 1;
         std::cout << "The trivial lower bound (too optimistic) assumes we are able to concatenate "
                      "all sequences : R_lo = "
                   << R_lower << std::endl;
 
-        std::sort(data.vertices.begin(), data.vertices.end(), [](auto const& x, auto const& y) {
+        std::sort(data.nodes.begin(), data.nodes.end(), [](auto const& x, auto const& y) {
             /* sort on front */
             if (x.front != y.front) return x.front < y.front;
             if (x.back != y.back) return x.back < y.back;
@@ -391,21 +391,21 @@ int main(int argc, char** argv) {
         // the largest abundance fits into a 32-bit uint.
         std::unordered_map<uint32_t, uint32_t> abundance_map;
 
-        uint64_t prev_front = data.vertices.front().front;
+        uint64_t prev_front = data.nodes.front().front;
         uint64_t count = 0;
-        for (auto const& vertex : data.vertices) {
-            if (vertex.front != prev_front) {
+        for (auto const& node : data.nodes) {
+            if (node.front != prev_front) {
                 abundance_map[prev_front] = count;
                 count = 0;
             }
             count += 1;
-            prev_front = vertex.front;
+            prev_front = node.front;
         }
         abundance_map[prev_front] = count;
 
         uint64_t R = data.num_runs_abundances;
-        for (auto const& vertex : data.vertices) {
-            uint64_t back = vertex.back;
+        for (auto const& node : data.nodes) {
+            uint64_t back = node.back;
             auto it = abundance_map.find(back);
             if (it != abundance_map.cend()) {  // found
                 if ((*it).second > 0) {        // if it is 0, we cannot find a match
@@ -425,10 +425,10 @@ int main(int argc, char** argv) {
     {
         /* compute cover */
         cover c(data.num_sequences, data.num_runs_abundances);
-        assert(data.vertices.size() == data.num_sequences);
-        c.compute(data.vertices);
+        assert(data.nodes.size() == data.num_sequences);
+        c.compute(data.nodes);
         c.save(permutation_filename);
-        std::vector<vertex>().swap(data.vertices);
+        std::vector<node>().swap(data.nodes);
     }
 
     /* permute */

From a9c8ca73548993f427a2f234174ff79b4b6cd164 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Sun, 27 Mar 2022 12:14:55 +0200
Subject: [PATCH 29/40] take into account the orientation of the vertices: done

---
 include/abundances.hpp    | 15 ++++----
 include/builder/cover.hpp | 43 ++++++++++++++++++-----
 src/permute.cpp           | 73 +++++++++++++++++++++++++++++++++++----
 3 files changed, 110 insertions(+), 21 deletions(-)

diff --git a/include/abundances.hpp b/include/abundances.hpp
index f0d3684..7a5e555 100644
--- a/include/abundances.hpp
+++ b/include/abundances.hpp
@@ -80,17 +80,20 @@ struct abundances {
                 return x.first < y.first;
             });
 
-            /* If this test fails, then we need to change the value of
-                constants::most_frequent_abundance. */
-            if (m_most_frequent_abundance != m_abundances.front().first) {
-                throw std::runtime_error("the most frequent abundance is not " +
-                                         std::to_string(constants::most_frequent_abundance) +
-                                         " but " + std::to_string(m_abundances.front().first));
+            if (m_kmer_id_interval_values.size() != 0) {  // optimize_mfa
+                /* If this test fails, then we need to change the value of
+                    constants::most_frequent_abundance. */
+                if (m_most_frequent_abundance != m_abundances.front().first) {
+                    throw std::runtime_error("the most frequent abundance is not " +
+                                             std::to_string(constants::most_frequent_abundance) +
+                                             " but " + std::to_string(m_abundances.front().first));
+                }
             }
 
             uint64_t rest = num_kmers - m_abundances.front().second;
             std::cout << "kmers that do not have the most frequent ab: " << rest << " ("
                       << (rest * 100.0) / num_kmers << "%)" << std::endl;
+
             if (m_kmer_id_interval_values.size() != 0) {  // optimize_mfa
                 std::cout << "cumulative_kmer_id_interval_lengths "
                           << m_kmer_id_interval_lengths.back() << '/' << rest << std::endl;
diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index 7c06655..b664c99 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -363,7 +363,11 @@ struct cover {
         int r = rounds.size() - 1;
         const auto& walks = rounds[r];
         uint64_t num_sequences = 0;
-        for (uint64_t w = 0; w != walks.size(); ++w) num_sequences += visit(w, r, out);
+        for (uint64_t w = 0; w != walks.size(); ++w) {
+            assert(walks[w].size() == 1);
+            assert(walks[w].front().sign == true);
+            num_sequences += visit(w, r, true, out);
+        }
         if (num_sequences != m_num_sequences) {
             std::cerr << "Error: expected to write " << m_num_sequences << " but written "
                       << num_sequences << std::endl;
@@ -378,23 +382,46 @@ struct cover {
     std::vector<walks_t> rounds;
 
     /* visit walk of index w in round of index r */
-    uint64_t visit(int w, int r, std::ofstream& out) const {
+    uint64_t visit(int w, int r, bool sign, std::ofstream& out) const {
         if (r > 0) {
             assert(size_t(w) < rounds[r].size());
             auto const& walk = rounds[r][w];
             uint64_t num_sequences = 0;
-            for (auto const& node : walk) num_sequences += visit(node.id, r - 1, out);
+            for (auto const& node : walk) {
+                /*
+                    + & + = +
+                    - & - = +
+                    + & - = -
+                    - & + = -
+                */
+                bool new_sign = sign * node.sign;
+                if (sign == false and node.sign == false) new_sign = true;
+                num_sequences += visit(node.id, r - 1, new_sign, out);
+            }
             return num_sequences;
         }
         /* print */
         assert(size_t(w) < rounds[0].size());
         auto const& walk = rounds[0][w];
-        for (auto const& node : walk) {
-            // out << node.id << ":[" << node.front << "," << node.back << "] ";
-            out << node.id;
-            out << (node.sign ? " +\n" : " -\n");
+
+        if (sign) {
+            /* print forward */
+            for (uint64_t i = 0; i != walk.size(); ++i) {
+                auto const& node = walk[i];
+                // out << node.id << ":[" << node.front << "," << node.back << "] ";
+                out << node.id;
+                out << (node.sign ? " 1\n" : " 0\n");
+            }
+        } else {
+            /* print backward and change orientation */
+            for (int64_t i = walk.size() - 1; i >= 0; --i) {
+                auto const& node = walk[i];
+                // out << node.id << ":[" << node.back << "," << node.front << "] ";
+                out << node.id;
+                out << (node.sign ? " 0\n" : " 1\n");
+            }
         }
-        // out << '\n';
+
         return walk.size();
     }
 };
diff --git a/src/permute.cpp b/src/permute.cpp
index b1c00cc..6764eae 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -181,8 +181,41 @@ struct lines_iterator {
     }
 };
 
+void reverse_header(std::string const& input, std::string& output, uint64_t k) {
+    // Example header:
+    // >2 LN:i:61 ab:Z:4 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1
+    // Expected output:
+    // >2 LN:i:61 ab:Z:1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 4 4
+
+    /* skip validation of input string */
+
+    uint64_t i = 0;
+    i = input.find_first_of(' ', i);
+    if (i == std::string::npos) throw parse_runtime_error();
+
+    i += 6;  // skip ' LN:i':
+    uint64_t j = input.find_first_of(' ', i);
+    if (j == std::string::npos) throw parse_runtime_error();
+
+    uint64_t seq_len = std::strtoull(input.data() + i, nullptr, 10);
+    i = j + 6;  // skip ' ab:Z:'
+    output.append(input.data(), input.data() + i);
+
+    std::vector<uint32_t> abundances;
+    abundances.reserve(seq_len - k + 1);
+    for (uint64_t j = 0; j != seq_len - k + 1; ++j) {
+        uint64_t abundance = std::strtoull(input.data() + i, nullptr, 10);
+        abundances.push_back(abundance);
+        i = input.find_first_of(' ', i) + 1;
+    }
+
+    std::reverse(abundances.begin(), abundances.end());
+    for (auto abundance : abundances) { output.append(std::to_string(abundance) + " "); }
+}
+
 void permute_and_write(std::istream& is, std::string const& output_filename,
-                       std::string const& tmp_dirname, pthash::compact_vector const& permutation) {
+                       std::string const& tmp_dirname, pthash::compact_vector const& permutation,
+                       pthash::bit_vector const& signs, uint64_t k) {
     constexpr uint64_t limit = 1 * essentials::GB;
     std::vector<std::pair<std::string, std::string>> buffer;  // (header, dna)
 
@@ -228,6 +261,25 @@ void permute_and_write(std::istream& is, std::string const& output_filename,
         std::getline(is, header_sequence);
         std::getline(is, dna_sequence);
 
+        if (!signs[i]) {
+            /* compute reverse complement of dna_sequence
+               and reverse the abundances in header_sequence */
+            std::string dna_sequence_rc(dna_sequence.size(), 0);
+            std::string header_sequence_r;
+
+            util::compute_reverse_complement(dna_sequence.data(), dna_sequence_rc.data(),
+                                             dna_sequence.size());
+            reverse_header(header_sequence, header_sequence_r, k);
+
+            // std::cout << "header_sequence '" << header_sequence << "'" << std::endl;
+            // std::cout << "dna_sequence '" << dna_sequence << "'" << std::endl;
+            // std::cout << "header_sequence_r '" << header_sequence_r << "'" << std::endl;
+            // std::cout << "dna_sequence_rc '" << dna_sequence_rc << "'" << std::endl;
+
+            dna_sequence.swap(dna_sequence_rc);
+            header_sequence.swap(header_sequence_r);
+        }
+
         uint64_t seq_bytes = header_sequence.size() + dna_sequence.size() + 16;
         if (bytes + seq_bytes > limit) sort_and_flush();
 
@@ -319,15 +371,16 @@ void permute_and_write(std::istream& is, std::string const& output_filename,
 }
 
 void permute_and_write(std::string const& input_filename, std::string const& output_filename,
-                       std::string const& tmp_dirname, pthash::compact_vector const& permutation) {
+                       std::string const& tmp_dirname, pthash::compact_vector const& permutation,
+                       pthash::bit_vector const& signs, uint64_t k) {
     std::ifstream is(input_filename.c_str());
     if (!is.good()) throw std::runtime_error("error in opening the file '" + input_filename + "'");
     std::cout << "reading file '" << input_filename << "'..." << std::endl;
     if (util::ends_with(input_filename, ".gz")) {
         zip_istream zis(is);
-        permute_and_write(zis, output_filename, tmp_dirname, permutation);
+        permute_and_write(zis, output_filename, tmp_dirname, permutation, signs, k);
     } else {
-        permute_and_write(is, output_filename, tmp_dirname, permutation);
+        permute_and_write(is, output_filename, tmp_dirname, permutation, signs, k);
     }
     is.close();
 }
@@ -431,8 +484,8 @@ int main(int argc, char** argv) {
         std::vector<node>().swap(data.nodes);
     }
 
-    /* permute */
     pthash::compact_vector permutation;
+    pthash::bit_vector signs;
     {
         std::ifstream is(permutation_filename.c_str());
         if (!is.good()) {
@@ -441,17 +494,23 @@ int main(int argc, char** argv) {
         pthash::compact_vector::builder cv_builder(
             data.num_sequences,
             data.num_sequences == 1 ? 1 : std::ceil(std::log2(data.num_sequences)));
+        pthash::bit_vector_builder bv_builder(data.num_sequences);
         for (uint64_t i = 0; i != data.num_sequences; ++i) {
             uint64_t position = 0;
+            bool sign = 0;
             is >> position;
+            is >> sign;
             cv_builder.set(position, i);
+            bv_builder.set(position, sign);
         }
         is.close();
         cv_builder.build(permutation);
+        signs.build(&bv_builder);
     }
 
-    permute_and_write(input_filename, output_filename, tmp_dirname, permutation);
-    std::remove(permutation_filename.c_str());
+    /* permute */
+    permute_and_write(input_filename, output_filename, tmp_dirname, permutation, signs, k);
+    // std::remove(permutation_filename.c_str());
 
     return 0;
 }

From 0f8dd889a1938a60d1268789db33c4ffde196858 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Sun, 27 Mar 2022 12:39:19 +0200
Subject: [PATCH 30/40] take into account the orientation of the vertices: done

---
 src/permute.cpp | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/src/permute.cpp b/src/permute.cpp
index 6764eae..28016b9 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -223,7 +223,11 @@ void permute_and_write(std::istream& is, std::string const& output_filename,
         std::to_string(pthash::clock_type::now().time_since_epoch().count());
 
     std::string header_sequence;
+    std::string header_sequence_r;
+
     std::string dna_sequence;
+    std::string dna_sequence_rc;
+
     uint64_t num_sequences = permutation.size();
     uint64_t num_bases = 0;
     uint64_t bytes = 0;
@@ -264,18 +268,11 @@ void permute_and_write(std::istream& is, std::string const& output_filename,
         if (!signs[i]) {
             /* compute reverse complement of dna_sequence
                and reverse the abundances in header_sequence */
-            std::string dna_sequence_rc(dna_sequence.size(), 0);
-            std::string header_sequence_r;
-
+            dna_sequence_rc.resize(dna_sequence.size());
+            header_sequence_r.clear();
             util::compute_reverse_complement(dna_sequence.data(), dna_sequence_rc.data(),
                                              dna_sequence.size());
             reverse_header(header_sequence, header_sequence_r, k);
-
-            // std::cout << "header_sequence '" << header_sequence << "'" << std::endl;
-            // std::cout << "dna_sequence '" << dna_sequence << "'" << std::endl;
-            // std::cout << "header_sequence_r '" << header_sequence_r << "'" << std::endl;
-            // std::cout << "dna_sequence_rc '" << dna_sequence_rc << "'" << std::endl;
-
             dna_sequence.swap(dna_sequence_rc);
             header_sequence.swap(header_sequence_r);
         }
@@ -508,9 +505,9 @@ int main(int argc, char** argv) {
         signs.build(&bv_builder);
     }
 
-    /* permute */
+    /* permute and save to output file */
     permute_and_write(input_filename, output_filename, tmp_dirname, permutation, signs, k);
-    // std::remove(permutation_filename.c_str());
+    std::remove(permutation_filename.c_str());
 
     return 0;
 }

From 8d8e02fb9a3ff1ad405b4f7d64ae63d967f0b3d9 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Sun, 27 Mar 2022 19:31:58 +0200
Subject: [PATCH 31/40] another strategy to compute R_hi

---
 include/builder/cover.hpp |  2 +-
 src/permute.cpp           | 39 ++++++++++++++++++++++-----------------
 2 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index b664c99..8780d5f 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -394,7 +394,7 @@ struct cover {
                     + & - = -
                     - & + = -
                 */
-                bool new_sign = sign * node.sign;
+                bool new_sign = sign and node.sign;
                 if (sign == false and node.sign == false) new_sign = true;
                 num_sequences += visit(node.id, r - 1, new_sign, out);
             }
diff --git a/src/permute.cpp b/src/permute.cpp
index 28016b9..8aea66b 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -424,10 +424,10 @@ int main(int argc, char** argv) {
     auto data = parse_file(input_filename, build_config);
 
     {
-        uint64_t R_lower = data.num_runs_abundances - data.nodes.size() + 1;
+        uint64_t R_lo = data.num_runs_abundances - data.nodes.size() + 1;
         std::cout << "The trivial lower bound (too optimistic) assumes we are able to concatenate "
                      "all sequences : R_lo = "
-                  << R_lower << std::endl;
+                  << R_lo << std::endl;
 
         std::sort(data.nodes.begin(), data.nodes.end(), [](auto const& x, auto const& y) {
             /* sort on front */
@@ -436,11 +436,14 @@ int main(int argc, char** argv) {
             return x.id < y.id;
         });
 
-        /* (abundance, num_seqs_with_front=abundance) */
+        /* (back_abundance, num_seqs_with_front=back_abundance) */
         // We assume there are less than 2^32 sequences and that
         // the largest abundance fits into a 32-bit uint.
         std::unordered_map<uint32_t, uint32_t> abundance_map;
 
+        /* (back_abundance, num_seqs_with_back=back_abundance) */
+        std::unordered_map<uint32_t, uint32_t> back_abundance_freqs;
+
         uint64_t prev_front = data.nodes.front().front;
         uint64_t count = 0;
         for (auto const& node : data.nodes) {
@@ -452,24 +455,26 @@ int main(int argc, char** argv) {
             prev_front = node.front;
         }
         abundance_map[prev_front] = count;
+        // for (auto const& p : abundance_map) {
+        //     std::cout << "ab:" << p.first << " - potential matches: " << p.second << std::endl;
+        // }
 
-        uint64_t R = data.num_runs_abundances;
         for (auto const& node : data.nodes) {
-            uint64_t back = node.back;
-            auto it = abundance_map.find(back);
-            if (it != abundance_map.cend()) {  // found
-                if ((*it).second > 0) {        // if it is 0, we cannot find a match
-
-                    /* We clearly cannot create more mergings than num_sequences - 1,
-                       thus R cannot be lower than R_lower. */
-                    if (R == R_lower) break;
-
-                    (*it).second -= 1;
-                    R -= 1;
-                }
+            auto it = back_abundance_freqs.find(node.back);
+            if (it == back_abundance_freqs.cend()) {
+                back_abundance_freqs[node.back] = 1;
+            } else {
+                (*it).second += 1;
+            }
+        }
+        uint64_t R_hi = R_lo;
+        for (auto const& p : back_abundance_freqs) {
+            // std::cout << "ab:" << p.first << " - freq:" << p.second << std::endl;
+            if (p.second == 1 and abundance_map.find(p.first) == abundance_map.cend()) {
+                R_hi += 1;
             }
         }
-        std::cout << "Computed lower bound: R_hi = " << R << std::endl;
+        std::cout << "Computed lower bound: R_hi = " << R_hi << std::endl;
     }
 
     {

From 4c3ac4f88b9cb692250116b631192edb8208a891 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Mon, 28 Mar 2022 11:06:25 +0200
Subject: [PATCH 32/40] another strategy to compute R_hi

---
 src/permute.cpp | 58 +++++++++++++++++++++++++------------------------
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/src/permute.cpp b/src/permute.cpp
index 8aea66b..28e6b33 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -429,52 +429,54 @@ int main(int argc, char** argv) {
                      "all sequences : R_lo = "
                   << R_lo << std::endl;
 
-        std::sort(data.nodes.begin(), data.nodes.end(), [](auto const& x, auto const& y) {
-            /* sort on front */
-            if (x.front != y.front) return x.front < y.front;
-            if (x.back != y.back) return x.back < y.back;
-            return x.id < y.id;
-        });
-
-        /* (back_abundance, num_seqs_with_front=back_abundance) */
         // We assume there are less than 2^32 sequences and that
         // the largest abundance fits into a 32-bit uint.
-        std::unordered_map<uint32_t, uint32_t> abundance_map;
+
+        /* (front_abundance, num_seqs_with_front=front_abundance) */
+        std::unordered_map<uint32_t, uint32_t> front_abundance_freqs;
 
         /* (back_abundance, num_seqs_with_back=back_abundance) */
         std::unordered_map<uint32_t, uint32_t> back_abundance_freqs;
 
-        uint64_t prev_front = data.nodes.front().front;
-        uint64_t count = 0;
         for (auto const& node : data.nodes) {
-            if (node.front != prev_front) {
-                abundance_map[prev_front] = count;
-                count = 0;
+            auto it_front = front_abundance_freqs.find(node.front);
+            if (it_front == front_abundance_freqs.cend()) {
+                front_abundance_freqs[node.front] = 1;
+            } else {
+                (*it_front).second += 1;
             }
-            count += 1;
-            prev_front = node.front;
-        }
-        abundance_map[prev_front] = count;
-        // for (auto const& p : abundance_map) {
-        //     std::cout << "ab:" << p.first << " - potential matches: " << p.second << std::endl;
-        // }
-
-        for (auto const& node : data.nodes) {
-            auto it = back_abundance_freqs.find(node.back);
-            if (it == back_abundance_freqs.cend()) {
+            auto it_back = back_abundance_freqs.find(node.back);
+            if (it_back == back_abundance_freqs.cend()) {
                 back_abundance_freqs[node.back] = 1;
             } else {
-                (*it).second += 1;
+                (*it_back).second += 1;
             }
         }
+
         uint64_t R_hi = R_lo;
         for (auto const& p : back_abundance_freqs) {
-            // std::cout << "ab:" << p.first << " - freq:" << p.second << std::endl;
-            if (p.second == 1 and abundance_map.find(p.first) == abundance_map.cend()) {
+            /* if there is only one occurrence of this back abundance
+               and it does not appear in any front abundance, then it cannot be matched */
+            if (p.second == 1 and
+                front_abundance_freqs.find(p.first) == front_abundance_freqs.cend()) {
                 R_hi += 1;
             }
         }
         std::cout << "Computed lower bound: R_hi = " << R_hi << std::endl;
+
+        /*
+        note that we do NOT do the same for front_abundance_freqs,
+        namely the following piece of code:
+
+            for (auto const& p : front_abundance_freqs) {
+                if (p.second == 1 and
+                    back_abundance_freqs.find(p.first) == back_abundance_freqs.cend()) {
+                    R_hi += 1;
+                }
+            }
+
+        because otherwise we will end up in potentially counting twice the number of cuts.
+        */
     }
 
     {

From 7662a6b068b4501e41b978a3211fcdcb2a26eeef Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Mon, 28 Mar 2022 11:15:09 +0200
Subject: [PATCH 33/40] minor

---
 include/builder/build.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/builder/build.cpp b/include/builder/build.cpp
index af4a359..3abb581 100644
--- a/include/builder/build.cpp
+++ b/include/builder/build.cpp
@@ -132,8 +132,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
         uint64_t j = sequence.find_first_of(' ', i);
         if (j == std::string::npos) throw parse_runtime_error();
 
-        char* end;
-        seq_len = std::strtoull(sequence.data() + i, &end, 10);
+        seq_len = std::strtoull(sequence.data() + i, nullptr, 10);
         i = j + 1;
         expect(sequence[i + 0], 'a');
         expect(sequence[i + 1], 'b');
@@ -150,7 +149,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
 
         for (uint64_t j = 0, num_kmers = data.num_kmers, prev_ab = constants::invalid;
              j != seq_len - k + 1; ++j, ++num_kmers) {
-            uint64_t ab = std::strtoull(sequence.data() + i, &end, 10);
+            uint64_t ab = std::strtoull(sequence.data() + i, nullptr, 10);
             i = sequence.find_first_of(' ', i) + 1;
 
             if (ab != constants::most_frequent_abundance) kmers_have_all_mfa = false;

From e6a10945c5f1f544fa30b99d51d373729d7b7573 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Mon, 28 Mar 2022 14:00:10 +0200
Subject: [PATCH 34/40] refined R_lo

---
 src/permute.cpp | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/permute.cpp b/src/permute.cpp
index 28e6b33..edcb85e 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -12,6 +12,7 @@ struct permute_data {
     permute_data() : num_runs_abundances(0), num_sequences(0) {}
     uint64_t num_runs_abundances;
     uint64_t num_sequences;
+    uint64_t num_distinct_abundances;
     std::vector<node> nodes;
 };
 
@@ -27,6 +28,9 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
     uint64_t num_sequences_all_mfa = 0;   // num sequences whose kmers have same abundance == mfa
     data.num_runs_abundances = 0;
     data.num_sequences = 0;
+    data.num_distinct_abundances = 0;
+
+    std::unordered_set<uint32_t> distinct_abundances;  // count number of distinct abundances
 
     auto parse_header = [&]() {
         if (sequence.empty()) return;
@@ -74,6 +78,8 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
             sum_of_abundances += abundance;
             i = sequence.find_first_of(' ', i) + 1;
 
+            distinct_abundances.insert(abundance);
+
             /* set front and back */
             if (j == 0) front = abundance;
             if (j == seq_len - k) back = abundance;
@@ -119,9 +125,11 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
 
     assert(data.nodes.size() == data.num_sequences);
     assert(data.num_runs_abundances >= data.num_sequences);
+    data.num_distinct_abundances = distinct_abundances.size();
 
     std::cout << "read " << data.num_sequences << " sequences, " << num_bases << " bases, "
               << num_kmers << " kmers" << std::endl;
+    std::cout << "found " << data.num_distinct_abundances << " distint abundances" << std::endl;
     std::cout << "sum_of_abundances " << sum_of_abundances << std::endl;
     std::cout << "num_sequences whose kmers have different abundances: " << num_sequences_diff_abs
               << "/" << data.num_sequences << " ("
@@ -424,10 +432,16 @@ int main(int argc, char** argv) {
     auto data = parse_file(input_filename, build_config);
 
     {
+        /* R_lo = max(c,r-p+1)
+           where c = num. distinct abundances
+                 r = num. runs in the abundances
+                 p = num. sequences */
         uint64_t R_lo = data.num_runs_abundances - data.nodes.size() + 1;
-        std::cout << "The trivial lower bound (too optimistic) assumes we are able to concatenate "
-                     "all sequences : R_lo = "
-                  << R_lo << std::endl;
+        if (R_lo < data.num_distinct_abundances) {
+            /* clearly we cannot have less than data.num_distinct_abundances runs */
+            R_lo = data.num_distinct_abundances;
+        }
+        std::cout << "R_lo = " << R_lo << std::endl;
 
         // We assume there are less than 2^32 sequences and that
         // the largest abundance fits into a 32-bit uint.
@@ -462,7 +476,7 @@ int main(int argc, char** argv) {
                 R_hi += 1;
             }
         }
-        std::cout << "Computed lower bound: R_hi = " << R_hi << std::endl;
+        std::cout << "R_hi = " << R_hi << std::endl;
 
         /*
         note that we do NOT do the same for front_abundance_freqs,

From 98168097e7a8a13da1f51932c3af1e181383d739 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Tue, 29 Mar 2022 18:18:51 +0200
Subject: [PATCH 35/40] refined computation of lower bound: commit for
 reference

---
 include/builder/cover.hpp |  32 ++---
 src/permute.cpp           | 249 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 253 insertions(+), 28 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index 8780d5f..acf30c0 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -306,21 +306,23 @@ struct cover {
                 uint64_t num_matches_in_round = 0;
                 for (auto const& walk : walks_in_round) {
                     num_matches_in_round += walk.size() - 1;
-#ifndef NDEBUG
-                    uint64_t prev_back = walk.front().front;
-                    std::cout << "=>";
-                    for (auto const& w : walk) {
-                        if (visited[w.id]) std::cout << "ERROR: duplicate node." << std::endl;
-                        if (w.front != prev_back) {
-                            std::cout << "ERROR: path is broken." << std::endl;
-                        }
-                        prev_back = w.back;
-                        visited[w.id] = true;
-                        std::cout << w.id << ":[" << w.front << "," << w.back << ","
-                                  << (w.sign ? '+' : '-') << "] ";
-                    }
-                    std::cout << std::endl;
-#endif
+                    // #ifndef NDEBUG
+                    // if (no_more_matches_possible) {
+                    //     // uint64_t prev_back = walk.front().front;
+                    //     std::cout << "=>";
+                    //     for (auto const& w : walk) {
+                    //         // if (visited[w.id]) std::cout << "ERROR: duplicate node." <<
+                    //         // std::endl; if (w.front != prev_back) {
+                    //         //     std::cout << "ERROR: path is broken." << std::endl;
+                    //         // }
+                    //         // prev_back = w.back;
+                    //         // visited[w.id] = true;
+                    //         std::cout << w.id << ":[" << w.front << "," << w.back << ","
+                    //                   << (w.sign ? '+' : '-') << "] ";
+                    //     }
+                    //     std::cout << std::endl;
+                    //     // #endif
+                    // }
                 }
                 assert(m_num_runs_abundances > num_matches_in_round);
                 m_num_runs_abundances -= num_matches_in_round;
diff --git a/src/permute.cpp b/src/permute.cpp
index edcb85e..41c21a7 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -13,9 +13,16 @@ struct permute_data {
     uint64_t num_runs_abundances;
     uint64_t num_sequences;
     uint64_t num_distinct_abundances;
+    uint64_t num_distinct_links;
     std::vector<node> nodes;
 };
 
+struct pair_hash {
+    inline uint64_t operator()(std::pair<uint32_t, uint32_t> const& v) const {
+        return static_cast<uint64_t>(v.first) * 31 + static_cast<uint64_t>(v.second);
+    }
+};
+
 void parse_file(std::istream& is, permute_data& data, build_configuration const& build_config) {
     std::string sequence;
     uint64_t k = build_config.k;
@@ -32,6 +39,10 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
 
     std::unordered_set<uint32_t> distinct_abundances;  // count number of distinct abundances
 
+    std::unordered_map<std::pair<uint32_t, uint32_t>, uint32_t, pair_hash> distinct_links_freq;
+    std::unordered_map<uint32_t, uint32_t> indegree;
+    std::unordered_map<uint32_t, uint32_t> outdegree;
+
     auto parse_header = [&]() {
         if (sequence.empty()) return;
 
@@ -99,6 +110,33 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
 
         constexpr bool sign = true;
         data.nodes.emplace_back(data.num_sequences, front, back, sign);
+
+        if (front != back) {
+            {
+                auto it = indegree.find(back);
+                if (it != indegree.cend()) {
+                    (*it).second += 1;
+                } else {
+                    indegree[back] = 1;
+                }
+            }
+            {
+                auto it = outdegree.find(front);
+                if (it != outdegree.cend()) {
+                    (*it).second += 1;
+                } else {
+                    outdegree[front] = 1;
+                }
+            }
+            {
+                auto it = distinct_links_freq.find({front, back});
+                if (it != distinct_links_freq.cend()) {
+                    (*it).second += 1;
+                } else {
+                    distinct_links_freq[{front, back}] = 1;
+                }
+            }
+        }
     };
 
     while (!is.eof()) {
@@ -126,6 +164,140 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
     assert(data.nodes.size() == data.num_sequences);
     assert(data.num_runs_abundances >= data.num_sequences);
     data.num_distinct_abundances = distinct_abundances.size();
+    data.num_distinct_links = distinct_links_freq.size();
+
+    {
+        /* (front_abundance, num_seqs_with_front=front_abundance) */
+        std::unordered_map<uint32_t, uint32_t> front_abundance_freqs;
+
+        /* (back_abundance, num_seqs_with_back=back_abundance) */
+        std::unordered_map<uint32_t, uint32_t> back_abundance_freqs;
+
+        for (auto const& node : data.nodes) {
+            auto it_front = front_abundance_freqs.find(node.front);
+            if (it_front == front_abundance_freqs.cend()) {
+                front_abundance_freqs[node.front] = 1;
+            } else {
+                (*it_front).second += 1;
+            }
+            auto it_back = back_abundance_freqs.find(node.back);
+            if (it_back == back_abundance_freqs.cend()) {
+                back_abundance_freqs[node.back] = 1;
+            } else {
+                (*it_back).second += 1;
+            }
+        }
+
+        std::vector<uint32_t> distinct_abundances_vec;
+        distinct_abundances_vec.reserve(distinct_abundances.size());
+        for (auto x : distinct_abundances) distinct_abundances_vec.push_back(x);
+        std::sort(distinct_abundances_vec.begin(), distinct_abundances_vec.end());
+
+        uint64_t will_appear = 0;
+        for (auto ab : distinct_abundances_vec) {
+            int64_t freq_front = 0;
+            int64_t freq_back = 0;
+            auto it_front = front_abundance_freqs.find(ab);
+            auto it_back = back_abundance_freqs.find(ab);
+            if (it_front != front_abundance_freqs.cend()) { freq_front = (*it_front).second; }
+            if (it_back != back_abundance_freqs.cend()) { freq_back = (*it_back).second; }
+
+            /* abundance only appears for internal kmers */
+            if (freq_front == 0 and freq_back == 0) continue;
+
+            // std::cout << "ab:" << ab << " - freq_front:" << freq_front
+            //           << " - freq_back:" << freq_back;
+
+            uint64_t in_count = 0;
+            uint64_t out_count = 0;
+            auto it_in_count = indegree.find(ab);
+            auto it_out_count = outdegree.find(ab);
+            if (it_in_count != indegree.cend()) { in_count = (*it_in_count).second; }
+            if (it_out_count != outdegree.cend()) { out_count = (*it_out_count).second; }
+
+            // std::cout << " - indegree:" << in_count << " - outdegree:" << out_count;
+
+            /* special case */
+            if (in_count == 0 and out_count == 0) {
+                // std::cout << " - **2**\n";
+                will_appear += 2;  // will appear as singleton, so count twice
+                continue;
+            }
+
+            uint64_t diff = std::abs(freq_front - freq_back);
+            if (diff % 2 == 0) {
+                // std::cout << " - 0\n";
+            } else {
+                // std::cout << " - 1\n";
+                will_appear += 1;
+            }
+        }
+
+        uint64_t num_runs_abundances_internal = data.num_runs_abundances - data.nodes.size();
+        std::cout << "will_appear = " << will_appear / 2 << std::endl;
+        std::cout << "computed lower bound = " << (num_runs_abundances_internal + will_appear / 2)
+                  << std::endl;
+
+        // uint64_t cuts = 0;
+        // for (uint64_t i = 0; i != distinct_abundances_vec.size(); ++i) {
+        //     uint64_t front = distinct_abundances_vec[i];
+        //     if (i + 1 == distinct_abundances_vec.size()) break;
+        //     uint64_t back = distinct_abundances_vec[i + 1];
+
+        //     std::cout << "trying to glue [" << front << "," << back << "]; ";
+
+        //     int64_t freq_forward = 0;  // signed
+        //     int64_t freq_bckward = 0;  // signed
+        //     auto it_forward = distinct_links_freq.find({front, back});
+        //     auto it_bckward = distinct_links_freq.find({back, front});
+        //     if (it_forward != distinct_links_freq.cend()) { freq_forward = (*it_forward).second;
+        //     } if (it_bckward != distinct_links_freq.cend()) { freq_bckward =
+        //     (*it_bckward).second; }
+
+        //     std::cout << "freq of link [" << front << "," << back << "] = " << freq_forward << ";
+        //     "; std::cout << "freq of link [" << back << "," << front << "] = " << freq_bckward <<
+        //     "; ";
+
+        //     /* if the difference in abs value is even, then the link
+        //     disappears and we cannot glue anymore, i.e., we originate a cut */
+        //     uint64_t diff = std::abs(freq_forward - freq_bckward);
+        //     if (diff % 2 == 0) {
+        //         // std::cout << "NOT GLUEABLE";
+        //         // cuts += 1;
+
+        //         int64_t freq_front = 0;  // signed
+        //         int64_t freq_back = 0;   // signed
+        //         auto it_front = front_abundance_freqs.find(front);
+        //         auto it_back = back_abundance_freqs.find(front);
+        //         if (it_front != front_abundance_freqs.cend()) { freq_front = (*it_front).second;
+        //         } if (it_back != back_abundance_freqs.cend()) { freq_back = (*it_back).second; }
+
+        //         bool front_appears = (std::abs(freq_front - freq_back) % 2 != 0);
+        //         // or (freq_front == 1 and freq_back == 1);
+
+        //         freq_front = 0;  // signed
+        //         freq_back = 0;   // signed
+        //         it_front = front_abundance_freqs.find(back);
+        //         it_back = back_abundance_freqs.find(back);
+        //         if (it_front != front_abundance_freqs.cend()) { freq_front = (*it_front).second;
+        //         } if (it_back != back_abundance_freqs.cend()) { freq_back = (*it_back).second; }
+
+        //         bool back_appears = (std::abs(freq_front - freq_back) % 2 != 0);
+        //         // or (freq_front == 1 and freq_back == 1);
+
+        //         if (front_appears and back_appears) {  // and link is absent
+        //             std::cout << "NOT GLUEABLE";
+        //             cuts += 1;
+        //         }
+
+        //     } else {
+        //         std::cout << "glueable";
+        //     }
+
+        //     std::cout << '\n';
+        // }
+        // std::cout << "cuts = " << cuts << std::endl;
+    }
 
     std::cout << "read " << data.num_sequences << " sequences, " << num_bases << " bases, "
               << num_kmers << " kmers" << std::endl;
@@ -436,47 +608,95 @@ int main(int argc, char** argv) {
            where c = num. distinct abundances
                  r = num. runs in the abundances
                  p = num. sequences */
+        uint64_t num_runs_abundances_internal = data.num_runs_abundances - data.nodes.size();
+        std::cout << "num_runs_abundances = " << data.num_runs_abundances << std::endl;
+        std::cout << "num_runs_abundances_internal = " << num_runs_abundances_internal << std::endl;
+        std::cout << "num_distinct_links = " << data.num_distinct_links << std::endl;
+
         uint64_t R_lo = data.num_runs_abundances - data.nodes.size() + 1;
+        uint64_t R_hi = R_lo;
         if (R_lo < data.num_distinct_abundances) {
             /* clearly we cannot have less than data.num_distinct_abundances runs */
             R_lo = data.num_distinct_abundances;
+            // R_hi = data.num_runs_abundances_internal;
+            R_hi = data.num_distinct_abundances + num_runs_abundances_internal -
+                   data.num_distinct_links;
         }
-        std::cout << "R_lo = " << R_lo << std::endl;
-
+        // else {
         // We assume there are less than 2^32 sequences and that
         // the largest abundance fits into a 32-bit uint.
 
         /* (front_abundance, num_seqs_with_front=front_abundance) */
-        std::unordered_map<uint32_t, uint32_t> front_abundance_freqs;
+        std::unordered_map<uint32_t, std::pair<uint32_t, uint32_t>> front_abundance_freqs;
 
         /* (back_abundance, num_seqs_with_back=back_abundance) */
-        std::unordered_map<uint32_t, uint32_t> back_abundance_freqs;
+        std::unordered_map<uint32_t, std::pair<uint32_t, uint32_t>> back_abundance_freqs;
 
+        uint64_t offset = 0;
         for (auto const& node : data.nodes) {
             auto it_front = front_abundance_freqs.find(node.front);
             if (it_front == front_abundance_freqs.cend()) {
-                front_abundance_freqs[node.front] = 1;
+                front_abundance_freqs[node.front] = {1, offset};
             } else {
-                (*it_front).second += 1;
+                (*it_front).second.first += 1;
+                (*it_front).second.second = -1;
             }
             auto it_back = back_abundance_freqs.find(node.back);
             if (it_back == back_abundance_freqs.cend()) {
-                back_abundance_freqs[node.back] = 1;
+                back_abundance_freqs[node.back] = {1, offset};
             } else {
-                (*it_back).second += 1;
+                (*it_back).second.first += 1;
+                (*it_back).second.second = -1;
             }
+            ++offset;
         }
 
-        uint64_t R_hi = R_lo;
+        // std::cout << "front_abundance_freqs.size() = " << front_abundance_freqs.size() <<
+        // std::endl; std::cout << "back_abundance_freqs.size() = " << back_abundance_freqs.size()
+        // << std::endl;
+
+        // for (auto const& p : back_abundance_freqs) {
+        //     std::cout << "back-ab:" << p.first << " - freq:" << p.second.first << std::endl;
+        // }
+        // for (auto const& p : front_abundance_freqs) {
+        //     std::cout << "front-ab:" << p.first << " - freq:" << p.second.first << std::endl;
+        // }
+
         for (auto const& p : back_abundance_freqs) {
             /* if there is only one occurrence of this back abundance
                and it does not appear in any front abundance, then it cannot be matched */
-            if (p.second == 1 and
-                front_abundance_freqs.find(p.first) == front_abundance_freqs.cend()) {
-                R_hi += 1;
+            if (p.second.first == 1) {
+                auto it = front_abundance_freqs.find(p.first);
+                if (it == front_abundance_freqs.cend()) { R_hi += 1; }
+                // else {
+                //     if ((*it).second.first == 1 and p.second.second == (*it).second.second) {
+                //         // uint64_t offset = p.second.second;
+                //         // auto node = data.nodes[offset];
+                //         // std::cout << "node " << node.id << ":[" << node.front << "," <<
+                //         // node.back
+                //         //           << "]" << std::endl;
+                //         R_hi += 1;
+                //     }
+                // }
             }
+            // else {
+            //     auto it = front_abundance_freqs.find(p.first);
+            //     if (it != front_abundance_freqs.cend()) {
+            //         // abundance is present both as back and front
+
+            //         // and appears an even number of times, will originate a walk
+            //         //   with both sides unmatchable
+            //         if ((*it).second.first % 2 == 0 and p.second.first % 2 == 0) { R_hi += 1; }
+            //     }
+            // }
         }
-        std::cout << "R_hi = " << R_hi << std::endl;
+
+        // for (auto const& p : front_abundance_freqs) {
+        //     if (p.second.first == 1 and
+        //         back_abundance_freqs.find(p.first) == back_abundance_freqs.cend()) {
+        //         R_hi += 1;
+        //     }
+        // }
 
         /*
         note that we do NOT do the same for front_abundance_freqs,
@@ -491,6 +711,9 @@ int main(int argc, char** argv) {
 
         because otherwise we will end up in potentially counting twice the number of cuts.
         */
+        // }
+        std::cout << "R_lo = " << R_lo << std::endl;
+        std::cout << "R_hi = " << R_hi << std::endl;
     }
 
     {

From 10a4b71585478f917b3dc0bf38c1a93203c0d07b Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Tue, 29 Mar 2022 18:56:52 +0200
Subject: [PATCH 36/40] refined computation of lower bound: cleaned code

---
 include/builder/cover.hpp |  38 ++---
 src/permute.cpp           | 315 ++++++++------------------------------
 2 files changed, 83 insertions(+), 270 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index acf30c0..3dc681d 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -306,23 +306,21 @@ struct cover {
                 uint64_t num_matches_in_round = 0;
                 for (auto const& walk : walks_in_round) {
                     num_matches_in_round += walk.size() - 1;
-                    // #ifndef NDEBUG
-                    // if (no_more_matches_possible) {
-                    //     // uint64_t prev_back = walk.front().front;
-                    //     std::cout << "=>";
-                    //     for (auto const& w : walk) {
-                    //         // if (visited[w.id]) std::cout << "ERROR: duplicate node." <<
-                    //         // std::endl; if (w.front != prev_back) {
-                    //         //     std::cout << "ERROR: path is broken." << std::endl;
-                    //         // }
-                    //         // prev_back = w.back;
-                    //         // visited[w.id] = true;
-                    //         std::cout << w.id << ":[" << w.front << "," << w.back << ","
-                    //                   << (w.sign ? '+' : '-') << "] ";
-                    //     }
-                    //     std::cout << std::endl;
-                    //     // #endif
-                    // }
+#ifndef NDEBUG
+                    uint64_t prev_back = walk.front().front;
+                    // std::cout << "=>";
+                    for (auto const& w : walk) {
+                        if (visited[w.id]) std::cout << "ERROR: duplicate node." << std::endl;
+                        if (w.front != prev_back) {
+                            std::cout << "ERROR: path is broken." << std::endl;
+                        }
+                        prev_back = w.back;
+                        visited[w.id] = true;
+                        // std::cout << w.id << ":[" << w.front << "," << w.back << ","
+                        //           << (w.sign ? '+' : '-') << "] ";
+                    }
+                    // std::cout << std::endl;
+#endif
                 }
                 assert(m_num_runs_abundances > num_matches_in_round);
                 m_num_runs_abundances -= num_matches_in_round;
@@ -331,12 +329,6 @@ struct cover {
 
                 round_timer.stop();
                 std::cout << "  time: " << round_timer.elapsed() / 1000000 << " [sec]" << std::endl;
-
-                // std::cout << "created nodes in round " << rounds.size() << ":" << std::endl;
-                // for (auto const& v : tmp_nodes) {
-                //     std::cout << v.id << ":[" << v.front << "," << v.back << ","
-                //               << (v.sign ? '+' : '-') << "]\n";
-                // }
             }
 
             rounds.push_back(walks_in_round);
diff --git a/src/permute.cpp b/src/permute.cpp
index 41c21a7..3a84f71 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -12,8 +12,6 @@ struct permute_data {
     permute_data() : num_runs_abundances(0), num_sequences(0) {}
     uint64_t num_runs_abundances;
     uint64_t num_sequences;
-    uint64_t num_distinct_abundances;
-    uint64_t num_distinct_links;
     std::vector<node> nodes;
 };
 
@@ -33,15 +31,12 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
     uint64_t sum_of_abundances = 0;
     uint64_t num_sequences_diff_abs = 0;  // num sequences whose kmers have different abundances
     uint64_t num_sequences_all_mfa = 0;   // num sequences whose kmers have same abundance == mfa
+
     data.num_runs_abundances = 0;
     data.num_sequences = 0;
-    data.num_distinct_abundances = 0;
-
-    std::unordered_set<uint32_t> distinct_abundances;  // count number of distinct abundances
 
-    std::unordered_map<std::pair<uint32_t, uint32_t>, uint32_t, pair_hash> distinct_links_freq;
-    std::unordered_map<uint32_t, uint32_t> indegree;
-    std::unordered_map<uint32_t, uint32_t> outdegree;
+    /* count number of distinct abundances */
+    std::unordered_set<uint32_t> distinct_abundances;
 
     auto parse_header = [&]() {
         if (sequence.empty()) return;
@@ -110,33 +105,6 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
 
         constexpr bool sign = true;
         data.nodes.emplace_back(data.num_sequences, front, back, sign);
-
-        if (front != back) {
-            {
-                auto it = indegree.find(back);
-                if (it != indegree.cend()) {
-                    (*it).second += 1;
-                } else {
-                    indegree[back] = 1;
-                }
-            }
-            {
-                auto it = outdegree.find(front);
-                if (it != outdegree.cend()) {
-                    (*it).second += 1;
-                } else {
-                    outdegree[front] = 1;
-                }
-            }
-            {
-                auto it = distinct_links_freq.find({front, back});
-                if (it != distinct_links_freq.cend()) {
-                    (*it).second += 1;
-                } else {
-                    distinct_links_freq[{front, back}] = 1;
-                }
-            }
-        }
     };
 
     while (!is.eof()) {
@@ -163,9 +131,25 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
 
     assert(data.nodes.size() == data.num_sequences);
     assert(data.num_runs_abundances >= data.num_sequences);
-    data.num_distinct_abundances = distinct_abundances.size();
-    data.num_distinct_links = distinct_links_freq.size();
 
+    std::cout << "read " << data.num_sequences << " sequences, " << num_bases << " bases, "
+              << num_kmers << " kmers" << std::endl;
+    std::cout << "found " << distinct_abundances.size() << " distinct abundances" << std::endl;
+    std::cout << "sum_of_abundances " << sum_of_abundances << std::endl;
+    std::cout << "num_sequences whose kmers have different abundances: " << num_sequences_diff_abs
+              << "/" << data.num_sequences << " ("
+              << (num_sequences_diff_abs * 100.0) / data.num_sequences << "%)" << std::endl;
+    std::cout << "num_sequences whose kmers all have the same abundance != mfa: "
+              << (data.num_sequences - num_sequences_diff_abs) << "/" << data.num_sequences << " ("
+              << ((data.num_sequences - num_sequences_diff_abs) * 100.0) / data.num_sequences
+              << "%)" << std::endl;
+    std::cout << "num_sequences whose kmers all have the same abundance == mfa: "
+              << num_sequences_all_mfa << "/" << (data.num_sequences - num_sequences_diff_abs)
+              << " ("
+              << (num_sequences_all_mfa * 100.0) / (data.num_sequences - num_sequences_diff_abs)
+              << "%)" << std::endl;
+
+    /* computation of lower bound to the number of final abundance runs */
     {
         /* (front_abundance, num_seqs_with_front=front_abundance) */
         std::unordered_map<uint32_t, uint32_t> front_abundance_freqs;
@@ -173,19 +157,45 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
         /* (back_abundance, num_seqs_with_back=back_abundance) */
         std::unordered_map<uint32_t, uint32_t> back_abundance_freqs;
 
+        /* (front_abundance, num_edges_outgoing_from) */
+        std::unordered_map<uint32_t, uint32_t> outdegree;
+
+        /* (back_abundance, num_edges_incoming_in) */
+        std::unordered_map<uint32_t, uint32_t> indegree;
+
         for (auto const& node : data.nodes) {
-            auto it_front = front_abundance_freqs.find(node.front);
+            uint64_t front = node.front;
+            uint64_t back = node.back;
+            auto it_front = front_abundance_freqs.find(front);
             if (it_front == front_abundance_freqs.cend()) {
-                front_abundance_freqs[node.front] = 1;
+                front_abundance_freqs[front] = 1;
             } else {
                 (*it_front).second += 1;
             }
-            auto it_back = back_abundance_freqs.find(node.back);
+            auto it_back = back_abundance_freqs.find(back);
             if (it_back == back_abundance_freqs.cend()) {
-                back_abundance_freqs[node.back] = 1;
+                back_abundance_freqs[back] = 1;
             } else {
                 (*it_back).second += 1;
             }
+            if (front != back) {
+                {
+                    auto it = indegree.find(back);
+                    if (it != indegree.cend()) {
+                        (*it).second += 1;
+                    } else {
+                        indegree[back] = 1;
+                    }
+                }
+                {
+                    auto it = outdegree.find(front);
+                    if (it != outdegree.cend()) {
+                        (*it).second += 1;
+                    } else {
+                        outdegree[front] = 1;
+                    }
+                }
+            }
         }
 
         std::vector<uint32_t> distinct_abundances_vec;
@@ -193,7 +203,7 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
         for (auto x : distinct_abundances) distinct_abundances_vec.push_back(x);
         std::sort(distinct_abundances_vec.begin(), distinct_abundances_vec.end());
 
-        uint64_t will_appear = 0;
+        uint64_t num_end_points = 0;
         for (auto ab : distinct_abundances_vec) {
             int64_t freq_front = 0;
             int64_t freq_back = 0;
@@ -205,9 +215,6 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
             /* abundance only appears for internal kmers */
             if (freq_front == 0 and freq_back == 0) continue;
 
-            // std::cout << "ab:" << ab << " - freq_front:" << freq_front
-            //           << " - freq_back:" << freq_back;
-
             uint64_t in_count = 0;
             uint64_t out_count = 0;
             auto it_in_count = indegree.find(ab);
@@ -215,106 +222,33 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
             if (it_in_count != indegree.cend()) { in_count = (*it_in_count).second; }
             if (it_out_count != outdegree.cend()) { out_count = (*it_out_count).second; }
 
-            // std::cout << " - indegree:" << in_count << " - outdegree:" << out_count;
+            // std::cout << "ab:" << ab << " - freq_front:" << freq_front
+            //           << " - freq_back:" << freq_back << " - indegree:" << in_count
+            //           << " - outdegree:" << out_count << std::endl;
 
-            /* special case */
+            /* special case: abundance will appear as singletion, so count twice */
             if (in_count == 0 and out_count == 0) {
-                // std::cout << " - **2**\n";
-                will_appear += 2;  // will appear as singleton, so count twice
+                num_end_points += 2;
                 continue;
             }
 
+            /* if the excess of frequency is odd, then the abundance will appear as end-point */
             uint64_t diff = std::abs(freq_front - freq_back);
-            if (diff % 2 == 0) {
-                // std::cout << " - 0\n";
-            } else {
-                // std::cout << " - 1\n";
-                will_appear += 1;
-            }
+            if (diff % 2 == 1) num_end_points += 1;
         }
 
+        uint64_t num_distinct_abundances = distinct_abundances.size();
         uint64_t num_runs_abundances_internal = data.num_runs_abundances - data.nodes.size();
-        std::cout << "will_appear = " << will_appear / 2 << std::endl;
-        std::cout << "computed lower bound = " << (num_runs_abundances_internal + will_appear / 2)
-                  << std::endl;
+        std::cout << "num_end_points = " << num_end_points / 2 << std::endl;
+        std::cout << "num_runs_abundances = " << data.num_runs_abundances << std::endl;
+        std::cout << "num_runs_abundances_internal = " << num_runs_abundances_internal << std::endl;
 
-        // uint64_t cuts = 0;
-        // for (uint64_t i = 0; i != distinct_abundances_vec.size(); ++i) {
-        //     uint64_t front = distinct_abundances_vec[i];
-        //     if (i + 1 == distinct_abundances_vec.size()) break;
-        //     uint64_t back = distinct_abundances_vec[i + 1];
-
-        //     std::cout << "trying to glue [" << front << "," << back << "]; ";
-
-        //     int64_t freq_forward = 0;  // signed
-        //     int64_t freq_bckward = 0;  // signed
-        //     auto it_forward = distinct_links_freq.find({front, back});
-        //     auto it_bckward = distinct_links_freq.find({back, front});
-        //     if (it_forward != distinct_links_freq.cend()) { freq_forward = (*it_forward).second;
-        //     } if (it_bckward != distinct_links_freq.cend()) { freq_bckward =
-        //     (*it_bckward).second; }
-
-        //     std::cout << "freq of link [" << front << "," << back << "] = " << freq_forward << ";
-        //     "; std::cout << "freq of link [" << back << "," << front << "] = " << freq_bckward <<
-        //     "; ";
-
-        //     /* if the difference in abs value is even, then the link
-        //     disappears and we cannot glue anymore, i.e., we originate a cut */
-        //     uint64_t diff = std::abs(freq_forward - freq_bckward);
-        //     if (diff % 2 == 0) {
-        //         // std::cout << "NOT GLUEABLE";
-        //         // cuts += 1;
-
-        //         int64_t freq_front = 0;  // signed
-        //         int64_t freq_back = 0;   // signed
-        //         auto it_front = front_abundance_freqs.find(front);
-        //         auto it_back = back_abundance_freqs.find(front);
-        //         if (it_front != front_abundance_freqs.cend()) { freq_front = (*it_front).second;
-        //         } if (it_back != back_abundance_freqs.cend()) { freq_back = (*it_back).second; }
-
-        //         bool front_appears = (std::abs(freq_front - freq_back) % 2 != 0);
-        //         // or (freq_front == 1 and freq_back == 1);
-
-        //         freq_front = 0;  // signed
-        //         freq_back = 0;   // signed
-        //         it_front = front_abundance_freqs.find(back);
-        //         it_back = back_abundance_freqs.find(back);
-        //         if (it_front != front_abundance_freqs.cend()) { freq_front = (*it_front).second;
-        //         } if (it_back != back_abundance_freqs.cend()) { freq_back = (*it_back).second; }
-
-        //         bool back_appears = (std::abs(freq_front - freq_back) % 2 != 0);
-        //         // or (freq_front == 1 and freq_back == 1);
-
-        //         if (front_appears and back_appears) {  // and link is absent
-        //             std::cout << "NOT GLUEABLE";
-        //             cuts += 1;
-        //         }
-
-        //     } else {
-        //         std::cout << "glueable";
-        //     }
-
-        //     std::cout << '\n';
-        // }
-        // std::cout << "cuts = " << cuts << std::endl;
+        uint64_t R_lo =
+            std::max<uint64_t>(num_distinct_abundances, num_runs_abundances_internal + 1);
+        uint64_t R_hi = num_runs_abundances_internal + num_end_points / 2;
+        std::cout << "R_lo = " << R_lo << std::endl;
+        std::cout << "R_hi = " << R_hi << std::endl;
     }
-
-    std::cout << "read " << data.num_sequences << " sequences, " << num_bases << " bases, "
-              << num_kmers << " kmers" << std::endl;
-    std::cout << "found " << data.num_distinct_abundances << " distint abundances" << std::endl;
-    std::cout << "sum_of_abundances " << sum_of_abundances << std::endl;
-    std::cout << "num_sequences whose kmers have different abundances: " << num_sequences_diff_abs
-              << "/" << data.num_sequences << " ("
-              << (num_sequences_diff_abs * 100.0) / data.num_sequences << "%)" << std::endl;
-    std::cout << "num_sequences whose kmers all have the same abundance != mfa: "
-              << (data.num_sequences - num_sequences_diff_abs) << "/" << data.num_sequences << " ("
-              << ((data.num_sequences - num_sequences_diff_abs) * 100.0) / data.num_sequences
-              << "%)" << std::endl;
-    std::cout << "num_sequences whose kmers all have the same abundance == mfa: "
-              << num_sequences_all_mfa << "/" << (data.num_sequences - num_sequences_diff_abs)
-              << " ("
-              << (num_sequences_all_mfa * 100.0) / (data.num_sequences - num_sequences_diff_abs)
-              << "%)" << std::endl;
 }
 
 permute_data parse_file(std::string const& filename, build_configuration const& build_config) {
@@ -603,119 +537,6 @@ int main(int argc, char** argv) {
 
     auto data = parse_file(input_filename, build_config);
 
-    {
-        /* R_lo = max(c,r-p+1)
-           where c = num. distinct abundances
-                 r = num. runs in the abundances
-                 p = num. sequences */
-        uint64_t num_runs_abundances_internal = data.num_runs_abundances - data.nodes.size();
-        std::cout << "num_runs_abundances = " << data.num_runs_abundances << std::endl;
-        std::cout << "num_runs_abundances_internal = " << num_runs_abundances_internal << std::endl;
-        std::cout << "num_distinct_links = " << data.num_distinct_links << std::endl;
-
-        uint64_t R_lo = data.num_runs_abundances - data.nodes.size() + 1;
-        uint64_t R_hi = R_lo;
-        if (R_lo < data.num_distinct_abundances) {
-            /* clearly we cannot have less than data.num_distinct_abundances runs */
-            R_lo = data.num_distinct_abundances;
-            // R_hi = data.num_runs_abundances_internal;
-            R_hi = data.num_distinct_abundances + num_runs_abundances_internal -
-                   data.num_distinct_links;
-        }
-        // else {
-        // We assume there are less than 2^32 sequences and that
-        // the largest abundance fits into a 32-bit uint.
-
-        /* (front_abundance, num_seqs_with_front=front_abundance) */
-        std::unordered_map<uint32_t, std::pair<uint32_t, uint32_t>> front_abundance_freqs;
-
-        /* (back_abundance, num_seqs_with_back=back_abundance) */
-        std::unordered_map<uint32_t, std::pair<uint32_t, uint32_t>> back_abundance_freqs;
-
-        uint64_t offset = 0;
-        for (auto const& node : data.nodes) {
-            auto it_front = front_abundance_freqs.find(node.front);
-            if (it_front == front_abundance_freqs.cend()) {
-                front_abundance_freqs[node.front] = {1, offset};
-            } else {
-                (*it_front).second.first += 1;
-                (*it_front).second.second = -1;
-            }
-            auto it_back = back_abundance_freqs.find(node.back);
-            if (it_back == back_abundance_freqs.cend()) {
-                back_abundance_freqs[node.back] = {1, offset};
-            } else {
-                (*it_back).second.first += 1;
-                (*it_back).second.second = -1;
-            }
-            ++offset;
-        }
-
-        // std::cout << "front_abundance_freqs.size() = " << front_abundance_freqs.size() <<
-        // std::endl; std::cout << "back_abundance_freqs.size() = " << back_abundance_freqs.size()
-        // << std::endl;
-
-        // for (auto const& p : back_abundance_freqs) {
-        //     std::cout << "back-ab:" << p.first << " - freq:" << p.second.first << std::endl;
-        // }
-        // for (auto const& p : front_abundance_freqs) {
-        //     std::cout << "front-ab:" << p.first << " - freq:" << p.second.first << std::endl;
-        // }
-
-        for (auto const& p : back_abundance_freqs) {
-            /* if there is only one occurrence of this back abundance
-               and it does not appear in any front abundance, then it cannot be matched */
-            if (p.second.first == 1) {
-                auto it = front_abundance_freqs.find(p.first);
-                if (it == front_abundance_freqs.cend()) { R_hi += 1; }
-                // else {
-                //     if ((*it).second.first == 1 and p.second.second == (*it).second.second) {
-                //         // uint64_t offset = p.second.second;
-                //         // auto node = data.nodes[offset];
-                //         // std::cout << "node " << node.id << ":[" << node.front << "," <<
-                //         // node.back
-                //         //           << "]" << std::endl;
-                //         R_hi += 1;
-                //     }
-                // }
-            }
-            // else {
-            //     auto it = front_abundance_freqs.find(p.first);
-            //     if (it != front_abundance_freqs.cend()) {
-            //         // abundance is present both as back and front
-
-            //         // and appears an even number of times, will originate a walk
-            //         //   with both sides unmatchable
-            //         if ((*it).second.first % 2 == 0 and p.second.first % 2 == 0) { R_hi += 1; }
-            //     }
-            // }
-        }
-
-        // for (auto const& p : front_abundance_freqs) {
-        //     if (p.second.first == 1 and
-        //         back_abundance_freqs.find(p.first) == back_abundance_freqs.cend()) {
-        //         R_hi += 1;
-        //     }
-        // }
-
-        /*
-        note that we do NOT do the same for front_abundance_freqs,
-        namely the following piece of code:
-
-            for (auto const& p : front_abundance_freqs) {
-                if (p.second == 1 and
-                    back_abundance_freqs.find(p.first) == back_abundance_freqs.cend()) {
-                    R_hi += 1;
-                }
-            }
-
-        because otherwise we will end up in potentially counting twice the number of cuts.
-        */
-        // }
-        std::cout << "R_lo = " << R_lo << std::endl;
-        std::cout << "R_hi = " << R_hi << std::endl;
-    }
-
     {
         /* compute cover */
         cover c(data.num_sequences, data.num_runs_abundances);

From 223b38ab2450fd025700806db0e696694eff9f11 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Wed, 30 Mar 2022 12:06:53 +0200
Subject: [PATCH 37/40] minor

---
 src/permute.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/permute.cpp b/src/permute.cpp
index 3a84f71..1a2ebdf 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -205,8 +205,8 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
 
         uint64_t num_end_points = 0;
         for (auto ab : distinct_abundances_vec) {
-            int64_t freq_front = 0;
-            int64_t freq_back = 0;
+            uint64_t freq_front = 0;
+            uint64_t freq_back = 0;
             auto it_front = front_abundance_freqs.find(ab);
             auto it_back = back_abundance_freqs.find(ab);
             if (it_front != front_abundance_freqs.cend()) { freq_front = (*it_front).second; }
@@ -226,15 +226,15 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
             //           << " - freq_back:" << freq_back << " - indegree:" << in_count
             //           << " - outdegree:" << out_count << std::endl;
 
-            /* special case: abundance will appear as singletion, so count twice */
+            /* special case: abundance will appear as singleton, so count twice */
             if (in_count == 0 and out_count == 0) {
                 num_end_points += 2;
                 continue;
             }
 
             /* if the excess of frequency is odd, then the abundance will appear as end-point */
-            uint64_t diff = std::abs(freq_front - freq_back);
-            if (diff % 2 == 1) num_end_points += 1;
+            uint64_t sum = freq_front + freq_back;
+            if (sum % 2 == 1) num_end_points += 1;
         }
 
         uint64_t num_distinct_abundances = distinct_abundances.size();

From 0ccbe1aca89103fa3275cd350255d311684f1d11 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Thu, 31 Mar 2022 10:26:53 +0200
Subject: [PATCH 38/40] simplified computation of lower bound

---
 include/builder/cover.hpp |   6 +--
 src/permute.cpp           | 102 ++++++++++++--------------------------
 2 files changed, 34 insertions(+), 74 deletions(-)

diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index 3dc681d..abe84f3 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -357,11 +357,7 @@ struct cover {
         int r = rounds.size() - 1;
         const auto& walks = rounds[r];
         uint64_t num_sequences = 0;
-        for (uint64_t w = 0; w != walks.size(); ++w) {
-            assert(walks[w].size() == 1);
-            assert(walks[w].front().sign == true);
-            num_sequences += visit(w, r, true, out);
-        }
+        for (uint64_t w = 0; w != walks.size(); ++w) num_sequences += visit(w, r, true, out);
         if (num_sequences != m_num_sequences) {
             std::cerr << "Error: expected to write " << m_num_sequences << " but written "
                       << num_sequences << std::endl;
diff --git a/src/permute.cpp b/src/permute.cpp
index 1a2ebdf..44904b2 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -151,101 +151,65 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
 
     /* computation of lower bound to the number of final abundance runs */
     {
-        /* (front_abundance, num_seqs_with_front=front_abundance) */
-        std::unordered_map<uint32_t, uint32_t> front_abundance_freqs;
-
-        /* (back_abundance, num_seqs_with_back=back_abundance) */
-        std::unordered_map<uint32_t, uint32_t> back_abundance_freqs;
-
-        /* (front_abundance, num_edges_outgoing_from) */
-        std::unordered_map<uint32_t, uint32_t> outdegree;
+        struct counts {
+            uint32_t freq;
+            uint32_t indegree;
+            uint32_t outdegree;
+        };
 
-        /* (back_abundance, num_edges_incoming_in) */
-        std::unordered_map<uint32_t, uint32_t> indegree;
+        std::unordered_map<uint32_t, counts> endpoint_counts;
 
         for (auto const& node : data.nodes) {
             uint64_t front = node.front;
             uint64_t back = node.back;
-            auto it_front = front_abundance_freqs.find(front);
-            if (it_front == front_abundance_freqs.cend()) {
-                front_abundance_freqs[front] = 1;
-            } else {
-                (*it_front).second += 1;
+            {
+                auto it = endpoint_counts.find(front);
+                if (it == endpoint_counts.cend()) {
+                    endpoint_counts[front] = {1, 0, 0};
+                } else {
+                    (*it).second.freq += 1;
+                }
             }
-            auto it_back = back_abundance_freqs.find(back);
-            if (it_back == back_abundance_freqs.cend()) {
-                back_abundance_freqs[back] = 1;
-            } else {
-                (*it_back).second += 1;
+            {
+                auto it = endpoint_counts.find(back);
+                if (it == endpoint_counts.cend()) {
+                    endpoint_counts[back] = {1, 0, 0};
+                } else {
+                    (*it).second.freq += 1;
+                }
             }
             if (front != back) {
-                {
-                    auto it = indegree.find(back);
-                    if (it != indegree.cend()) {
-                        (*it).second += 1;
-                    } else {
-                        indegree[back] = 1;
-                    }
-                }
-                {
-                    auto it = outdegree.find(front);
-                    if (it != outdegree.cend()) {
-                        (*it).second += 1;
-                    } else {
-                        outdegree[front] = 1;
-                    }
-                }
+                endpoint_counts[back].indegree += 1;
+                endpoint_counts[front].outdegree += 1;
             }
         }
 
-        std::vector<uint32_t> distinct_abundances_vec;
-        distinct_abundances_vec.reserve(distinct_abundances.size());
-        for (auto x : distinct_abundances) distinct_abundances_vec.push_back(x);
-        std::sort(distinct_abundances_vec.begin(), distinct_abundances_vec.end());
-
-        uint64_t num_end_points = 0;
-        for (auto ab : distinct_abundances_vec) {
-            uint64_t freq_front = 0;
-            uint64_t freq_back = 0;
-            auto it_front = front_abundance_freqs.find(ab);
-            auto it_back = back_abundance_freqs.find(ab);
-            if (it_front != front_abundance_freqs.cend()) { freq_front = (*it_front).second; }
-            if (it_back != back_abundance_freqs.cend()) { freq_back = (*it_back).second; }
-
-            /* abundance only appears for internal kmers */
-            if (freq_front == 0 and freq_back == 0) continue;
-
-            uint64_t in_count = 0;
-            uint64_t out_count = 0;
-            auto it_in_count = indegree.find(ab);
-            auto it_out_count = outdegree.find(ab);
-            if (it_in_count != indegree.cend()) { in_count = (*it_in_count).second; }
-            if (it_out_count != outdegree.cend()) { out_count = (*it_out_count).second; }
-
-            // std::cout << "ab:" << ab << " - freq_front:" << freq_front
-            //           << " - freq_back:" << freq_back << " - indegree:" << in_count
-            //           << " - outdegree:" << out_count << std::endl;
+        uint64_t num_endpoints = 0;
+        for (auto const& p : endpoint_counts) {
+            uint64_t freq = p.second.freq;
+            uint64_t indegree = p.second.indegree;
+            uint64_t outdegree = p.second.outdegree;
 
             /* special case: abundance will appear as singleton, so count twice */
-            if (in_count == 0 and out_count == 0) {
-                num_end_points += 2;
+            if (indegree == 0 and outdegree == 0) {
+                num_endpoints += 2;
                 continue;
             }
 
             /* if the excess of frequency is odd, then the abundance will appear as end-point */
-            uint64_t sum = freq_front + freq_back;
-            if (sum % 2 == 1) num_end_points += 1;
+            if (freq % 2 == 1) num_endpoints += 1;
         }
 
         uint64_t num_distinct_abundances = distinct_abundances.size();
         uint64_t num_runs_abundances_internal = data.num_runs_abundances - data.nodes.size();
-        std::cout << "num_end_points = " << num_end_points / 2 << std::endl;
+        uint64_t num_walks = num_endpoints / 2;
+        std::cout << "(estimated) num_walks = " << num_walks << std::endl;
         std::cout << "num_runs_abundances = " << data.num_runs_abundances << std::endl;
         std::cout << "num_runs_abundances_internal = " << num_runs_abundances_internal << std::endl;
 
         uint64_t R_lo =
             std::max<uint64_t>(num_distinct_abundances, num_runs_abundances_internal + 1);
-        uint64_t R_hi = num_runs_abundances_internal + num_end_points / 2;
+        uint64_t R_hi = num_runs_abundances_internal + num_walks;
         std::cout << "R_lo = " << R_lo << std::endl;
         std::cout << "R_hi = " << R_hi << std::endl;
     }

From 350ee9215c4b4098135e4dea6c4a7b912cf0fd68 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Sat, 2 Apr 2022 15:12:38 +0200
Subject: [PATCH 39/40] some refactoring

---
 external/pthash           |   2 +-
 include/abundances.hpp    | 109 +++-----------------------------------
 include/builder/build.cpp |  81 ++++------------------------
 include/builder/cover.hpp |  13 ++++-
 include/ef_sequence.hpp   |   9 ++--
 include/util.hpp          |   6 +--
 src/build.cpp             |   3 --
 src/permute.cpp           |   8 +--
 8 files changed, 37 insertions(+), 194 deletions(-)

diff --git a/external/pthash b/external/pthash
index f0a0fb9..3f1deb0 160000
--- a/external/pthash
+++ b/external/pthash
@@ -1 +1 @@
-Subproject commit f0a0fb929ecfe1d925d88bc9bf41f7b4eaf0516b
+Subproject commit 3f1deb0e1ec132274af8ef55478915209f608b2c
diff --git a/include/abundances.hpp b/include/abundances.hpp
index 7a5e555..4fd4536 100644
--- a/include/abundances.hpp
+++ b/include/abundances.hpp
@@ -1,20 +1,17 @@
 #pragma once
 
 #include <vector>
-#include <unordered_map>  // count the distinct abundances
+#include <unordered_map>  // count the distinct abundances with freq information
+
 #include "ef_sequence.hpp"
 
 namespace sshash {
 
 struct abundances {
     struct builder {
-        builder() : m_most_frequent_abundance(0) {}
+        builder() {}
 
-        void init(uint64_t most_frequent_abundance) {
-            m_most_frequent_abundance = most_frequent_abundance;
-            m_kmer_id_interval_lengths.push_back(0);
-            m_abundance_interval_lengths.push_back(0);
-        }
+        void init() { m_abundance_interval_lengths.push_back(0); }
 
         void eat(uint64_t abundance) {
             assert(abundance > 0);
@@ -26,28 +23,17 @@ struct abundances {
             }
         }
 
-        void push_kmer_id_interval(uint64_t value, uint64_t length) {
-            m_kmer_id_interval_values.push_back(value);
-            m_kmer_id_interval_lengths.push_back(m_kmer_id_interval_lengths.back() + length);
-        }
-
         void push_abundance_interval(uint64_t value, uint64_t length) {
             m_abundance_interval_values.push_back(value);
             m_abundance_interval_lengths.push_back(m_abundance_interval_lengths.back() + length);
         }
 
-        uint64_t num_kmer_id_intervals() const { return m_kmer_id_interval_values.size(); }
         uint64_t num_abundance_intervals() const { return m_abundance_interval_values.size(); }
 
         void finalize(uint64_t num_kmers) {
-            assert(
-                std::is_sorted(m_kmer_id_interval_values.begin(), m_kmer_id_interval_values.end()));
-            assert(std::is_sorted(m_kmer_id_interval_lengths.begin(),
-                                  m_kmer_id_interval_lengths.end()));
             assert(std::is_sorted(m_abundance_interval_lengths.begin(),
                                   m_abundance_interval_lengths.end()));
 
-            std::cout << "num_kmer_id_intervals " << num_kmer_id_intervals() << std::endl;
             std::cout << "num_abundance_intervals " << num_abundance_intervals() << std::endl;
 
             uint64_t num_distinct_abundances = m_abundances_map.size();
@@ -80,27 +66,10 @@ struct abundances {
                 return x.first < y.first;
             });
 
-            if (m_kmer_id_interval_values.size() != 0) {  // optimize_mfa
-                /* If this test fails, then we need to change the value of
-                    constants::most_frequent_abundance. */
-                if (m_most_frequent_abundance != m_abundances.front().first) {
-                    throw std::runtime_error("the most frequent abundance is not " +
-                                             std::to_string(constants::most_frequent_abundance) +
-                                             " but " + std::to_string(m_abundances.front().first));
-                }
-            }
-
             uint64_t rest = num_kmers - m_abundances.front().second;
             std::cout << "kmers that do not have the most frequent ab: " << rest << " ("
                       << (rest * 100.0) / num_kmers << "%)" << std::endl;
 
-            if (m_kmer_id_interval_values.size() != 0) {  // optimize_mfa
-                std::cout << "cumulative_kmer_id_interval_lengths "
-                          << m_kmer_id_interval_lengths.back() << '/' << rest << std::endl;
-                std::cout << "cumulative_abundance_interval_lengths "
-                          << m_abundance_interval_lengths.back() << '/' << rest << std::endl;
-            }
-
             m_abundance_dictionary_builder.resize(num_distinct_abundances,
                                                   std::ceil(std::log2(largest_ab + 1)));
             for (uint64_t id = 0; id != num_distinct_abundances; ++id) {
@@ -111,13 +80,6 @@ struct abundances {
         }
 
         void build(abundances& index) {
-            std::swap(index.m_most_frequent_abundance, m_most_frequent_abundance);
-
-            index.m_kmer_id_interval_values.encode(m_kmer_id_interval_values.begin(),
-                                                   m_kmer_id_interval_values.size());
-            index.m_kmer_id_interval_lengths.encode(m_kmer_id_interval_lengths.begin(),
-                                                    m_kmer_id_interval_lengths.size());
-
             uint64_t num_distinct_abundances = m_abundance_dictionary_builder.size();
             pthash::compact_vector::builder abundance_interval_values;
             abundance_interval_values.resize(
@@ -147,7 +109,7 @@ struct abundances {
             m_abundance_dictionary_builder.build(index.m_abundance_dictionary);
         }
 
-        // return the average empirical entropy per abundance
+        /* return the average empirical entropy per abundance */
         double print_info(uint64_t num_kmers) {
             assert(!m_abundances.empty());
             double expected_ab_value = 0.0;
@@ -169,15 +131,10 @@ struct abundances {
         }
 
     private:
-        uint64_t m_most_frequent_abundance;
-
         /* (abundance,frequency) pairs during construction, then (abundance,id) after sorting */
         std::unordered_map<uint64_t, uint64_t> m_abundances_map;
         std::vector<std::pair<uint64_t, uint64_t>> m_abundances;  // (abundance,frequency)
 
-        std::vector<uint64_t> m_kmer_id_interval_values;
-        std::vector<uint64_t> m_kmer_id_interval_lengths;
-
         std::vector<uint64_t> m_abundance_interval_values;
         std::vector<uint64_t> m_abundance_interval_lengths;
 
@@ -187,48 +144,18 @@ struct abundances {
     bool empty() const { return m_abundance_dictionary.size() == 0; }
 
     uint64_t abundance(uint64_t kmer_id) const {
-        uint64_t rank = kmer_id;
-
-        if (m_kmer_id_interval_values.size() != 0) {  // optimize_mfa
-            bool is_present = false;
-            auto [pos, val, prev] = m_kmer_id_interval_values.next_geq_neighbourhood(kmer_id);
-            if (val == kmer_id) {
-                is_present = true;
-                rank = m_kmer_id_interval_lengths.access(pos);
-            } else {
-                if (pos > 0) {
-                    rank = m_kmer_id_interval_lengths.access(pos - 1);
-                    uint64_t length = m_kmer_id_interval_lengths.access(pos) - rank;
-                    if (kmer_id < prev + length) {
-                        is_present = true;
-                        assert(kmer_id >= prev);
-                        rank += kmer_id - prev;
-                    }
-                }
-            }
-            if (!is_present) return m_most_frequent_abundance;
-        }
-
-        uint64_t i = m_abundance_interval_lengths.prev_leq(rank);
+        uint64_t i = m_abundance_interval_lengths.prev_leq(kmer_id);
         uint64_t id = m_abundance_interval_values.access(i);
         uint64_t abundance = m_abundance_dictionary.access(id);
-
         return abundance;
     }
 
     uint64_t num_bits() const {
-        return sizeof(m_most_frequent_abundance) * 8 + m_kmer_id_interval_values.num_bits() +
-               m_kmer_id_interval_lengths.num_bits() + m_abundance_interval_values.bytes() * 8 +
-               m_abundance_interval_lengths.num_bits() + m_abundance_dictionary.bytes() * 8;
+        return m_abundance_interval_values.bytes() * 8 + m_abundance_interval_lengths.num_bits() +
+               m_abundance_dictionary.bytes() * 8;
     }
 
     void print_space_breakdown(uint64_t num_kmers) const {
-        std::cout << "    kmer_id_interval_values: "
-                  << static_cast<double>(m_kmer_id_interval_values.num_bits()) / num_kmers
-                  << " [bits/kmer]\n";
-        std::cout << "    kmer_id_interval_lengths: "
-                  << static_cast<double>(m_kmer_id_interval_lengths.num_bits()) / num_kmers
-                  << " [bits/kmer]\n";
         std::cout << "    abundance_interval_values: "
                   << static_cast<double>(m_abundance_interval_values.bytes() * 8) / num_kmers
                   << " [bits/kmer]\n";
@@ -242,34 +169,14 @@ struct abundances {
 
     template <typename Visitor>
     void visit(Visitor& visitor) {
-        visitor.visit(m_most_frequent_abundance);
-        visitor.visit(m_kmer_id_interval_values);
-        visitor.visit(m_kmer_id_interval_lengths);
         visitor.visit(m_abundance_interval_values);
         visitor.visit(m_abundance_interval_lengths);
         visitor.visit(m_abundance_dictionary);
     }
 
 private:
-    uint64_t m_most_frequent_abundance;
-
-    /*****
-       We model abundances as two lists of intervals.
-       Each interval is a pair (value,length).
-       - First list: kmer_ids list. In this case, a pair (value,length)
-       represents all kmer_ids = value, value+1, value+2, ..., value+length-1.
-       - Second list: abundance list. In this case, a pair (value,length)
-       represents that the abundance [value] repeats for [length] times.
-    */
-
-    ef_sequence<true> m_kmer_id_interval_values;
-    ef_sequence<false> m_kmer_id_interval_lengths;
-
     pthash::compact_vector m_abundance_interval_values;
     ef_sequence<true> m_abundance_interval_lengths;
-    /***/
-
-    /* Abundance dictionary, listing all distinct abundances sorted by decreasing frequency. */
     pthash::compact_vector m_abundance_dictionary;
 };
 
diff --git a/include/builder/build.cpp b/include/builder/build.cpp
index 3abb581..4931b06 100644
--- a/include/builder/build.cpp
+++ b/include/builder/build.cpp
@@ -80,30 +80,11 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
 
     uint64_t seq_len = 0;
     uint64_t sum_of_abundances = 0;
-    data.abundances_builder.init(constants::most_frequent_abundance);
-
-    /* intervals of kmer_ids */
-    uint64_t kmer_id_value = constants::invalid;
-    uint64_t kmer_id_length = 1;
+    data.abundances_builder.init();
 
     /* intervals of abundances */
     uint64_t ab_value = constants::invalid;
-    uint64_t ab_length = 1;
-
-    uint64_t num_sequences_diff_abs = 0;  // num sequences whose kmers have different abundances
-    uint64_t num_sequences_all_mfa = 0;   // num sequences whose kmers have same abundance == mfa
-
-    auto process_abundance = [&](uint64_t ab) {
-        if (ab == ab_value) {
-            ab_length += 1;
-        } else {
-            if (ab_value != constants::invalid) {
-                data.abundances_builder.push_abundance_interval(ab_value, ab_length);
-            }
-            ab_value = ab;
-            ab_length = 1;
-        }
-    };
+    uint64_t ab_length = 0;
 
     auto parse_header = [&]() {
         if (sequence.empty()) return;
@@ -141,53 +122,23 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
         expect(sequence[i + 4], ':');
         i += 5;
 
-        kmer_id_value = constants::invalid;
-        kmer_id_length = 1;
-
-        bool kmers_have_all_mfa = true;
-        bool kmers_have_different_abundances = false;
-
-        for (uint64_t j = 0, num_kmers = data.num_kmers, prev_ab = constants::invalid;
-             j != seq_len - k + 1; ++j, ++num_kmers) {
+        for (uint64_t j = 0; j != seq_len - k + 1; ++j) {
             uint64_t ab = std::strtoull(sequence.data() + i, nullptr, 10);
             i = sequence.find_first_of(' ', i) + 1;
 
-            if (ab != constants::most_frequent_abundance) kmers_have_all_mfa = false;
-            if (j > 0) {
-                if (ab != prev_ab) { kmers_have_different_abundances = true; }
-            }
-            prev_ab = ab;
-
             data.abundances_builder.eat(ab);
             sum_of_abundances += ab;
 
-            if (build_config.optimize_mfa) {
-                if (ab != constants::most_frequent_abundance) {
-                    if (kmer_id_value == constants::invalid) {
-                        kmer_id_value = num_kmers;
-                        kmer_id_length = 1;
-                    } else {
-                        if (num_kmers == kmer_id_value + kmer_id_length) kmer_id_length += 1;
-                    }
-                    process_abundance(ab);
-                } else {
-                    if (kmer_id_value != constants::invalid) {
-                        data.abundances_builder.push_kmer_id_interval(kmer_id_value,
-                                                                      kmer_id_length);
-                    }
-                    kmer_id_value = constants::invalid;
-                }
+            if (ab == ab_value) {
+                ab_length += 1;
             } else {
-                process_abundance(ab);
+                if (ab_value != constants::invalid) {
+                    data.abundances_builder.push_abundance_interval(ab_value, ab_length);
+                }
+                ab_value = ab;
+                ab_length = 1;
             }
         }
-
-        if (kmer_id_value != constants::invalid) {
-            data.abundances_builder.push_kmer_id_interval(kmer_id_value, kmer_id_length);
-        }
-
-        num_sequences_diff_abs += kmers_have_different_abundances;
-        num_sequences_all_mfa += kmers_have_all_mfa;
     };
 
     while (!is.eof()) {
@@ -255,18 +206,6 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
 
     if (build_config.store_abundances) {
         std::cout << "sum_of_abundances " << sum_of_abundances << std::endl;
-        std::cout << "num_sequences whose kmers have different abundances: "
-                  << num_sequences_diff_abs << "/" << num_sequences << " ("
-                  << (num_sequences_diff_abs * 100.0) / num_sequences << "%)" << std::endl;
-        std::cout << "num_sequences whose kmers all have the same abundance != mfa: "
-                  << (num_sequences - num_sequences_diff_abs) << "/" << num_sequences << " ("
-                  << ((num_sequences - num_sequences_diff_abs) * 100.0) / num_sequences << "%)"
-                  << std::endl;
-        std::cout << "num_sequences whose kmers all have the same abundance == mfa: "
-                  << num_sequences_all_mfa << "/" << (num_sequences - num_sequences_diff_abs)
-                  << " ("
-                  << (num_sequences_all_mfa * 100.0) / (num_sequences - num_sequences_diff_abs)
-                  << "%)" << std::endl;
         data.abundances_builder.push_abundance_interval(ab_value, ab_length);
         data.abundances_builder.finalize(data.num_kmers);
     }
diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index abe84f3..9296e0f 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -8,6 +8,10 @@
 
 namespace sshash {
 
+namespace constants {
+constexpr uint64_t most_frequent_abundance = 1;  // usual value
+}
+
 struct node {
     // We assume there are less than 2^32 sequences and that
     // the largest abundance fits into a 32-bit uint.
@@ -59,13 +63,20 @@ struct cover {
                 optimize for the most frequent case:
                 push nodes of the form v:[mfa,mfa] to the bottom, and remove them
                 forming a single (usually, long) walk.
-                Warning: here we are assuming the mfa is also the *smallest* abundance.
             */
 
             essentials::timer_type timer;
             timer.start();
 
             std::sort(nodes.begin(), nodes.end(), [](auto const& x, auto const& y) {
+                if (x.front == constants::most_frequent_abundance and
+                    x.back == constants::most_frequent_abundance) {
+                    return false;
+                }
+                if (y.front == constants::most_frequent_abundance and
+                    y.back == constants::most_frequent_abundance) {
+                    return true;
+                }
                 if (x.front != y.front) return x.front > y.front;
                 if (x.back != y.back) return x.back > y.back;
                 return x.id > y.id;
diff --git a/include/ef_sequence.hpp b/include/ef_sequence.hpp
index 3d1ee93..6fd17ae 100644
--- a/include/ef_sequence.hpp
+++ b/include/ef_sequence.hpp
@@ -8,13 +8,13 @@ namespace sshash {
 
 template <bool index_zeros = false>
 struct ef_sequence {
-    ef_sequence() {}
+    ef_sequence() : m_universe(0) {}
 
     template <typename Iterator>
     void encode(Iterator begin, uint64_t n) {
         if (n == 0) return;
         uint64_t u = *(begin + n - 1);
-        m_universe = u;  // universe is last element, "back"
+        m_universe = u;
 
         uint64_t l = uint64_t((n && u / n) ? pthash::util::msb(u / n) : 0);
         pthash::bit_vector_builder bvb_high_bits(n + (u >> l) + 1);
@@ -37,8 +37,8 @@ struct ef_sequence {
 
         pthash::bit_vector(&bvb_high_bits).swap(m_high_bits);
         cv_builder_low_bits.build(m_low_bits);
-        pthash::darray1(m_high_bits).swap(m_high_bits_d1);
-        if constexpr (index_zeros) { pthash::darray0(m_high_bits).swap(m_high_bits_d0); }
+        m_high_bits_d1.build(m_high_bits);
+        if constexpr (index_zeros) m_high_bits_d0.build(m_high_bits);
     }
 
     struct iterator {
@@ -55,7 +55,6 @@ struct ef_sequence {
         }
 
         bool good() const { return m_ef != nullptr; }
-
         bool has_next() const { return m_pos < m_ef->size(); }
 
         uint64_t next() {
diff --git a/include/util.hpp b/include/util.hpp
index 0727dac..6403374 100644
--- a/include/util.hpp
+++ b/include/util.hpp
@@ -17,7 +17,6 @@ constexpr uint64_t hashcode_bits = 64;
 constexpr double c = 3.0;  // for PThash
 constexpr uint64_t min_l = 6;
 constexpr uint64_t max_l = 12;
-constexpr uint64_t most_frequent_abundance = 1;  // usual value
 static const std::string default_tmp_dirname(".");
 }  // namespace constants
 
@@ -68,7 +67,6 @@ struct build_configuration {
 
         , canonical_parsing(false)
         , store_abundances(false)
-        , optimize_mfa(false)
         , verbose(true) {}
 
     uint64_t k;  // kmer size
@@ -80,15 +78,13 @@ struct build_configuration {
 
     bool canonical_parsing;
     bool store_abundances;
-    bool optimize_mfa;  // fma: "most frequent abundance"
     bool verbose;
 
     void print() const {
         std::cout << "k = " << k << ", m = " << m << ", seed = " << seed << ", l = " << l
                   << ", c = " << c
                   << ", canonical_parsing = " << (canonical_parsing ? "true" : "false")
-                  << ", store_abundances = " << (store_abundances ? "true" : "false")
-                  << ", optimize_mfa = " << (optimize_mfa ? "true" : "false") << std::endl;
+                  << ", store_abundances = " << (store_abundances ? "true" : "false") << std::endl;
     }
 };
 
diff --git a/src/build.cpp b/src/build.cpp
index 40adc6e..1477c9a 100644
--- a/src/build.cpp
+++ b/src/build.cpp
@@ -43,8 +43,6 @@ int main(int argc, char** argv) {
                "--canonical-parsing", true);
     parser.add("store_abundances", "Also store the abundances in compressed format.",
                "--abundances", true);
-    parser.add("optimize_mfa", "Turn on space optimization for most frequent abundance (mfa).",
-               "--optimize-mfa", true);
     parser.add("output_filename", "Output file name where the data structure will be serialized.",
                "-o", false);
     parser.add("check", "Check correctness after construction.", "--check", true);
@@ -68,7 +66,6 @@ int main(int argc, char** argv) {
     if (parser.parsed("c")) build_config.c = parser.get<double>("c");
     build_config.canonical_parsing = parser.get<bool>("canonical_parsing");
     build_config.store_abundances = parser.get<bool>("store_abundances");
-    build_config.optimize_mfa = parser.get<bool>("optimize_mfa");
     build_config.verbose = parser.get<bool>("verbose");
     build_config.print();
 
diff --git a/src/permute.cpp b/src/permute.cpp
index 44904b2..6844e5c 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -15,12 +15,6 @@ struct permute_data {
     std::vector<node> nodes;
 };
 
-struct pair_hash {
-    inline uint64_t operator()(std::pair<uint32_t, uint32_t> const& v) const {
-        return static_cast<uint64_t>(v.first) * 31 + static_cast<uint64_t>(v.second);
-    }
-};
-
 void parse_file(std::istream& is, permute_data& data, build_configuration const& build_config) {
     std::string sequence;
     uint64_t k = build_config.k;
@@ -288,7 +282,7 @@ void reverse_header(std::string const& input, std::string& output, uint64_t k) {
     }
 
     std::reverse(abundances.begin(), abundances.end());
-    for (auto abundance : abundances) { output.append(std::to_string(abundance) + " "); }
+    for (auto abundance : abundances) output.append(std::to_string(abundance) + " ");
 }
 
 void permute_and_write(std::istream& is, std::string const& output_filename,

From 2f3672c8caef46cdfc499849177ae6643318d007 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Sat, 2 Apr 2022 15:44:17 +0200
Subject: [PATCH 40/40] put Example 4 in the README

---
 README.md | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/README.md b/README.md
index f756267..7f34707 100644
--- a/README.md
+++ b/README.md
@@ -218,6 +218,28 @@ We see that the canonical dictionary is twice as fast as the regular dictionary
 for low-hit workloads,
 even on this tiny example, for only +0.4 bits/k-mer.
 
+### Example 4
+
+	./permute ../data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz 31 -o ecoli_sakai.permuted.fa
+
+This command re-orders (and possibly reverse-complement) the strings in the collection as to *minimize* the number of runs in the abundances and, hence, optimize the encoding of the abundances.
+The result is saved to the file `ecoli_sakai.permuted.fa`.
+
+In this example for the E.Coli collection (Sakai strain) we reduce the number of runs in the abundances from 5820 to 3723.
+
+Then use the `build` command as usual to build the permuted collection:
+
+	./build ecoli_sakai.permuted.fa 31 13 --abundances --verbose
+	
+The index built on the permuted collection
+optimizes the storage space for the abundances which results in a 15.1X better space than the empirical entropy of the abundances.
+
+For reference, the index built on the original collection:
+
+	./build ../data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz 31 13 --abundances --verbose
+
+already achieves a 12.4X better space than the empirical entropy.
+
 Input Files
 -----------