Merge pull request #4 from jermp/abundances_dev

Permute and reverse-complement the strings to minimize the number of abundance runs
jermp · Apr 2, 2022 · 562ff63 · 562ff63
2 parents 63d1252 + 2f3672c
commit 562ff63
Show file tree

Hide file tree

Showing 12 changed files with 1,027 additions and 196 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -42,3 +42,6 @@ target_link_libraries(check z)
 
 add_executable(query src/query.cpp)
 target_link_libraries(query z)
+
+add_executable(permute src/permute.cpp)
+target_link_libraries(permute z)
diff --git a/README.md b/README.md
@@ -218,6 +218,28 @@ We see that the canonical dictionary is twice as fast as the regular dictionary
 for low-hit workloads,
 even on this tiny example, for only +0.4 bits/k-mer.
 
+### Example 4
+
+	./permute ../data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz 31 -o ecoli_sakai.permuted.fa
+
+This command re-orders (and possibly reverse-complement) the strings in the collection as to *minimize* the number of runs in the abundances and, hence, optimize the encoding of the abundances.
+The result is saved to the file `ecoli_sakai.permuted.fa`.
+
+In this example for the E.Coli collection (Sakai strain) we reduce the number of runs in the abundances from 5820 to 3723.
+
+Then use the `build` command as usual to build the permuted collection:
+
+	./build ecoli_sakai.permuted.fa 31 13 --abundances --verbose
+
+The index built on the permuted collection
+optimizes the storage space for the abundances which results in a 15.1X better space than the empirical entropy of the abundances.
+
+For reference, the index built on the original collection:
+
+	./build ../data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz 31 13 --abundances --verbose
+
+already achieves a 12.4X better space than the empirical entropy.
+
 Input Files
 -----------
 

diff --git a/data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz b/data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz
diff --git a/external/pthash b/external/pthash
diff --git a/include/abundances.hpp b/include/abundances.hpp
@@ -1,20 +1,17 @@
 #pragma once
 
 #include <vector>
-#include <unordered_map>  // count the distinct abundances
+#include <unordered_map>  // count the distinct abundances with freq information
+
 #include "ef_sequence.hpp"
 
 namespace sshash {
 
 struct abundances {
     struct builder {
-        builder() : m_most_frequent_abundance(0) {}
+        builder() {}
 
-        void init(uint64_t most_frequent_abundance) {
-            m_most_frequent_abundance = most_frequent_abundance;
-            m_kmer_id_interval_lengths.push_back(0);
-            m_abundance_interval_lengths.push_back(0);
-        }
+        void init() { m_abundance_interval_lengths.push_back(0); }
 
         void eat(uint64_t abundance) {
             assert(abundance > 0);
@@ -26,28 +23,17 @@ struct abundances {
             }
         }
 
-        void push_kmer_id_interval(uint64_t value, uint64_t length) {
-            m_kmer_id_interval_values.push_back(value);
-            m_kmer_id_interval_lengths.push_back(m_kmer_id_interval_lengths.back() + length);
-        }
-
         void push_abundance_interval(uint64_t value, uint64_t length) {
             m_abundance_interval_values.push_back(value);
             m_abundance_interval_lengths.push_back(m_abundance_interval_lengths.back() + length);
         }
 
-        uint64_t num_kmer_id_intervals() const { return m_kmer_id_interval_values.size(); }
         uint64_t num_abundance_intervals() const { return m_abundance_interval_values.size(); }
 
         void finalize(uint64_t num_kmers) {
-            assert(
-                std::is_sorted(m_kmer_id_interval_values.begin(), m_kmer_id_interval_values.end()));
-            assert(std::is_sorted(m_kmer_id_interval_lengths.begin(),
-                                  m_kmer_id_interval_lengths.end()));
             assert(std::is_sorted(m_abundance_interval_lengths.begin(),
                                   m_abundance_interval_lengths.end()));
 
-            std::cout << "num_kmer_id_intervals " << num_kmer_id_intervals() << std::endl;
             std::cout << "num_abundance_intervals " << num_abundance_intervals() << std::endl;
 
             uint64_t num_distinct_abundances = m_abundances_map.size();
@@ -80,23 +66,9 @@ struct abundances {
                 return x.first < y.first;
             });
 
-            /* If this test fails, then we need to change the value of
-                constants::most_frequent_abundance. */
-            if (m_most_frequent_abundance != m_abundances.front().first) {
-                throw std::runtime_error("the most frequent abundance is not " +
-                                         std::to_string(constants::most_frequent_abundance) +
-                                         " but " + std::to_string(m_abundances.front().first));
-            }
-
             uint64_t rest = num_kmers - m_abundances.front().second;
             std::cout << "kmers that do not have the most frequent ab: " << rest << " ("
                       << (rest * 100.0) / num_kmers << "%)" << std::endl;
-            if (m_kmer_id_interval_values.size() != 0) {  // optimize_mfa
-                std::cout << "cumulative_kmer_id_interval_lengths "
-                          << m_kmer_id_interval_lengths.back() << '/' << rest << std::endl;
-                std::cout << "cumulative_abundance_interval_lengths "
-                          << m_abundance_interval_lengths.back() << '/' << rest << std::endl;
-            }
 
             m_abundance_dictionary_builder.resize(num_distinct_abundances,
                                                   std::ceil(std::log2(largest_ab + 1)));
@@ -108,13 +80,6 @@ struct abundances {
         }
 
         void build(abundances& index) {
-            std::swap(index.m_most_frequent_abundance, m_most_frequent_abundance);
-
-            index.m_kmer_id_interval_values.encode(m_kmer_id_interval_values.begin(),
-                                                   m_kmer_id_interval_values.size());
-            index.m_kmer_id_interval_lengths.encode(m_kmer_id_interval_lengths.begin(),
-                                                    m_kmer_id_interval_lengths.size());
-
             uint64_t num_distinct_abundances = m_abundance_dictionary_builder.size();
             pthash::compact_vector::builder abundance_interval_values;
             abundance_interval_values.resize(
@@ -144,7 +109,7 @@ struct abundances {
             m_abundance_dictionary_builder.build(index.m_abundance_dictionary);
         }
 
-        // return the average empirical entropy per abundance
+        /* return the average empirical entropy per abundance */
         double print_info(uint64_t num_kmers) {
             assert(!m_abundances.empty());
             double expected_ab_value = 0.0;
@@ -166,15 +131,10 @@ struct abundances {
         }
 
     private:
-        uint64_t m_most_frequent_abundance;
-
         /* (abundance,frequency) pairs during construction, then (abundance,id) after sorting */
         std::unordered_map<uint64_t, uint64_t> m_abundances_map;
         std::vector<std::pair<uint64_t, uint64_t>> m_abundances;  // (abundance,frequency)
 
-        std::vector<uint64_t> m_kmer_id_interval_values;
-        std::vector<uint64_t> m_kmer_id_interval_lengths;
-
         std::vector<uint64_t> m_abundance_interval_values;
         std::vector<uint64_t> m_abundance_interval_lengths;
 
@@ -184,48 +144,18 @@ struct abundances {
     bool empty() const { return m_abundance_dictionary.size() == 0; }
 
     uint64_t abundance(uint64_t kmer_id) const {
-        uint64_t rank = kmer_id;
-
-        if (m_kmer_id_interval_values.size() != 0) {  // optimize_mfa
-            bool is_present = false;
-            auto [pos, val, prev] = m_kmer_id_interval_values.next_geq_neighbourhood(kmer_id);
-            if (val == kmer_id) {
-                is_present = true;
-                rank = m_kmer_id_interval_lengths.access(pos);
-            } else {
-                if (pos > 0) {
-                    rank = m_kmer_id_interval_lengths.access(pos - 1);
-                    uint64_t length = m_kmer_id_interval_lengths.access(pos) - rank;
-                    if (kmer_id < prev + length) {
-                        is_present = true;
-                        assert(kmer_id >= prev);
-                        rank += kmer_id - prev;
-                    }
-                }
-            }
-            if (!is_present) return m_most_frequent_abundance;
-        }
-
-        uint64_t i = m_abundance_interval_lengths.prev_leq(rank);
+        uint64_t i = m_abundance_interval_lengths.prev_leq(kmer_id);
         uint64_t id = m_abundance_interval_values.access(i);
         uint64_t abundance = m_abundance_dictionary.access(id);
-
         return abundance;
     }
 
     uint64_t num_bits() const {
-        return sizeof(m_most_frequent_abundance) * 8 + m_kmer_id_interval_values.num_bits() +
-               m_kmer_id_interval_lengths.num_bits() + m_abundance_interval_values.bytes() * 8 +
-               m_abundance_interval_lengths.num_bits() + m_abundance_dictionary.bytes() * 8;
+        return m_abundance_interval_values.bytes() * 8 + m_abundance_interval_lengths.num_bits() +
+               m_abundance_dictionary.bytes() * 8;
     }
 
     void print_space_breakdown(uint64_t num_kmers) const {
-        std::cout << "    kmer_id_interval_values: "
-                  << static_cast<double>(m_kmer_id_interval_values.num_bits()) / num_kmers
-                  << " [bits/kmer]\n";
-        std::cout << "    kmer_id_interval_lengths: "
-                  << static_cast<double>(m_kmer_id_interval_lengths.num_bits()) / num_kmers
-                  << " [bits/kmer]\n";
         std::cout << "    abundance_interval_values: "
                   << static_cast<double>(m_abundance_interval_values.bytes() * 8) / num_kmers
                   << " [bits/kmer]\n";
@@ -239,34 +169,14 @@ struct abundances {
 
     template <typename Visitor>
     void visit(Visitor& visitor) {
-        visitor.visit(m_most_frequent_abundance);
-        visitor.visit(m_kmer_id_interval_values);
-        visitor.visit(m_kmer_id_interval_lengths);
         visitor.visit(m_abundance_interval_values);
         visitor.visit(m_abundance_interval_lengths);
         visitor.visit(m_abundance_dictionary);
     }
 
 private:
-    uint64_t m_most_frequent_abundance;
-
-    /*****
-       We model abundances as two lists of intervals.
-       Each interval is a pair (value,length).
-       - First list: kmer_ids list. In this case, a pair (value,length)
-       represents all kmer_ids = value, value+1, value+2, ..., value+length-1.
-       - Second list: abundance list. In this case, a pair (value,length)
-       represents that the abundance [value] repeats for [length] times.
-    */
-
-    ef_sequence<true> m_kmer_id_interval_values;
-    ef_sequence<false> m_kmer_id_interval_lengths;
-
     pthash::compact_vector m_abundance_interval_values;
     ef_sequence<true> m_abundance_interval_lengths;
-    /***/
-
-    /* Abundance dictionary, listing all distinct abundances sorted by decreasing frequency. */
     pthash::compact_vector m_abundance_dictionary;
 };
 

diff --git a/include/builder/build.cpp b/include/builder/build.cpp
@@ -6,21 +6,13 @@
 #include "../info.cpp"
 #include "../../external/pthash/include/pthash.hpp"
 #include "../../external/pthash/external/essentials/include/essentials.hpp"
-
-#include "build_util_types.hpp"
+#include "util.hpp"
 
 #include <numeric>  // for std::partial_sum
 #include <vector>
 
 namespace sshash {
 
-void expect(char got, char expected) {
-    if (got != expected) {
-        std::cout << "got '" << got << "' but expected '" << expected << "'" << std::endl;
-        throw parse_runtime_error();
-    }
-}
-
 struct parse_data {
     parse_data() : num_kmers(0) {}
     uint64_t num_kmers;
@@ -88,37 +80,18 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
 
     uint64_t seq_len = 0;
     uint64_t sum_of_abundances = 0;
-    data.abundances_builder.init(constants::most_frequent_abundance);
-
-    /* intervals of kmer_ids */
-    uint64_t kmer_id_value = constants::invalid;
-    uint64_t kmer_id_length = 1;
+    data.abundances_builder.init();
 
     /* intervals of abundances */
     uint64_t ab_value = constants::invalid;
-    uint64_t ab_length = 1;
-
-    uint64_t num_sequences_diff_abs = 0;  // num sequences whose kmers have different abundances
-    uint64_t num_sequences_all_mfa = 0;   // num sequences whose kmers have same abundance == mfa
-
-    auto process_abundance = [&](uint64_t ab) {
-        if (ab == ab_value) {
-            ab_length += 1;
-        } else {
-            if (ab_value != constants::invalid) {
-                data.abundances_builder.push_abundance_interval(ab_value, ab_length);
-            }
-            ab_value = ab;
-            ab_length = 1;
-        }
-    };
+    uint64_t ab_length = 0;
 
     auto parse_header = [&]() {
         if (sequence.empty()) return;
 
         /*
             Heder format:
-            >[seq_id] LN:i:[seq_len] ab:Z:[ab_seq]
+            >[id] LN:i:[seq_len] ab:Z:[ab_seq]
             where [ab_seq] is a space-separated sequence of integer counters (the abundances),
             whose length is equal to [seq_len]-k+1
         */
@@ -140,8 +113,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
         uint64_t j = sequence.find_first_of(' ', i);
         if (j == std::string::npos) throw parse_runtime_error();
 
-        char* end;
-        seq_len = std::strtoull(sequence.data() + i, &end, 10);
+        seq_len = std::strtoull(sequence.data() + i, nullptr, 10);
         i = j + 1;
         expect(sequence[i + 0], 'a');
         expect(sequence[i + 1], 'b');
@@ -150,53 +122,23 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
         expect(sequence[i + 4], ':');
         i += 5;
 
-        kmer_id_value = constants::invalid;
-        kmer_id_length = 1;
-
-        bool kmers_have_all_mfa = true;
-        bool kmers_have_different_abundances = false;
-
-        for (uint64_t j = 0, num_kmers = data.num_kmers, prev_ab = constants::invalid;
-             j != seq_len - k + 1; ++j, ++num_kmers) {
-            uint64_t ab = std::strtoull(sequence.data() + i, &end, 10);
+        for (uint64_t j = 0; j != seq_len - k + 1; ++j) {
+            uint64_t ab = std::strtoull(sequence.data() + i, nullptr, 10);
             i = sequence.find_first_of(' ', i) + 1;
 
-            if (ab != constants::most_frequent_abundance) kmers_have_all_mfa = false;
-            if (j > 0) {
-                if (ab != prev_ab) { kmers_have_different_abundances = true; }
-            }
-            prev_ab = ab;
-
             data.abundances_builder.eat(ab);
             sum_of_abundances += ab;
 
-            if (build_config.optimize_mfa) {
-                if (ab != constants::most_frequent_abundance) {
-                    if (kmer_id_value == constants::invalid) {
-                        kmer_id_value = num_kmers;
-                        kmer_id_length = 1;
-                    } else {
-                        if (num_kmers == kmer_id_value + kmer_id_length) kmer_id_length += 1;
-                    }
-                    process_abundance(ab);
-                } else {
-                    if (kmer_id_value != constants::invalid) {
-                        data.abundances_builder.push_kmer_id_interval(kmer_id_value,
-                                                                      kmer_id_length);
-                    }
-                    kmer_id_value = constants::invalid;
-                }
+            if (ab == ab_value) {
+                ab_length += 1;
             } else {
-                process_abundance(ab);
+                if (ab_value != constants::invalid) {
+                    data.abundances_builder.push_abundance_interval(ab_value, ab_length);
+                }
+                ab_value = ab;
+                ab_length = 1;
             }
         }
-
-        if (kmer_id_value != constants::invalid) {
-            data.abundances_builder.push_kmer_id_interval(kmer_id_value, kmer_id_length);
-        }
-
-        num_sequences_diff_abs += kmers_have_different_abundances;
-        num_sequences_all_mfa += kmers_have_all_mfa;
     };
 
     while (!is.eof()) {
@@ -264,18 +206,6 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
 
     if (build_config.store_abundances) {
         std::cout << "sum_of_abundances " << sum_of_abundances << std::endl;
-        std::cout << "num_sequences whose kmers have different abundances: "
-                  << num_sequences_diff_abs << "/" << num_sequences << " ("
-                  << (num_sequences_diff_abs * 100.0) / num_sequences << "%)" << std::endl;
-        std::cout << "num_sequences whose kmers all have the same abundance != mfa: "
-                  << (num_sequences - num_sequences_diff_abs) << "/" << num_sequences << " ("
-                  << ((num_sequences - num_sequences_diff_abs) * 100.0) / num_sequences << "%)"
-                  << std::endl;
-        std::cout << "num_sequences whose kmers all have the same abundance == mfa: "
-                  << num_sequences_all_mfa << "/" << (num_sequences - num_sequences_diff_abs)
-                  << " ("
-                  << (num_sequences_all_mfa * 100.0) / (num_sequences - num_sequences_diff_abs)
-                  << "%)" << std::endl;
         data.abundances_builder.push_abundance_interval(ab_value, ab_length);
         data.abundances_builder.finalize(data.num_kmers);
     }
+1 −1		README.md
+2 −9		include/encoders/darray.hpp
+4 −5		include/encoders/ef_sequence.hpp