Skip to content

Commit

Permalink
Merge pull request #4 from jermp/abundances_dev
Browse files Browse the repository at this point in the history
Permute and reverse-complement the strings to minimize the number of abundance runs
  • Loading branch information
jermp authored Apr 2, 2022
2 parents 63d1252 + 2f3672c commit 562ff63
Show file tree
Hide file tree
Showing 12 changed files with 1,027 additions and 196 deletions.
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,6 @@ target_link_libraries(check z)

add_executable(query src/query.cpp)
target_link_libraries(query z)

add_executable(permute src/permute.cpp)
target_link_libraries(permute z)
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,28 @@ We see that the canonical dictionary is twice as fast as the regular dictionary
for low-hit workloads,
even on this tiny example, for only +0.4 bits/k-mer.

### Example 4

./permute ../data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz 31 -o ecoli_sakai.permuted.fa

This command re-orders (and possibly reverse-complement) the strings in the collection as to *minimize* the number of runs in the abundances and, hence, optimize the encoding of the abundances.
The result is saved to the file `ecoli_sakai.permuted.fa`.

In this example for the E.Coli collection (Sakai strain) we reduce the number of runs in the abundances from 5820 to 3723.

Then use the `build` command as usual to build the permuted collection:

./build ecoli_sakai.permuted.fa 31 13 --abundances --verbose

The index built on the permuted collection
optimizes the storage space for the abundances which results in a 15.1X better space than the empirical entropy of the abundances.

For reference, the index built on the original collection:

./build ../data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz 31 13 --abundances --verbose

already achieves a 12.4X better space than the empirical entropy.

Input Files
-----------

Expand Down
Binary file not shown.
2 changes: 1 addition & 1 deletion external/pthash
106 changes: 8 additions & 98 deletions include/abundances.hpp
Original file line number Diff line number Diff line change
@@ -1,20 +1,17 @@
#pragma once

#include <vector>
#include <unordered_map> // count the distinct abundances
#include <unordered_map> // count the distinct abundances with freq information

#include "ef_sequence.hpp"

namespace sshash {

struct abundances {
struct builder {
builder() : m_most_frequent_abundance(0) {}
builder() {}

void init(uint64_t most_frequent_abundance) {
m_most_frequent_abundance = most_frequent_abundance;
m_kmer_id_interval_lengths.push_back(0);
m_abundance_interval_lengths.push_back(0);
}
void init() { m_abundance_interval_lengths.push_back(0); }

void eat(uint64_t abundance) {
assert(abundance > 0);
Expand All @@ -26,28 +23,17 @@ struct abundances {
}
}

void push_kmer_id_interval(uint64_t value, uint64_t length) {
m_kmer_id_interval_values.push_back(value);
m_kmer_id_interval_lengths.push_back(m_kmer_id_interval_lengths.back() + length);
}

void push_abundance_interval(uint64_t value, uint64_t length) {
m_abundance_interval_values.push_back(value);
m_abundance_interval_lengths.push_back(m_abundance_interval_lengths.back() + length);
}

uint64_t num_kmer_id_intervals() const { return m_kmer_id_interval_values.size(); }
uint64_t num_abundance_intervals() const { return m_abundance_interval_values.size(); }

void finalize(uint64_t num_kmers) {
assert(
std::is_sorted(m_kmer_id_interval_values.begin(), m_kmer_id_interval_values.end()));
assert(std::is_sorted(m_kmer_id_interval_lengths.begin(),
m_kmer_id_interval_lengths.end()));
assert(std::is_sorted(m_abundance_interval_lengths.begin(),
m_abundance_interval_lengths.end()));

std::cout << "num_kmer_id_intervals " << num_kmer_id_intervals() << std::endl;
std::cout << "num_abundance_intervals " << num_abundance_intervals() << std::endl;

uint64_t num_distinct_abundances = m_abundances_map.size();
Expand Down Expand Up @@ -80,23 +66,9 @@ struct abundances {
return x.first < y.first;
});

/* If this test fails, then we need to change the value of
constants::most_frequent_abundance. */
if (m_most_frequent_abundance != m_abundances.front().first) {
throw std::runtime_error("the most frequent abundance is not " +
std::to_string(constants::most_frequent_abundance) +
" but " + std::to_string(m_abundances.front().first));
}

uint64_t rest = num_kmers - m_abundances.front().second;
std::cout << "kmers that do not have the most frequent ab: " << rest << " ("
<< (rest * 100.0) / num_kmers << "%)" << std::endl;
if (m_kmer_id_interval_values.size() != 0) { // optimize_mfa
std::cout << "cumulative_kmer_id_interval_lengths "
<< m_kmer_id_interval_lengths.back() << '/' << rest << std::endl;
std::cout << "cumulative_abundance_interval_lengths "
<< m_abundance_interval_lengths.back() << '/' << rest << std::endl;
}

m_abundance_dictionary_builder.resize(num_distinct_abundances,
std::ceil(std::log2(largest_ab + 1)));
Expand All @@ -108,13 +80,6 @@ struct abundances {
}

void build(abundances& index) {
std::swap(index.m_most_frequent_abundance, m_most_frequent_abundance);

index.m_kmer_id_interval_values.encode(m_kmer_id_interval_values.begin(),
m_kmer_id_interval_values.size());
index.m_kmer_id_interval_lengths.encode(m_kmer_id_interval_lengths.begin(),
m_kmer_id_interval_lengths.size());

uint64_t num_distinct_abundances = m_abundance_dictionary_builder.size();
pthash::compact_vector::builder abundance_interval_values;
abundance_interval_values.resize(
Expand Down Expand Up @@ -144,7 +109,7 @@ struct abundances {
m_abundance_dictionary_builder.build(index.m_abundance_dictionary);
}

// return the average empirical entropy per abundance
/* return the average empirical entropy per abundance */
double print_info(uint64_t num_kmers) {
assert(!m_abundances.empty());
double expected_ab_value = 0.0;
Expand All @@ -166,15 +131,10 @@ struct abundances {
}

private:
uint64_t m_most_frequent_abundance;

/* (abundance,frequency) pairs during construction, then (abundance,id) after sorting */
std::unordered_map<uint64_t, uint64_t> m_abundances_map;
std::vector<std::pair<uint64_t, uint64_t>> m_abundances; // (abundance,frequency)

std::vector<uint64_t> m_kmer_id_interval_values;
std::vector<uint64_t> m_kmer_id_interval_lengths;

std::vector<uint64_t> m_abundance_interval_values;
std::vector<uint64_t> m_abundance_interval_lengths;

Expand All @@ -184,48 +144,18 @@ struct abundances {
bool empty() const { return m_abundance_dictionary.size() == 0; }

uint64_t abundance(uint64_t kmer_id) const {
uint64_t rank = kmer_id;

if (m_kmer_id_interval_values.size() != 0) { // optimize_mfa
bool is_present = false;
auto [pos, val, prev] = m_kmer_id_interval_values.next_geq_neighbourhood(kmer_id);
if (val == kmer_id) {
is_present = true;
rank = m_kmer_id_interval_lengths.access(pos);
} else {
if (pos > 0) {
rank = m_kmer_id_interval_lengths.access(pos - 1);
uint64_t length = m_kmer_id_interval_lengths.access(pos) - rank;
if (kmer_id < prev + length) {
is_present = true;
assert(kmer_id >= prev);
rank += kmer_id - prev;
}
}
}
if (!is_present) return m_most_frequent_abundance;
}

uint64_t i = m_abundance_interval_lengths.prev_leq(rank);
uint64_t i = m_abundance_interval_lengths.prev_leq(kmer_id);
uint64_t id = m_abundance_interval_values.access(i);
uint64_t abundance = m_abundance_dictionary.access(id);

return abundance;
}

uint64_t num_bits() const {
return sizeof(m_most_frequent_abundance) * 8 + m_kmer_id_interval_values.num_bits() +
m_kmer_id_interval_lengths.num_bits() + m_abundance_interval_values.bytes() * 8 +
m_abundance_interval_lengths.num_bits() + m_abundance_dictionary.bytes() * 8;
return m_abundance_interval_values.bytes() * 8 + m_abundance_interval_lengths.num_bits() +
m_abundance_dictionary.bytes() * 8;
}

void print_space_breakdown(uint64_t num_kmers) const {
std::cout << " kmer_id_interval_values: "
<< static_cast<double>(m_kmer_id_interval_values.num_bits()) / num_kmers
<< " [bits/kmer]\n";
std::cout << " kmer_id_interval_lengths: "
<< static_cast<double>(m_kmer_id_interval_lengths.num_bits()) / num_kmers
<< " [bits/kmer]\n";
std::cout << " abundance_interval_values: "
<< static_cast<double>(m_abundance_interval_values.bytes() * 8) / num_kmers
<< " [bits/kmer]\n";
Expand All @@ -239,34 +169,14 @@ struct abundances {

template <typename Visitor>
void visit(Visitor& visitor) {
visitor.visit(m_most_frequent_abundance);
visitor.visit(m_kmer_id_interval_values);
visitor.visit(m_kmer_id_interval_lengths);
visitor.visit(m_abundance_interval_values);
visitor.visit(m_abundance_interval_lengths);
visitor.visit(m_abundance_dictionary);
}

private:
uint64_t m_most_frequent_abundance;

/*****
We model abundances as two lists of intervals.
Each interval is a pair (value,length).
- First list: kmer_ids list. In this case, a pair (value,length)
represents all kmer_ids = value, value+1, value+2, ..., value+length-1.
- Second list: abundance list. In this case, a pair (value,length)
represents that the abundance [value] repeats for [length] times.
*/

ef_sequence<true> m_kmer_id_interval_values;
ef_sequence<false> m_kmer_id_interval_lengths;

pthash::compact_vector m_abundance_interval_values;
ef_sequence<true> m_abundance_interval_lengths;
/***/

/* Abundance dictionary, listing all distinct abundances sorted by decreasing frequency. */
pthash::compact_vector m_abundance_dictionary;
};

Expand Down
98 changes: 14 additions & 84 deletions include/builder/build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,13 @@
#include "../info.cpp"
#include "../../external/pthash/include/pthash.hpp"
#include "../../external/pthash/external/essentials/include/essentials.hpp"

#include "build_util_types.hpp"
#include "util.hpp"

#include <numeric> // for std::partial_sum
#include <vector>

namespace sshash {

void expect(char got, char expected) {
if (got != expected) {
std::cout << "got '" << got << "' but expected '" << expected << "'" << std::endl;
throw parse_runtime_error();
}
}

struct parse_data {
parse_data() : num_kmers(0) {}
uint64_t num_kmers;
Expand Down Expand Up @@ -88,37 +80,18 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b

uint64_t seq_len = 0;
uint64_t sum_of_abundances = 0;
data.abundances_builder.init(constants::most_frequent_abundance);

/* intervals of kmer_ids */
uint64_t kmer_id_value = constants::invalid;
uint64_t kmer_id_length = 1;
data.abundances_builder.init();

/* intervals of abundances */
uint64_t ab_value = constants::invalid;
uint64_t ab_length = 1;

uint64_t num_sequences_diff_abs = 0; // num sequences whose kmers have different abundances
uint64_t num_sequences_all_mfa = 0; // num sequences whose kmers have same abundance == mfa

auto process_abundance = [&](uint64_t ab) {
if (ab == ab_value) {
ab_length += 1;
} else {
if (ab_value != constants::invalid) {
data.abundances_builder.push_abundance_interval(ab_value, ab_length);
}
ab_value = ab;
ab_length = 1;
}
};
uint64_t ab_length = 0;

auto parse_header = [&]() {
if (sequence.empty()) return;

/*
Heder format:
>[seq_id] LN:i:[seq_len] ab:Z:[ab_seq]
>[id] LN:i:[seq_len] ab:Z:[ab_seq]
where [ab_seq] is a space-separated sequence of integer counters (the abundances),
whose length is equal to [seq_len]-k+1
*/
Expand All @@ -140,8 +113,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
uint64_t j = sequence.find_first_of(' ', i);
if (j == std::string::npos) throw parse_runtime_error();

char* end;
seq_len = std::strtoull(sequence.data() + i, &end, 10);
seq_len = std::strtoull(sequence.data() + i, nullptr, 10);
i = j + 1;
expect(sequence[i + 0], 'a');
expect(sequence[i + 1], 'b');
Expand All @@ -150,53 +122,23 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
expect(sequence[i + 4], ':');
i += 5;

kmer_id_value = constants::invalid;
kmer_id_length = 1;

bool kmers_have_all_mfa = true;
bool kmers_have_different_abundances = false;

for (uint64_t j = 0, num_kmers = data.num_kmers, prev_ab = constants::invalid;
j != seq_len - k + 1; ++j, ++num_kmers) {
uint64_t ab = std::strtoull(sequence.data() + i, &end, 10);
for (uint64_t j = 0; j != seq_len - k + 1; ++j) {
uint64_t ab = std::strtoull(sequence.data() + i, nullptr, 10);
i = sequence.find_first_of(' ', i) + 1;

if (ab != constants::most_frequent_abundance) kmers_have_all_mfa = false;
if (j > 0) {
if (ab != prev_ab) { kmers_have_different_abundances = true; }
}
prev_ab = ab;

data.abundances_builder.eat(ab);
sum_of_abundances += ab;

if (build_config.optimize_mfa) {
if (ab != constants::most_frequent_abundance) {
if (kmer_id_value == constants::invalid) {
kmer_id_value = num_kmers;
kmer_id_length = 1;
} else {
if (num_kmers == kmer_id_value + kmer_id_length) kmer_id_length += 1;
}
process_abundance(ab);
} else {
if (kmer_id_value != constants::invalid) {
data.abundances_builder.push_kmer_id_interval(kmer_id_value,
kmer_id_length);
}
kmer_id_value = constants::invalid;
}
if (ab == ab_value) {
ab_length += 1;
} else {
process_abundance(ab);
if (ab_value != constants::invalid) {
data.abundances_builder.push_abundance_interval(ab_value, ab_length);
}
ab_value = ab;
ab_length = 1;
}
}

if (kmer_id_value != constants::invalid) {
data.abundances_builder.push_kmer_id_interval(kmer_id_value, kmer_id_length);
}

num_sequences_diff_abs += kmers_have_different_abundances;
num_sequences_all_mfa += kmers_have_all_mfa;
};

while (!is.eof()) {
Expand Down Expand Up @@ -264,18 +206,6 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b

if (build_config.store_abundances) {
std::cout << "sum_of_abundances " << sum_of_abundances << std::endl;
std::cout << "num_sequences whose kmers have different abundances: "
<< num_sequences_diff_abs << "/" << num_sequences << " ("
<< (num_sequences_diff_abs * 100.0) / num_sequences << "%)" << std::endl;
std::cout << "num_sequences whose kmers all have the same abundance != mfa: "
<< (num_sequences - num_sequences_diff_abs) << "/" << num_sequences << " ("
<< ((num_sequences - num_sequences_diff_abs) * 100.0) / num_sequences << "%)"
<< std::endl;
std::cout << "num_sequences whose kmers all have the same abundance == mfa: "
<< num_sequences_all_mfa << "/" << (num_sequences - num_sequences_diff_abs)
<< " ("
<< (num_sequences_all_mfa * 100.0) / (num_sequences - num_sequences_diff_abs)
<< "%)" << std::endl;
data.abundances_builder.push_abundance_interval(ab_value, ab_length);
data.abundances_builder.finalize(data.num_kmers);
}
Expand Down
Loading

0 comments on commit 562ff63

Please sign in to comment.