Skip to content

Commit

Permalink
[MISC] Small refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Oct 7, 2024
1 parent a38580f commit 93122d2
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 45 deletions.
72 changes: 40 additions & 32 deletions src/estimate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,76 +29,84 @@ template <class IBFType, bool last_exp, bool normalization, typename exp_t>
void check_ibf(min_arguments const & args,
IBFType const & ibf,
std::vector<uint16_t> & estimations_i,
seqan3::dna4_vector const seq,
seqan3::dna4_vector const & seq,
std::vector<uint32_t> & prev_counts,
exp_t const & expressions,
uint16_t const k,
uint16_t const level,
std::vector<double> const fprs,
std::vector<int> & deleted)
{
// Check, if one expression threshold for all or individual thresholds
static constexpr bool multiple_expressions = std::same_as<exp_t, std::vector<std::vector<uint16_t>>>;

// Count minimisers in ibf of current level
std::vector<uint32_t> counter;
counter.assign(ibf.bin_count(), 0);
uint64_t minimiser_length = 0;
std::vector<uint32_t> counter(ibf.bin_count());
uint64_t minimiser_count{};
auto agent = ibf.membership_agent();
for (auto minHash : seqan3::views::minimiser_hash(seq, args.shape, args.w_size, args.s))
{
auto agent = ibf.membership_agent();
std::transform(counter.begin(),
counter.end(),
agent.bulk_contains(minHash).begin(),
counter.begin(),
std::plus<int>());
++minimiser_length;
std::ranges::transform(counter, agent.bulk_contains(minHash), counter.begin(), std::plus<uint32_t>());
++minimiser_count;
}

// Defines, where the median should be
float minimiser_pos = minimiser_length / 2.0;
double const minimiser_pos = minimiser_count / 2.0;

// Check every experiment by going over the number of bins in the ibf.
for (size_t j = 0; j < counter.size(); j++)
for (size_t bin = 0; bin < counter.size(); ++bin)
{
if (std::find(deleted.begin(), deleted.end(), j) != deleted.end())
if (std::ranges::find(deleted, bin) != deleted.end())
continue;
// Correction by substracting the expected number of false positives
counter[j] = std::max((double)0.0, (double)((counter[j] - (minimiser_length * fprs[j])) / (1.0 - fprs[j])));
// Check, if considering previously seen minimisers and minimisers found ar current level equal to or are greater
auto const expected_false_positives = minimiser_count * fprs[bin];
auto const corrected_count = counter[bin] - expected_false_positives;
auto const normalized_count = corrected_count / (1.0 - fprs[bin]);

counter[bin] = std::max<double>(0.0, normalized_count);
// Check if considering previously seen minimisers and minimisers found at current level equal to or are greater
// than the minimiser_pow, which gives the median position.
// If estimation took already place (estimations_i[j]!=0), a second estimation is not performed.
if (((prev_counts[j] + counter[j]) >= minimiser_pos) && (estimations_i[j] == 0))
// If an estimation took already place (estimations_i[bin]!=0), a second estimation is not performed.
if (estimations_i[bin] == 0 && prev_counts[bin] + counter[bin] >= minimiser_pos)
{
// If there was no previous level, because we are looking at the last level.
if constexpr (last_exp)
{
if constexpr (multiple_expressions)
estimations_i[j] = expressions[k][j];
estimations_i[bin] = expressions[level][bin];
else
estimations_i[j] = expressions;
estimations_i[bin] = expressions;
}
else
{
// Actually calculate estimation, in the else case k stands for the prev_expression
auto const normalized_minimiser_pos = std::abs(minimiser_pos - prev_counts[bin]) / counter[bin];

// Actually calculate estimation, in the else case level stands for the prev_expression
if constexpr (multiple_expressions)
estimations_i[j] = std::max(expressions[k][j] * 1.0,
expressions[k + 1][j]
- ((abs(minimiser_pos - prev_counts[j]) / (counter[j] * 1.0))
* (expressions[k + 1][j] - expressions[k][j])));
{
auto const prev_level_expression = expressions[level + 1][bin];
auto const expression_difference = prev_level_expression - expressions[level][bin];
auto const estimate = prev_level_expression - (normalized_minimiser_pos * expression_difference);

estimations_i[bin] = std::max<double>(expressions[level][bin], estimate);
}
else
estimations_i[j] =
std::max(expressions * 1.0,
k - ((abs(minimiser_pos - prev_counts[j]) / (counter[j] * 1.0)) * (k - expressions)));
{
auto const prev_level_expression = level;
auto const expression_difference = prev_level_expression - expressions;
auto const estimate = prev_level_expression - (normalized_minimiser_pos * expression_difference);

estimations_i[bin] = std::max<double>(expressions, estimate);
}
}

// Perform normalization by dividing through the threshold of the first level. Only works, if multiple expressions were used.
// Perform normalization by dividing through the threshold of the first level. Only works if multiple expressions were used.
if constexpr (normalization && multiple_expressions)
estimations_i[j] = estimations_i[j] / expressions[1][j];
estimations_i[bin] /= expressions[1][bin];
}
else
{
// If not found at this level, add to previous count.
prev_counts[j] = prev_counts[j] + counter[j];
prev_counts[bin] += counter[bin];
}
}
}
Expand Down
44 changes: 31 additions & 13 deletions src/ibf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,31 +116,48 @@ void fill_hash_table(min_arguments const & args,
bool const only_include = false,
uint8_t cutoff = 0)
{
// Would result in no minimisers being added to hash_table.
bool const containment_check_in_empty_include_set = only_include && include_set_table.empty();
if (containment_check_in_empty_include_set)
return;

Check warning on line 122 in src/ibf.cpp

View check run for this annotation

Codecov / codecov/patch

src/ibf.cpp#L122

Added line #L122 was not covered by tests

// Would result in no filtering.
bool const exclusion_check_in_empty_exclude_set = !only_include && exclude_set_table.empty();

auto check_value_in_tables = [&](auto && minHash)
{
return (only_include && include_set_table.contains(minHash))
|| (!only_include && !exclude_set_table.contains(minHash));
};

for (auto & [seq] : fin)
{
for (auto && minHash : seqan3::views::minimiser_hash(seq, args.shape, args.w_size, args.s))
{
if ((only_include && include_set_table.contains(minHash))
|| (!only_include && !exclude_set_table.contains(minHash)))
if (exclusion_check_in_empty_exclude_set || check_value_in_tables(minHash))
{
// Note that hash_table[minHash] would insert minHash into the map if it does not yet exist.
auto it = hash_table.find(minHash);
// If minHash is already in hash table, increase count in hash table

// If minHash is already in hash table, increase count in hash table.
if (it != hash_table.end())
{
it->second = std::min<uint16_t>(65534u, hash_table[minHash] + 1);
}
// If minHash equals now the cutoff than add it to the hash table and add plus one for the current
// iteration.
else if (cutoff_table[minHash] == cutoff)
{
hash_table[minHash] = cutoff_table[minHash] + 1;
cutoff_table.erase(minHash);
// Prevent overflow. 65535 is the maximum value for uint16_t.
it->second = std::min<uint16_t>(65534u, it->second + 1);
}
// If there is no cutoff, add value to hash_table and set count to 1.
else if (cutoff == 0)
{
hash_table[minHash]++;
}
// If none of the above, increase count in cutoff table. Cutoff Table increases RAM usage by storing
// If minHash equals the cutoff, add it to the hash table and add plus one for the current
// iteration.
else if (uint8_t const value = cutoff_table[minHash]; value == cutoff)
{
hash_table[minHash] = value + 1u;
cutoff_table.erase(minHash);
}
// If none of the above, increase count in cutoff table. Cutoff Table reduces RAM usage by storing
// minimisers with a low occurence in a smaller hash table.
else
{
Expand Down Expand Up @@ -792,7 +809,8 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
minimiser_files[file_iterator].extension());
filesize = std::filesystem::file_size(minimiser_files[file_iterator]) * minimiser_args.samples[i]
* (is_fasta ? 2 : 1) / (is_compressed ? 1 : 3);
filesize = filesize / ((cutoffs[i] + 1) * (is_fasta ? 1 : 2));
filesize /= ((cutoffs[i] + 1) * (is_fasta ? 1 : 2));
// ^^ why divide? --> estimate the number of minimisers
}
// If set_expression_thresholds_samplewise is not set the expressions as determined by the first file are used for
// all files.
Expand Down

0 comments on commit 93122d2

Please sign in to comment.