Skip to content

Commit

Permalink
Merge pull request #117 from MitraDarja/improve_doc
Browse files Browse the repository at this point in the history
Improve doc
  • Loading branch information
MitraDarja authored Sep 20, 2021
2 parents f5b53e5 + 41fbed0 commit 4d4ce9b
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 13 deletions.
17 changes: 13 additions & 4 deletions src/estimate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,10 @@ void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector<uint
seqan3::dna4_vector const seq, std::vector<uint32_t> & prev_counts,
exp_t const & expressions, uint16_t const k, std::vector<double> const fprs)
{
// Check, if one expression threshold for all or individual thresholds
static constexpr bool multiple_expressions = std::same_as<exp_t, std::vector<std::vector<uint16_t>>>;

// Count minimisers in ibf of current level
std::vector<uint32_t> counter;
counter.assign(ibf.bin_count(), 0);
uint64_t minimiser_length = 0;
Expand All @@ -41,15 +43,20 @@ void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector<uint
++minimiser_length;
}

// Defines, where the median should be
float minimiser_pos = minimiser_length/2.0;

// Check every experiment by going over the number of bins in the ibf.
for(int j = 0; j < counter.size(); j++)
{
// Correction by substracting the expected number of false positives
counter[j] = std::max((double) 0.0, (double) ((counter[j]-(minimiser_length*fprs[j]))/(1.0-fprs[j])));
// Check, if considering previously seen minimisers and minimisers found ar current level equal to or are greater
// than the minimiser_pow, which gives the median position.
// If ań estimation took already place (estimations_i[j]!=0), a second estimation is not performed.
if (((prev_counts[j] + counter[j]) >= minimiser_pos) & (estimations_i[j] == 0))
{
// If there was nothing previous
// If there was no previous level, because we are looking at the last level.
if constexpr(last_exp)
{
if constexpr (multiple_expressions)
Expand All @@ -68,15 +75,15 @@ void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector<uint
estimations_i[j] = std::max(expressions[k][j] * 1.0, expressions[k+1][j] - ((abs(minimiser_pos - prev_counts[j])/(counter[j] * 1.0)) * (expressions[k+1][j]-expressions[k][j])));
else
estimations_i[j] = std::max(expressions * 1.0, k - ((abs(minimiser_pos - prev_counts[j])/(counter[j] * 1.0)) * (k-expressions)));
// Make sure, every transcript is only estimated once
prev_counts[j] = 0;
}

// Perform normalization by dividing through the threshold of the first level. Only works, if multiple expressions were used.
if constexpr (normalization & multiple_expressions)
estimations_i[j] = estimations_i[j]/expressions[0][j];
}
else
{
// If not found at this level, add to previous count.
prev_counts[j] = prev_counts[j] + counter[j];
}
}
Expand Down Expand Up @@ -160,7 +167,7 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat
// Make sure expression levels are sorted.
sort(args.expression_thresholds.begin(), args.expression_thresholds.end());

// Initialse last expression
// Initialse last expression.
if constexpr (samplewise)
load_ibf(ibf, estimate_args.path_in.string() + "IBF_Level_" + std::to_string(args.number_expression_thresholds-1));
else
Expand Down Expand Up @@ -199,6 +206,7 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat

for (int j = args.number_expression_thresholds - 2; j >= 0; j--)
{
// Loadthe next ibf that should be considered.
if constexpr (samplewise)
load_ibf(ibf, estimate_args.path_in.string() + "IBF_Level_" + std::to_string(j));
else
Expand All @@ -223,6 +231,7 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat
prev_expression = args.expression_thresholds[j];
}

// Write output file.
std::ofstream outfile;
outfile.open(std::string{file_out});
for (int i = 0; i < seqs.size(); ++i)
Expand Down
25 changes: 16 additions & 9 deletions src/ibf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ void get_include_set_table(min_arguments const & args, std::filesystem::path con
}
}

// Chech if file has fasta format to estimate cutoffs.
inline bool check_for_fasta_format(std::vector<std::string> const & valid_extensions, std::string const & file_path)
{

Expand All @@ -68,7 +69,7 @@ inline bool check_for_fasta_format(std::vector<std::string> const & valid_extens
// Determine cutoff for one experiment
uint8_t calculate_cutoff(std::filesystem::path sequence_file, int samples)
{
// Cutoff according to Mantis paper, divided by two because we store expression levels and
// Cutoff according to Mantis paper, divided by two because we store expression thresholds and
// -1 because we use "<" and not "<="
uint16_t const default_cutoff{24};
uint8_t cutoff{default_cutoff};
Expand Down Expand Up @@ -123,7 +124,8 @@ void fill_hash_table(min_arguments const & args,
hash_table[minHash] = cutoff_table[minHash] + 1;
cutoff_table.erase(minHash);
}
// If none of the above, increase count in cutoff table.
// If none of the above, increase count in cutoff table. Cutoff Table increases RAM usage by storing
// minimisers with a low occurence in a smaller hash table.
else
{
cutoff_table[minHash]++;
Expand Down Expand Up @@ -320,13 +322,13 @@ void check_fpr(uint8_t const number_expression_thresholds, std::vector<double> &
}
}

// Calculate expression levels and sizes
// Calculate expression thresholds and sizes
void get_expression_thresholds(uint8_t const number_expression_thresholds,
robin_hood::unordered_node_map<uint64_t, uint16_t> const & hash_table,
std::vector<uint16_t> & expression_thresholds, std::vector<uint64_t> & sizes,
robin_hood::unordered_set<uint64_t> const & genome, bool all = true)
{
// Calculate expression levels by taking median recursively
// Calculate expression thresholds by taking median recursively
std::vector<uint16_t> counts;
for (auto && elem : hash_table)
{
Expand Down Expand Up @@ -354,21 +356,23 @@ void get_expression_thresholds(uint8_t const number_expression_thresholds,
prev_pos = prev_pos + counts.size()/dev;
dev = dev*2;

// If expression does not change compared to previous one, do not store it again as an expression threshold.
if ((exp - prev_exp) > 1)
{
expression_thresholds.push_back(exp);
sizes.push_back(prev_pos);

}

prev_exp = exp;
}
// In case not all levels have a threshold, give the last levels a maximal threshold, which can not be met by any minimiser.
while(expression_thresholds.size() < number_expression_thresholds)
expression_thresholds.push_back(max_elem + 1);
counts.clear();
}

// Estimate the file size for every expression level, necessary when samplewise=false
// Estimate the file size for every expression level, necessary when samplewise=false, because then it is completly
// unclear how many minimisers are to store per file.
void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t const number_expression_thresholds,
std::vector<uint16_t> const & expression_thresholds, std::vector<uint64_t> & sizes,
robin_hood::unordered_set<uint64_t> const & genome, bool all = true)
Expand Down Expand Up @@ -398,6 +402,8 @@ void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t co
fin.read((char*)&minimiser_count, sizeof(minimiser_count));
if (all | genome.contains(minimiser))
{
// Find the level with the smallest greater value than the minimiser occurrence, in the level before that the
// minimiser is going to be stored.
auto p = std::upper_bound(expression_thresholds.begin(), expression_thresholds.end(), minimiser_count);
if(p != expression_thresholds.begin())
sizes[(p-expression_thresholds.begin())-1]++;
Expand Down Expand Up @@ -469,7 +475,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
else
{
// Estimate sizes on filesize, assuming every byte translates to one letter (which is obiously not true,
// because ids contain letters as well), so size might be overestimated
// because ids contain letters as well), so size might be overestimated. TODO: Find a better estimation!
unsigned file_iterator = std::accumulate(minimiser_args.samples.begin(), minimiser_args.samples.begin() + i, 0);

// Determine cutoffs
Expand Down Expand Up @@ -515,7 +521,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
}

std::ofstream outfile_fpr;
outfile_fpr.open(std::string{ibf_args.path_out} + "IBF_FPRs.fprs");
outfile_fpr.open(std::string{ibf_args.path_out} + "IBF_FPRs.fprs"); // File to store actual false positive rates per experiment.
// Create IBFs
std::vector<seqan3::interleaved_bloom_filter<seqan3::data_layout::uncompressed>> ibfs;
for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++)
Expand All @@ -539,7 +545,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,

for (unsigned i = 0; i < num_files; i++)
{
double fpr = std::pow(1.0- std::pow(1.0-(1.0/size), num_hash*sizes[i][j]), num_hash);//std::pow((1.0-(std::exp((-1.0*num_hash*sizes[i][j])/size))), num_hash);
double fpr = std::pow(1.0- std::pow(1.0-(1.0/size), num_hash*sizes[i][j]), num_hash);
outfile_fpr << fpr << " ";
}
outfile_fpr << "\n";
Expand Down Expand Up @@ -637,6 +643,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,

}

// Store all expression thresholds per level.
if constexpr(samplewise)
{
std::ofstream outfile;
Expand Down

0 comments on commit 4d4ce9b

Please sign in to comment.