Merge pull request #117 from MitraDarja/improve_doc

Improve doc
seqan · Sep 20, 2021 · 4d4ce9b · 4d4ce9b
2 parents f5b53e5 + 41fbed0
commit 4d4ce9b
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 13 deletions.
diff --git a/src/estimate.cpp b/src/estimate.cpp
@@ -28,8 +28,10 @@ void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector<uint
                seqan3::dna4_vector const seq, std::vector<uint32_t> & prev_counts,
                exp_t const & expressions, uint16_t const k, std::vector<double> const fprs)
 {
+    // Check, if one expression threshold for all or individual thresholds
     static constexpr bool multiple_expressions = std::same_as<exp_t, std::vector<std::vector<uint16_t>>>;
 
+    // Count minimisers in ibf of current level
     std::vector<uint32_t> counter;
     counter.assign(ibf.bin_count(), 0);
     uint64_t minimiser_length = 0;
@@ -41,15 +43,20 @@ void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector<uint
         ++minimiser_length;
     }
 
+    // Defines, where the median should be
     float minimiser_pos = minimiser_length/2.0;
 
+    // Check every experiment by going over the number of bins in the ibf.
     for(int j = 0; j < counter.size(); j++)
     {
         // Correction by substracting the expected number of false positives
         counter[j] = std::max((double) 0.0, (double) ((counter[j]-(minimiser_length*fprs[j]))/(1.0-fprs[j])));
+        // Check, if considering previously seen minimisers and minimisers found ar current level equal to or are greater
+        // than the minimiser_pow, which gives the median position.
+        // If ań estimation took already place (estimations_i[j]!=0), a second estimation is not performed.
         if (((prev_counts[j] + counter[j]) >= minimiser_pos) & (estimations_i[j] == 0))
         {
-            // If there was nothing previous
+            // If there was no previous level, because we are looking at the last level.
             if constexpr(last_exp)
             {
                 if constexpr (multiple_expressions)
@@ -68,15 +75,15 @@ void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector<uint
                    estimations_i[j] = std::max(expressions[k][j] * 1.0, expressions[k+1][j] - ((abs(minimiser_pos - prev_counts[j])/(counter[j] * 1.0)) * (expressions[k+1][j]-expressions[k][j])));
                else
                    estimations_i[j] = std::max(expressions * 1.0, k - ((abs(minimiser_pos - prev_counts[j])/(counter[j] * 1.0)) * (k-expressions)));
-               // Make sure, every transcript is only estimated once
-               prev_counts[j] = 0;
             }
 
+            // Perform normalization by dividing through the threshold of the first level. Only works, if multiple expressions were used.
             if constexpr (normalization & multiple_expressions)
                 estimations_i[j] = estimations_i[j]/expressions[0][j];
         }
         else
         {
+            // If not found at this level, add to previous count.
             prev_counts[j] = prev_counts[j] + counter[j];
         }
     }
@@ -160,7 +167,7 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat
     // Make sure expression levels are sorted.
     sort(args.expression_thresholds.begin(), args.expression_thresholds.end());
 
-    // Initialse last expression
+    // Initialse last expression.
     if constexpr (samplewise)
         load_ibf(ibf, estimate_args.path_in.string() + "IBF_Level_" + std::to_string(args.number_expression_thresholds-1));
     else
@@ -199,6 +206,7 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat
 
     for (int j = args.number_expression_thresholds - 2; j >= 0; j--)
     {
+        // Loadthe next ibf that should be considered.
         if constexpr (samplewise)
             load_ibf(ibf, estimate_args.path_in.string() + "IBF_Level_" + std::to_string(j));
         else
@@ -223,6 +231,7 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat
             prev_expression = args.expression_thresholds[j];
     }
 
+    // Write output file.
     std::ofstream outfile;
     outfile.open(std::string{file_out});
     for (int i = 0; i <  seqs.size(); ++i)

diff --git a/src/ibf.cpp b/src/ibf.cpp
@@ -42,6 +42,7 @@ void get_include_set_table(min_arguments const & args, std::filesystem::path con
     }
 }
 
+// Chech if file has fasta format to estimate cutoffs.
 inline bool check_for_fasta_format(std::vector<std::string> const & valid_extensions, std::string const & file_path)
 {
 
@@ -68,7 +69,7 @@ inline bool check_for_fasta_format(std::vector<std::string> const & valid_extens
 // Determine cutoff for one experiment
 uint8_t calculate_cutoff(std::filesystem::path sequence_file, int samples)
 {
-    // Cutoff according to Mantis paper, divided by two because we store expression levels and
+    // Cutoff according to Mantis paper, divided by two because we store expression thresholds and
     // -1 because we use "<" and not "<="
     uint16_t const default_cutoff{24};
     uint8_t cutoff{default_cutoff};
@@ -123,7 +124,8 @@ void fill_hash_table(min_arguments const & args,
                     hash_table[minHash] = cutoff_table[minHash] + 1;
                     cutoff_table.erase(minHash);
                 }
-                // If none of the above, increase count in cutoff table.
+                // If none of the above, increase count in cutoff table. Cutoff Table increases RAM usage by storing
+                // minimisers with a low occurence in a smaller hash table.
                 else
                 {
                     cutoff_table[minHash]++;
@@ -320,13 +322,13 @@ void check_fpr(uint8_t const number_expression_thresholds, std::vector<double> &
     }
 }
 
-// Calculate expression levels and sizes
+// Calculate expression thresholds and sizes
 void get_expression_thresholds(uint8_t const number_expression_thresholds,
                            robin_hood::unordered_node_map<uint64_t, uint16_t> const & hash_table,
                            std::vector<uint16_t> & expression_thresholds, std::vector<uint64_t> & sizes,
                            robin_hood::unordered_set<uint64_t> const & genome, bool all = true)
 {
-    // Calculate expression levels by taking median recursively
+    // Calculate expression thresholds by taking median recursively
     std::vector<uint16_t> counts;
     for (auto && elem : hash_table)
     {
@@ -354,21 +356,23 @@ void get_expression_thresholds(uint8_t const number_expression_thresholds,
         prev_pos = prev_pos + counts.size()/dev;
         dev = dev*2;
 
+        // If expression does not change compared to previous one, do not store it again as an expression threshold.
         if ((exp - prev_exp) > 1)
         {
             expression_thresholds.push_back(exp);
             sizes.push_back(prev_pos);
-
         }
 
         prev_exp = exp;
     }
+    // In case not all levels have a threshold, give the last levels a maximal threshold, which can not be met by any minimiser.
     while(expression_thresholds.size() < number_expression_thresholds)
         expression_thresholds.push_back(max_elem + 1);
     counts.clear();
 }
 
-// Estimate the file size for every expression level, necessary when samplewise=false
+// Estimate the file size for every expression level, necessary when samplewise=false, because then it is completly
+// unclear how many minimisers are to store per file.
 void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t const number_expression_thresholds,
                                       std::vector<uint16_t> const & expression_thresholds, std::vector<uint64_t> & sizes,
                                       robin_hood::unordered_set<uint64_t> const & genome, bool all = true)
@@ -398,6 +402,8 @@ void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t co
         fin.read((char*)&minimiser_count, sizeof(minimiser_count));
         if (all | genome.contains(minimiser))
         {
+            // Find the level with the smallest greater value than the minimiser occurrence, in the level before that the
+            // minimiser is going to be stored.
             auto p = std::upper_bound(expression_thresholds.begin(), expression_thresholds.end(), minimiser_count);
             if(p != expression_thresholds.begin())
                 sizes[(p-expression_thresholds.begin())-1]++;
@@ -469,7 +475,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
         else
         {
             // Estimate sizes on filesize, assuming every byte translates to one letter (which is obiously not true,
-            // because ids contain letters as well), so size might be overestimated
+            // because ids contain letters as well), so size might be overestimated. TODO: Find a better estimation!
             unsigned file_iterator = std::accumulate(minimiser_args.samples.begin(), minimiser_args.samples.begin() + i, 0);
 
             // Determine cutoffs
@@ -515,7 +521,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
     }
 
     std::ofstream outfile_fpr;
-    outfile_fpr.open(std::string{ibf_args.path_out} +  "IBF_FPRs.fprs");
+    outfile_fpr.open(std::string{ibf_args.path_out} +  "IBF_FPRs.fprs"); // File to store actual false positive rates per experiment.
     // Create IBFs
     std::vector<seqan3::interleaved_bloom_filter<seqan3::data_layout::uncompressed>> ibfs;
     for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++)
@@ -539,7 +545,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
 
         for (unsigned i = 0; i < num_files; i++)
         {
-            double fpr = std::pow(1.0- std::pow(1.0-(1.0/size), num_hash*sizes[i][j]), num_hash);//std::pow((1.0-(std::exp((-1.0*num_hash*sizes[i][j])/size))), num_hash);
+            double fpr = std::pow(1.0- std::pow(1.0-(1.0/size), num_hash*sizes[i][j]), num_hash);
             outfile_fpr << fpr << " ";
         }
         outfile_fpr << "\n";
@@ -637,6 +643,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
 
     }
 
+    // Store all expression thresholds per level.
     if constexpr(samplewise)
     {
         std::ofstream outfile;