Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Coverage #12

Merged
merged 6 commits into from
Nov 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 0 additions & 16 deletions .github/workflows/ci_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,22 +41,6 @@ jobs:
cc: "gcc-10"
build_type: Release

- name: "gcc9 (c++2a)"
cxx: "g++-9"
cc: "gcc-9"
build_type: Release
cxx_flags: "-std=c++2a"

- name: "gcc8"
cxx: "g++-8"
cc: "gcc-8"
build_type: Release

- name: "gcc7"
cxx: "g++-7"
cc: "gcc-7"
build_type: Release

steps:
- name: Checkout
uses: actions/checkout@v2
Expand Down
19 changes: 0 additions & 19 deletions .github/workflows/ci_macos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,25 +30,6 @@ jobs:
cc: "gcc-10"
build_type: Release

- name: "gcc9 (c++2a)"
os: macos-10.15
cxx: "g++-9"
cc: "gcc-9"
build_type: Release
cxx_flags: "-std=c++2a"

- name: "gcc8"
os: macos-10.15
cxx: "g++-8"
cc: "gcc-8"
build_type: Release

- name: "gcc7"
os: macos-10.15
cxx: "g++-7"
cc: "gcc-7"
build_type: Release

steps:
- name: Checkout
uses: actions/checkout@v2
Expand Down
6 changes: 6 additions & 0 deletions include/compare.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,9 @@ struct my_traits2 : seqan3::sequence_file_input_default_traits_dna
* \param args The arguments about the view to be used.
*/
void do_comparison(std::vector<std::filesystem::path> sequence_files, range_arguments & args);

/*! \brief Function, comparing the methods in regard of their coverage.
* \param sequence_file A sequence file.
* \param args The arguments about the view to be used.
*/
void do_coverage(std::filesystem::path sequence_file, range_arguments & args);
153 changes: 150 additions & 3 deletions src/compare.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <chrono>
#include <ranges>

#include <index.hpp>
#include <seqan3/alphabet/adaptation/char.hpp>
Expand All @@ -12,7 +13,8 @@
* \param mean Variable to store the mean.
* \param stdev Variable to store the variance.
*/
void get_mean_and_var(std::vector<int> & results, double & mean, double & stdev)
template<typename urng_t>
void get_mean_and_var(urng_t & results, double & mean, double & stdev)
{
double sum = std::accumulate(results.begin(), results.end(), 0.0);
mean = sum / results.size();
Expand Down Expand Up @@ -56,7 +58,6 @@ void compare(std::vector<std::filesystem::path> sequence_files, urng_t input_vie
std::ofstream outfile;
for (int i = 0; i < sequence_files.size(); ++i)
{

robin_hood::unordered_node_map<uint64_t, uint16_t> hash_table{};
auto start = std::chrono::high_resolution_clock::now();
if constexpr (strobemers > 0)
Expand Down Expand Up @@ -105,11 +106,144 @@ void compare(std::vector<std::filesystem::path> sequence_files, urng_t input_vie
outfile.close();
}

/*! \brief Function, that increases how often a position is covered by one.
* \param covs A vector, where every entry represents a position in the sequence and how often it is covered.
* \param position Starting position.
* \param shape The positions of covered positions.
*/
void positions_covered(std::vector<uint32_t> & covs, int position, seqan3::shape shape)
{
for(int pos = 0; pos < shape.size(); pos++)
{
if (shape[pos] == 1)
covs[position+pos]++;
}
}

/*! \brief Function, get the actual coverage of some submers.
* \param kmers A view that contains all possible k-mers (minimisers with window length = kmer length, if reverse strand should be considered).
* \param submers The submers, which coverage should be obtained.
* \param covs A vector, where every entry represents a position in the sequence and how often it is covered.
* \param shape Shape of a submer.
*/
template <typename urng_t>
void coverage(urng_t kmers, std::deque<uint64_t> & submers, std::vector<uint32_t> & covs, seqan3::shape shape)
{
int i{};
for (auto && elem : kmers)
{
if (elem == submers[0])
{
positions_covered(covs, i, shape);
submers.pop_front();
}

if (submers.size() == 0)
break;
i++;
}
}

/*! \brief Function, get the coverage of one sequence file for a method.
* Strobemers not supported, because strobemers should cover every position at least once because they are not
* a sampling method.
* \param sequence_file A sequence file.
* \param kmer_view View to compare to (Should be k-mers in most cases or minimisers with window length = kmer length).
* \param input_view View that should be evaluated.
* \param method_name Name of the tested method.
* \param args The arguments about the view to be used, needed for strobemers.
*/
template <typename urng_t, typename urng_t2>
void compare_cov(std::filesystem::path sequence_file, urng_t kmer_view, urng_t2 input_view, std::string method_name, range_arguments & args)
{
std::vector<int> islands{};
int island{};
int covered{};
std::vector<int> avg_islands{};
std::vector<int> largest_islands{};
std::vector<int> covered_percentage{};
std::vector<int> covereage_avg{};
std::ofstream outfile;

seqan3::sequence_file_input<my_traits, seqan3::fields<seqan3::field::seq>> fin{sequence_file};
for (auto & [seq] : fin)
{
auto kmers = seq | kmer_view;
auto submers = seq | input_view;

std::vector<uint32_t> covs{};
covs.assign(seq.size(), 0);
std::deque<uint64_t> submers2{};

for(auto && sub: submers)
submers2.push_back(sub);
coverage(kmers, submers2, covs, args.shape);

for(auto & elem : covs)
{
if (elem == 0)
{
island++;
}
else
{
if (island > 0)
{
islands.push_back(island);
island = 0;
}
covered++;
}
}

covered_percentage.push_back(covered);

double mean_cov{};
double var_cov{};
get_mean_and_var(covs, mean_cov, var_cov);
covereage_avg.push_back(mean_cov);

if (islands.size() == 0)
{
avg_islands.push_back(0);
largest_islands.push_back(0);
}
else
{
double mean{};
double var{};
get_mean_and_var(islands, mean, var);
avg_islands.push_back(mean);
largest_islands.push_back(*std::max_element(islands.begin(), islands.end()));
}

islands.clear();
island = 0;
covered = 0;
}

double mean_covered, mean_largest_island, mean_avg_island, stdev_covered, stdev_largest_island, stdev_avg_island;
get_mean_and_var(covered_percentage, mean_covered, stdev_covered);
get_mean_and_var(largest_islands, mean_largest_island, stdev_largest_island);
get_mean_and_var(avg_islands, mean_avg_island, stdev_avg_island);

std::nth_element(covereage_avg.begin(), covereage_avg.begin() + covereage_avg.size()/2, covereage_avg.end());
int median{};

// Store speed and compression
outfile.open(std::string{args.path_out} + method_name + "_coverage.out");
outfile << "Covered\t"<< method_name << "\t" << *std::min_element(covered_percentage.begin(), covered_percentage.end()) << "\t" << mean_covered << "\t" << stdev_covered << "\t" << *std::max_element(covered_percentage.begin(), covered_percentage.end()) << "\n";
outfile << "Covered Median\t"<< method_name << "\t" << *std::min_element(covereage_avg.begin(), covereage_avg.end()) << "\t" << median << "\t" << *std::max_element(covereage_avg.begin(), covereage_avg.end()) << "\n";
outfile << "Largest Island\t"<< method_name << "\t" << *std::min_element(largest_islands.begin(), largest_islands.end()) << "\t" << mean_largest_island << "\t" << stdev_largest_island << "\t" << *std::max_element(largest_islands.begin(), largest_islands.end()) << "\n";
outfile << "Avg Island\t"<< method_name << "\t" << *std::min_element(avg_islands.begin(), avg_islands.end()) << "\t" << mean_avg_island << "\t" << stdev_avg_island << "\t" << *std::max_element(avg_islands.begin(), avg_islands.end()) << "\n";
outfile.close();
}

void do_comparison(std::vector<std::filesystem::path> sequence_files, range_arguments & args)
{
switch(args.name)
{
case kmer: compare(sequence_files, seqan3::views::kmer_hash(seqan3::ungapped{args.k_size}), "kmer_hash_"+std::to_string(args.k_size), args);
case kmer: compare(sequence_files, seqan3::views::kmer_hash(args.shape), "kmer_hash_"+std::to_string(args.k_size), args);
break;
case minimiser: compare(sequence_files, seqan3::views::minimiser_hash(args.shape,
args.w_size), "minimiser_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()), args);
Expand All @@ -129,3 +263,16 @@ void do_comparison(std::vector<std::filesystem::path> sequence_files, range_argu
"minstrobemers_" + std::to_string(args.k_size) + "_" + std::to_string(args.order) + "_" + std::to_string(args.w_min) + "_" + std::to_string(args.w_max), args);
}
}

void do_coverage(std::filesystem::path sequence_file, range_arguments & args)
{
switch(args.name)
{
case kmer: compare_cov(sequence_file, seqan3::views::kmer_hash(args.shape), seqan3::views::kmer_hash(args.shape), "kmer_hash_"+std::to_string(args.k_size), args);
break;
case minimiser: compare_cov(sequence_file, seqan3::views::minimiser_hash(args.shape,
seqan3::window_size{args.shape.size()}), seqan3::views::minimiser_hash(args.shape,
args.w_size), "minimiser_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()), args);
break;
}
}
47 changes: 43 additions & 4 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ void string_to_methods(std::string name, methods & m)
m = strobemer;
};

void read_range_arguments_minimiser(seqan3::argument_parser & parser, range_arguments & args)
void read_range_arguments_strobemers(seqan3::argument_parser & parser, range_arguments & args)
{
parser.add_option(args.w_min, '\0', "w-min", "Define w-min for strobemers.");
parser.add_option(args.w_max, '\0', "w-max", "Define w-ax for strobemers.");
Expand All @@ -29,7 +29,7 @@ void read_range_arguments_minimiser(seqan3::argument_parser & parser, range_argu
parser.add_flag(args.minstrobers, '\0', "minstrobers", "If minstrobemers should be calculated.");
}

void read_range_arguments_strobemers(seqan3::argument_parser & parser, range_arguments & args)
void read_range_arguments_minimiser(seqan3::argument_parser & parser, range_arguments & args)
{
parser.add_option(w_size, 'w', "window", "Define window size. Default: 60.");
parser.add_option(shape, '\0', "shape", "Define a shape by the decimal of a bitvector, where 0 symbolizes a "
Expand All @@ -47,6 +47,43 @@ void parsing(range_arguments & args)
args.seed_se = seqan3::seed{adjust_seed(args.k_size, se)};
}

int coverage(seqan3::argument_parser & parser)
{
range_arguments args{};
std::filesystem::path sequence_file{};
parser.add_positional_option(sequence_file, "Please provide at least one sequence file.");
parser.add_option(args.path_out, 'o', "out", "Directory, where output files should be saved.");
parser.add_option(args.k_size, 'k', "kmer-size", "Define kmer size.");
std::string method{};
parser.add_option(method, '\0', "method", "Pick your method.", seqan3::option_spec::required, seqan3::value_list_validator{"kmer", "minimiser"});

read_range_arguments_minimiser(parser, args);

try
{
parser.parse();
parsing(args);
}
catch (seqan3::argument_parser_error const & ext) // catch user errors
{
seqan3::debug_stream << "Error. Incorrect command line input for coverage. " << ext.what() << "\n";
return -1;
}

try
{
string_to_methods(method, args.name);
do_coverage(sequence_file, args);
}
catch (const std::invalid_argument & e)
{
std::cerr << e.what() << std::endl;
return -1;
}

return 0;
}

int speed(seqan3::argument_parser & parser)
{
range_arguments args{};
Expand Down Expand Up @@ -88,7 +125,7 @@ int speed(seqan3::argument_parser & parser)
int main(int argc, char ** argv)
{
seqan3::argument_parser top_level_parser{"minions", argc, argv, seqan3::update_notifications::on,
{"speed"}};
{"coverage", "speed"}};

// Parser
top_level_parser.info.author = "Mitra Darvish"; // give parser some infos
Expand All @@ -106,7 +143,9 @@ int main(int argc, char ** argv)

seqan3::argument_parser & sub_parser = top_level_parser.get_sub_parser(); // hold a reference to the sub_parser

if (sub_parser.info.app_name == std::string_view{"minions-speed"})
if (sub_parser.info.app_name == std::string_view{"minions-coverage"})
coverage(sub_parser);
else if (sub_parser.info.app_name == std::string_view{"minions-speed"})
speed(sub_parser);

return 0;
Expand Down
1 change: 1 addition & 0 deletions test/api/comparison_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ TEST(minions, small_example)
range_arguments args{};
args.name = kmer;
args.k_size = 19;
args.shape = seqan3::ungapped{19};
std::string expected{"kmer_hash_19 159493 159493 0 159493\n"};
args.path_out = std::filesystem::temp_directory_path();
do_comparison({DATADIR"example1.fasta"}, args);
Expand Down
4 changes: 3 additions & 1 deletion test/cli/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
cmake_minimum_required (VERSION 3.8)

add_cli_test (minions_options_test.cpp FILES example1.fasta)
add_cli_test (minions_options_test.cpp)
add_cli_test (minions_coverage_test.cpp FILES example1.fasta)
add_cli_test (minions_speed_test.cpp FILES example1.fasta)
53 changes: 53 additions & 0 deletions test/cli/minions_coverage_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#include "cli_test.hpp"

TEST_F(cli_test, no_options)
{
cli_test_result result = execute_app("minions coverage");
std::string expected
{
"minions-coverage\n"
"================\n"
" Try -h or --help for more information.\n"
};
EXPECT_EQ(result.exit_code, 0);
EXPECT_EQ(result.out, expected);
EXPECT_EQ(result.err, std::string{});
}

TEST_F(cli_test, kmer)
{
cli_test_result result = execute_app("minions coverage --method kmer -k 19", data("example1.fasta"));
EXPECT_EQ(result.exit_code, 0);
EXPECT_EQ(result.out, std::string{});
EXPECT_EQ(result.err, std::string{});
}

TEST_F(cli_test, gapped_kmer)
{
cli_test_result result = execute_app("minions coverage --method kmer --shape 524223", data("example1.fasta"));
EXPECT_EQ(result.exit_code, 0);
EXPECT_EQ(result.out, std::string{});
EXPECT_EQ(result.err, std::string{});
}

TEST_F(cli_test, minimiser)
{
cli_test_result result = execute_app("minions coverage --method minimiser -k 19 -w 19 ", data("example1.fasta"));
EXPECT_EQ(result.exit_code, 0);
EXPECT_EQ(result.out, std::string{});
EXPECT_EQ(result.err, std::string{});
}

TEST_F(cli_test, wrong_method)
{
cli_test_result result = execute_app("minions coverage --method submer -k 19", data("example1.fasta"));
std::string expected
{
"Error. Incorrect command line input for coverage. Validation failed "
"for option --method: Value submer is not one of [kmer,minimiser].\n"
};

EXPECT_EQ(result.exit_code, 0);
EXPECT_EQ(result.err, expected);
EXPECT_EQ(result.out, std::string{});
}
Loading