diff --git a/.github/workflows/ci_linux.yml b/.github/workflows/ci_linux.yml index 9c5a555..cd65949 100644 --- a/.github/workflows/ci_linux.yml +++ b/.github/workflows/ci_linux.yml @@ -41,22 +41,6 @@ jobs: cc: "gcc-10" build_type: Release - - name: "gcc9 (c++2a)" - cxx: "g++-9" - cc: "gcc-9" - build_type: Release - cxx_flags: "-std=c++2a" - - - name: "gcc8" - cxx: "g++-8" - cc: "gcc-8" - build_type: Release - - - name: "gcc7" - cxx: "g++-7" - cc: "gcc-7" - build_type: Release - steps: - name: Checkout uses: actions/checkout@v2 diff --git a/.github/workflows/ci_macos.yml b/.github/workflows/ci_macos.yml index 7dc51c6..3c8cda2 100644 --- a/.github/workflows/ci_macos.yml +++ b/.github/workflows/ci_macos.yml @@ -30,25 +30,6 @@ jobs: cc: "gcc-10" build_type: Release - - name: "gcc9 (c++2a)" - os: macos-10.15 - cxx: "g++-9" - cc: "gcc-9" - build_type: Release - cxx_flags: "-std=c++2a" - - - name: "gcc8" - os: macos-10.15 - cxx: "g++-8" - cc: "gcc-8" - build_type: Release - - - name: "gcc7" - os: macos-10.15 - cxx: "g++-7" - cc: "gcc-7" - build_type: Release - steps: - name: Checkout uses: actions/checkout@v2 diff --git a/include/compare.h b/include/compare.h index 4354bcc..f4f17af 100644 --- a/include/compare.h +++ b/include/compare.h @@ -67,3 +67,9 @@ struct my_traits2 : seqan3::sequence_file_input_default_traits_dna * \param args The arguments about the view to be used. */ void do_comparison(std::vector sequence_files, range_arguments & args); + +/*! \brief Function, comparing the methods in regard of their coverage. + * \param sequence_file A sequence file. + * \param args The arguments about the view to be used. + */ +void do_coverage(std::filesystem::path sequence_file, range_arguments & args); diff --git a/src/compare.cpp b/src/compare.cpp index 62605d4..48b70e8 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -12,7 +13,8 @@ * \param mean Variable to store the mean. * \param stdev Variable to store the variance. */ -void get_mean_and_var(std::vector & results, double & mean, double & stdev) +template +void get_mean_and_var(urng_t & results, double & mean, double & stdev) { double sum = std::accumulate(results.begin(), results.end(), 0.0); mean = sum / results.size(); @@ -56,7 +58,6 @@ void compare(std::vector sequence_files, urng_t input_vie std::ofstream outfile; for (int i = 0; i < sequence_files.size(); ++i) { - robin_hood::unordered_node_map hash_table{}; auto start = std::chrono::high_resolution_clock::now(); if constexpr (strobemers > 0) @@ -105,11 +106,144 @@ void compare(std::vector sequence_files, urng_t input_vie outfile.close(); } +/*! \brief Function, that increases how often a position is covered by one. + * \param covs A vector, where every entry represents a position in the sequence and how often it is covered. + * \param position Starting position. + * \param shape The positions of covered positions. + */ +void positions_covered(std::vector & covs, int position, seqan3::shape shape) +{ + for(int pos = 0; pos < shape.size(); pos++) + { + if (shape[pos] == 1) + covs[position+pos]++; + } +} + +/*! \brief Function, get the actual coverage of some submers. + * \param kmers A view that contains all possible k-mers (minimisers with window length = kmer length, if reverse strand should be considered). + * \param submers The submers, which coverage should be obtained. + * \param covs A vector, where every entry represents a position in the sequence and how often it is covered. + * \param shape Shape of a submer. + */ +template +void coverage(urng_t kmers, std::deque & submers, std::vector & covs, seqan3::shape shape) +{ + int i{}; + for (auto && elem : kmers) + { + if (elem == submers[0]) + { + positions_covered(covs, i, shape); + submers.pop_front(); + } + + if (submers.size() == 0) + break; + i++; + } +} + +/*! \brief Function, get the coverage of one sequence file for a method. + * Strobemers not supported, because strobemers should cover every position at least once because they are not + * a sampling method. + * \param sequence_file A sequence file. + * \param kmer_view View to compare to (Should be k-mers in most cases or minimisers with window length = kmer length). + * \param input_view View that should be evaluated. + * \param method_name Name of the tested method. + * \param args The arguments about the view to be used, needed for strobemers. + */ +template +void compare_cov(std::filesystem::path sequence_file, urng_t kmer_view, urng_t2 input_view, std::string method_name, range_arguments & args) +{ + std::vector islands{}; + int island{}; + int covered{}; + std::vector avg_islands{}; + std::vector largest_islands{}; + std::vector covered_percentage{}; + std::vector covereage_avg{}; + std::ofstream outfile; + + seqan3::sequence_file_input> fin{sequence_file}; + for (auto & [seq] : fin) + { + auto kmers = seq | kmer_view; + auto submers = seq | input_view; + + std::vector covs{}; + covs.assign(seq.size(), 0); + std::deque submers2{}; + + for(auto && sub: submers) + submers2.push_back(sub); + coverage(kmers, submers2, covs, args.shape); + + for(auto & elem : covs) + { + if (elem == 0) + { + island++; + } + else + { + if (island > 0) + { + islands.push_back(island); + island = 0; + } + covered++; + } + } + + covered_percentage.push_back(covered); + + double mean_cov{}; + double var_cov{}; + get_mean_and_var(covs, mean_cov, var_cov); + covereage_avg.push_back(mean_cov); + + if (islands.size() == 0) + { + avg_islands.push_back(0); + largest_islands.push_back(0); + } + else + { + double mean{}; + double var{}; + get_mean_and_var(islands, mean, var); + avg_islands.push_back(mean); + largest_islands.push_back(*std::max_element(islands.begin(), islands.end())); + } + + islands.clear(); + island = 0; + covered = 0; + } + + double mean_covered, mean_largest_island, mean_avg_island, stdev_covered, stdev_largest_island, stdev_avg_island; + get_mean_and_var(covered_percentage, mean_covered, stdev_covered); + get_mean_and_var(largest_islands, mean_largest_island, stdev_largest_island); + get_mean_and_var(avg_islands, mean_avg_island, stdev_avg_island); + + std::nth_element(covereage_avg.begin(), covereage_avg.begin() + covereage_avg.size()/2, covereage_avg.end()); + int median{}; + + // Store speed and compression + outfile.open(std::string{args.path_out} + method_name + "_coverage.out"); + outfile << "Covered\t"<< method_name << "\t" << *std::min_element(covered_percentage.begin(), covered_percentage.end()) << "\t" << mean_covered << "\t" << stdev_covered << "\t" << *std::max_element(covered_percentage.begin(), covered_percentage.end()) << "\n"; + outfile << "Covered Median\t"<< method_name << "\t" << *std::min_element(covereage_avg.begin(), covereage_avg.end()) << "\t" << median << "\t" << *std::max_element(covereage_avg.begin(), covereage_avg.end()) << "\n"; + outfile << "Largest Island\t"<< method_name << "\t" << *std::min_element(largest_islands.begin(), largest_islands.end()) << "\t" << mean_largest_island << "\t" << stdev_largest_island << "\t" << *std::max_element(largest_islands.begin(), largest_islands.end()) << "\n"; + outfile << "Avg Island\t"<< method_name << "\t" << *std::min_element(avg_islands.begin(), avg_islands.end()) << "\t" << mean_avg_island << "\t" << stdev_avg_island << "\t" << *std::max_element(avg_islands.begin(), avg_islands.end()) << "\n"; + outfile.close(); +} + void do_comparison(std::vector sequence_files, range_arguments & args) { switch(args.name) { - case kmer: compare(sequence_files, seqan3::views::kmer_hash(seqan3::ungapped{args.k_size}), "kmer_hash_"+std::to_string(args.k_size), args); + case kmer: compare(sequence_files, seqan3::views::kmer_hash(args.shape), "kmer_hash_"+std::to_string(args.k_size), args); break; case minimiser: compare(sequence_files, seqan3::views::minimiser_hash(args.shape, args.w_size), "minimiser_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()), args); @@ -129,3 +263,16 @@ void do_comparison(std::vector sequence_files, range_argu "minstrobemers_" + std::to_string(args.k_size) + "_" + std::to_string(args.order) + "_" + std::to_string(args.w_min) + "_" + std::to_string(args.w_max), args); } } + +void do_coverage(std::filesystem::path sequence_file, range_arguments & args) +{ + switch(args.name) + { + case kmer: compare_cov(sequence_file, seqan3::views::kmer_hash(args.shape), seqan3::views::kmer_hash(args.shape), "kmer_hash_"+std::to_string(args.k_size), args); + break; + case minimiser: compare_cov(sequence_file, seqan3::views::minimiser_hash(args.shape, + seqan3::window_size{args.shape.size()}), seqan3::views::minimiser_hash(args.shape, + args.w_size), "minimiser_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()), args); + break; + } +} diff --git a/src/main.cpp b/src/main.cpp index 3a9c57d..a99b08d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -19,7 +19,7 @@ void string_to_methods(std::string name, methods & m) m = strobemer; }; -void read_range_arguments_minimiser(seqan3::argument_parser & parser, range_arguments & args) +void read_range_arguments_strobemers(seqan3::argument_parser & parser, range_arguments & args) { parser.add_option(args.w_min, '\0', "w-min", "Define w-min for strobemers."); parser.add_option(args.w_max, '\0', "w-max", "Define w-ax for strobemers."); @@ -29,7 +29,7 @@ void read_range_arguments_minimiser(seqan3::argument_parser & parser, range_argu parser.add_flag(args.minstrobers, '\0', "minstrobers", "If minstrobemers should be calculated."); } -void read_range_arguments_strobemers(seqan3::argument_parser & parser, range_arguments & args) +void read_range_arguments_minimiser(seqan3::argument_parser & parser, range_arguments & args) { parser.add_option(w_size, 'w', "window", "Define window size. Default: 60."); parser.add_option(shape, '\0', "shape", "Define a shape by the decimal of a bitvector, where 0 symbolizes a " @@ -47,6 +47,43 @@ void parsing(range_arguments & args) args.seed_se = seqan3::seed{adjust_seed(args.k_size, se)}; } +int coverage(seqan3::argument_parser & parser) +{ + range_arguments args{}; + std::filesystem::path sequence_file{}; + parser.add_positional_option(sequence_file, "Please provide at least one sequence file."); + parser.add_option(args.path_out, 'o', "out", "Directory, where output files should be saved."); + parser.add_option(args.k_size, 'k', "kmer-size", "Define kmer size."); + std::string method{}; + parser.add_option(method, '\0', "method", "Pick your method.", seqan3::option_spec::required, seqan3::value_list_validator{"kmer", "minimiser"}); + + read_range_arguments_minimiser(parser, args); + + try + { + parser.parse(); + parsing(args); + } + catch (seqan3::argument_parser_error const & ext) // catch user errors + { + seqan3::debug_stream << "Error. Incorrect command line input for coverage. " << ext.what() << "\n"; + return -1; + } + + try + { + string_to_methods(method, args.name); + do_coverage(sequence_file, args); + } + catch (const std::invalid_argument & e) + { + std::cerr << e.what() << std::endl; + return -1; + } + + return 0; +} + int speed(seqan3::argument_parser & parser) { range_arguments args{}; @@ -88,7 +125,7 @@ int speed(seqan3::argument_parser & parser) int main(int argc, char ** argv) { seqan3::argument_parser top_level_parser{"minions", argc, argv, seqan3::update_notifications::on, - {"speed"}}; + {"coverage", "speed"}}; // Parser top_level_parser.info.author = "Mitra Darvish"; // give parser some infos @@ -106,7 +143,9 @@ int main(int argc, char ** argv) seqan3::argument_parser & sub_parser = top_level_parser.get_sub_parser(); // hold a reference to the sub_parser - if (sub_parser.info.app_name == std::string_view{"minions-speed"}) + if (sub_parser.info.app_name == std::string_view{"minions-coverage"}) + coverage(sub_parser); + else if (sub_parser.info.app_name == std::string_view{"minions-speed"}) speed(sub_parser); return 0; diff --git a/test/api/comparison_test.cpp b/test/api/comparison_test.cpp index b2e8a24..b7c8d03 100644 --- a/test/api/comparison_test.cpp +++ b/test/api/comparison_test.cpp @@ -10,6 +10,7 @@ TEST(minions, small_example) range_arguments args{}; args.name = kmer; args.k_size = 19; + args.shape = seqan3::ungapped{19}; std::string expected{"kmer_hash_19 159493 159493 0 159493\n"}; args.path_out = std::filesystem::temp_directory_path(); do_comparison({DATADIR"example1.fasta"}, args); diff --git a/test/cli/CMakeLists.txt b/test/cli/CMakeLists.txt index 5a36173..8492e84 100644 --- a/test/cli/CMakeLists.txt +++ b/test/cli/CMakeLists.txt @@ -1,3 +1,5 @@ cmake_minimum_required (VERSION 3.8) -add_cli_test (minions_options_test.cpp FILES example1.fasta) +add_cli_test (minions_options_test.cpp) +add_cli_test (minions_coverage_test.cpp FILES example1.fasta) +add_cli_test (minions_speed_test.cpp FILES example1.fasta) diff --git a/test/cli/minions_coverage_test.cpp b/test/cli/minions_coverage_test.cpp new file mode 100644 index 0000000..892cf99 --- /dev/null +++ b/test/cli/minions_coverage_test.cpp @@ -0,0 +1,53 @@ +#include "cli_test.hpp" + +TEST_F(cli_test, no_options) +{ + cli_test_result result = execute_app("minions coverage"); + std::string expected + { + "minions-coverage\n" + "================\n" + " Try -h or --help for more information.\n" + }; + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, expected); + EXPECT_EQ(result.err, std::string{}); +} + +TEST_F(cli_test, kmer) +{ + cli_test_result result = execute_app("minions coverage --method kmer -k 19", data("example1.fasta")); + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{}); +} + +TEST_F(cli_test, gapped_kmer) +{ + cli_test_result result = execute_app("minions coverage --method kmer --shape 524223", data("example1.fasta")); + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{}); +} + +TEST_F(cli_test, minimiser) +{ + cli_test_result result = execute_app("minions coverage --method minimiser -k 19 -w 19 ", data("example1.fasta")); + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{}); +} + +TEST_F(cli_test, wrong_method) +{ + cli_test_result result = execute_app("minions coverage --method submer -k 19", data("example1.fasta")); + std::string expected + { + "Error. Incorrect command line input for coverage. Validation failed " + "for option --method: Value submer is not one of [kmer,minimiser].\n" + }; + + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.err, expected); + EXPECT_EQ(result.out, std::string{}); +} diff --git a/test/cli/minions_options_test.cpp b/test/cli/minions_options_test.cpp index 830da86..abf2b20 100644 --- a/test/cli/minions_options_test.cpp +++ b/test/cli/minions_options_test.cpp @@ -20,31 +20,10 @@ TEST_F(cli_test, fail_no_argument) std::string expected { "Parsing error. You either forgot or misspelled the subcommand! Please specify which sub-program you want to " - "use: one of [speed]. Use -h/--help for more information.\n" + "use: one of [coverage,speed]. Use -h/--help for more information.\n" }; EXPECT_NE(result.exit_code, 0); EXPECT_EQ(result.out, std::string{}); EXPECT_EQ(result.err, expected); } - -TEST_F(cli_test, with_argument) -{ - cli_test_result result = execute_app("minions speed --method kmer -k 19", data("example1.fasta")); - EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{}); -} - -TEST_F(cli_test, minimiser) -{ - cli_test_result result = execute_app("minions speed --method minimiser -k 19 -w 19 ", data("example1.fasta")); - EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{}); -} - -TEST_F(cli_test, strobemer) -{ - cli_test_result result = execute_app("minions speed --method strobemer -k 19 --w-min 16 --w-max 30 --order 2 --randstrobemers", data("example1.fasta")); - EXPECT_EQ(result.exit_code, 0); - EXPECT_EQ(result.out, std::string{}); -} diff --git a/test/cli/minions_speed_test.cpp b/test/cli/minions_speed_test.cpp new file mode 100644 index 0000000..60dccaf --- /dev/null +++ b/test/cli/minions_speed_test.cpp @@ -0,0 +1,52 @@ +#include "cli_test.hpp" + +TEST_F(cli_test, no_options) +{ + cli_test_result result = execute_app("minions speed"); + std::string expected + { + "minions-speed\n" + "=============\n" + " Try -h or --help for more information.\n" + }; + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, expected); + EXPECT_EQ(result.err, std::string{}); +} + +TEST_F(cli_test, with_argument) +{ + cli_test_result result = execute_app("minions speed --method kmer -k 19", data("example1.fasta")); + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{}); +} + +TEST_F(cli_test, minimiser) +{ + cli_test_result result = execute_app("minions speed --method minimiser -k 19 -w 19 ", data("example1.fasta")); + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{}); +} + +TEST_F(cli_test, strobemer) +{ + cli_test_result result = execute_app("minions speed --method strobemer -k 19 --w-min 16 --w-max 30 --order 2 --randstrobemers", data("example1.fasta")); + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{}); +} + +TEST_F(cli_test, wrong_method) +{ + cli_test_result result = execute_app("minions speed --method submer -k 19", data("example1.fasta")); + std::string expected + { + "Error. Incorrect command line input for speed. Validation failed " + "for option --method: Value submer is not one of [kmer,minimiser,strobemer].\n" + }; + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, expected); +}