From 0bd0503343a2f9389251d286ff201eabf0819656 Mon Sep 17 00:00:00 2001 From: Ivan Sovic Date: Wed, 23 Sep 2015 00:18:45 +0200 Subject: [PATCH] Fixed two bugs: there was a floating point precision problem on very large references that could be exhibited only on short (e.g. Illumina) reads and default semiglobal mode of alignment. The second problem was a bug in the anchored alignment, where in some rare cases two anchors might overlap. This could have caused some problematic CIGAR strings to appear (i.e. the CIGAR length would be shorter than the read length). --- src/alignment/local_realignment.cc | 12 +++++--- src/containers/path_graph_entry.h | 6 ++-- src/graphmap/experimental.cc | 48 ++++++++++++++++++++---------- src/graphmap/postprocess_lcs.cc | 22 +++++++------- 4 files changed, 54 insertions(+), 34 deletions(-) diff --git a/src/alignment/local_realignment.cc b/src/alignment/local_realignment.cc index db851f1..070b43d 100644 --- a/src/alignment/local_realignment.cc +++ b/src/alignment/local_realignment.cc @@ -355,7 +355,7 @@ int LocalRealignmentLinear(AlignmentFunctionType AlignmentFunction, const Single int64_t best_aligning_position = 0; int64_t l1_reference_start = best_path->get_l1_data().l1_lmin; - int64_t l1_reference_end = best_path->get_l1_data().l1_k * read->get_sequence_length() + best_path->get_l1_data().l1_lmax; + int64_t l1_reference_end = ((int64_t) (best_path->get_l1_data().l1_k * read->get_sequence_length())) + best_path->get_l1_data().l1_lmax; if (l1_reference_start < reference_start) l1_reference_start = reference_start; if (l1_reference_end >= (reference_start + reference_length)) @@ -456,7 +456,7 @@ int CalcEditDistanceLinear(EditDistanceFunctionType EditDistanceFunction, const int64_t best_aligning_position = 0; int64_t l1_reference_start = best_path->get_l1_data().l1_lmin; - int64_t l1_reference_end = best_path->get_l1_data().l1_k * read->get_sequence_length() + best_path->get_l1_data().l1_lmax; + int64_t l1_reference_end = ((int64_t) (best_path->get_l1_data().l1_k * read->get_sequence_length())) + best_path->get_l1_data().l1_lmax; if (l1_reference_start < reference_start) l1_reference_start = reference_start; if (l1_reference_end >= (reference_start + reference_length)) @@ -507,7 +507,7 @@ int LocalRealignmentCircular(AlignmentFunctionType AlignmentFunction, const Sing int64_t reference_id = best_path->get_region_data().reference_id; SeqOrientation orientation = kForward; int64_t l1_reference_start = best_path->get_l1_data().l1_lmin; - int64_t l1_reference_end = best_path->get_l1_data().l1_k * read->get_sequence_length() + best_path->get_l1_data().l1_lmax; + int64_t l1_reference_end = ((int64_t) (best_path->get_l1_data().l1_k * read->get_sequence_length())) + best_path->get_l1_data().l1_lmax; if (l1_reference_start < 0) l1_reference_start = 0; if (l1_reference_end >= region_length_joined) @@ -610,7 +610,11 @@ int LocalRealignmentCircular(AlignmentFunctionType AlignmentFunction, const Sing // CountAlignmentOperations(alignment, read->get_data(), index->get_data(), reference_id, best_aligning_position_start, orientation, ret_eq_op, ret_x_op, ret_i_op, ret_d_op); + if (CheckAlignmentSane(alignment_left_part, read, index, reference_id, best_aligning_position_left_part)) + return -1; + if (CheckAlignmentSane(alignment_right_part, read, index, reference_id, best_aligning_position_right_part)) + return -1; //#ifndef RELEASE_VERSION // if (parameters.verbose_level > 5 && ((int64_t) read->get_sequence_id()) == parameters.debug_read) { @@ -670,7 +674,7 @@ int CalcEditDistanceCircular(EditDistanceFunctionType EditDistanceFunction, cons int64_t reference_id = best_path->get_region_data().reference_id; SeqOrientation orientation = kForward; int64_t l1_reference_start = best_path->get_l1_data().l1_lmin; - int64_t l1_reference_end = best_path->get_l1_data().l1_k * read->get_sequence_length() + best_path->get_l1_data().l1_lmax; + int64_t l1_reference_end = ((int64_t) (best_path->get_l1_data().l1_k * read->get_sequence_length())) + best_path->get_l1_data().l1_lmax; if (l1_reference_start < 0) l1_reference_start = 0; if (l1_reference_end >= region_length_joined) diff --git a/src/containers/path_graph_entry.h b/src/containers/path_graph_entry.h index 0415c3e..1f3ae8e 100644 --- a/src/containers/path_graph_entry.h +++ b/src/containers/path_graph_entry.h @@ -68,11 +68,11 @@ typedef struct InfoMapping { typedef struct InfoL1 { int64_t l1_l = 0; - float l1_k = 1.0f; + double l1_k = 1.0f; int64_t l1_lmin = 0; int64_t l1_lmax = 0; - float l1_confidence_abs = 0; - float l1_std = 0; + double l1_confidence_abs = 0; + double l1_std = 0; int64_t l1_rough_start = 0; int64_t l1_rough_end = 0; } InfoL1; diff --git a/src/graphmap/experimental.cc b/src/graphmap/experimental.cc index c213b04..9e473a0 100644 --- a/src/graphmap/experimental.cc +++ b/src/graphmap/experimental.cc @@ -77,7 +77,7 @@ int GraphMap::ExperimentalPostProcessRegionWithLCS_(ScoreRegistry* local_score, if (CheckDistanceTooBig(local_score->get_registry_entries(), current_lcskp_index, current_lcskp_index, parameters) == true) continue; - if (last_nonskipped_i > lcskpp_indices.size()) { + if (new_cluster == NULL || last_nonskipped_i > lcskpp_indices.size()) { // cluster.push_back(lcskpp_indices.at(i)); } else { @@ -90,8 +90,14 @@ int GraphMap::ExperimentalPostProcessRegionWithLCS_(ScoreRegistry* local_score, if (wrong_to_previous1 == true && wrong_to_previous2 == true) { /// In this case, the new point is a general outlier to the previous LCSk, because it doesn't fit neither to the previous point, nor to the point before that. if (new_cluster != NULL) { - clusters.push_back(new_cluster); - new_cluster = NULL; + if (clusters.size() > 0 && (new_cluster->query.start <= clusters.back()->query.end || new_cluster->ref.start <= clusters.back()->ref.end)) { + delete new_cluster; + new_cluster = NULL; + } else { + clusters.push_back(new_cluster); + new_cluster = NULL; + } + } } else if (wrong_to_previous1 == true && wrong_to_previous2 == false) { /// In this case, the previous point was an outlier, because the new point fits better to the one before the previous one. Overwrite the previous entry in new_cluster. @@ -110,22 +116,32 @@ int GraphMap::ExperimentalPostProcessRegionWithLCS_(ScoreRegistry* local_score, continue; } } - if (new_cluster == NULL) { - new_cluster = new ClusterAndIndices; - new_cluster->query.start = local_score->get_registry_entries().query_starts[current_lcskp_index]; - new_cluster->ref.start = local_score->get_registry_entries().reference_starts[current_lcskp_index]; - } - new_cluster->query.end = local_score->get_registry_entries().query_ends[current_lcskp_index] + parameters->k_graph - 1; - new_cluster->ref.end = local_score->get_registry_entries().reference_ends[current_lcskp_index] + parameters->k_graph - 1; - new_cluster->num_anchors += 1; - new_cluster->coverage += local_score->get_registry_entries().covered_bases_queries[current_lcskp_index]; - new_cluster->lcskpp_indices.push_back(current_lcskp_index); - last_nonskipped_i = i; + if (clusters.size() == 0 || (clusters.size() > 0 && (local_score->get_registry_entries().query_starts[current_lcskp_index] > clusters.back()->query.end && local_score->get_registry_entries().reference_starts[current_lcskp_index] > clusters.back()->ref.end))) { + if (new_cluster == NULL) { + new_cluster = new ClusterAndIndices; + new_cluster->query.start = local_score->get_registry_entries().query_starts[current_lcskp_index]; + new_cluster->ref.start = local_score->get_registry_entries().reference_starts[current_lcskp_index]; + } + new_cluster->query.end = local_score->get_registry_entries().query_ends[current_lcskp_index] + parameters->k_graph - 1; + new_cluster->ref.end = local_score->get_registry_entries().reference_ends[current_lcskp_index] + parameters->k_graph - 1; + new_cluster->num_anchors += 1; + new_cluster->coverage += local_score->get_registry_entries().covered_bases_queries[current_lcskp_index]; + new_cluster->lcskpp_indices.push_back(current_lcskp_index); + + last_nonskipped_i = i; + } } if (new_cluster != NULL) { - clusters.push_back(new_cluster); - new_cluster = NULL; +// clusters.push_back(new_cluster); +// new_cluster = NULL; + if (clusters.size() > 0 && (new_cluster->query.start <= clusters.back()->query.end || new_cluster->ref.start <= clusters.back()->ref.end)) { + delete new_cluster; + new_cluster = NULL; + } else { + clusters.push_back(new_cluster); + new_cluster = NULL; + } } // for (int64_t i=0; i= maximum_allowed_deviation) + if (distance > maximum_allowed_deviation) continue; confidence_L1 += ((float) abs(distance)) / ((float) num_points_under_max_dev_threshold); @@ -332,8 +332,8 @@ int GraphMap::PostProcessRegionWithLCS_(ScoreRegistry* local_score, MappingData* } // Find the L1 parameters (median line and the confidence intervals). - float l_diff = read->get_sequence_length() * parameters->error_rate; - float maximum_allowed_deviation = l_diff * sqrt(2.0f) / 2.0f; + double l_diff = read->get_sequence_length() * parameters->error_rate; + double maximum_allowed_deviation = l_diff * sqrt(2.0f) / 2.0f; float sigma_L2 = 0.0f, confidence_L1 = 0.0f; int64_t k = 0, l = 0; // Actual L1 calculation. @@ -418,12 +418,12 @@ int GraphMap::PostProcessRegionWithLCS_(ScoreRegistry* local_score, MappingData* InfoL1 l1_info; l1_info.l1_l = l; l1_info.l1_k = 1.0f; - l1_info.l1_lmin = l - l_diff; - l1_info.l1_lmax = l + l_diff; + l1_info.l1_lmin = ((double) l) - ((double) l_diff); + l1_info.l1_lmax = ((double) l) + ((double) l_diff); l1_info.l1_confidence_abs = confidence_L1; l1_info.l1_std = sigma_L2; l1_info.l1_rough_start = l1_info.l1_k * 0 + l1_info.l1_lmin; - l1_info.l1_rough_end = l1_info.l1_k * read->get_sequence_length() + l1_info.l1_lmax; + l1_info.l1_rough_end = ((double) l1_info.l1_k * read->get_sequence_length()) + l1_info.l1_lmax; if (l1_info.l1_rough_start < index->get_reference_starting_pos()[local_score->get_region().reference_id]) l1_info.l1_rough_start = index->get_reference_starting_pos()[local_score->get_region().reference_id]; if (l1_info.l1_rough_end >= (index->get_reference_starting_pos()[local_score->get_region().reference_id] + index->get_reference_lengths()[local_score->get_region().reference_id])) @@ -482,12 +482,12 @@ int GraphMap::VerboseLocalScoresToFile(std::string file_path, const SingleSequen } else { float distance1 = abs((float) ((local_score->get_registry_entries().reference_starts[indices->at(i)] - local_score->get_registry_entries().query_starts[indices->at(i)]) - l_median) * (sqrt(2.0f)) / 2.0f); - if (distance1 < maximum_allowed_deviation) { + if (distance1 <= maximum_allowed_deviation) { fprintf (fp, "%ld\t%ld\n", local_score->get_registry_entries().query_starts[indices->at(i)], local_score->get_registry_entries().reference_starts[indices->at(i)]); } float distance2 = abs((float) ((local_score->get_registry_entries().reference_ends[indices->at(i)] - local_score->get_registry_entries().query_ends[indices->at(i)]) - l_median) * (sqrt(2.0f)) / 2.0f); - if (distance2 < maximum_allowed_deviation) { + if (distance2 <= maximum_allowed_deviation) { fprintf (fp, "%ld\t%ld\n", local_score->get_registry_entries().query_ends[indices->at(i)], local_score->get_registry_entries().reference_ends[indices->at(i)]); } } @@ -501,12 +501,12 @@ int GraphMap::VerboseLocalScoresToFile(std::string file_path, const SingleSequen } else { float distance1 = abs((float) ((local_score->get_registry_entries().reference_starts[i] - local_score->get_registry_entries().query_starts[i]) - l_median) * (sqrt(2.0f)) / 2.0f); - if (distance1 < maximum_allowed_deviation) { + if (distance1 <= maximum_allowed_deviation) { fprintf (fp, "%ld\t%ld\n", local_score->get_registry_entries().query_starts[i], local_score->get_registry_entries().reference_starts[i]); } float distance2 = abs((float) ((local_score->get_registry_entries().reference_ends[i] - local_score->get_registry_entries().query_ends[i]) - l_median) * (sqrt(2.0f)) / 2.0f); - if (distance2 < maximum_allowed_deviation) { + if (distance2 <= maximum_allowed_deviation) { fprintf (fp, "%ld\t%ld\n", local_score->get_registry_entries().query_ends[i], local_score->get_registry_entries().reference_ends[i]); } }