From db1362c39748ba56374ac770a26adc0ae3ec054a Mon Sep 17 00:00:00 2001 From: "Ivan Sovic (T530)" Date: Mon, 16 Nov 2015 01:52:31 +0100 Subject: [PATCH] Fixed issues with overhanging bases when aligning near a gap while using anchored alignment. The first issue was that the incremental hash key should have been reset when an unknown base was met, because it couldn't be 2bitpacked. The other one was that SeqAn, unlike EDlib, reports alignments even if all bases are Xs. I placed a threshold on the edit distance on alignment of the front and back part of a read (before first and after last alignment). These parts should not have edit distance larger than half their length. Otherwise, the read will be clipped on that end. --- .gitignore | 1 + Makefile | 24 +++++++++++++++++---- src/alignment/local_realignment_wrappers.cc | 7 ++++++ src/graphmap/anchored_mapping.cc | 12 +++++++++-- src/graphmap/core_graphmap.cc | 3 ++- src/index/index_hash.cc | 9 ++++++-- 6 files changed, 47 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 6ac2b07..d8abaec 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ obj_debug/ obj_linux/ obj_mac/ obj_test/ +obj_testext/ obj_extcigar/ # bin/graphmap-not_release # bin/graphmap-debug diff --git a/Makefile b/Makefile index 3318d06..a94241b 100755 --- a/Makefile +++ b/Makefile @@ -2,7 +2,8 @@ BIN = ./bin/graphmap-not_release BIN_DEBUG = ./bin/graphmap-debug BIN_LINUX = ./bin/Linux-x64/graphmap BIN_MAC = ./bin/Mac/graphmap -OBJ = ./obj_test +OBJ_TESTING = ./obj_test +OBJ_TESTING_EXT = ./obj_testext OBJ_DEBUG = ./obj_debug OBJ_LINUX = ./obj_linux OBJ_EXTCIGAR = ./obj_extcigar @@ -22,7 +23,8 @@ CC_FILES := $(wildcard $(SOURCE)/*/*.cc) $(wildcard $(SOURCE)/*.cc) H_FILES := $(wildcard $(SOURCE)/*/*.h) $(wildcard $(SOURCE)/*.h) OBJ_FILES := $(CPP_FILES:.cpp=.o) $(CC_FILES:.cc=.o) -OBJ_FILES_FOLDER := $(addprefix $(OBJ)/,$(OBJ_FILES)) +OBJ_FILES_FOLDER_TESTING := $(addprefix $(OBJ_TESTING)/,$(OBJ_FILES)) +OBJ_FILES_FOLDER_TESTING_EXT := $(addprefix $(OBJ_TESTING_EXT)/,$(OBJ_FILES)) OBJ_FILES_FOLDER_DEBUG := $(addprefix $(OBJ_DEBUG)/,$(OBJ_FILES)) OBJ_FILES_FOLDER_LINUX := $(addprefix $(OBJ_LINUX)/,$(OBJ_FILES)) OBJ_FILES_FOLDER_EXTCIGAR := $(addprefix $(OBJ_EXTCIGAR)/,$(OBJ_FILES)) @@ -36,6 +38,7 @@ CC_FLAGS_DEBUG = -O0 -g -rdynamic -c -fmessage-length=0 -ffreestanding -fopenmp CC_FLAGS_RELEASE = -DRELEASE_VERSION -O3 -fdata-sections -ffunction-sections -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -pthread # -march=native CC_FLAGS_EXTCIGAR = -DRELEASE_VERSION -DUSE_EXTENDED_CIGAR_FORMAT -O3 -fdata-sections -ffunction-sections -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -pthread -march=native CC_FLAGS_NOT_RELEASE = -O3 -fdata-sections -ffunction-sections -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -Wuninitialized -pthread -march=native +CC_FLAGS_NOT_RELEASE_EXT = -O3 -DUSE_EXTENDED_CIGAR_FORMAT -fdata-sections -ffunction-sections -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -Wuninitialized -pthread -march=native LD_FLAGS = -static-libgcc -static-libstdc++ -m64 -ffreestanding LD_LIBS = -lpthread -lgomp -lm -lz -ldivsufsort64 @@ -45,9 +48,9 @@ all: gcc_version_check linux -testing: $(OBJ_FILES_FOLDER) +testing: $(OBJ_FILES_FOLDER_TESTING) mkdir -p $(dir $(BIN)) - $(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN) $(OBJ_FILES_FOLDER) $(LD_LIBS) + $(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN) $(OBJ_FILES_FOLDER_TESTING) $(LD_LIBS) obj_test/%.o: %.cc $(H_FILES) mkdir -p $(dir $@) @@ -57,6 +60,19 @@ obj_test/%.o: %.cpp $(H_FILES) mkdir -p $(dir $@) $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_NOT_RELEASE) -o $@ $< +testingext: $(OBJ_FILES_FOLDER_TESTING_EXT) + mkdir -p $(dir $(BIN)) + $(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN) $(OBJ_FILES_FOLDER_TESTING_EXT) $(LD_LIBS) + +obj_testext/%.o: %.cc $(H_FILES) + mkdir -p $(dir $@) + $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_NOT_RELEASE_EXT) -o $@ $< + +obj_testext/%.o: %.cpp $(H_FILES) + mkdir -p $(dir $@) + $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_NOT_RELEASE_EXT) -o $@ $< + + gcc_version_check: ifneq ($(GCC_MAJOR_VERSION_GE_4), 1) diff --git a/src/alignment/local_realignment_wrappers.cc b/src/alignment/local_realignment_wrappers.cc index 3fdcd69..f29e0c2 100644 --- a/src/alignment/local_realignment_wrappers.cc +++ b/src/alignment/local_realignment_wrappers.cc @@ -130,6 +130,7 @@ int SeqAnAlignmentToEdlibAlignmentNoCigar(seqan::Align &align std::vector alignment; // alignment.reserve(align.data_rows[0]. + int64_t aln_edit_distance = 0; // Reinitilaize the iterators. TGapsIterator itGapsText = seqan::begin(gapsText); @@ -141,6 +142,7 @@ int SeqAnAlignmentToEdlibAlignmentNoCigar(seqan::Align &align if (alignment_type == ALIGNMENT_TYPE_NW || alignment_type == ALIGNMENT_TYPE_SHW) { alignment.insert(alignment.begin(), (*ret_start_offset), (char) EDLIB_D); + aln_edit_distance += (*ret_start_offset); *ret_start_offset = 0; } @@ -155,6 +157,7 @@ int SeqAnAlignmentToEdlibAlignmentNoCigar(seqan::Align &align alignment.insert(alignment.end(), num_gaps, (char) EDLIB_D); itGapsText += num_gaps; itGapsPattern += num_gaps; + aln_edit_distance += num_gaps; continue; } // Count deletions. @@ -163,6 +166,7 @@ int SeqAnAlignmentToEdlibAlignmentNoCigar(seqan::Align &align alignment.insert(alignment.end(), num_gaps, (char) EDLIB_I); itGapsText += num_gaps; itGapsPattern += num_gaps; + aln_edit_distance += num_gaps; continue; } @@ -184,6 +188,7 @@ int SeqAnAlignmentToEdlibAlignmentNoCigar(seqan::Align &align { ++numChar; alignment.insert(alignment.end(), 1, (char) EDLIB_X); + aln_edit_distance += 1; ++itGapsText; ++itGapsPattern; } @@ -196,6 +201,7 @@ int SeqAnAlignmentToEdlibAlignmentNoCigar(seqan::Align &align if (alignment_type == ALIGNMENT_TYPE_NW) { alignment.insert(alignment.end(), (*ret_end_offset), (char) EDLIB_D); + aln_edit_distance += (*ret_end_offset); *ret_end_offset = 0; } @@ -214,6 +220,7 @@ int SeqAnAlignmentToEdlibAlignmentNoCigar(seqan::Align &align // *ret_end_offset = 0; ret_alignment = alignment; + *edit_distance = aln_edit_distance; // if (alignment.size() > 0) // cigar = AlignmentToCigar((unsigned char *) &alignment[0], alignment.size()); diff --git a/src/graphmap/anchored_mapping.cc b/src/graphmap/anchored_mapping.cc index 0c00b73..19cbfee 100644 --- a/src/graphmap/anchored_mapping.cc +++ b/src/graphmap/anchored_mapping.cc @@ -468,6 +468,7 @@ int AnchoredAlignment(bool is_linear, bool end_to_end, AlignmentFunctionType Ali int64_t query_start = best_path->get_mapping_data().clusters.front().query.start; // - clip_count_front; int64_t query_end = best_path->get_mapping_data().clusters.back().query.end; // + clip_count_back; /////I + /// Aligning the begining of the read (in front of the first anchor). if (clip_count_front > 0) { /// Check if we need to extend the alignment to the left boundary. Also, even if the user specified it, if we are to close to the boundary, just clip it. if (end_to_end == false || ((alignment_position_start - clip_count_front*2) < reference_start)) { @@ -491,7 +492,10 @@ int AnchoredAlignment(bool is_linear, bool end_to_end, AlignmentFunctionType Ali -1, parameters.match_score, -parameters.mismatch_penalty, -parameters.gap_open_penalty, -parameters.gap_extend_penalty, &leftover_left_start, &leftover_left_end, &leftover_left_edit_distance, leftover_left_alignment); - if (ret_code_right != 0) { + /// Check if the return code is ok. Otherwise, just clip the front. + /// Added on 15.11.2015.: check if the edit distance of the front part is too high. EDlib will automatically return an error code, but SeqAn won't. + /// An example is when the entire front part does not match (e.g. alignment of a read to a part of the reference consisted of N bases). + if (ret_code_right != 0 || leftover_left_edit_distance > clip_count_front/2) { // TODO: This is a nasty hack. EDlib used to crash when query and target are extremely small, e.g. query = "C" and target = "TC". // In this manner we just ignore the leading part, and clip it. std::vector insertions_front(clip_count_front, EDLIB_I); @@ -718,6 +722,7 @@ int AnchoredAlignment(bool is_linear, bool end_to_end, AlignmentFunctionType Ali + /// Aligning the end of the read. if (clip_count_back > 0) { /// Handle the clipping at the end, or extend alignment to the end of the sequence. if (end_to_end == false || (alignment_position_end + 1 + clip_count_back * 2) >= (reference_start + reference_length)) { @@ -737,7 +742,10 @@ int AnchoredAlignment(bool is_linear, bool end_to_end, AlignmentFunctionType Ali -1, parameters.match_score, -parameters.mismatch_penalty, -parameters.gap_open_penalty, -parameters.gap_extend_penalty, &leftover_right_start, &leftover_right_end, &leftover_right_edit_distance, leftover_right_alignment); - if (ret_code_right != 0) { + /// Check if the return code is ok. Otherwise, just clip the back. + /// Added on 15.11.2015.: check if the edit distance of the back part is too high. EDlib will automatically return an error code, but SeqAn won't. + /// An example is when the entire back part does not match (e.g. alignment of a read to a part of the reference consisted of N bases). + if (ret_code_right != 0 || leftover_right_edit_distance > clip_count_back/2) { // TODO: This is a nasty hack. EDlib used to crash when query and target are extremely small, e.g. query = "C" and target = "TC". // In this manner we just ignore the trailing part, and clip it. std::vector insertions_back(clip_count_back, EDLIB_I); diff --git a/src/graphmap/core_graphmap.cc b/src/graphmap/core_graphmap.cc index ac85fcd..352a1a3 100644 --- a/src/graphmap/core_graphmap.cc +++ b/src/graphmap/core_graphmap.cc @@ -77,13 +77,14 @@ int GraphMap::ProcessKmerCacheFriendly_(int8_t *kmer, int64_t kmer_start_positio uint64_t hits_start = -1; uint64_t num_hits = 0; int64_t *hits = NULL; + int ret_search = ((IndexHash *) index_read)->FindAllRawPositionsOfIncrementalSeed(kmer, (uint64_t) k, (uint64_t) parameters->max_num_hits, &hits, &hits_start, &num_hits); if (ret_search == 1) { // There are no hits for the current kmer. return 1; } else if (ret_search == 2) { // There are too many seeds for the current kmer. // We ignore this case, and we'll use all the hits. - } else if (ret_search > 2) { // Something other went wrong. + } else if (ret_search > 2 || ret_search < 0) { // Something other went wrong. return 2; } diff --git a/src/index/index_hash.cc b/src/index/index_hash.cc index 764c4e5..77cd51b 100644 --- a/src/index/index_hash.cc +++ b/src/index/index_hash.cc @@ -225,19 +225,23 @@ int IndexHash::GenerateFromSingleSequenceOnlyForward(const SingleSequence& seque // } int64_t hash_key = -1; + bool last_skipped = true; for (uint64_t i=0; i<(sequence.get_sequence_length() - k_ + 1); i++) { int8_t *seed_start = &(data_[i]); // int64_t hash_key = GenerateHashKey(seed_start, k_); - if (i == 0) { + if (i == 0 || last_skipped == true) { hash_key = GenerateHashKey(seed_start, k_); + last_skipped = false; } else { hash_key = UpdateHashKey(seed_start, k_, hash_key); } - if (hash_key < 0) + if (hash_key < 0) { + last_skipped = true; continue; + } // kmer_hash_[hash_key].push_back(((int64_t) i)); // printf ("\nkmer_counts[%ld] = %ld\n", hash_key, kmer_counts[hash_key]); @@ -319,6 +323,7 @@ int IndexHash::FindAllRawPositionsOfIncrementalSeed(int8_t* seed, uint64_t seed_ } if (hash_key < 0 || hash_key >= kmer_hash_size_) { + kmer_hash_last_key_initialized_ = false; return 3; }