diff --git a/bwape.c b/bwape.c index 2a7f46ef..7dc3d740 100644 --- a/bwape.c +++ b/bwape.c @@ -633,7 +633,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f gap_opt_t opt, opt0; khint_t iter; isize_info_t last_ii; // this is for the last batch of reads - char str[1024]; + char str[1024], magic[2][4]; bwt_t *bwt; uint8_t *pac; @@ -648,6 +648,12 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f g_hash = kh_init(b128); last_ii.avg = -1.0; + err_fread_noeof(magic[0], 1, 4, fp_sa[0]); + err_fread_noeof(magic[1], 1, 4, fp_sa[1]); + if (strncmp(magic[0], SAI_MAGIC, 4) != 0 || strncmp(magic[1], SAI_MAGIC, 4) != 0) { + fprintf(stderr, "[E::%s] Unmatched SAI magic. Please re-run `aln' with the same version of bwa.\n", __func__); + exit(1); + } err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[0]); ks[0] = bwa_open_reads(opt.mode, fn_fa[0]); opt0 = opt; diff --git a/bwase.c b/bwase.c index 85165d64..dcf29bfa 100644 --- a/bwase.c +++ b/bwase.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "bwase.h" #include "bwtaln.h" #include "bntseq.h" @@ -36,6 +37,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma if (p->score > best) break; if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) { s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; + s->ref_shift = (int)p->n_del - (int)p->n_ins; s->score = p->score; s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48()); } @@ -71,6 +73,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma for (l = q->k; l <= q->l; ++l) { s->multi[z].pos = l; s->multi[z].gap = q->n_gapo + q->n_gape; + s->multi[z].ref_shift = (int)q->n_del - (int)q->n_ins; s->multi[z++].mm = q->n_mm; } rest -= q->l - q->k + 1; @@ -81,6 +84,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma while (x < p) p -= p * j / (i--); s->multi[z].pos = q->l - i; s->multi[z].gap = q->n_gapo + q->n_gape; + s->multi[z].ref_shift = (int)q->n_del - (int)q->n_ins; s->multi[z++].mm = q->n_mm; } rest = 0; @@ -107,16 +111,15 @@ int bwa_approx_mapQ(const bwa_seq_t *p, int mm) return (23 < g_log_n[n])? 0 : 23 - g_log_n[n]; } -bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand) +bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int ref_len, int *strand) { bwtint_t pos_f; int is_rev; - pos_f = bns_depos(bns, bwt_sa(bwt, sapos), &is_rev); // pos_f + pos_f = bwt_sa(bwt, sapos); // position on the forward-reverse coordinate + if (pos_f < bns->l_pac && bns->l_pac < pos_f + ref_len) return (bwtint_t)-1; + pos_f = bns_depos(bns, pos_f, &is_rev); // position on the forward strand; this may be the first base or the last base *strand = !is_rev; - /* NB: For gapped alignment, pacpos may not be correct, which will be fixed - * in bwa_refine_gapped_core(). This line also determines the way "x" is - * calculated in bwa_refine_gapped_core() when (ext < 0 && is_end == 0). */ - if (is_rev) pos_f = pos_f + 1 < len? 0 : pos_f - len + 1; // mapped to the forward strand + if (is_rev) pos_f = pos_f + 1 < ref_len? 0 : pos_f - ref_len + 1; // position of the first base return pos_f; // FIXME: it is possible that pos_f < bns->anns[ref_id].offset } @@ -132,9 +135,11 @@ void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t *bwt, bwa_seq_t *seq, if (seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) return; max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm; seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); - seq->pos = bwa_sa2pos(bns, bwt, seq->sa, seq->len, &strand); + //fprintf(stderr, "%d\n", seq->ref_shift); + seq->pos = bwa_sa2pos(bns, bwt, seq->sa, seq->len + seq->ref_shift, &strand); seq->strand = strand; seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); + if (seq->pos == (bwtint_t)-1) seq->type = BWA_TYPE_NO_MATCH; } void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr) @@ -150,9 +155,9 @@ void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_se bwa_cal_pac_pos_core(bns, bwt, p, max_mm, fnr); for (j = n_multi = 0; j < p->n_multi; ++j) { bwt_multi1_t *q = p->multi + j; - q->pos = bwa_sa2pos(bns, bwt, q->pos, p->len, &strand); + q->pos = bwa_sa2pos(bns, bwt, q->pos, p->len + p->ref_shift, &strand); q->strand = strand; - if (q->pos != p->pos) + if (q->pos != p->pos && q->pos != (bwtint_t)-1) p->multi[n_multi++] = *q; } p->n_multi = n_multi; @@ -162,64 +167,25 @@ void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_se #define SW_BW 50 -bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, bwtint_t *_pos, int *n_cigar, int is_rev) +bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, int ref_shift, bwtint_t rb, int *n_cigar) { bwa_cigar_t *cigar = 0; uint32_t *cigar32 = 0; ubyte_t *rseq; - int tle, qle, gtle, gscore; - int64_t k, rb, re, rlen; + int64_t k, re, rlen; int8_t mat[25]; bwa_fill_scmat(1, 3, mat); - if (!is_rev) { // forward strand, the end position is correct - re = *_pos + len; - if (re > l_pac) re = l_pac; - rb = re - (len + SW_BW); - if (rb < 0) rb = 0; - rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); - seq_reverse(len, seq, 0); // as we need to do left extension, we have to reverse both query and reference sequences - seq_reverse(rlen, rseq, 0); - ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, 0, -1, len<<1, &qle, &tle, >le, &gscore, 0); - if (gscore > 0) tle = gtle, qle = len; - rb = re - tle; rlen = tle; - seq_reverse(len, seq, 0); - seq_reverse(rlen, rseq, 0); - if (rlen == 0) goto refine_gapped_err; - ksw_global(qle, &seq[len-qle], rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); - if (qle < len) { // write soft clip - cigar = realloc(cigar, (*n_cigar + 1) * 4); - memmove(cigar + 1, cigar, *n_cigar * 4); - cigar[0] = (len - qle)<<4 | FROM_S; - ++(*n_cigar); - } - } else { // reverse strand, the start position is correct - rb = *_pos; re = rb + len + SW_BW; - if (re > l_pac) re = l_pac; - rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); - ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, 0, -1, len<<1, &qle, &tle, >le, &gscore, 0); - if (gscore > 0) tle = gtle, qle = len; - re = rb + tle; rlen = tle; - if (rlen == 0) goto refine_gapped_err; - ksw_global(qle, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension - if (qle < len) { - cigar = realloc(cigar, (*n_cigar + 1) * 4); - cigar[*n_cigar - 1] = (len - qle)<<4 | FROM_S; - ++(*n_cigar); - } - } - *_pos = rb; - + re = rb + len + ref_shift; + assert(re <= l_pac); + rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); + assert(re - rb == rlen); + ksw_global(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension cigar = (bwa_cigar_t*)cigar32; for (k = 0; k < *n_cigar; ++k) cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); free(rseq); return cigar; - -refine_gapped_err: - free(rseq); - *n_cigar = 0; - return 0; } char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_t *seq, @@ -311,7 +277,7 @@ void bwa_correct_trimmed(bwa_seq_t *s) void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq) { ubyte_t *pacseq; - int i, j; + int i, j, k; kstring_t *str; if (!_pacseq) { @@ -322,15 +288,18 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t for (i = 0; i != n_seqs; ++i) { bwa_seq_t *s = seqs + i; seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!! - for (j = 0; j < s->n_multi; ++j) { + for (j = k = 0; j < s->n_multi; ++j) { bwt_multi1_t *q = s->multi + j; int n_cigar; - if (q->gap == 0) continue; - q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos, &n_cigar, q->strand); - q->n_cigar = n_cigar; + if (q->gap) { // gapped alignment + q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, q->ref_shift, q->pos, &n_cigar); + q->n_cigar = n_cigar; + if (q->cigar) s->multi[k++] = *q; + } else s->multi[k++] = *q; } + s->n_multi = k; // this squeezes out gapped alignments which failed the CIGAR generation if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; - s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, &s->n_cigar, s->strand); + s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, s->ref_shift, s->pos, &s->n_cigar); if (s->cigar == 0) s->type = BWA_TYPE_NO_MATCH; } // generate MD tag @@ -339,8 +308,7 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t bwa_seq_t *s = seqs + i; if (s->type != BWA_TYPE_NO_MATCH) { int nm; - s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, - bns->l_pac, pacseq, str, &nm); + s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, bns->l_pac, pacseq, str, &nm); s->nm = nm; } } @@ -487,7 +455,6 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in for (i = 0; i < p->n_multi; ++i) { bwt_multi1_t *q = p->multi + i; int k; - if (q->cigar == 0) continue; j = pos_end_multi(q, p->len) - q->pos; nn = bns_cnt_ambi(bns, q->pos, j, &seqid); err_printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+', @@ -538,6 +505,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f bntseq_t *bns; FILE *fp_sa; gap_opt_t opt; + char magic[4]; // initialization bwase_initialize(); @@ -546,6 +514,11 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f fp_sa = xopen(fn_sa, "r"); m_aln = 0; + err_fread_noeof(magic, 1, 4, fp_sa); + if (strncmp(magic, SAI_MAGIC, 4) != 0) { + fprintf(stderr, "[E::%s] Unmatched SAI magic. Please re-run `aln' with the same version of bwa.\n", __func__); + exit(1); + } err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa); bwa_print_sam_hdr(bns, rg_line); //bwa_print_sam_PG(); diff --git a/bwtaln.c b/bwtaln.c index e7727924..68d0274a 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -116,6 +116,7 @@ void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, for (j = 0; j < p->len; ++j) // we need to complement p->seq[j] = p->seq[j] > 3? 4 : 3 - p->seq[j]; p->aln = bwt_match_gap(bwt, p->len, p->seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack); + //fprintf(stderr, "mm=%lld,ins=%lld,del=%lld,gapo=%lld\n", p->aln->n_mm, p->aln->n_ins, p->aln->n_del, p->aln->n_gapo); // clean up the unused data in the record free(p->name); free(p->seq); free(p->rseq); free(p->qual); p->name = 0; p->seq = p->rseq = p->qual = 0; @@ -173,6 +174,7 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) } // core loop + err_fwrite(SAI_MAGIC, 1, 4, stdout); err_fwrite(opt, sizeof(gap_opt_t), 1, stdout); while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) { tot_seqs += n_seqs; diff --git a/bwtaln.h b/bwtaln.h index 556f2593..4616ff5a 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -33,14 +33,15 @@ #define FROM_D 2 #define FROM_S 3 +#define SAI_MAGIC "SAI\1" + typedef struct { bwtint_t w; int bid; } bwt_width_t; typedef struct { - uint32_t n_mm:16, n_gapo:8, n_gape:8; - int score; + uint64_t n_mm:8, n_gapo:8, n_gape:8, score:20, n_ins:10, n_del:10; bwtint_t k, l; } bwt_aln1_t; @@ -57,6 +58,7 @@ typedef uint16_t bwa_cigar_t; typedef struct { uint32_t n_cigar:15, gap:8, mm:8, strand:1; + int ref_shift; bwtint_t pos; bwa_cigar_t *cigar; } bwt_multi1_t; @@ -77,6 +79,7 @@ typedef struct { // alignment information bwtint_t sa, pos; uint64_t c1:28, c2:28, seQ:8; // number of top1 and top2 hits; single-end mapQ + int ref_shift; int n_cigar; bwa_cigar_t *cigar; // for multi-threading only diff --git a/bwtgap.c b/bwtgap.c index 16d90255..08bc1f48 100644 --- a/bwtgap.c +++ b/bwtgap.c @@ -45,7 +45,7 @@ static void gap_reset_stack(gap_stack_t *stack) stack->n_entries = 0; } -static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, +static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, int n_ins, int n_del, int state, int is_diff, const gap_opt_t *opt) { int score; @@ -59,7 +59,9 @@ static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, i } p = q->stack + q->n_entries; p->info = (u_int32_t)score<<21 | i; p->k = k; p->l = l; - p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; p->state = state; + p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; + p->n_ins = n_ins; p->n_del = n_del; + p->state = state; p->last_diff_pos = is_diff? i : 0; ++(q->n_entries); ++(stack->n_entries); @@ -106,7 +108,7 @@ static inline int int_log2(uint32_t v) bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *width, bwt_width_t *seed_width, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack) -{ +{ // $seq is the reverse complement of the input read int best_score = aln_score(opt->max_diff+1, opt->max_gapo+1, opt->max_gape+1, opt); int best_diff = opt->max_diff + 1, max_diff = opt->max_diff; int best_cnt = 0; @@ -126,7 +128,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_wid //for (j = 0; j != len; ++j) printf("#0 %d: [%d,%u]\t[%d,%u]\n", j, w[0][j].bid, w[0][j].w, w[1][j].bid, w[1][j].w); gap_reset_stack(stack); // reset stack - gap_push(stack, len, 0, bwt->seq_len, 0, 0, 0, 0, 0, opt); + gap_push(stack, len, 0, bwt->seq_len, 0, 0, 0, 0, 0, 0, 0, opt); while (stack->n_entries) { gap_entry_t e; @@ -186,8 +188,10 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_wid } p = aln + n_aln; p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; + p->n_ins = e.n_ins; p->n_del = e.n_del; p->k = k; p->l = l; p->score = score; + //fprintf(stderr, "*** n_mm=%d,n_gapo=%d,n_gape=%d,n_ins=%d,n_del=%d\n", e.n_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del); ++n_aln; } continue; @@ -214,24 +218,24 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_wid if (e.state == STATE_M) { // gap open if (e.n_gapo < opt->max_gapo) { // gap open is allowed // insertion - gap_push(stack, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_I, 1, opt); + gap_push(stack, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, e.n_ins + 1, e.n_del, STATE_I, 1, opt); // deletion for (j = 0; j != 4; ++j) { k = bwt->L2[j] + cnt_k[j] + 1; l = bwt->L2[j] + cnt_l[j]; - if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_D, 1, opt); + if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, e.n_ins, e.n_del + 1, STATE_D, 1, opt); } } } else if (e.state == STATE_I) { // extention of an insertion if (e.n_gape < opt->max_gape) // gap extention is allowed - gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_I, 1, opt); + gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, e.n_ins + 1, e.n_del, STATE_I, 1, opt); } else if (e.state == STATE_D) { // extention of a deletion if (e.n_gape < opt->max_gape) { // gap extention is allowed if (e.n_gape + e.n_gapo < max_diff || occ < opt->max_del_occ) { for (j = 0; j != 4; ++j) { k = bwt->L2[j] + cnt_k[j] + 1; l = bwt->L2[j] + cnt_l[j]; - if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_D, 1, opt); + if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, e.n_ins, e.n_del + 1, STATE_D, 1, opt); } } } @@ -244,13 +248,13 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_wid int is_mm = (j != 4 || seq[i] > 3); k = bwt->L2[c] + cnt_k[c] + 1; l = bwt->L2[c] + cnt_l[c]; - if (k <= l) gap_push(stack, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, STATE_M, is_mm, opt); + if (k <= l) gap_push(stack, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del, STATE_M, is_mm, opt); } } else if (seq[i] < 4) { // try exact match only int c = seq[i] & 3; k = bwt->L2[c] + cnt_k[c] + 1; l = bwt->L2[c] + cnt_l[c]; - if (k <= l) gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape, STATE_M, 0, opt); + if (k <= l) gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del, STATE_M, 0, opt); } } diff --git a/bwtgap.h b/bwtgap.h index 83987628..7dd61659 100644 --- a/bwtgap.h +++ b/bwtgap.h @@ -7,8 +7,9 @@ typedef struct { // recursion stack u_int32_t info; // score<<21 | i u_int32_t n_mm:8, n_gapo:8, n_gape:8, state:2, n_seed_mm:6; - bwtint_t k, l; // (k,l) is the SA region of [i,n-1] + u_int32_t n_ins:16, n_del:16; int last_diff_pos; + bwtint_t k, l; // (k,l) is the SA region of [i,n-1] } gap_entry_t; typedef struct { diff --git a/main.c b/main.c index 5e5a0242..d243144d 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.4-r396-beta" +#define PACKAGE_VERSION "0.7.4-r397-beta" #endif int bwa_fa2pac(int argc, char *argv[]);