Skip to content

Commit

Permalink
Make BAM input compatible with --interleaved, -1/2 and -U
Browse files Browse the repository at this point in the history
  • Loading branch information
ch4rr0 committed Dec 11, 2024
1 parent 04188b8 commit b50baa2
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 39 deletions.
2 changes: 1 addition & 1 deletion bt2_search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1714,7 +1714,7 @@ static void parseOptions(int argc, const char **argv) {
<< "sequences must be specified with -1 and -2." << endl;
throw 1;
}
if(interleaved && (format != FASTA && format != FASTQ)) {
if(interleaved && (format != FASTA && format != FASTQ && format != BAM)) {
cerr << "Error: --interleaved only works in combination with FASTA (-f) and FASTQ (-q) formats." << endl;
throw 1;
}
Expand Down
61 changes: 40 additions & 21 deletions pat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ static uint32_t genRandSeed(
size_t qlen = qry.length();
// Throw all the characters of the read into the random seed
for(size_t i = 0; i < qlen; i++) {
int p = (int)qry[i];
int p = (int)qry[i];
assert_leq(p, 4);
size_t off = ((i & 15) << 1);
rseed ^= ((uint32_t)p << off);
Expand Down Expand Up @@ -302,20 +302,28 @@ pair<bool, int> DualPatternComposer::nextBatch(PerThreadReadBuf& pt) {
* the read and quality input, create a list of pattern sources to
* dispense them.
*/
PatternComposer* PatternComposer::setupPatternComposer(
const EList<string>& si, // singles, from argv
const EList<string>& m1, // mate1's, from -1 arg
const EList<string>& m2, // mate2's, from -2 arg
const EList<string>& m12, // both mates on each line, from --12 arg
const EList<string>& q, // qualities associated with singles
const EList<string>& q1, // qualities associated with m1
const EList<string>& q2, // qualities associated with m2
PatternComposer *PatternComposer::setupPatternComposer(
const EList<string> &si, // singles, from argv
const EList<string> &m1, // mate1's, from -1 arg
const EList<string> &m2, // mate2's, from -2 arg
const EList<string> &m12, // both mates on each line, from --12 arg
const EList<string> &q, // qualities associated with singles
const EList<string> &q1, // qualities associated with m1
const EList<string> &q2, // qualities associated with m2
#ifdef USE_SRA
const EList<string>& sra_accs, // SRA accessions
const EList<string> &sra_accs, // SRA accessions
#endif
PatternParams& p, // read-in parameters
bool verbose) // be talkative?
PatternParams &p, // read-in parameters
bool verbose) // be talkative?
{
// We save the value of PatternParams::align_paired_reads
// so that `--align-paired-reads` works as expected.
bool align_paired_reads = p.align_paired_reads;
// This serves as an indicator to the BAM pattern source
// that we intend on parsing paired reads from separate files
p.align_paired_reads = true;


EList<PatternSource*>* a = new EList<PatternSource*>();
EList<PatternSource*>* b = new EList<PatternSource*>();
// Create list of pattern sources for paired reads appearing
Expand Down Expand Up @@ -354,8 +362,7 @@ PatternComposer* PatternComposer::setupPatternComposer(
}
}
#endif

// Create list of pattern sources for paired reads
// Create list of pattern sources for paired reads
for(size_t i = 0; i < m1.size(); i++) {
const EList<string>* qs = &m1;
EList<string> tmpSeq;
Expand Down Expand Up @@ -391,7 +398,11 @@ PatternComposer* PatternComposer::setupPatternComposer(
// All mates/mate files must be paired
assert_eq(a->size(), b->size());

// Create list of pattern sources for the unpaired reads
// Create list of pattern sources for the unpaired reads
p.align_paired_reads = align_paired_reads;
if (p.align_paired_reads) {
p.interleaved = true;
}
for(size_t i = 0; i < si.size(); i++) {
const EList<string>* qs = &si;
PatternSource* patsrc = NULL;
Expand Down Expand Up @@ -1370,7 +1381,8 @@ std::pair<bool, int> BAMPatternSource::get_alignments(PerThreadReadBuf& pt, bool
i = i + sizeof(l_name) + l_name + sizeof(uint32_t);
}
first_ = false;
}
}

while (readi < pt.max_buf_) {
if (i >= alignment_batch.size()) {
return make_pair(false, readi);
Expand All @@ -1397,19 +1409,26 @@ std::pair<bool, int> BAMPatternSource::get_alignments(PerThreadReadBuf& pt, bool
i += sizeof(block_size);
memcpy(&flag, &alignment_batch[0] + i + offset[BAMField::flag], sizeof(flag));
if (currentlyBigEndian())
flag = endianSwapU16(flag);
EList<Read>& readbuf = (pp_.align_paired_reads && (flag & 0x80)) != 0 ? pt.bufb_ : pt.bufa_;
flag = endianSwapU16(flag);

if (interleaved_) {
batch_a = (flag & 0x80) != 0 ? false : true;
}

EList<Read> &readbuf = batch_a ? pt.bufa_ : pt.bufb_;

if ((flag & 0x4) == 0) {
readbuf[readi].readOrigBuf.clear();
i += block_size;
continue;
}
if (!pp_.align_paired_reads && ((flag & 0x40) != 0 || (flag & 0x80) != 0)) {
if (!align_paired_reads_ && (flag & 0x1) != 0) {
readbuf[readi].readOrigBuf.clear();
i += block_size;
continue;
}
if (pp_.align_paired_reads && ((flag & 0x40) == 0 && (flag & 0x80) == 0)) {
if (align_paired_reads_ &&
((batch_a && ((flag & 0x40) == 0)) || (!batch_a && (flag & 0x80) == 0))) {
readbuf[readi].readOrigBuf.clear();
i += block_size;
continue;
Expand All @@ -1418,7 +1437,7 @@ std::pair<bool, int> BAMPatternSource::get_alignments(PerThreadReadBuf& pt, bool
readbuf[readi].readOrigBuf.resize(block_size);
memcpy(readbuf[readi].readOrigBuf.wbuf(), &alignment_batch[0] + i, block_size);
i += block_size;
readi += (pp_.align_paired_reads &&
readi += (interleaved_ &&
pt.bufb_[readi].readOrigBuf.length() == 0) ? 0 : 1;
}

Expand Down
30 changes: 13 additions & 17 deletions pat.h
Original file line number Diff line number Diff line change
Expand Up @@ -851,22 +851,16 @@ class BAMPatternSource : public CFilePatternSource {
};

public:

BAMPatternSource(
const EList<std::string>& infiles,
const PatternParams& p) :
CFilePatternSource(infiles, p),
first_(true),
alignment_batch(0),
alignment_offset(0),
delta_(0),
pp_(p)
{
stream.zalloc = Z_NULL;
stream.zfree = Z_NULL;
stream.opaque = Z_NULL;
alignment_batch.reserve(1 << 16);
}
BAMPatternSource(const EList<std::string> &infiles, const PatternParams &p)
: CFilePatternSource(infiles, p), first_(true), alignment_batch(0),
alignment_offset(0), delta_(0), pp_(p), interleaved_(pp_.interleaved),
align_paired_reads_(pp_.align_paired_reads)
{
stream.zalloc = Z_NULL;
stream.zfree = Z_NULL;
stream.opaque = Z_NULL;
alignment_batch.reserve(1 << 16);
}

virtual void reset() {
first_ = true;
Expand Down Expand Up @@ -913,7 +907,9 @@ class BAMPatternSource : public CFilePatternSource {
size_t delta_;
z_stream stream;

PatternParams pp_;
PatternParams pp_;
bool interleaved_;
bool align_paired_reads_;
};

/**
Expand Down

0 comments on commit b50baa2

Please sign in to comment.