Skip to content

Commit

Permalink
wip: branch ready for testing now
Browse files Browse the repository at this point in the history
  • Loading branch information
gpertea committed May 4, 2015
1 parent 102e430 commit 719fbfb
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 98 deletions.
2 changes: 1 addition & 1 deletion rlink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ bool maxCovReached(int currentstart, GBamRecord& brec, BundleData& bdata) {
return true;
}

void countRead(BundleData& bdata, GBamRecord& brec, int hi) {
void countFragment(BundleData& bdata, GBamRecord& brec, int hi) {
static uint32_t BAM_R2SINGLE = BAM_FREAD2 | BAM_FMUNMAP ;
if (hi==0) {
for (int i=0;i<brec.exons.Count();i++) {
Expand Down
6 changes: 3 additions & 3 deletions rlink.h
Original file line number Diff line number Diff line change
Expand Up @@ -346,9 +346,9 @@ struct BundleData {
}
Not needed here, we update the coverage span as each transcript is added
*/
void rc_store_t(GffObj* scaff);
void keepGuide(GffObj* scaff);

bool rc_count_hit(GBamRecord& brec, char strand, int nh); //, int hi);
bool evalReadAln(GBamRecord& brec, char& strand, int nh); //, int hi);

void Clear() {
keepguides.Clear();
Expand Down Expand Up @@ -376,7 +376,7 @@ struct BundleData {
int processRead(int currentstart, int currentend, BundleData& bdata,
GHash<int>& hashread, GBamRecord& brec, char strand, int nh, int hi);

void countRead(BundleData& bdata, GBamRecord& brec, int hi);
void countFragment(BundleData& bdata, GBamRecord& brec, int hi);

int printResults(BundleData* bundleData, int ngenes, int geneno, GStr& refname);

Expand Down
35 changes: 22 additions & 13 deletions stringtie.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -356,8 +356,6 @@ if (ballgown)
alncounts[gseq_id]++;
prev_pos=pos;
xstrand=brec->spliceStrand();
if (xstrand=='+') strand=1;
else if (xstrand=='-') strand=-1;
nh=brec->tag_int("NH");
if (nh==0) nh=1;
hi=brec->tag_int("HI");
Expand All @@ -372,13 +370,16 @@ if (ballgown)
if (new_bundle || chr_changed) {
hashread.Clear();
if (bundle->readlist.Count()>0) { // process reads in previous bundle
if (guides && ng_end>=ng_start) {
//TODO: check that the same guides are now kept
// using bundle->keepGuide()
/*if (guides && ng_end>=ng_start) {
for (int gi=ng_start;gi<=ng_end;gi++) {
int tidx=bundle->keepguides.Add((*guides)[gi]);
(*guides)[gi]->udata=tidx; //tid when not ballgown
}
}
// geneno=infer_transcripts(geneno, lastref, $label,\@readlist,$readthr,\@junction,$junctionthr,$mintranscriptlen,\@keepguides);
}*/
// geneno=infer_transcripts(geneno, lastref, $label,\@readlist,$readthr,
// \@junction,$junctionthr,$mintranscriptlen,\@keepguides);
// (readthr, junctionthr, mintranscriptlen are globals)
bundle->getReady(currentstart, currentend);
#ifndef NOTHREADS
Expand Down Expand Up @@ -476,7 +477,10 @@ if (ballgown)
currentend=brec->end;
if (guides) { //guided and guides!=NULL
ng_start=ng_end+1;
while (ng_start<ng && (int)(*guides)[ng_start]->end < pos) { ng_start++; } // skip guides that have no read coverage
while (ng_start<ng && (int)(*guides)[ng_start]->end < pos) {
// skip guides that have no read coverage
ng_start++;
}
//if(ng_start<ng && (int)(*guides)[ng_start]->start<pos) {
int ng_ovlstart=ng_start;
//add all guides overlapping the current read
Expand All @@ -485,7 +489,8 @@ if (ballgown)
currentstart=(*guides)[ng_ovlstart]->start;
if (currentend<(int)(*guides)[ng_ovlstart]->end)
currentend=(*guides)[ng_ovlstart]->end;
if (ballgown) bundle->rc_store_t((*guides)[ng_ovlstart]);
//if (ballgown)
bundle->keepGuide((*guides)[ng_ovlstart]);
ng_ovlstart++;
}
if (ng_ovlstart>ng_start) ng_end=ng_ovlstart-1;
Expand All @@ -506,7 +511,8 @@ if (ballgown)
while (ng_end+1<ng && (int)(*guides)[ng_end+1]->start<=currentend) {
ng_end++;
//more transcripts overlapping this bundle
if (ballgown) bundle->rc_store_t((*guides)[ng_end]);
//if (ballgown)
bundle->keepGuide((*guides)[ng_end]);
if(currentend<(int)(*guides)[ng_end]->end) {
currentend=(*guides)[ng_end]->end;
cend_changed=true;
Expand All @@ -516,11 +522,14 @@ if (ballgown)
}
} //adjusted currentend and checked for overlapping reference transcripts
bool ref_overlap=false;
if (ballgown && bundle->rc_data) ref_overlap=bundle->rc_count_hit(*brec, xstrand, nh);
countRead(*bundle, *brec, hi);
if (!ballgown || ref_overlap) {
processRead(currentstart, currentend, *bundle, hashread, *brec, strand, nh, hi);
}
//if (ballgown && bundle->rc_data)
ref_overlap=bundle->evalReadAln(*brec, xstrand, nh);
if (xstrand=='+') strand=1;
else if (xstrand=='-') strand=-1;
countFragment(*bundle, *brec, hi);
//if (!ballgown || ref_overlap)
processRead(currentstart, currentend, *bundle, hashread, *brec, strand, nh, hi);

//update current end to be at least as big as the start of the read pair in the fragment?? -> maybe not because then I could introduce some false positives with paired reads mapped badly

/*
Expand Down
182 changes: 110 additions & 72 deletions tablemaker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ void rc_update_tdata(BundleData& bundle, GffObj& scaff,
(*tdata).fpkm=fpkm;
}
*/
void BundleData::rc_store_t(GffObj* t) {
//if (!rc_stage) return;
void BundleData::keepGuide(GffObj* t) {
if (rc_data==NULL) {
rc_init(t);
}
t->udata=keepguides.Add(t);
rc_data->addTranscript(*t);
}

Expand All @@ -47,82 +47,120 @@ void rc_updateExonCounts(const RC_Feature* exon, int nh) {
if (nh<=1) exon->ucount++;
}

bool BundleData::rc_count_hit(GBamRecord& brec, char strand, int nh) { //, int hi) {
if (rc_data==NULL) return false; //no ref transcripts available for this reads' region
if (rc_data->tdata.Count()==0) return false; //nothing to do without transcripts

//check this read alignment against ref exons and introns
/*
int gstart=brec.start; //alignment start position on the genome
We NEVER update read-counting boundaries of the bundle based on reads - they should only be based on reference transcripts
if (rc_data->f_cov.size()==0 && gstart<rc_data->lmin) {
fprintf(stderr, "Warning: adjusting lmin coverage bundle from %d to %d !\n", int(rc_data->lmin), (int)gstart);
rc_data->lmin=gstart;
bool BundleData::evalReadAln(GBamRecord& brec, char& strand, int nh) { //, int hi) {
if (rc_data==NULL) {
return false; //no ref transcripts available for this reads' region
}
*/
if ((int)brec.end<rc_data->lmin || (int)brec.start>rc_data->rmax) {
return false; //hit outside coverage area
}
/*
int gpos=brec.start; //current genomic position
int rlen=0; //read length, obtained here from the cigar string
int segstart=gstart;
*/
vector<RC_Seg> rsegs;
vector<RC_Seg> rintrons;
for (int i=0;i<brec.exons.Count();i++) {
rc_data->updateCov(strand, nh, brec.exons[i].start, brec.exons[i].len());
rsegs.push_back(RC_Seg(brec.exons[i].start, brec.exons[i].end) );
if (i>0) {
//add intron
rintrons.push_back(RC_Seg(brec.exons[i-1].end+1, brec.exons[i].start-1));
if (rc_data->exons.Count()==0) return false;

if (ballgown) {
if (rc_data->tdata.Count()==0) return false; //nothing to do without transcripts
//check this read alignment against ref exons and introns
/*
int gstart=brec.start; //alignment start position on the genome
We NEVER update read-counting boundaries of the bundle based on reads - they should only be based on reference transcripts
if (rc_data->f_cov.size()==0 && gstart<rc_data->lmin) {
fprintf(stderr, "Warning: adjusting lmin coverage bundle from %d to %d !\n", int(rc_data->lmin), (int)gstart);
rc_data->lmin=gstart;
}
}
//now check rexons and rintrons with findExons() and findIntron()
for (size_t i=0;i<rintrons.size();++i) {
/*RC_FeatIt ri=rc_data->findIntron(rintrons[i].l, rintrons[i].r, strand);
if (ri!=rc_data->introns.end()) {
(*ri).rcount++;
(*ri).mrcount += (nh > 1) ? (1.0/nh) : 1;
if (nh==1) (*ri).ucount++;
}
*/
RC_Feature* ri=rc_data->findIntron(rintrons[i].l, rintrons[i].r, strand);
if (ri) {
ri->rcount++;
ri->mrcount += (nh > 1) ? (1.0/nh) : 1;
if (nh==1) ri->ucount++;
}
} //for each intron
*/
/*
int gpos=brec.start; //current genomic position
int rlen=0; //read length, obtained here from the cigar string
int segstart=gstart;
*/
vector<RC_Seg> rsegs;
vector<RC_Seg> rintrons;
for (int i=0;i<brec.exons.Count();i++) {
rc_data->updateCov(strand, nh, brec.exons[i].start, brec.exons[i].len());
rsegs.push_back(RC_Seg(brec.exons[i].start, brec.exons[i].end) );
if (i>0) {
//add intron
rintrons.push_back(RC_Seg(brec.exons[i-1].end+1, brec.exons[i].start-1));
}
}
//now check rexons and rintrons with findExons() and findIntron()
for (size_t i=0;i<rintrons.size();++i) {
/*RC_FeatIt ri=rc_data->findIntron(rintrons[i].l, rintrons[i].r, strand);
if (ri!=rc_data->introns.end()) {
(*ri).rcount++;
(*ri).mrcount += (nh > 1) ? (1.0/nh) : 1;
if (nh==1) (*ri).ucount++;
}
*/
RC_Feature* ri=rc_data->findIntron(rintrons[i].l, rintrons[i].r, strand);
if (ri) {
ri->rcount++;
ri->mrcount += (nh > 1) ? (1.0/nh) : 1;
if (nh==1) ri->ucount++;
}
} //for each intron

for (size_t i=0;i<rsegs.size();++i) {
RC_FeatPtrSet ovlex=rc_data->findExons(rsegs[i].l, rsegs[i].r, strand);
if (ovlex.size()==0) continue;
if (ovlex.size()>1) {
vector< pair<int, const RC_Feature*> > xovl; //overlapped exons sorted by decreasing overlap length
for (RC_FeatPtrSet::iterator ox=ovlex.begin();ox != ovlex.end(); ++ox) {
int ovlen=(*ox)->ovlen(rsegs[i].l, rsegs[i].r);
if (ovlen>=5)
xovl.push_back(pair<int, const RC_Feature*>(ovlen, *ox));
}
if (xovl.size()>1) {
sort(xovl.begin(), xovl.end(), OvlSorter); //larger overlaps first
//update the counts only for ref exons with max overlap to this segment
int max_ovl=xovl.begin()->first;
for (vector<pair<int, const RC_Feature*> >::iterator xo=xovl.begin();xo!=xovl.end();++xo) {
if (max_ovl - xo->first > 5 ) break; //more than +5 bases coverage for the other exons
rc_updateExonCounts(xo->second, nh);
}
} else if (xovl.size() == 1) {
rc_updateExonCounts(xovl.begin()->second, nh);
}
} else {
// 1 exon overlap only
int ovlen=(*ovlex.begin())->ovlen(rsegs[i].l, rsegs[i].r);
if (ovlen>=5) rc_updateExonCounts(*ovlex.begin(), nh);
}
} //for each read "exon"
char strandbits=0;
for (size_t i=0;i<rsegs.size();++i) {
RC_FeatPtrSet ovlex=rc_data->findExons(rsegs[i].l, rsegs[i].r, strand);
if (ovlex.size()==0) continue;
if (ovlex.size()>1) {
vector< pair<int, const RC_Feature*> > xovl; //overlapped exons sorted by decreasing overlap length
for (RC_FeatPtrSet::iterator ox=ovlex.begin();ox != ovlex.end(); ++ox) {
int ovlen=(*ox)->ovlen(rsegs[i].l, rsegs[i].r);
if (ovlen>=5)
xovl.push_back(pair<int, const RC_Feature*>(ovlen, *ox));
}
if (xovl.size()>1) {
sort(xovl.begin(), xovl.end(), OvlSorter); //larger overlaps first
//update the counts only for ref exons with max overlap to this segment
int max_ovl=xovl.begin()->first;
for (vector<pair<int, const RC_Feature*> >::iterator xo=xovl.begin();xo!=xovl.end();++xo) {
if (xo->second->strand=='+') strandbits |= 0x01;
else if (xo->second->strand=='-') strandbits |= 0x02;
if (max_ovl - xo->first > 5 ) break; //more than +5 bases coverage for the other exons
rc_updateExonCounts(xo->second, nh);
}
} else if (xovl.size() == 1) {
rc_updateExonCounts(xovl.begin()->second, nh);
}
} else {
// 1 exon overlap only
int ovlen=(*ovlex.begin())->ovlen(rsegs[i].l, rsegs[i].r);
if (ovlen>=5) {
if ((*ovlex.begin())->strand=='+') strandbits |= 0x01;
else if ((*ovlex.begin())->strand=='-') strandbits |= 0x02;
rc_updateExonCounts(*ovlex.begin(), nh);
}
}
} //for each read alignment "exon"
if (strand=='.' && strandbits && strandbits<3) {
strand = (strandbits==1) ? '+' : '-';
}
}
else {
//don't keep track of ballgown coverage data, just check for exon overlaps
//TODO check and assign strand here if needed
if (strand=='.') {
vector< pair<int, const RC_Feature*> > xovl; //overlapped exons sorted by ovl len
for (int i=0;i<brec.exons.Count();i++) {
RC_FeatPtrSet ovlex=rc_data->findExons(brec.exons[i].start, brec.exons[i].end, strand);
for (RC_FeatPtrSet::iterator ox=ovlex.begin();ox != ovlex.end(); ++ox) {
int ovlen=(*ox)->ovlen(brec.exons[i].start, brec.exons[i].end);
if (ovlen>=5)
xovl.push_back(pair<int, const RC_Feature*>(ovlen, *ox));
}
} // for each read alignment "exon"
char strandbits=0;
for (vector<pair<int, const RC_Feature*> >::iterator xo=xovl.begin();xo!=xovl.end();++xo) {
if (xo->second->strand=='+') strandbits |= 0x01;
else if (xo->second->strand=='-') strandbits |= 0x02;
}
if (strandbits && strandbits<3) {
strand = (strandbits==1) ? '+' : '-';
}
}
}
return true;
}

Expand Down
31 changes: 22 additions & 9 deletions tablemaker.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ using namespace std;

#define RC_MIN_EOVL 5

extern bool ballgown;

void Ballgown_setupFiles(FILE* &f_tdata, FILE* &f_edata, FILE* &f_idata,
FILE* &f_e2t, FILE* &f_i2t);
Expand Down Expand Up @@ -231,7 +232,7 @@ struct RC_BundleData {
exons(true, false, true), introns(true, false, true),
xcache(0), xcache_pos(0)
{
if (rmax>lmin) updateCovSpan();
if (rmax>lmin && ballgown) updateCovSpan();
}

~RC_BundleData() {
Expand All @@ -241,16 +242,29 @@ struct RC_BundleData {
r_mcov.clear();
}

//for non-ballgown tracking of guide features
void addTranscriptFeature(uint fstart, uint fend, char fstrand, GList<RC_Feature>& flist, uint tidx) {
RC_Feature* feat=new RC_Feature(fstart, fend, fstrand, 0, tidx);
int fidx=-1;
RC_Feature* p=flist.AddIfNew(feat, true, &fidx);
if (p==feat) {//exon not seen before
p->id=fidx;
}
else {//exon seen before, update t_id list
p->t_ids.Add(tidx);
}
}

void addTranscript(GffObj& t) {
//if (!ps.rc_id_data()) return;
//RC_ScaffIds& sdata = *(ps.rc_id_data());
if (t.uptr) { //ballgown case
bool boundary_changed=false;
if (lmin==0 || lmin>(int)t.start) { lmin=t.start; boundary_changed=true; }
if (rmax==0 || rmax<(int)t.end) { rmax=t.end; boundary_changed=true; }
if (t.uptr) { //ballgown case
RC_ScaffData& sdata=*(RC_ScaffData*)(t.uptr);
//tdata.insert(sdata);
tdata.Add(&sdata);
bool boundary_changed=false;
if (lmin==0 || lmin>(int)t.start) { lmin=t.start; boundary_changed=true; }
if (rmax==0 || rmax<(int)t.end) { rmax=t.end; boundary_changed=true; }
if (boundary_changed) updateCovSpan();
//for (vector<RC_ScaffSeg>::iterator it=sdata.exons.begin();it!=sdata.exons.end();++it) {
for (int i=0;i<sdata.t_exons.Count();i++) {
Expand All @@ -263,13 +277,12 @@ struct RC_BundleData {
}
else {
//non-ballgown, regular storage of bundle guides data
//TODO
//use GffObj::udata as tid, it's the index in bundles::keepguides
for (int i=0;i<t.exons.Count();++i) {
RC_Feature fexon(t.exons[i]->start, t.exons[i]->end, t.strand);
addTranscriptFeature(t.exons[i]->start, t.exons[i]->end, t.strand, exons, (uint)t.udata);
if (i>0) {
//add intron
RC_Feature fintron(t.exons[i-1]->end+1, t.exons[i]->start-1, t.strand);
//add intron
addTranscriptFeature(t.exons[i-1]->end+1, t.exons[i]->start-1, t.strand, introns, (uint)t.udata);
}
}
} //no ballgown coverage
Expand Down

0 comments on commit 719fbfb

Please sign in to comment.