From 111743d1d1377dd464bda8b5ce7b01544bb2b18f Mon Sep 17 00:00:00 2001 From: Geo Pertea Date: Wed, 13 May 2015 19:57:26 -0400 Subject: [PATCH] more refactoring/improving of read counting --- gclib/GBase.cpp | 5 +- gclib/GList.hh | 7 +- gclib/GVec.hh | 6 +- rlink.cpp | 47 ++++++------ rlink.h | 22 +++++- stringtie.cpp | 28 +++---- tablemaker.cpp | 189 +++++++++++++++++++++++++++++++++++++----------- tablemaker.h | 180 ++++++++++++++++++++++----------------------- 8 files changed, 308 insertions(+), 176 deletions(-) diff --git a/gclib/GBase.cpp b/gclib/GBase.cpp index f375aa4..4671e42 100644 --- a/gclib/GBase.cpp +++ b/gclib/GBase.cpp @@ -222,10 +222,13 @@ int Gmkdir(const char *path, bool recursive, int perms) { *psep=0; //now gpath is the path up to this / ss=psep; ++ss; //ss repositioned just after the / // create current level - if (fileExists(gpath)!=1 && G_mkdir(gpath, perms)!=0) + if (fileExists(gpath)!=1 && G_mkdir(gpath, perms)!=0) { + GFREE(gpath); return -1; + } *psep='/'; } + GFREE(gpath); return 0; } diff --git a/gclib/GList.hh b/gclib/GList.hh index 13e0729..886b930 100644 --- a/gclib/GList.hh +++ b/gclib/GList.hh @@ -86,7 +86,8 @@ template class GList:public GPVec { else return 0; } public: - void sortInsert(int idx, OBJ* item); + void sortInsert(int idx, OBJ* item); //special insert in sorted lists + //WARNING: the caller must know the insert index such that the sort order is preserved! GList(GCompareProc* compareProc=NULL); //free by default GList(GCompareProc* compareProc, //unsorted by default GFreeProc *freeProc, @@ -110,7 +111,7 @@ template class GList:public GPVec { } else fCompareProc=NULL; } - int Add(OBJ* item); //-- specific implementation if sorted + int Add(OBJ* item); //-- specific implementation if sorted - may become an Insert() void Add(GList& list); //add all pointers from another list OBJ* AddIfNew(OBJ* item, bool deleteIfFound=true, int* fidx=NULL); @@ -588,7 +589,7 @@ template bool GList::Found(OBJ* item, int& idx) { template void GList::sortInsert(int idx, OBJ* item) { //idx must be the new position this new item must have //so the allowed range is [0..fCount] - //the old idx item all the above will be shifted to idx+1 + //the current fList[idx] and all the above will be shifted +1 if (idx<0 || idx>this->fCount) GError(GVEC_INDEX_ERR, idx); if (this->fCount==this->fCapacity) { GPVec::Grow(idx, item); diff --git a/gclib/GVec.hh b/gclib/GVec.hh index 510cdbc..7b9800b 100644 --- a/gclib/GVec.hh +++ b/gclib/GVec.hh @@ -75,7 +75,7 @@ template class GVec { GVec(int init_capacity=2); GVec(int init_count, const OBJ init_val); GVec(int init_count, OBJ* init_val, bool delete_initval=true); //convenience constructor for complex vectors - GVec(GVec& array); //copy constructor + GVec(const GVec& array); //copy constructor const GVec& operator=(GVec& array); //copy operator virtual ~GVec(); void Insert(int idx, OBJ item) { Insert(idx, &item); } @@ -234,7 +234,7 @@ template GVec::GVec(int init_count, OBJ* init_val, bool delete_ } -template GVec::GVec(GVec& array) { //copy constructor +template GVec::GVec(const GVec& array) { //copy constructor this->fCount=array.fCount; this->fCapacity=array.fCapacity; this->fArray=NULL; @@ -246,7 +246,7 @@ template GVec::GVec(GVec& array) { //copy constructor else { fArray=new OBJ[this->fCapacity]; //]() // uses OBJ operator= - for (int i=0;ifCount;i++) fArray[i]=array[i]; + for (int i=0;ifCount;i++) fArray[i]=array.fArray[i]; } } this->fCount=array.fCount; diff --git a/rlink.cpp b/rlink.cpp index 1c106a2..f1dc459 100644 --- a/rlink.cpp +++ b/rlink.cpp @@ -110,11 +110,15 @@ void countFragment(BundleData& bdata, GBamRecord& brec, int hi) { } int processRead(int currentstart, int currentend, BundleData& bdata, - GHash& hashread, GBamRecord& brec, char strand, int nh, int hi) { + GHash& hashread, GReadAlnData& alndata) { + //GBamRecord& brec, char strand, int nh, int hi) { GList& readlist = bdata.readlist; GList& junction = bdata.junction; GVec& bpcov = bdata.bpcov; - + GBamRecord& brec=*(alndata.brec); + char strand=alndata.strand; + int nh=alndata.nh; + int hi=alndata.hi; int readstart=brec.start; CReadAln* readaln=NULL; bool covSaturated=false; @@ -143,8 +147,7 @@ int processRead(int currentstart, int currentend, BundleData& bdata, GPVec newjunction(false); for (int i=0;imaxleftsupport) { @@ -163,6 +166,8 @@ int processRead(int currentstart, int currentend, BundleData& bdata, int maxrightsupport = support > rightsupport ? support : rightsupport; nj=add_junction(seg.start, seg.end, maxleftsupport, maxrightsupport, junction, strand, nh); newjunction.Add(nj); + if (alndata.juncs.Count()) + nj->guide_match=alndata.juncs[i-1]->guide_match; } cov_add(bpcov, brec.exons[i].start-currentstart, brec.exons[i].end-currentstart, float(1)/nh); @@ -8088,7 +8093,7 @@ void clean_junctions(GList& junction) { */ void clean_junctions(GList& junction, int refstart, GVec& bpcov,GPVec& guides) { - + /* GArray guideintrons(true,true); if(guides.Count()) { // guides are not NULL for(int g=0;g& junction, int refstart, GVec& bpco } } } - +*/ //fprintf(stderr,"Clean junctions:\n"); for(int i=0;ilongintron && (!guides.Count() || guideintrons.IndexOf(jd)==-1)) { // very long intron -> hard to trust unless it's well covered + else //if((int)(jd.end-jd.start)>longintron && (!guides.Count() || guideintrons.IndexOf(jd)==-1)) { + // very long intron -> hard to trust unless it's well covered + if ((int)(jd.end-jd.start)>longintron && (!guides.Count() || !jd.guide_match)) { int leftreach = jd.start-longintronanchor-refstart; if(leftreach<0) leftreach=0; int rightreach = jd.end+longintronanchor-refstart; if(rightreach>=bpcov.Count()) rightreach=bpcov.Count()-1; //fprintf(stderr,"cov[%d]=%f cov[%d]=%f cov[%d][=%f cov[%d]=%f\n",leftreach+refstart,bpcov[leftreach],jd.start,bpcov[jd.start-refstart-1],rightreach+refstart,bpcov[rightreach],jd.end,bpcov[jd.end-refstart]); - if((bpcov[leftreach]<1 && bpcov[jd.start-refstart-1]<2) || (bpcov[rightreach]<1 && bpcov[jd.end-refstart]<2)) { + if((bpcov[leftreach]<1 && bpcov[jd.start-refstart-1]<2) || + (bpcov[rightreach]<1 && bpcov[jd.end-refstart]<2)) { jd.strand=0; } } - - /* - { // DEBUG ONLY - fprintf(stderr,"Junction %d: %d %d %d %g %g\n",i,jd.start,jd.end,jd.strand,jd.nreads,jd.nreads_good); - } - */ - } + //{ // DEBUG ONLY + // fprintf(stderr,"Junction %d: %d %d %d %g %g\n",i,jd.start,jd.end,jd.strand,jd.nreads,jd.nreads_good); + //} + } //for each read junction } //debug funcs @@ -8579,7 +8584,7 @@ int print_signcluster(char strand,GList& pred,GVec& genes,GVec uint t_id=0; if (pred[n]->t_eq && pred[n]->t_eq->uptr) { - t_id = ((RC_ScaffData*)pred[n]->t_eq->uptr)->t_id; + t_id = ((RC_TData*)pred[n]->t_eq->uptr)->t_id; } fprintf(f_out,"%d %d %d %.6f %.6f\n",pred[n]->exons.Count()+1,pred[n]->tlen, t_id, pred[n]->frag,pred[n]->cov); fprintf(f_out,"%s\tStringTie\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s.%d\"; transcript_id \"%s.%d.%d\"; ", @@ -8854,7 +8859,7 @@ int print_cluster(GPVec& pred,GVec& genes,GVec& transcrip transcripts[pred[n]->geneno]++; uint t_id=0; if (pred[n]->t_eq && pred[n]->t_eq->uptr) { - t_id = ((RC_ScaffData*)pred[n]->t_eq->uptr)->t_id; + t_id = ((RC_TData*)pred[n]->t_eq->uptr)->t_id; } fprintf(f_out,"%d %d %d %.6f %.6f\n",pred[n]->exons.Count()+1,pred[n]->tlen, t_id, pred[n]->frag,pred[n]->cov); fprintf(f_out,"%s\tStringTie\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s.%d\"; transcript_id \"%s.%d.%d\"; ", @@ -8982,7 +8987,7 @@ int print_cluster_inclusion(GPVec& pred,GVec& genes,GVec& transcripts[pred[n]->geneno]++; uint t_id=0; if (pred[n]->t_eq && pred[n]->t_eq->uptr) { - t_id = ((RC_ScaffData*)pred[n]->t_eq->uptr)->t_id; + t_id = ((RC_TData*)pred[n]->t_eq->uptr)->t_id; } fprintf(f_out,"%d %d %d %.6f %.6f\n",pred[n]->exons.Count()+1,pred[n]->tlen, t_id, pred[n]->frag,pred[n]->cov); fprintf(f_out,"%s\tStringTie\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s.%d\"; transcript_id \"%s.%d.%d\"; ", @@ -9077,7 +9082,7 @@ int print_transcript_signcluster(char strand,GList& pred,GVec& if(pred[n]->flag) { uint t_id=0; if (pred[n]->t_eq && pred[n]->t_eq->uptr) { - t_id = ((RC_ScaffData*)pred[n]->t_eq->uptr)->t_id; + t_id = ((RC_TData*)pred[n]->t_eq->uptr)->t_id; } fprintf(f_out,"%d %d %d %.6f %.6f\n",pred[n]->exons.Count()+1,pred[n]->tlen, t_id, pred[n]->frag,pred[n]->cov); fprintf(f_out,"%s\tStringTie\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s.%d\"; transcript_id \"%s.%d.%d\"; ", diff --git a/rlink.h b/rlink.h index a08282c..2f66a7c 100644 --- a/rlink.h +++ b/rlink.h @@ -252,11 +252,12 @@ struct CGraphnode:public GSeg { // # 0: strand; 1: start; 2: end; 3: nreads; 4: nreads_good; struct CJunction:public GSeg { - char strand; //-1,0,1 (unsigned byte) + char strand; //-1,0,1 + char guide_match; //exact match of a ref intron? double nreads; double nreads_good; CJunction(int s=0,int e=0, char _strand=0):GSeg(s,e), - strand(_strand), nreads(0), nreads_good(0) {} + strand(_strand), guide_match(0), nreads(0), nreads_good(0) {} bool operator<(CJunction& b) { if (startb.start) return false; @@ -270,6 +271,17 @@ struct CJunction:public GSeg { } }; +struct GReadAlnData { + GBamRecord* brec; + char strand; //-1, 0, 1 + int nh; + int hi; + GPVec juncs; + GPVec< GVec > g_exonovls; //>5bp overlaps with guide exons, for each read "exon" + GReadAlnData(GBamRecord* bamrec=NULL, char nstrand=0, int num_hits=0, int hit_idx=0):brec(bamrec), + strand(nstrand), nh(num_hits), hi(hit_idx), juncs(true), g_exonovls(true) { } +}; + struct CTCov { //covered transcript info int first_cov_exon; int last_cov_exon; @@ -346,7 +358,8 @@ struct BundleData { */ void keepGuide(GffObj* scaff); - bool evalReadAln(GBamRecord& brec, char& strand, int nh); //, int hi); + //bool evalReadAln(GBamRecord& brec, char& strand, int nh); //, int hi); + bool evalReadAln(GReadAlnData& alndata, char& strand); void Clear() { keepguides.Clear(); @@ -372,7 +385,8 @@ struct BundleData { }; int processRead(int currentstart, int currentend, BundleData& bdata, - GHash& hashread, GBamRecord& brec, char strand, int nh, int hi); + GHash& hashread, GReadAlnData& alndata); + //GBamRecord& brec, char strand, int nh, int hi); void countFragment(BundleData& bdata, GBamRecord& brec, int hi); diff --git a/stringtie.cpp b/stringtie.cpp index aa521ef..a84bcab 100644 --- a/stringtie.cpp +++ b/stringtie.cpp @@ -201,7 +201,7 @@ int main(int argc, char * const argv[]) { // == Done argument processing. GVec refguides; // plain vector with transcripts for each chromosome - GPVec refguides_RC_Data(true); //raw count data for all guide transcripts + GPVec refguides_RC_Data(true); //raw count data for all guide transcripts GPVec refguides_RC_exons(true); //raw count data for all guide exons GPVec refguides_RC_introns(true);//raw count data for all guide introns GVec alncounts(30,0); //keep track of the number of read alignments per chromosome [gseq_id] @@ -237,24 +237,24 @@ const char* ERR_BAM_SORT="\nError: the input alignment file is not sorted!\n"; uint cur_tid=0; uint cur_exon_id=0; uint cur_intron_id=0; - std::set exons; - std::set introns; + GList uexons(true, false, true); //sorted, free items, unique + GList uintrons(true, false, true); //assign unique transcript IDs based on the sorted order int last_refid=0; for (int i=0;iuptr=tdata; if (last_refid!=m->gseq_id) { //chromosome switch - exons.clear(); - introns.clear(); + uexons.Clear(); + uintrons.Clear(); last_refid=m->gseq_id; } refguides_RC_Data.Add(tdata); - tdata->rc_addFeatures(cur_exon_id, exons, refguides_RC_exons, - cur_intron_id, introns, refguides_RC_introns); + tdata->rc_addFeatures(cur_exon_id, uexons, refguides_RC_exons, + cur_intron_id, uintrons, refguides_RC_introns); } GRefData& grefdata = refguides[m->gseq_id]; @@ -322,7 +322,7 @@ if (ballgown) bool chr_changed=false; int pos=0; const char* refseqName=NULL; - char strand=0; + //char strand=0; char xstrand=0; int nh=1; int hi=0; @@ -526,12 +526,14 @@ if (ballgown) //bool ref_overlap=false; //if (ballgown && bundle->rc_data) //ref_overlap= - bundle->evalReadAln(*brec, xstrand, nh); - if (xstrand=='+') strand=1; - else if (xstrand=='-') strand=-1; + GReadAlnData alndata(brec, 0, nh, hi); + bundle->evalReadAln(alndata, xstrand); //xstrand, nh); + if (xstrand=='+') alndata.strand=1; + else if (xstrand=='-') alndata.strand=-1; countFragment(*bundle, *brec, hi); //if (!ballgown || ref_overlap) - processRead(currentstart, currentend, *bundle, hashread, *brec, strand, nh, hi); + processRead(currentstart, currentend, *bundle, hashread, alndata); + // *brec, strand, nh, hi); //update current end to be at least as big as the start of the read pair in the fragment?? -> maybe not because then I could introduce some false positives with paired reads mapped badly diff --git a/tablemaker.cpp b/tablemaker.cpp index 21b5fd7..2df0048 100755 --- a/tablemaker.cpp +++ b/tablemaker.cpp @@ -46,51 +46,65 @@ void rc_updateExonCounts(const RC_Feature* exon, int nh) { if (nh<=1) exon->ucount++; } -bool BundleData::evalReadAln(GBamRecord& brec, char& strand, int nh) { +bool BundleData::evalReadAln(GReadAlnData& alndata, char& xstrand) { + //GBamRecord& brec, char& strand, int nh) { if (rc_data==NULL) { return false; //no ref transcripts available for this reads' region } + GBamRecord& brec=*(alndata.brec); + int nh=alndata.nh; if ((int)brec.endlmin || (int)brec.start>rc_data->rmax) { return false; //hit outside coverage area } - if (rc_data->exons.Count()==0) return false; + if (rc_data->g_exons.Count()==0) return false; if (ballgown && rc_data->tdata.Count()==0) return false; //nothing to do without transcripts //check this read alignment against ref exons and introns char strandbits=0; + for (int i=0;iupdateCov(strand, nh, brec.exons[i].start, brec.exons[i].len()); - GArray exonOverlaps(true, true); //overlaps sorted by decreasing length + rc_data->updateCov(xstrand, nh, brec.exons[i].start, brec.exons[i].len()); + GArray exonOverlaps(true, true); //overlaps sorted by decreasing length if (rc_data->findOvlExons(exonOverlaps, brec.exons[i].start, - brec.exons[i].end, strand)) { + brec.exons[i].end, xstrand)) { int max_ovl=exonOverlaps[0].ovlen; - if (max_ovl>=5) { //only count exon overlaps of at least 5bp + alndata.g_exonovls.Add(new GVec(exonOverlaps)); + //if (max_ovl>=5) { //only count exon overlaps of at least 5bp for (int k=0;kstrand=='+') strandbits |= 0x01; else if (exonOverlaps[k].feature->strand=='-') strandbits |= 0x02; - if (max_ovl - exonOverlaps[k].ovlen > 5 ) + if (max_ovl - exonOverlaps[k].ovlen > 5) break; //ignore overlaps shorter than max_overlap - 5bp - //TODO: perhaps we could use a better approach for non-overlapping exons + //TODO: perhaps we could use a better approach for non-overlapping ref exons // spanned by this same read alignment //counting this overlap for multiple exons if it's similarly large //(in the shared region of overlapping exons) rc_updateExonCounts(exonOverlaps[k].feature, nh); } - } //exon overlap >= 5 - } //exon overlap processing + //} //exon overlap >= 5 + } //ref exon overlaps + else { //no ref exon overlaps + alndata.g_exonovls.Add(new GVec((int)0)); + } if (i>0) { //intron processing - RC_Feature* ri=rc_data->findIntron(brec.exons[i-1].end+1, brec.exons[i].start-1, strand); + int j_l=brec.exons[i-1].end+1; + int j_r=brec.exons[i].start-1; + RC_Feature* ri=rc_data->findIntron(j_l, j_r, xstrand); + alndata.juncs.Add(new CJunction(j_l, j_r)); //don't set strand, etc. for now if (ri) { ri->rcount++; ri->mrcount += (nh > 1) ? (1.0/nh) : 1; if (nh==1) ri->ucount++; + alndata.juncs.Last()->guide_match=1; } } //intron processing } - if (strand=='.' && strandbits && strandbits<3) { - strand = (strandbits==1) ? '+' : '-'; + if (xstrand=='.' && strandbits && strandbits<3) { + xstrand = (strandbits==1) ? '+' : '-'; } return true; @@ -160,8 +174,8 @@ void rc_write_f2t(FILE* fh, map >& f2t) { void rc_update_exons(RC_BundleData& rc) { //update stdev etc. for all exons in bundle - for (int f=0;f= rc.lmin ); int L=exon.l-rc.lmin; int xlen=exon.r-exon.l+1; @@ -215,11 +229,11 @@ void rc_update_exons(RC_BundleData& rc) { } -void rc_write_RCfeature( GPVec& rcdata, GPVec& features, FILE*& fdata, FILE*& f2t, +void rc_write_RCfeature( GPVec& rcdata, GPVec& features, FILE*& fdata, FILE*& f2t, bool is_exon=false) { for (int i=0;iscaff->getGSeqName(); + const char* ref_name=rcdata[f.t_ids[0]-1]->ref_t->getGSeqName(); if (is_exon) { fprintf(fdata, "%u\t%s\t%c\t%d\t%d\t%d\t%d\t%.2f\t%.4f\t%.4f\t%.4f\t%.4f\n", f.id, ref_name, f.strand, f.l, f.r, f.rcount, @@ -243,7 +257,7 @@ void rc_write_RCfeature( GPVec& rcdata, GPVec& feature //if (rc.exons.size()==0) return; * */ -void rc_writeRC(GPVec& RC_data, +void rc_writeRC(GPVec& RC_data, GPVec& RC_exons, GPVec& RC_introns, FILE* &f_tdata, FILE* &f_edata, FILE* &f_idata, @@ -252,13 +266,13 @@ void rc_writeRC(GPVec& RC_data, for (int t=0;tgetGSeqName(); - const char* genename= sd.scaff->getGeneName(); + RC_TData& sd=*RC_data[t]; + const char* refname = sd.ref_t->getGSeqName(); + const char* genename= sd.ref_t->getGeneName(); if (genename==NULL) genename="."; fprintf(f_tdata, "%u\t%s\t%c\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%f\t%f\n", - sd.t_id, refname, sd.strand, sd.l, sd.r, sd.scaff->getID(), - sd.num_exons, sd.eff_len, sd.scaff->getGeneID(), + sd.t_id, refname, sd.ref_t->strand, sd.l, sd.r, sd.ref_t->getID(), + sd.t_exons.Count(), sd.eff_len, sd.ref_t->getGeneID(), genename, sd.cov, sd.fpkm); }//for each transcript //fflush(f_tdata); @@ -275,13 +289,27 @@ void rc_writeRC(GPVec& RC_data, //rc_write_f2t(rc.fe2t, rc.e2t); //rc_write_f2t(rc.fi2t, rc.i2t); } +/* +void RC_TData::rc_addFeatures(uint& c_e_id, set& exonSet, GPVec& exonTable, + uint& c_i_id, set& intronSet, GPVec& intronTable) { + GASSERT(ref_t); + GffObj& m = *(ref_t); + for (int i = 0; i < m.exons.Count(); ++i) { + set::iterator eit=exonSet.end(); + set::iterator iit=intronSet.end(); + addFeature(m.exons[i]->start, m.exons[i]->end, t_exons, c_e_id, exonSet, eit, exonTable); + if (i>0) { //store intron + addFeature(m.exons[i-1]->end+1, m.exons[i]->start-1, t_introns, c_i_id, intronSet, iit, intronTable); + } + } //for each exon +} -void RC_ScaffData::addFeature(int fl, int fr, GPVec& fvec, - uint& f_id, set& fset, set::iterator& fit, +void RC_TData::addFeature(int fl, int fr, GPVec& fvec, + uint& f_id, set& fset, set::iterator& fit, GPVec& fdata) { //f_id is the largest f_id inserted so far in fset bool add_new = true; - RC_ScaffSeg newseg(fl,fr,this->strand); + RC_Feature newseg(fl,fr,this->strand); //RC_Feature* newfeature=NULL; if (fset.size()>0) { if (fit == fset.end()) --fit; @@ -317,7 +345,7 @@ void RC_ScaffData::addFeature(int fl, int fr, GPVec& fvec, } if (add_new) { //never seen this feature before newseg.id = ++f_id; - pair< set::iterator, bool> ret = fset.insert(newseg); + pair< set::iterator, bool> ret = fset.insert(newseg); //ret.second = was_inserted (indeed new) if (!ret.second) { GError("Error: feature %d-%d (%c) already in segment set!\n", @@ -340,6 +368,99 @@ void RC_ScaffData::addFeature(int fl, int fr, GPVec& fvec, fvec.Add(fdata[newseg.id-1]); } +*/ +void RC_TData::rc_addFeatures(uint& c_e_id, GList& exonSet, GPVec& exonTable, + uint& c_i_id, GList& intronSet, GPVec& intronTable) { + GASSERT(ref_t); + GffObj& m = *(ref_t); + int ecache_idx = exonSet.Count()-1; + int icache_idx = intronSet.Count()-1; + //int ecache_idx = e_idx_cache>=0 ? e_idx_cache : exonSet.Count()-1; + //int icache_idx = i_idx_cache>=0 ? i_idx_cache : intronSet.Count()-1; + for (int i = 0; i < m.exons.Count(); ++i) { + addFeature((int)m.exons[i]->start, (int)m.exons[i]->end, t_exons, c_e_id, exonSet, exonTable, ecache_idx); + //if (i==0) e_idx_cache=ecache_idx; + if (i>0) { //store intron + //if (i==1) i_idx_cache=icache_idx; + addFeature(m.exons[i-1]->end+1, m.exons[i]->start-1, t_introns, c_i_id, + intronSet, intronTable, icache_idx); + } //for each intron + } //for each exon +} + +void RC_TData::addFeature(int fl, int fr, GPVec& fvec, + uint& f_id, GList& fset, + GPVec& fdata, int& cache_idx) { + //f_id is the largest f_id inserted so far in fset + bool add_new = true; + RC_Feature* newseg=new RC_Feature(fl, fr, ref_t->strand, 0, this->t_id); + //RC_Feature* newfeature=NULL; + int fit=cache_idx<0 ? fset.Count()-1 : cache_idx; + int fp_id=-1; + if (fset.Count()>0) { + if (*newseg < *(fset[fit])) { + bool eq=false; + while (*newseg < *(fset[fit]) || (eq = (*newseg==*(fset[fit])))) { + if (eq) { + add_new = false; + fp_id = fset[fit]->id; //fset[fit]->id; + break; + } + //newseg< fset[fit] + --fit; + if (fit<0) break; //newseg should be inserted at 0 + } //while newseg= *fset[fit] + bool eq=false; + while (*(fset[fit]) < *newseg || (eq = (*newseg==*(fset[fit])))) { + if (eq) { + add_new = false; + fp_id = fset[fit]->id; + break; + } + ++fit; + if (fit==fset.Count()) { + //newseg should be appended to the list + break; + } + } + //if (fit<=fset.Count() && !add_new) { + //fset[fit-1] < newseg < fset[fit] + //we'll insert newseg at position fit + //} + } + } //check existing set + if (add_new) { //did not see this feature before + newseg->id = ++f_id; + if (fit<0) fit = fset.Add(newseg); + else fset.sortInsert(fit, newseg); + if (fit<0) { + GError("Error: feature %d-%d (%c) already in feature set!\n", + newseg->l, newseg->r, newseg->strand); + } + cache_idx=fit; + fp_id=fdata.Add(newseg)+1; + +#ifdef DEBUG + if (fdata.Count()!=(int)f_id) { + GMessage("Error: fdata.Count=%d, f_id=%d\n", fdata.Count(), f_id); + } +#endif + GASSERT((uint)fdata.Count()==f_id); + } + else { //feature seen before, update its parent list + fdata[fp_id-1]->t_ids.Add(this->t_id); + delete newseg; + } + //fvec.push_back(newseg); + GASSERT(fdata[fp_id-1]->id==(uint)fp_id); + fvec.Add(fdata[fp_id-1]); +} + void Ballgown_setupFiles(FILE* &f_tdata, FILE* &f_edata, FILE* &f_idata, FILE* &f_e2t, FILE* &f_i2t) { if (f_tdata == NULL) { @@ -356,17 +477,3 @@ void Ballgown_setupFiles(FILE* &f_tdata, FILE* &f_edata, FILE* &f_idata, fprintf(f_i2t, "i_id\tt_id\n"); } } - -void RC_ScaffData::rc_addFeatures(uint& c_e_id, set& fexons, GPVec& edata, - uint& c_i_id, set& fintrons, GPVec& idata) { - GASSERT(scaff); - GffObj& m = *(scaff); - for (int i = 0; i < m.exons.Count(); ++i) { - set::iterator eit=fexons.end(); - set::iterator iit=fintrons.end(); - addFeature(m.exons[i]->start, m.exons[i]->end, t_exons, c_e_id, fexons, eit, edata); - if (i>0) { //store intron - addFeature(m.exons[i-1]->end+1, m.exons[i]->start-1, t_introns, c_i_id, fintrons, iit, idata); - } - } //for each exon -} diff --git a/tablemaker.h b/tablemaker.h index 8b41f45..0c47198 100755 --- a/tablemaker.h +++ b/tablemaker.h @@ -22,12 +22,12 @@ void Ballgown_setupFiles(FILE* &f_tdata, FILE* &f_edata, FILE* &f_idata, FILE* &f_e2t, FILE* &f_i2t); //Bundle raw count data - -struct RC_ScaffSeg { +/* +struct RC_TSeg { uint id; //feature id (>0) int l; int r; //genomic coordinates char strand; - bool operator<(const RC_ScaffSeg& o) const { + bool operator<(const RC_TSeg& o) const { //if (id == o.id) return false; if (l != o.l) return (l < o.l); if (r != o.r) return (r < o.r); @@ -36,31 +36,35 @@ struct RC_ScaffSeg { return false; } - bool operator==(const RC_ScaffSeg& o) const { + bool operator==(const RC_TSeg& o) const { //if (id == o.id) return true; return (l==o.l && r==o.r && (strand == o.strand || strand == '.' || o.strand == '.')); } - RC_ScaffSeg(int fl=0, int fr=0, char s='.', int fid=0) : id(fid), + RC_TSeg(int fl=0, int fr=0, char s='.', int fid=0) : id(fid), l(fl), r(fr), strand(s) { } }; +*/ struct RC_Feature { //exon or intron of a reference transcript - uint id; //feature id (>0) + uint id; //feature id (>0) only used with -B (full Ballgown data) + //it's the index in the global refguides_RC_exons/introns array + 1 GVec t_ids; //transcripts owning this feature - int l; int r; //genomic coordinates + //if -B, this is the index in the global refguides_RC_Data array + 1 + // otherwise it is the index in the BundleData::keepguides array + 1 + int l; int r; //genomic coordinates for the feature char strand; - mutable uint rcount; //# reads covering this feature - mutable uint ucount; //# uniquely mapped reads covering this feature - mutable double mrcount; //multi-mapping-weighted counts + mutable uint rcount; //# read alignments overlapping this feature (>5bp overlaps for exons; + // exact coord. match for introns) + mutable uint ucount; //# uniquely mapped reads overlapping this ref feature + mutable double mrcount; //multi-mapping weighted counts double avg; double stdev; double mavg; double mstdev; - //mutable vector coverage; //per-base exon coverage data struct PCompare { bool operator()(const RC_Feature* p1, const RC_Feature* p2) { return (*p1 < *p2); @@ -72,8 +76,11 @@ struct RC_Feature { //exon or intron of a reference transcript if (l>r) { int t=l; l=r; r=t; } if (tid>0) t_ids.Add(tid); } + RC_Feature(const RC_Feature& seg): id(seg.id), t_ids(seg.t_ids), l(seg.l), r(seg.r), + strand(seg.strand), rcount(0),ucount(0),mrcount(0), avg(0), stdev(0), mavg(0), mstdev(0) { + } - RC_Feature(RC_ScaffSeg& seg, uint tid=0): id(seg.id), t_ids(1), l(seg.l), r(seg.r), + RC_Feature(const RC_Feature& seg, uint tid): id(seg.id), t_ids(1), l(seg.l), r(seg.r), strand(seg.strand), rcount(0),ucount(0),mrcount(0), avg(0), stdev(0), mavg(0), mstdev(0) { if (l>r) { int t=l; l=r; r=t; } if (tid>0) t_ids.Add(tid); @@ -118,85 +125,73 @@ struct RC_Feature { //exon or intron of a reference transcript } }; -struct RC_FeatOvl { - RC_Feature* feature; +struct RC_ExonOvl { + RC_Feature* feature; //pointer to an item of RC_BundleData::g_exons int ovlen; - bool operator<(const RC_FeatOvl& o) const { + bool operator<(const RC_ExonOvl& o) const { if (ovlen==o.ovlen) { return (featureo.ovlen); } //operator < - bool operator==(const RC_FeatOvl& o) const { + bool operator==(const RC_ExonOvl& o) const { return (ovlen==o.ovlen && feature==o.feature); } - RC_FeatOvl(RC_Feature* f=NULL, int olen=0):feature(f), ovlen(olen) { + RC_ExonOvl(RC_Feature* f=NULL, int olen=0):feature(f), ovlen(olen) { } }; -typedef set RC_FeatPtrSet; +//typedef set RC_FeatPtrSet; typedef set::iterator RC_FeatIt; typedef map > RC_Map2Set; typedef map >::iterator RC_Map2SetIt; -struct RC_Seg { //just a genomic interval holder +/*struct RC_Seg { //just a genomic interval holder int l; int r; RC_Seg(int l0=0, int r0=0):l(l0), r(r0) { } }; +*/ -struct RC_ScaffData { //storing RC data for a transcript - GffObj* scaff; +struct RC_TData { //storing RC data for a transcript + //only used with -B (full Ballgown data) + GffObj* ref_t; uint t_id; - GStr t_name; //original GFF ID for the transcript + //GStr t_name; //original GFF ID for the transcript int l; int r; //char strand; - int num_exons; + //int num_exons; int eff_len; double cov; double fpkm; - //other mutable fields here, to be updated by rc_update_scaff() - char strand; - //vector exons; - //vector introns; + //char strand; GPVec t_exons; GPVec t_introns; - //RC_ScaffIds(uint id=0, char s='.'):t_id(id),strand(s) { } - void rc_addFeatures(uint& c_e_id, set& fexons, GPVec& edata, - uint& c_i_id, set& fintrons, GPVec& idata); + //int e_idx_cache; + //int i_idx_cache; + void rc_addFeatures(uint& c_e_id, GList& fexons, GPVec& edata, + uint& c_i_id, GList& fintrons, GPVec& idata); void addFeature(int fl, int fr, GPVec& fvec, uint& f_id, - set& fset, set::iterator& fit, GPVec& fdata); - RC_ScaffData(GffObj& s, uint id=0):scaff(&s), t_id(id), t_name(s.getID()), l(s.start), r(s.end), - num_exons(s.exons.Count()), eff_len(s.covlen), cov(0), fpkm(0), strand(s.strand), - t_exons(false), t_introns(false) { - /*RC_ScaffIds& sdata = *(scaff->rc_id_data()); - t_id = sdata.t_id; - t_name=scaff->annotated_trans_id(); - strand=sdata.strand; - l=scaff->left(); - r=scaff->right(); - num_exons=scaff->exons.Count(); - strand=scaff->strand; - for (size_t i=0;i& fset, GPVec& fdata, + int& cache_idx); + RC_TData(GffObj& s, uint id=0):ref_t(&s), t_id(id), l(s.start), r(s.end), //t_name(s.getID()), + //num_exons(s.exons.Count()), + eff_len(s.covlen), cov(0), fpkm(0), //strand(s.strand), + t_exons(false), t_introns(false) { //, e_idx_cache(-1), i_idx_cache(-1) { } - bool operator<(const RC_ScaffData& o) const { + bool operator<(const RC_TData& o) const { if (l != o.l) return (l < o.l); if (r != o.r) return (r < o.r); - if (strand != o.strand) return (strand < o.strand); - return (t_name < o.t_name); - return false; + if (char c=(ref_t->strand - o.ref_t->strand)) return (c<0); + return (strcmp(ref_t->getID(), o.ref_t->getID())<0); + //return false; } - bool operator==(const RC_ScaffData& o) const { - if (t_id!=0 && o.t_id!=0 && t_id!=o.t_id) return false; - return (l==o.l && r==o.r && strand == o.strand && - t_name == o.t_name); + bool operator==(const RC_TData& o) const { + if (t_id!=0 && o.t_id!=0) return (t_id==o.t_id); + return (l==o.l && r==o.r && ref_t->strand == o.ref_t->strand && + strcmp(ref_t->getID(),o.ref_t->getID())==0); } }; @@ -206,9 +201,7 @@ void rc_frendel(const char* fname); struct BundleData; -//void rc_write_counts(const char* refname, BundleData& bundle); - -void rc_writeRC(GPVec& RC_data, +void rc_writeRC(GPVec& RC_data, GPVec& RC_exons, GPVec& RC_introns, FILE* &f_tdata, FILE* &f_edata, FILE* &f_idata, @@ -231,12 +224,13 @@ struct RC_BundleData { int init_lmin; int lmin; int rmax; - GPVec tdata; //all transcripts in this bundle - GList exons; //all unique exons in this bundle, by their start coordinate - GList introns; //all unique introns in this bundle, by their start coordinate + GPVec tdata; //all transcripts in this bundle (not used unless -B) + GList g_exons; //all unique guide exons in this bundle, sorted by their start coordinate + GList g_introns; //all unique guide introns in this bundle, sorted by their start coordinate //RC_FeatIt xcache; //cache the first exon overlapping xcache_pos to speed up exon-overlap queries (findOvlExons()) int xcache; //exons index of the first exon overlapping xcache_pos int xcache_pos; // left coordinate of last cached exon overlap query (findOvlExons()) + // the following coverage arrays will only used with Ballgown data (-B) vector f_mcov; //coverage data, multi-map aware, per strand vector f_cov; vector r_mcov; //coverage data on the reverse strand @@ -244,14 +238,14 @@ struct RC_BundleData { // RC_BundleData(int t_l=0, int t_r=0):init_lmin(0), lmin(t_l), rmax(t_r), tdata(false), // features:(sorted, free, unique) - exons(true, false, true), introns(true, false, true), + g_exons(true, false, true), g_introns(true, false, true), xcache(0), xcache_pos(0) { if (ballgown) { if (rmax>lmin) updateCovSpan(); - }else { - exons.setFreeItem(true); - introns.setFreeItem(true); + }else { //this will be deallocated when the bundle is cleared + g_exons.setFreeItem(true); + g_introns.setFreeItem(true); } } @@ -263,11 +257,14 @@ struct RC_BundleData { } //for non-ballgown tracking of guide features - void addTranscriptFeature(uint fstart, uint fend, char fstrand, GList& flist, uint tidx) { + void addTranscriptFeature(uint fstart, uint fend, char fstrand, GList& flist, GffObj& t) { + uint tidx=(uint)t.udata; //in non-Ballgown case, this is the index in the BundleData::keepguides array + 1 RC_Feature* feat=new RC_Feature(fstart, fend, fstrand, 0, tidx); int fidx=-1; RC_Feature* p=flist.AddIfNew(feat, true, &fidx); - if (p!=feat) {//exon seen before, update t_id list + if (p==feat) {//exon never seen before + //TODO: use some stable e_id index here and assign it to t.exons[eidx]->uptr->e_id (?) for quick retrieval + } else {//exon seen before, just update t_id list p->t_ids.Add(tidx); } } @@ -279,27 +276,27 @@ struct RC_BundleData { if (lmin==0 || lmin>(int)t.start) { lmin=t.start; boundary_changed=true; } if (rmax==0 || rmax<(int)t.end) { rmax=t.end; boundary_changed=true; } if (t.uptr) { //ballgown case - RC_ScaffData& sdata=*(RC_ScaffData*)(t.uptr); + RC_TData& sdata=*(RC_TData*)(t.uptr); //tdata.insert(sdata); tdata.Add(&sdata); if (boundary_changed) updateCovSpan(); //for (vector::iterator it=sdata.exons.begin();it!=sdata.exons.end();++it) { for (int i=0;istart, t.exons[i]->end, t.strand, exons, (uint)t.udata); + addTranscriptFeature(t.exons[i]->start, t.exons[i]->end, t.strand, g_exons, t); if (i>0) { //add intron - addTranscriptFeature(t.exons[i-1]->end+1, t.exons[i]->start-1, t.strand, introns, (uint)t.udata); + addTranscriptFeature(t.exons[i-1]->end+1, t.exons[i]->start-1, t.strand, g_introns, t); } } } //no ballgown coverage @@ -348,27 +345,27 @@ struct RC_BundleData { } } - bool findOvlExons(GArray& exovls, int hl, int hr, char strand='.', bool update_cache=true) { - //exovls should be clear, unless the caller knows what she's doing - bool foundovls=false; - if (exons.Count()==0) return false; + bool findOvlExons(GArray& exovls, int hl, int hr, char strand='.', bool update_cache=true) { + //exovls should be clear, unless the caller knows what s/he's doing + bool hasOverlaps=false; + if (g_exons.Count()==0) return false; RC_Feature q(hl, hr); int xstart=0; bool no_cache=(xcache_pos==0 || xcache_pos>hl); if (no_cache) { if (update_cache) { //xcache=exons.end(); - xcache=exons.Count()-1; + xcache=g_exons.Count()-1; xcache_pos=0; } } else xstart=xcache; //must have a valid value bool upd_cache(update_cache); - int last_checked_exon=exons.Count()-1; - for (int p=xstart;p < exons.Count();++p) { + int last_checked_exon=g_exons.Count()-1; + for (int p=xstart;p < g_exons.Count();++p) { last_checked_exon=p; - int l=exons[p]->l; - int r=exons[p]->r; + int l=g_exons[p]->l; + int r=g_exons[p]->r; if (l > hr) break; if (hl > r) continue; //exon overlap here @@ -384,16 +381,19 @@ struct RC_BundleData { xcache=p; upd_cache=false; } - if (strand!='.' && strand!=exons[p]->strand) continue; //non-matching strand - foundovls=true; - RC_FeatOvl fovl(exons[p], ovlen); - exovls.Add(fovl); + if (strand!='.' && strand!=g_exons[p]->strand) continue; //non-matching strand + if (ovlen>=5) { + //TODO: check this, arbitrary ovl minimum of 5bp + hasOverlaps=true; + RC_ExonOvl fovl(g_exons[p], ovlen); + exovls.Add(fovl); + } } if (update_cache) { if (upd_cache) xcache=last_checked_exon; //there was no overlap found xcache_pos=hl; } - return foundovls; + return hasOverlaps; } /* RC_FeatPtrSet findExons(int hl, int hr, char strand='.', bool update_cache=true) { @@ -441,8 +441,8 @@ struct RC_BundleData { int fidx=0; RC_Feature* r=NULL; RC_Feature t(hl, hr, strand); - if (introns.Found(&t, fidx)) - r=introns[fidx]; + if (g_introns.Found(&t, fidx)) + r=g_introns[fidx]; return r; } }; //struct RC_BundleData