Skip to content

Commit

Permalink
more refactoring/improving of read counting
Browse files Browse the repository at this point in the history
  • Loading branch information
gpertea committed May 13, 2015
1 parent 50dd7fc commit 111743d
Show file tree
Hide file tree
Showing 8 changed files with 308 additions and 176 deletions.
5 changes: 4 additions & 1 deletion gclib/GBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,10 +222,13 @@ int Gmkdir(const char *path, bool recursive, int perms) {
*psep=0; //now gpath is the path up to this /
ss=psep; ++ss; //ss repositioned just after the /
// create current level
if (fileExists(gpath)!=1 && G_mkdir(gpath, perms)!=0)
if (fileExists(gpath)!=1 && G_mkdir(gpath, perms)!=0) {
GFREE(gpath);
return -1;
}
*psep='/';
}
GFREE(gpath);
return 0;
}

Expand Down
7 changes: 4 additions & 3 deletions gclib/GList.hh
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ template <class OBJ> class GList:public GPVec<OBJ> {
else return 0;
}
public:
void sortInsert(int idx, OBJ* item);
void sortInsert(int idx, OBJ* item); //special insert in sorted lists
//WARNING: the caller must know the insert index such that the sort order is preserved!
GList(GCompareProc* compareProc=NULL); //free by default
GList(GCompareProc* compareProc, //unsorted by default
GFreeProc *freeProc,
Expand All @@ -110,7 +111,7 @@ template <class OBJ> class GList:public GPVec<OBJ> {
}
else fCompareProc=NULL;
}
int Add(OBJ* item); //-- specific implementation if sorted
int Add(OBJ* item); //-- specific implementation if sorted - may become an Insert()
void Add(GList<OBJ>& list); //add all pointers from another list

OBJ* AddIfNew(OBJ* item, bool deleteIfFound=true, int* fidx=NULL);
Expand Down Expand Up @@ -588,7 +589,7 @@ template <class OBJ> bool GList<OBJ>::Found(OBJ* item, int& idx) {
template <class OBJ> void GList<OBJ>::sortInsert(int idx, OBJ* item) {
//idx must be the new position this new item must have
//so the allowed range is [0..fCount]
//the old idx item all the above will be shifted to idx+1
//the current fList[idx] and all the above will be shifted +1
if (idx<0 || idx>this->fCount) GError(GVEC_INDEX_ERR, idx);
if (this->fCount==this->fCapacity) {
GPVec<OBJ>::Grow(idx, item);
Expand Down
6 changes: 3 additions & 3 deletions gclib/GVec.hh
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ template <class OBJ> class GVec {
GVec(int init_capacity=2);
GVec(int init_count, const OBJ init_val);
GVec(int init_count, OBJ* init_val, bool delete_initval=true); //convenience constructor for complex vectors
GVec(GVec<OBJ>& array); //copy constructor
GVec(const GVec<OBJ>& array); //copy constructor
const GVec<OBJ>& operator=(GVec<OBJ>& array); //copy operator
virtual ~GVec();
void Insert(int idx, OBJ item) { Insert(idx, &item); }
Expand Down Expand Up @@ -234,7 +234,7 @@ template <class OBJ> GVec<OBJ>::GVec(int init_count, OBJ* init_val, bool delete_
}


template <class OBJ> GVec<OBJ>::GVec(GVec<OBJ>& array) { //copy constructor
template <class OBJ> GVec<OBJ>::GVec(const GVec<OBJ>& array) { //copy constructor
this->fCount=array.fCount;
this->fCapacity=array.fCapacity;
this->fArray=NULL;
Expand All @@ -246,7 +246,7 @@ template <class OBJ> GVec<OBJ>::GVec(GVec<OBJ>& array) { //copy constructor
else {
fArray=new OBJ[this->fCapacity]; //]()
// uses OBJ operator=
for (int i=0;i<this->fCount;i++) fArray[i]=array[i];
for (int i=0;i<this->fCount;i++) fArray[i]=array.fArray[i];
}
}
this->fCount=array.fCount;
Expand Down
47 changes: 26 additions & 21 deletions rlink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,15 @@ void countFragment(BundleData& bdata, GBamRecord& brec, int hi) {
}

int processRead(int currentstart, int currentend, BundleData& bdata,
GHash<int>& hashread, GBamRecord& brec, char strand, int nh, int hi) {
GHash<int>& hashread, GReadAlnData& alndata) {
//GBamRecord& brec, char strand, int nh, int hi) {
GList<CReadAln>& readlist = bdata.readlist;
GList<CJunction>& junction = bdata.junction;
GVec<float>& bpcov = bdata.bpcov;

GBamRecord& brec=*(alndata.brec);
char strand=alndata.strand;
int nh=alndata.nh;
int hi=alndata.hi;
int readstart=brec.start;
CReadAln* readaln=NULL;
bool covSaturated=false;
Expand Down Expand Up @@ -143,8 +147,7 @@ int processRead(int currentstart, int currentend, BundleData& bdata,
GPVec<CJunction> newjunction(false);
for (int i=0;i<brec.exons.Count();i++) {
CJunction* nj=NULL;
if (i) {
//deal with introns
if (i) { //deal with introns
GSeg seg(brec.exons[i-1].end, brec.exons[i].start);
leftsupport=brec.exons[i-1].len();
if (leftsupport>maxleftsupport) {
Expand All @@ -163,6 +166,8 @@ int processRead(int currentstart, int currentend, BundleData& bdata,
int maxrightsupport = support > rightsupport ? support : rightsupport;
nj=add_junction(seg.start, seg.end, maxleftsupport, maxrightsupport, junction, strand, nh);
newjunction.Add(nj);
if (alndata.juncs.Count())
nj->guide_match=alndata.juncs[i-1]->guide_match;
}
cov_add(bpcov, brec.exons[i].start-currentstart,
brec.exons[i].end-currentstart, float(1)/nh);
Expand Down Expand Up @@ -8088,7 +8093,7 @@ void clean_junctions(GList<CJunction>& junction) {
*/

void clean_junctions(GList<CJunction>& junction, int refstart, GVec<float>& bpcov,GPVec<GffObj>& guides) {

/*
GArray <CJunction> guideintrons(true,true);
if(guides.Count()) { // guides are not NULL
for(int g=0;g<guides.Count();g++) {
Expand All @@ -8101,32 +8106,32 @@ void clean_junctions(GList<CJunction>& junction, int refstart, GVec<float>& bpco
}
}
}

*/
//fprintf(stderr,"Clean junctions:\n");
for(int i=0;i<junction.Count();i++) {
CJunction& jd=*(junction[i]);
//if(jd.nreads_good<junctionthr) {
if(jd.nreads_good<junctionthr && (!guides.Count() || guideintrons.IndexOf(jd)==-1)) {
//if(jd.nreads_good<junctionthr && (!guides.Count() || guideintrons.IndexOf(jd)==-1)) {
if (jd.nreads_good<junctionthr && (!guides.Count() || !jd.guide_match)) {
//fprintf(stderr,"deleted junction: %d-%d (%d)\n",jd.start,jd.end,jd.strand);
jd.strand=0;
}
else if((int)(jd.end-jd.start)>longintron && (!guides.Count() || guideintrons.IndexOf(jd)==-1)) { // very long intron -> hard to trust unless it's well covered
else //if((int)(jd.end-jd.start)>longintron && (!guides.Count() || guideintrons.IndexOf(jd)==-1)) {
// very long intron -> hard to trust unless it's well covered
if ((int)(jd.end-jd.start)>longintron && (!guides.Count() || !jd.guide_match)) {
int leftreach = jd.start-longintronanchor-refstart;
if(leftreach<0) leftreach=0;
int rightreach = jd.end+longintronanchor-refstart;
if(rightreach>=bpcov.Count()) rightreach=bpcov.Count()-1;
//fprintf(stderr,"cov[%d]=%f cov[%d]=%f cov[%d][=%f cov[%d]=%f\n",leftreach+refstart,bpcov[leftreach],jd.start,bpcov[jd.start-refstart-1],rightreach+refstart,bpcov[rightreach],jd.end,bpcov[jd.end-refstart]);
if((bpcov[leftreach]<1 && bpcov[jd.start-refstart-1]<2) || (bpcov[rightreach]<1 && bpcov[jd.end-refstart]<2)) {
if((bpcov[leftreach]<1 && bpcov[jd.start-refstart-1]<2) ||
(bpcov[rightreach]<1 && bpcov[jd.end-refstart]<2)) {
jd.strand=0;
}
}

/*
{ // DEBUG ONLY
fprintf(stderr,"Junction %d: %d %d %d %g %g\n",i,jd.start,jd.end,jd.strand,jd.nreads,jd.nreads_good);
}
*/
}
//{ // DEBUG ONLY
// fprintf(stderr,"Junction %d: %d %d %d %g %g\n",i,jd.start,jd.end,jd.strand,jd.nreads,jd.nreads_good);
//}
} //for each read junction
}

//debug funcs
Expand Down Expand Up @@ -8579,7 +8584,7 @@ int print_signcluster(char strand,GList<CPrediction>& pred,GVec<int>& genes,GVec

uint t_id=0;
if (pred[n]->t_eq && pred[n]->t_eq->uptr) {
t_id = ((RC_ScaffData*)pred[n]->t_eq->uptr)->t_id;
t_id = ((RC_TData*)pred[n]->t_eq->uptr)->t_id;
}
fprintf(f_out,"%d %d %d %.6f %.6f\n",pred[n]->exons.Count()+1,pred[n]->tlen, t_id, pred[n]->frag,pred[n]->cov);
fprintf(f_out,"%s\tStringTie\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s.%d\"; transcript_id \"%s.%d.%d\"; ",
Expand Down Expand Up @@ -8854,7 +8859,7 @@ int print_cluster(GPVec<CPrediction>& pred,GVec<int>& genes,GVec<int>& transcrip
transcripts[pred[n]->geneno]++;
uint t_id=0;
if (pred[n]->t_eq && pred[n]->t_eq->uptr) {
t_id = ((RC_ScaffData*)pred[n]->t_eq->uptr)->t_id;
t_id = ((RC_TData*)pred[n]->t_eq->uptr)->t_id;
}
fprintf(f_out,"%d %d %d %.6f %.6f\n",pred[n]->exons.Count()+1,pred[n]->tlen, t_id, pred[n]->frag,pred[n]->cov);
fprintf(f_out,"%s\tStringTie\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s.%d\"; transcript_id \"%s.%d.%d\"; ",
Expand Down Expand Up @@ -8982,7 +8987,7 @@ int print_cluster_inclusion(GPVec<CPrediction>& pred,GVec<int>& genes,GVec<int>&
transcripts[pred[n]->geneno]++;
uint t_id=0;
if (pred[n]->t_eq && pred[n]->t_eq->uptr) {
t_id = ((RC_ScaffData*)pred[n]->t_eq->uptr)->t_id;
t_id = ((RC_TData*)pred[n]->t_eq->uptr)->t_id;
}
fprintf(f_out,"%d %d %d %.6f %.6f\n",pred[n]->exons.Count()+1,pred[n]->tlen, t_id, pred[n]->frag,pred[n]->cov);
fprintf(f_out,"%s\tStringTie\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s.%d\"; transcript_id \"%s.%d.%d\"; ",
Expand Down Expand Up @@ -9077,7 +9082,7 @@ int print_transcript_signcluster(char strand,GList<CPrediction>& pred,GVec<int>&
if(pred[n]->flag) {
uint t_id=0;
if (pred[n]->t_eq && pred[n]->t_eq->uptr) {
t_id = ((RC_ScaffData*)pred[n]->t_eq->uptr)->t_id;
t_id = ((RC_TData*)pred[n]->t_eq->uptr)->t_id;
}
fprintf(f_out,"%d %d %d %.6f %.6f\n",pred[n]->exons.Count()+1,pred[n]->tlen, t_id, pred[n]->frag,pred[n]->cov);
fprintf(f_out,"%s\tStringTie\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s.%d\"; transcript_id \"%s.%d.%d\"; ",
Expand Down
22 changes: 18 additions & 4 deletions rlink.h
Original file line number Diff line number Diff line change
Expand Up @@ -252,11 +252,12 @@ struct CGraphnode:public GSeg {

// # 0: strand; 1: start; 2: end; 3: nreads; 4: nreads_good;
struct CJunction:public GSeg {
char strand; //-1,0,1 (unsigned byte)
char strand; //-1,0,1
char guide_match; //exact match of a ref intron?
double nreads;
double nreads_good;
CJunction(int s=0,int e=0, char _strand=0):GSeg(s,e),
strand(_strand), nreads(0), nreads_good(0) {}
strand(_strand), guide_match(0), nreads(0), nreads_good(0) {}
bool operator<(CJunction& b) {
if (start<b.start) return true;
if (start>b.start) return false;
Expand All @@ -270,6 +271,17 @@ struct CJunction:public GSeg {
}
};

struct GReadAlnData {
GBamRecord* brec;
char strand; //-1, 0, 1
int nh;
int hi;
GPVec<CJunction> juncs;
GPVec< GVec<RC_ExonOvl> > g_exonovls; //>5bp overlaps with guide exons, for each read "exon"
GReadAlnData(GBamRecord* bamrec=NULL, char nstrand=0, int num_hits=0, int hit_idx=0):brec(bamrec),
strand(nstrand), nh(num_hits), hi(hit_idx), juncs(true), g_exonovls(true) { }
};

struct CTCov { //covered transcript info
int first_cov_exon;
int last_cov_exon;
Expand Down Expand Up @@ -346,7 +358,8 @@ struct BundleData {
*/
void keepGuide(GffObj* scaff);

bool evalReadAln(GBamRecord& brec, char& strand, int nh); //, int hi);
//bool evalReadAln(GBamRecord& brec, char& strand, int nh); //, int hi);
bool evalReadAln(GReadAlnData& alndata, char& strand);

void Clear() {
keepguides.Clear();
Expand All @@ -372,7 +385,8 @@ struct BundleData {
};

int processRead(int currentstart, int currentend, BundleData& bdata,
GHash<int>& hashread, GBamRecord& brec, char strand, int nh, int hi);
GHash<int>& hashread, GReadAlnData& alndata);
//GBamRecord& brec, char strand, int nh, int hi);

void countFragment(BundleData& bdata, GBamRecord& brec, int hi);

Expand Down
28 changes: 15 additions & 13 deletions stringtie.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ int main(int argc, char * const argv[]) {
// == Done argument processing.

GVec<GRefData> refguides; // plain vector with transcripts for each chromosome
GPVec<RC_ScaffData> refguides_RC_Data(true); //raw count data for all guide transcripts
GPVec<RC_TData> refguides_RC_Data(true); //raw count data for all guide transcripts
GPVec<RC_Feature> refguides_RC_exons(true); //raw count data for all guide exons
GPVec<RC_Feature> refguides_RC_introns(true);//raw count data for all guide introns
GVec<int> alncounts(30,0); //keep track of the number of read alignments per chromosome [gseq_id]
Expand Down Expand Up @@ -237,24 +237,24 @@ const char* ERR_BAM_SORT="\nError: the input alignment file is not sorted!\n";
uint cur_tid=0;
uint cur_exon_id=0;
uint cur_intron_id=0;
std::set<RC_ScaffSeg> exons;
std::set<RC_ScaffSeg> introns;
GList<RC_Feature> uexons(true, false, true); //sorted, free items, unique
GList<RC_Feature> uintrons(true, false, true);
//assign unique transcript IDs based on the sorted order
int last_refid=0;
for (int i=0;i<gffr.gflst.Count();i++) {
GffObj* m=gffr.gflst[i];
if (ballgown) {
RC_ScaffData* tdata=new RC_ScaffData(*m, ++cur_tid);
RC_TData* tdata=new RC_TData(*m, ++cur_tid);
m->uptr=tdata;
if (last_refid!=m->gseq_id) {
//chromosome switch
exons.clear();
introns.clear();
uexons.Clear();
uintrons.Clear();
last_refid=m->gseq_id;
}
refguides_RC_Data.Add(tdata);
tdata->rc_addFeatures(cur_exon_id, exons, refguides_RC_exons,
cur_intron_id, introns, refguides_RC_introns);
tdata->rc_addFeatures(cur_exon_id, uexons, refguides_RC_exons,
cur_intron_id, uintrons, refguides_RC_introns);
}

GRefData& grefdata = refguides[m->gseq_id];
Expand Down Expand Up @@ -322,7 +322,7 @@ if (ballgown)
bool chr_changed=false;
int pos=0;
const char* refseqName=NULL;
char strand=0;
//char strand=0;
char xstrand=0;
int nh=1;
int hi=0;
Expand Down Expand Up @@ -526,12 +526,14 @@ if (ballgown)
//bool ref_overlap=false;
//if (ballgown && bundle->rc_data)
//ref_overlap=
bundle->evalReadAln(*brec, xstrand, nh);
if (xstrand=='+') strand=1;
else if (xstrand=='-') strand=-1;
GReadAlnData alndata(brec, 0, nh, hi);
bundle->evalReadAln(alndata, xstrand); //xstrand, nh);
if (xstrand=='+') alndata.strand=1;
else if (xstrand=='-') alndata.strand=-1;
countFragment(*bundle, *brec, hi);
//if (!ballgown || ref_overlap)
processRead(currentstart, currentend, *bundle, hashread, *brec, strand, nh, hi);
processRead(currentstart, currentend, *bundle, hashread, alndata);
// *brec, strand, nh, hi);

//update current end to be at least as big as the start of the read pair in the fragment?? -> maybe not because then I could introduce some false positives with paired reads mapped badly

Expand Down
Loading

0 comments on commit 111743d

Please sign in to comment.