Skip to content

Commit

Permalink
keep transcript data uptr around for all guides
Browse files Browse the repository at this point in the history
  • Loading branch information
gpertea committed May 11, 2016
1 parent a950425 commit cbd9980
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 167 deletions.
2 changes: 2 additions & 0 deletions rlink.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ extern bool debugMode;
//collect all refguide transcripts for a single genomic sequence
struct GRefData {
GList<GffObj> rnas; //all transcripts on this genomic seq
GList<GRefLocus> loci;
int gseq_id;
const char* gseq_name;
//GList<GTData> tdata; //transcript data (uptr holder for all rnas loaded here)
Expand All @@ -60,6 +61,7 @@ struct GRefData {
}
rnas.Add(t);
t->isUsed(true);
//setLocus(t); //use the GRefLocus::mexons to quickly find an overlap with existing loci, or create a new one
}

bool operator==(GRefData& d){
Expand Down
11 changes: 6 additions & 5 deletions stringtie.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ int main(int argc, char * const argv[]) {
GVec<GRefData> refguides; // plain vector with transcripts for each chromosome

//table indexes for Ballgown Raw Counts data (-B/-b option)
GPVec<RC_TData> guides_RC_tdata(true); //raw count data for all guide transcripts
GPVec<RC_TData> guides_RC_tdata(true); //raw count data or other info for all guide transcripts
GPVec<RC_Feature> guides_RC_exons(true); //raw count data for all guide exons
GPVec<RC_Feature> guides_RC_introns(true);//raw count data for all guide introns

Expand Down Expand Up @@ -321,10 +321,11 @@ const char* ERR_BAM_SORT="\nError: the input alignment file is not sorted!\n";
m->getID(),m->start, m->end);
m->exons.Add(new GffExon(m->start, m->end));
}
if (ballgown) {
RC_TData* tdata=new RC_TData(*m, ++c_tid);
m->uptr=tdata;
guides_RC_tdata.Add(tdata);
//TODO: always keep a RC_TData pointer around, with additional info about guides
RC_TData* tdata=new RC_TData(*m, ++c_tid);
m->uptr=tdata;
guides_RC_tdata.Add(tdata);
if (ballgown) { //already gather exon & intron info for all ref transcripts
tdata->rc_addFeatures(c_exon_id, uexons, guides_RC_exons,
c_intron_id, uintrons, guides_RC_introns);
}
Expand Down
83 changes: 0 additions & 83 deletions tablemaker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,91 +288,8 @@ void rc_writeRC(GPVec<RC_TData>& RC_data,
//File: i_data.ctab
//i_id chr gstart gend rcount ucount mrcount
rc_write_RCfeature(RC_data, RC_introns, f_idata, f_i2t);

// feature-to-transcript link files
//rc_write_f2t(rc.fe2t, rc.e2t);
//rc_write_f2t(rc.fi2t, rc.i2t);
}
/*
void RC_TData::rc_addFeatures(uint& c_e_id, set<RC_Feature>& exonSet, GPVec<RC_Feature>& exonTable,
uint& c_i_id, set<RC_Feature>& intronSet, GPVec<RC_Feature>& intronTable) {
GASSERT(ref_t);
GffObj& m = *(ref_t);
for (int i = 0; i < m.exons.Count(); ++i) {
set<RC_Feature>::iterator eit=exonSet.end();
set<RC_Feature>::iterator iit=intronSet.end();
addFeature(m.exons[i]->start, m.exons[i]->end, t_exons, c_e_id, exonSet, eit, exonTable);
if (i>0) { //store intron
addFeature(m.exons[i-1]->end+1, m.exons[i]->start-1, t_introns, c_i_id, intronSet, iit, intronTable);
}
} //for each exon
}
void RC_TData::addFeature(int fl, int fr, GPVec<RC_Feature>& fvec,
uint& f_id, set<RC_Feature>& fset, set<RC_Feature>::iterator& fit,
GPVec<RC_Feature>& fdata) {
//f_id is the largest f_id inserted so far in fset
bool add_new = true;
RC_Feature newseg(fl,fr,this->strand);
//RC_Feature* newfeature=NULL;
if (fset.size()>0) {
if (fit == fset.end()) --fit;
if (newseg < (*fit)) {
bool eq=false;
while (newseg < (*fit) || (eq = (newseg==(*fit)))) {
if (eq) {
add_new = false;
newseg.id = fit->id;
break;
}
if (fit==fset.begin()) {
break;
}
--fit;
}
}
else { //newseg >= *fit
bool eq=false;
while ((*fit) < newseg || (eq = (newseg==(*fit)))) {
if (eq) {
add_new = false;
newseg.id = fit->id;
break;
}
++fit;
if (fit==fset.end()) {
--fit;
break;
}
}
}
}
if (add_new) { //never seen this feature before
newseg.id = ++f_id;
pair< set<RC_Feature>::iterator, bool> ret = fset.insert(newseg);
//ret.second = was_inserted (indeed new)
if (!ret.second) {
GError("Error: feature %d-%d (%c) already in segment set!\n",
newseg.l, newseg.r, newseg.strand);
//newseg.id = ret.first->id;
}
fdata.Add(new RC_Feature(newseg, this->t_id));
#ifdef DEBUG
if (fdata.Count()!=(int)f_id) {
GMessage("Error: fdata.Count=%d, f_id=%d\n", fdata.Count(), f_id);
}
#endif
GASSERT((uint)fdata.Count()==f_id);
}
else { //feature seen before, update its parent list
fdata[newseg.id-1]->t_ids.Add(this->t_id);
}
//fvec.push_back(newseg);
GASSERT(fdata[newseg.id-1]->id==newseg.id);
fvec.Add(fdata[newseg.id-1]);
}

*/
void RC_TData::rc_addFeatures(uint& c_e_id, GList<RC_Feature>& exonSet, GPVec<RC_Feature>& exonTable,
uint& c_i_id, GList<RC_Feature>& intronSet, GPVec<RC_Feature>& intronTable) {
GASSERT(ref_t);
Expand Down
149 changes: 70 additions & 79 deletions tablemaker.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ struct RC_TSeg {
*/

struct RC_Feature { //exon or intron of a reference transcript
uint id; //feature id (>0) only used with -B (full Ballgown data)
//it's the index in the global refguides_RC_exons/introns array + 1
uint id; //feature id (>0), +1 to the index either in global guides_RC_exons/introns if ballgown,
// or in bundle_RC_exons/introns if not ballgown
GVec<uint> t_ids; //transcripts owning this feature
//if -B, this is the index in the global refguides_RC_Data array + 1
// otherwise it is the index in the BundleData::keepguides array + 1
Expand Down Expand Up @@ -128,6 +128,25 @@ struct RC_Feature { //exon or intron of a reference transcript
}
};

//for locus tracking and coverage, keep merged exons/introns in the locus
struct GSegInfo {
int t_id; //index of RC_TData in the guides_RC_data table + 1
int exonum; //exon number
GSegInfo(int tid=-1, int en=-1):t_id(tid), exonum(en) { }
};
class GMSeg:public GSeg {
public:
GVec<GSegInfo> msegs; //keep track of exons contributing to this merged exon
GMSeg(int l=0, int r=0, int tid=-1, int eno=-1):GSeg(l,r), msegs(tid, eno) {
}
};

//reference locus - formed by exon-overlapping transcripts
class GRefLocus:public GSeg {
GArray<GMSeg> mexons; //merged exons in this locus (from any transcript)
GPVec<GffObj> rnas; //transcripts in this locus
};

struct RC_ExonOvl {
RC_Feature* feature; //pointer to an item of RC_BundleData::g_exons
int mate_ovl; // = 1 if the mate of this read overlaps the same exon
Expand Down Expand Up @@ -167,27 +186,23 @@ struct RC_TData { //storing RC data for a transcript
//only used with -B (full Ballgown data)
GffObj* ref_t;
uint t_id;
//GStr t_name; //original GFF ID for the transcript
int l;
int r;
//char strand;
//int num_exons;
char in_bundle; // 1 if used by read bundles, 2 if it has any direct read overlap
GRefLocus* locus; //pointer to a locus info
int eff_len;
double cov;
double fpkm;
//char strand;
GPVec<RC_Feature> t_exons;
GPVec<RC_Feature> t_introns;
//int e_idx_cache;
//int i_idx_cache;
void rc_addFeatures(uint& c_e_id, GList<RC_Feature>& fexons, GPVec<RC_Feature>& edata,
uint& c_i_id, GList<RC_Feature>& fintrons, GPVec<RC_Feature>& idata);
void addFeature(int fl, int fr, GPVec<RC_Feature>& fvec, uint& f_id,
GList<RC_Feature>& fset, GPVec<RC_Feature>& fdata,
int& cache_idx);
RC_TData(GffObj& s, uint id=0):ref_t(&s), t_id(id), l(s.start), r(s.end), //t_name(s.getID()),
//num_exons(s.exons.Count()),
eff_len(s.covlen), cov(0), fpkm(0), //strand(s.strand),
RC_TData(GffObj& s, uint id=0):ref_t(&s), t_id(id), l(s.start), r(s.end),
in_bundle(0), locus(NULL), eff_len(s.covlen), cov(0), fpkm(0), //strand(s.strand),
t_exons(false), t_introns(false) { //, e_idx_cache(-1), i_idx_cache(-1) {
}

Expand All @@ -196,7 +211,6 @@ struct RC_TData { //storing RC data for a transcript
if (r != o.r) return (r < o.r);
if (char c=(ref_t->strand - o.ref_t->strand)) return (c<0);
return (strcmp(ref_t->getID(), o.ref_t->getID())<0);
//return false;
}
bool operator==(const RC_TData& o) const {
if (t_id!=0 && o.t_id!=0) return (t_id==o.t_id);
Expand Down Expand Up @@ -236,8 +250,8 @@ struct RC_BundleData {
int rmax;
GPVec<RC_TData> g_tdata; //raw counting data for all transcripts in this bundle
// RC_TData::t_id-1 = the element index in this array
GList<RC_Feature> g_exons; //set of guide exons in this bundle, sorted by their start coordinate
GList<RC_Feature> g_introns; //set of guide introns in this bundle, sorted by their start coordinate
GList<RC_Feature> g_exons; //set of guide exons in this bundle, sorted by start coordinate
GList<RC_Feature> g_introns; //set of guide introns in this bundle, sorted by start coordinate
//RC_FeatIt xcache; //cache the first exon overlapping xcache_pos to speed up exon-overlap queries (findOvlExons())
int xcache; //exons index of the first exon overlapping xcache_pos
int xcache_pos; // left coordinate of last cached exon overlap query (findOvlExons())
Expand All @@ -250,10 +264,11 @@ struct RC_BundleData {

//-- when no global Ballgown data is to be generated, these are
// local (bundle) stable order tables of guide features
GPVec<RC_TData>* guides_RC_tdata; //just a pointer to the current RC tdata table
GPVec<RC_TData>* bundle_RC_tdata; //pointer to the global RC tdata table
// RC_Feature::id-1 = the index in these arrays:
GPVec<RC_Feature>* guides_RC_exons;
GPVec<RC_Feature>* guides_RC_introns;
GPVec<RC_Feature>* bundle_RC_exons; //pointers to global (if ballgown)
GPVec<RC_Feature>* bundle_RC_introns;// OR local exon/intron RC data
//local exon/intron ids within the bundle
uint c_exon_id;
uint c_intron_id;
//--
Expand All @@ -263,16 +278,18 @@ struct RC_BundleData {
// features:(sorted, free, unique)
g_exons(true, false, true), g_introns(true, false, true),
xcache(0), xcache_pos(0),
guides_RC_tdata(rc_tdata), guides_RC_exons(rc_exons), guides_RC_introns(rc_introns),
bundle_RC_tdata(rc_tdata), bundle_RC_exons(rc_exons), bundle_RC_introns(rc_introns),
c_exon_id(0), c_intron_id(0)
{
if (ballgown) {
if (rmax>lmin) updateCovSpan();
}else {
g_tdata.setFreeItem(true);
guides_RC_tdata = &g_tdata;
guides_RC_exons = new GPVec<RC_Feature>(true);
guides_RC_introns= new GPVec<RC_Feature>(true);
//g_tdata.setFreeItem(true);
//guides_RC_tdata = &g_tdata;
//-- override the passed rc_exons/rc_introns if not ballgown
//-- because these are now locally maintained so they'll be deallocated with the bundle
bundle_RC_exons = new GPVec<RC_Feature>(true);
bundle_RC_introns= new GPVec<RC_Feature>(true);
}
}

Expand All @@ -282,72 +299,46 @@ struct RC_BundleData {
r_cov.clear();
r_mcov.clear();
if (!ballgown) {
delete guides_RC_exons;
delete guides_RC_introns;
delete bundle_RC_exons;
delete bundle_RC_introns;
}
}

//for non-ballgown tracking of guide features
/*
void addTranscriptFeature(uint fstart, uint fend, char fstrand, GList<RC_Feature>& flist, GffObj& t) {
uint tidx=(uint)t.udata; //in non-Ballgown case, this is the index in the BundleData::keepguides array + 1
RC_Feature* feat=new RC_Feature(fstart, fend, fstrand, 0, tidx);
int fidx=-1;
RC_Feature* p=flist.AddIfNew(feat, true, &fidx);
if (p==feat) {//exon never seen before
//TODO: use some stable e_id index here and assign it to t.exons[eidx]->uptr->e_id (?) for quick retrieval
} else {//exon seen before, just update t_id list
p->t_ids.Add(tidx);
}
}
*/
uint addTranscript(GffObj& t) { //shouuld return the guide index in *guides_RC_tdata
//if (!ps.rc_id_data()) return;
//RC_ScaffIds& sdata = *(ps.rc_id_data());
bool boundary_changed=false;
if (lmin==0 || lmin>(int)t.start) { lmin=t.start; boundary_changed=true; }
if (rmax==0 || rmax<(int)t.end) { rmax=t.end; boundary_changed=true; }
RC_TData* tdata=NULL;
uint addTranscript(GffObj& t) { //should return the guide index in *guides_RC_tdata
bool boundary_changed=false;
if (lmin==0 || lmin>(int)t.start) { lmin=t.start; boundary_changed=true; }
if (rmax==0 || rmax<(int)t.end) { rmax=t.end; boundary_changed=true; }
GASSERT(t.uptr); //we should always have a RC_TData for each guide
RC_TData* tdata=(RC_TData*)(t.uptr);
/*RC_TData* tdata=NULL;
if (ballgown) {
tdata=(RC_TData*)(t.uptr);
tdata=(RC_TData*)(t.uptr);
}
else {
//int c_tid=g_tdata.Count();
//add RC transcript data locally for the bundle
tdata=new RC_TData(t, g_tdata.Count()+1);
t.uptr=tdata;
//guides_RC_Data.Add(tdata);
tdata->rc_addFeatures(c_exon_id, g_exons, *guides_RC_exons,
c_intron_id, g_introns, *guides_RC_introns);
}
//if (ballgown) { //ballgown case
//RC_TData& sdata=*(RC_TData*)(t.uptr);
g_tdata.Add(tdata);
if (ballgown) {
if (boundary_changed) updateCovSpan();
//need to add exons and introns because rc_addFeatures()
// was NOT using g_exons and g_introns as unique sets if ballgown
for (int i=0;i<tdata->t_exons.Count();i++) {
g_exons.Add(tdata->t_exons[i]);
}
for (int i=0;i<tdata->t_introns.Count();i++) {
g_introns.Add(tdata->t_introns[i]);
}
//add RC transcript data locally for the bundle
tdata=new RC_TData(t, g_tdata.Count()+1);
t.uptr=tdata;
//guides_RC_Data.Add(tdata);
tdata->rc_addFeatures(c_exon_id, g_exons, *guides_RC_exons,
c_intron_id, g_introns, *guides_RC_introns);
}*/
g_tdata.Add(tdata);
if (ballgown) {
if (boundary_changed) updateCovSpan();
//rc_addFeatures() called before, but we still need to add exons
// and introns to the local sets: g_exons, g_introns
for (int i=0;i<tdata->t_exons.Count();i++) {
g_exons.Add(tdata->t_exons[i]);
}
//} //if ballgown
/*
else { //non-ballgown, regular storage of bundle guides data
//use GffObj::udata as tid, it's the index in bundledata::keepguides
for (int i=0;i<t.exons.Count();++i) {
addTranscriptFeature(t.exons[i]->start, t.exons[i]->end, t.strand, g_exons, t);
if (i>0) {
//add intron
addTranscriptFeature(t.exons[i-1]->end+1, t.exons[i]->start-1, t.strand, g_introns, t);
}
for (int i=0;i<tdata->t_introns.Count();i++) {
g_introns.Add(tdata->t_introns[i]);
}
} //no ballgown coverage
*/
return tdata->t_id;
}
else {
tdata->rc_addFeatures(c_exon_id, g_exons, *bundle_RC_exons,
c_intron_id, g_introns, *bundle_RC_introns);
}
return tdata->t_id;
}

void updateCovSpan() {
Expand Down

0 comments on commit cbd9980

Please sign in to comment.