-
Notifications
You must be signed in to change notification settings - Fork 1
/
SSRContigLists.cpp
200 lines (179 loc) · 6.88 KB
/
SSRContigLists.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
/*******************************************************************************
+
+ SSRContigLists.cpp
+
+ Copyright (c) 2002 Genoscope, CEA, CNS, Evry, France
+ Author : Jean-Marc Aury, [email protected]
+
*******************************************************************************/
#include "SSRContigLists.h"
using namespace std;
// class constructor
SSRContigLists::SSRContigLists(GffRecordList listRecord, map<string, string>& sequences, map<string, s32>& badintrons) {
string seqname, tagPrevious, exonIdPrevious;
s32 start, end, tmpEnd;
map<string,SSRContig*> seenCovtigs;
f8 coverage;
s32 startPrevious = 0, endPrevious = 0;
Tstrand strand;
map<s32,TSSRList*> mapTranscrit;
s32 removed = 0;
GffRecordL* allRecords = listRecord.getRecords();
for(GffRecordL::iterator itRecord = allRecords->begin() ; itRecord != allRecords->end(); ++itRecord) {
GffRecord* record = *itRecord;
seqname = record->getSeqName();
string currentId = record->getAttribute();
if(seqname.empty()) { break; }
start = record->getStart(); //+ 1;
end = record->getEnd();
coverage = 0;
if(record->getStrand() == "+") strand = SSRContig::FORWARD;
else if(record->getStrand() == "-") strand = SSRContig::REVERSE;
else strand = SSRContig::UNKNOWN;
map<string, string>::iterator itSeq = sequences.find(seqname);
if (itSeq == sequences.end()) {
cerr << "[SSRContigList] Can't find sequence " << seqname << " in map structure." << endl;
exit(2);
}
string* seq = &itSeq->second;
SSRContig* ctg = new SSRContig(seqname, start, end, coverage, seq, strand);
s32 color = record->getColor();
ctg->setIdTranscrit(color);
ostringstream oss;
oss << seqname << "@"<< start << "@"<<end << "@"<<strand;
string key = oss.str();
map<string,SSRContig*>::iterator itSeenCovtigs = seenCovtigs.find(key);
if( itSeenCovtigs == seenCovtigs.end()){// The covtig doesnt exist
if(end-start+1 > SSRContig::MINSIZEEXON) {
TcontigLists::iterator itContigs = _contigs.find(seqname);
if (itContigs == _contigs.end()) {
SSRContigList* liste = new SSRContigList();
itContigs = _contigs.insert(make_pair(seqname, liste)).first;
}
(itContigs->second)->push_back(ctg); // it->second = TSSRList _contigs
map<s32,list<string> >::iterator itTranscrit = _transcrit.find(color);
if(itTranscrit != _transcrit.end()){
itTranscrit->second.push_back(key);
}
else{
list<string> tmpList;
tmpList.push_back(key);
_transcrit.insert(make_pair(color,tmpList));
}
seenCovtigs.insert(make_pair(key,((itContigs->second)->back())));
}
else delete(ctg);
}
else {
(itSeenCovtigs->second)->setIdTranscrit(color);
map<s32,list<string> >::iterator itTranscrit = _transcrit.find(color);
if(itTranscrit != _transcrit.end()){
itTranscrit->second.push_back(key);
}
else{
list<string> tmpList;
tmpList.push_back(key);
_transcrit.insert(make_pair(color,tmpList));
}
delete(ctg);
}
/*
* Load Junctions
*/
map<string, s32>::iterator itJS; // Known Junctions
tmpEnd = end;
if(record->getAttribute() == exonIdPrevious){
if(strand == SSRContig::UNKNOWN) {
removed += addJunction(seqname, startPrevious, endPrevious, start, tmpEnd, 1, record->getDatatype(), badintrons);
removed += addJunction(seqname, startPrevious, endPrevious, start, tmpEnd, -1, record->getDatatype(), badintrons);
} else {
removed += addJunction(seqname, startPrevious, endPrevious, start, tmpEnd, (s32)strand, record->getDatatype(), badintrons);
}
}
//update previous value before relooping
startPrevious = start;
endPrevious = tmpEnd;
exonIdPrevious = record->getAttribute();
}
cout << removed << " introns from GFF files were removed thanks to the exclude_introns file." << endl << endl;
for( TcontigLists::iterator it=_contigs.begin(); it != _contigs.end(); it++ ) (it->second)->startposSort();
}
s32 SSRContigLists::addJunction(string seqname, s32 start1, s32 end1, s32 start2,
s32 end2, s32 strand, s8 datatype, map<string, s32>& badintrons) {
ostringstream ossintron, oss;
ossintron << seqname << "@" << (end1 + 1) << "@" << (start2 - 1) << "@" << strand;
string keyI = ossintron.str();
map<string, s32>::iterator it = badintrons.find(keyI);
if (it == badintrons.end()) {
oss << seqname << "@" << start1 << "@" << end1 << "@" << start2 << "@" << end2 << "@" << strand;
string key = oss.str();
it = _kwJunctions.find(key);
if (it == _kwJunctions.end()) _kwJunctions.insert(make_pair(key, datatype));
else it->second += 1;
return 0;
} else return 1;
}
// to test junctions using the dictionary and the junctions file given as input
list<NetEx*> SSRContigLists::buildGraph() {
list<NetEx*> NetList;
for( TcontigLists::iterator it =_contigs.begin(); it != _contigs.end(); it++ ){//loop on transcrits by scaffold
NetList.push_back( (it->second)->buildGraph( _kwJunctions) );
}
return NetList;
}
std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) {
std::stringstream ss(s);
std::string item;
while (std::getline(ss, item, delim)) {
elems.push_back(item);
}
return elems;
}
std::vector<std::string> split(const std::string &s, char delim) {
std::vector<std::string> elems;
split(s, delim, elems);
return elems;
}
void SSRContigLists::cleanJunctions(){
//Delete some jonctions based on length and coverage
map<string,pair<s32,list<string> > >::iterator itJ;
string key;
map<string,pair<s32,list<string> > > newJunctions;
for(map<string,s32>::iterator itKJ = _kwJunctions.begin(); itKJ != _kwJunctions.end(); ++itKJ){
vector<string> blop = split(itKJ->first,'@');
s32 startJunction = atoi(blop[2].c_str());
s32 endJunction = atoi(blop[3].c_str());
s32 sizeJunction = endJunction-startJunction+1;
if(sizeJunction > 5000 ){
key = blop[2];
key += "@";
key += blop[3];
itJ = newJunctions.find(key);
if(itJ == newJunctions.end()){
list<string> tmpList;
tmpList.push_back(itKJ->first);
pair<s32,list<string> > tmpP = make_pair(itKJ->second,tmpList);
pair<string,pair<s32,list<string> > > tmpPP = make_pair(key,tmpP);
newJunctions.insert(tmpPP);
}
else{
itJ->second.first += itKJ->second;
itJ->second.second.push_back(itKJ->first);
}
}
}
for(map<string,pair<s32,list<string> > >::iterator itNJ =newJunctions.begin(); itNJ != newJunctions.end(); ++itNJ){
vector<string> blop = split(itNJ->first,'@');
if(itNJ->second.first < 2 ){
for(list<string>::iterator itList = itNJ->second.second.begin(); itList != itNJ->second.second.end(); ++itList){
_kwJunctions.erase(*itList);
}
}
}
}
// to print the covtigs
ostream& operator<<(ostream& ostr, const SSRContigLists& d) {
TcontigLists::const_iterator it;
for( it=(d._contigs).begin(); it != (d._contigs).end(); it++ ) ostr << *(it->second);
return ostr;
}