-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathevo_PBS.cpp
419 lines (373 loc) · 24.8 KB
/
evo_PBS.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
//
// evo_PBS.cpp
// process_vcf
//
// Created by Milan Malinsky on 03/01/2019.
// Copyright © 2019 Milan Malinsky. All rights reserved.
//
#include "evo_PBS.h"
#include "process_vcf_annotation_tools.h"
#include <deque>
#define SUBPROGRAM "PBS"
#define DEBUG 1
static const char *PBS_USAGE_MESSAGE =
"Usage: " PROGRAM_BIN " " SUBPROGRAM " [OPTIONS] INPUT_FILE.vcf POPULATIONS.txt PBS_trios.txt\n"
"Calculate the PBS statistic from:\n"
"Sequencing of 50 human exomes reveals adaptation to high altitude. Science 329, 75–78 (2010).\n"
"The POPULATIONS.txt file should have two columns: SAMPLE_ID POPULATION_ID\n"
"The PBS_trios.txt should contain names of three populations for which the PBS will be calculated:\n"
"POP1 POP2 POP3\n"
"There can be multiple lines and then the program generates multiple ouput files, named like POP1_POP2_POP3_PBS_SIZE_STEP.txt\n"
"\n"
" -h, --help display this help and exit\n"
" -f, --fixedW sizeKb fixed window size (default: 10kb)\n"
" -w SIZE,STEP --window=SIZE,STEP the parameters of the sliding window: contains SIZE SNPs and move by STEP (default: 20,10)\n"
" --af (optional) output a file with allele frequencies per-population\n"
" --annot=ANNOTATION.gffExtract (optional)gene annotation in the same format as for the 'getCodingSeq' subprogram\n"
" outputs PBS per gene (only exons, with introns, and with 3kb upstream)\n"
" -i, --allow-indels-and-multiallelics (optional) also calculate the PBS score for indels, and multiallelics\n"
" for multiallelics, the first alternate allele is considered\n"
" sites where the ALT allele is simply '*' are ignored\n"
" -r , --region=start,length (optional) only process a subset of the VCF file\n"
" -n, --run-name run-name will be included in the output file name\n"
"\n"
"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
static const char* shortopts = "hw:n:f:i";
enum { OPT_ANNOT, OPT_AF };
static const struct option longopts[] = {
{ "fixedW", required_argument, NULL, 'f' },
{ "window", required_argument, NULL, 'w' },
{ "annot", required_argument, NULL, OPT_ANNOT },
{ "af", required_argument, NULL, OPT_AF },
{ "help", no_argument, NULL, 'h' },
{ "run-name", required_argument, NULL, 'n' },
{ "allow-indels-and-multiallelics", no_argument, NULL, 'i' },
{ NULL, 0, NULL, 0 }
};
namespace opt
{
static string vcfFile;
static string setsFile;
static string PBStriosFile;
static string annotFile;
static string runName = "";
static int fixedWindowSize = 10000;
static int windowSize = 20;
static int windowStep = 10;
static bool af = false;
static bool allowIndels = false;
}
inline std::vector<double> calculatePBSfromAFs(double p1, double p2, double p3, double p1AlleleCount, double p2AlleleCount, double p3AlleleCount) {
// std::cerr << "p:\t" << p1 << "\t" << p2 << "\t" << p3 << std::endl;
double Fst12; double Fst13; double Fst23;
double power12 = pow(p1-p2, 2);
double power13 = pow(p1-p3, 2);
double power23 = pow(p2-p3, 2);
double fraction1 = (p1*(1-p1))/(p1AlleleCount-1);
double fraction2 = (p2*(1-p2))/(p2AlleleCount-1);
double fraction3 = (p3*(1-p3))/(p3AlleleCount-1);
double numerator12 = power12 - fraction1 - fraction2;
double numerator13 = power13 - fraction1 - fraction3;
double numerator23 = power23 - fraction2 - fraction3;
double denominator12 = (p1*(1-p2))+(p2*(1-p1));
double denominator13 = (p1*(1-p3))+(p3*(1-p1));
double denominator23 = (p2*(1-p3))+(p3*(1-p2));
if ((p1 == 0 && p2 == 0) || (p1 == 1 && p2 == 1)) { Fst12 = 0.0; } else { Fst12 = numerator12/denominator12; }
if ((p1 == 0 && p3 == 0) || (p1 == 1 && p3 == 1)) { Fst13 = 0.0; } else { Fst13 = numerator13/denominator13; }
if ((p2 == 0 && p3 == 0) || (p2 == 1 && p3 == 1)) { Fst23 = 0.0; } else { Fst23 = numerator23/denominator23; }
// std::cerr << Fst12 << "\t" << Fst13 << "\t" << Fst23 << std::endl;
if (Fst12 < 0) Fst12 = 0; if (Fst13 < 0) Fst13 = 0; if (Fst23 < 0) Fst23 = 0;
if (Fst12 == 1) Fst12 = 1 - (Fst12/p1AlleleCount); if (Fst13 == 1) Fst13 = 1 - (Fst13/p1AlleleCount); if (Fst23 == 1) Fst23 = 1 - (Fst23/p2AlleleCount);
double T12 = -log(1-Fst12); double T13 = -log(1-Fst13); double T23 = -log(1-Fst23);
double PBS1 = (T12+T13-T23)/2.0;
double PBS2 = (T12+T23-T13)/2.0;
double PBS3 = (T13+T23-T12)/2.0;
if (PBS1 < 0) PBS1 = 0; if (PBS2 < 0) PBS2 = 0; if (PBS3 < 0) PBS3 = 0;
//std::cerr << PBS1 << "\t" << PBS2 << "\t" << PBS3 << std::endl;
std::vector<double> PBS; PBS.push_back(PBS1); PBS.push_back(PBS2); PBS.push_back(PBS3);
return PBS;
}
inline std::vector<double> calculatePBSnumerators(const GeneralSetCounts* c,std::vector<string>& PBStrio) {
double power12 = pow((c->setAAFs.at(PBStrio[0])-c->setAAFs.at(PBStrio[1])), 2);
double power13 = pow((c->setAAFs.at(PBStrio[0])-c->setAAFs.at(PBStrio[2])), 2);
double power23 = pow((c->setAAFs.at(PBStrio[1])-c->setAAFs.at(PBStrio[2])), 2);
double fraction1 = (c->setAAFs.at(PBStrio[0])*(1-c->setAAFs.at(PBStrio[0])))/(c->setAlleleCounts.at(PBStrio[0])-1);
double fraction2 = (c->setAAFs.at(PBStrio[1])*(1-c->setAAFs.at(PBStrio[1])))/(c->setAlleleCounts.at(PBStrio[1])-1);
double fraction3 = (c->setAAFs.at(PBStrio[2])*(1-c->setAAFs.at(PBStrio[2])))/(c->setAlleleCounts.at(PBStrio[2])-1);
double numerator12 = power12 - fraction1 - fraction2;
double numerator13 = power13 - fraction1 - fraction3;
double numerator23 = power23 - fraction2 - fraction3;
std::vector<double> PBSnumerators; PBSnumerators.push_back(numerator12);
PBSnumerators.push_back(numerator13); PBSnumerators.push_back(numerator23);
return PBSnumerators;
}
inline std::vector<double> calculatePBSdenominator(const GeneralSetCounts* c,std::vector<string>& PBStrio) {
double denominator12 = (c->setAAFs.at(PBStrio[0])*(1-c->setAAFs.at(PBStrio[1])))+(c->setAAFs.at(PBStrio[1])*(1-c->setAAFs.at(PBStrio[0])));
double denominator13 = (c->setAAFs.at(PBStrio[0])*(1-c->setAAFs.at(PBStrio[2])))+(c->setAAFs.at(PBStrio[2])*(1-c->setAAFs.at(PBStrio[0])));
double denominator23 = (c->setAAFs.at(PBStrio[1])*(1-c->setAAFs.at(PBStrio[2])))+(c->setAAFs.at(PBStrio[2])*(1-c->setAAFs.at(PBStrio[1])));
std::vector<double> PBSdenominators; PBSdenominators.push_back(denominator12);
PBSdenominators.push_back(denominator13); PBSdenominators.push_back(denominator23);
return PBSdenominators;
}
int PBSmain(int argc, char** argv) {
parsePBSoptions(argc, argv);
string line; // for reading the input files
// Load up the annotation file
Annotation wgAnnotation; std::ifstream* annotFile;
if (!opt::annotFile.empty()) {
annotFile = new std::ifstream(opt::annotFile.c_str());
Annotation Annot(annotFile, false); // Does not use transcripts annotated as 5' or 3' partial
wgAnnotation = Annot;
}
std::istream* vcfFile = createReader(opt::vcfFile.c_str());
std::ifstream* setsFile = new std::ifstream(opt::setsFile.c_str());
std::ifstream* PBStriosFile = new std::ifstream(opt::PBStriosFile.c_str());
std::map<string, std::vector<string>> popToIDsMap;
std::map<string, string> IDsToPopMap;
std::map<string, std::vector<size_t>> popToPosMap;
std::map<size_t, string> posToPopMap;
// Get the sample sets
while (getline(*setsFile, line)) {
// std::cerr << line << std::endl;
std::vector<string> ID_Pop = split(line, '\t');
popToIDsMap[ID_Pop[1]].push_back(ID_Pop[0]);
IDsToPopMap[ID_Pop[0]] = ID_Pop[1];
//std::cerr << ID_Species[1] << "\t" << ID_Species[0] << std::endl;
}
// Get a vector of set names (usually populations/species)
std::vector<string> populations;
for(std::map<string,std::vector<string>>::iterator it = popToIDsMap.begin(); it != popToIDsMap.end(); ++it) {
populations.push_back(it->first);
// std::cerr << it->first << std::endl;
} std::cerr << "There are " << populations.size() << " populations " << std::endl;
// Get the PBS trios
std::vector<std::ofstream*> outFiles;
std::vector<std::ofstream*> outFilesFixedWindow;
std::vector<std::ofstream*> outFilesGenes;
std::vector<std::vector<string> > PBStrios;
while (getline(*PBStriosFile,line)) {
// std::cerr << line << std::endl;
std::vector<string> threePops = split(line, '\t'); assert(threePops.size() == 3);
std::ofstream* outFile = new std::ofstream(threePops[0] + "_" + threePops[1] + "_" + threePops[2]+ "_PBS_" + opt::runName + "_" + numToString(opt::windowSize) + "_" + numToString(opt::windowStep) + ".txt");
std::ofstream* outFileFixedWindow = new std::ofstream(threePops[0] + "_" + threePops[1] + "_" + threePops[2]+ "_PBS_" + opt::runName + "_FW" + numToString(opt::fixedWindowSize) + ".txt");
*outFile << "chr\twStart\twEnd\t" << threePops[0] << "\t" << threePops[1] << "\t" << threePops[2] << std::endl;
*outFileFixedWindow << "chr\twStart\twEnd\t" << threePops[0] << "\t" << threePops[1] << "\t" << threePops[2] << "\t" << "nFwSNPs1" << "\t" << "nFwSNPs2" << "\t" << "nFwSNPs3" << std::endl;
//outFile->setf(std::ios_base::fixed); // Avoid scientific notation in the coordinates
outFiles.push_back(outFile); outFilesFixedWindow.push_back(outFileFixedWindow);
if (!opt::annotFile.empty()) {
std::ofstream* outFileGenes = new std::ofstream(threePops[0] + "_" + threePops[1] + "_" + threePops[2]+ "_PBSGenes_" + opt::runName + "_" + numToString(opt::windowSize) + "_" + numToString(opt::windowStep) + ".txt");
// *outFileGenes << "gene\t" << "numSNPsExons\t" << "numSNPsWithIntrons\t" << "numSNPsWith3kbUpstr\t" << threePops[0] << "_exons\t" << threePops[1] << "_exons\t" << threePops[2] << "_exons\t" << threePops[0] << "_wIntrons\t" << threePops[1] << "_wIntrons\t" << threePops[2] << "_wIntrons\t" << threePops[0] << "_w3kbUpstr\t" << threePops[1] << "_w3kbUpstr\t" << threePops[2] << "_w3kbUpstr" << std::endl;
*outFileGenes << "gene\t" << "numSNPsExons\tnumSNPsIntrons\tnumSNPs3kbPromoter\t" << threePops[0] << "_exons\t" << threePops[1] << "_exons\t" << threePops[2] << "_exons\t" << threePops[0] << "_wIntrons\t" << threePops[1] << "_wIntrons\t" << threePops[2] << "_wIntrons\t" << threePops[0] << "_promoter\t" << threePops[1] << "_promoter\t" << threePops[2] << "_promoter" << std::endl;
outFilesGenes.push_back(outFileGenes);
}
PBStrios.push_back(threePops);
}
// And need to prepare the vectors to hold the PBS values and the coordinates:
std::deque<double> initDeq(opt::windowSize,0.0); // deque to initialise per-site PBS values
std::vector<std::deque<double>> initThreeDeques(4,initDeq); // vector of three per-site PBS deques - for each population in the trio, and the fourth is for the coordinates
std::vector<std::vector<std::deque<double>>> PBSresults(PBStrios.size(),initThreeDeques);
// Fixed window size vectors:
std::vector<std::vector<double>> initVectorFixed(3);
std::vector<std::vector<std::vector<double>>> PBSfixedWindowResults(PBStrios.size(),initVectorFixed);
int currentWindowStart = 0; int currentWindowEnd = currentWindowStart + opt::fixedWindowSize;
// if (!opt::annotFile.empty()) {
std::vector<std::vector<double>> initGeneVectors(9); // For the nine PBS columns in the _PBSGenes_ files
std::vector<std::vector<std::vector<double>>> PBSgeneResults(PBStrios.size(),initGeneVectors);
std::string currentGene = ""; std::string previousGene = "";
//}
//std::deque<std::vector<double>> regionPBSnums; regionPBSnums.assign(opt::windowSize,init);
//std::deque<std::vector<double>> regionPBSdenoms; regionPBSdenoms.assign(opt::windowSize,init);
//std::vector<double> allPs(populations.size(),0.0);
int totalVariantNumber = 0;
std::vector<int> usedVars(PBStrios.size(),0); // Will count the number of used variants for each trio
std::vector<string> sampleNames; std::vector<std::string> fields;
int reportProgressEvery = 1000; string chr; string coord; double coordDouble;
std::clock_t start; std::clock_t startGettingCounts; std::clock_t startCalculation;
double durationOverall; double durationGettingCounts; double durationCalculation;
while (getline(*vcfFile, line)) {
line.erase(std::remove(line.begin(), line.end(), '\r'), line.end()); // Deal with any left over \r from files prepared on Windows
if (line[0] == '#' && line[1] == '#')
continue;
else if (line[0] == '#' && line[1] == 'C') {
fields = split(line, '\t');
std::vector<std::string> sampleNames(fields.begin()+NUM_NON_GENOTYPE_COLUMNS,fields.end());
// print_vector_stream(sampleNames, std::cerr);
for (std::vector<std::string>::size_type i = 0; i != sampleNames.size(); i++) {
posToPopMap[i] = IDsToPopMap[sampleNames[i]];
}
// Iterate over all the keys in the map to find the samples in the VCF:
// Give an error if no sample is found for a species:
for(std::map<string, std::vector<string>>::iterator it = popToIDsMap.begin(); it != popToIDsMap.end(); ++it) {
string sp = it->first;
//std::cerr << "sp " << sp << std::endl;
std::vector<string> IDs = it->second;
std::vector<size_t> spPos = locateSet(sampleNames, IDs);
if (spPos.empty()) {
std::cerr << "Did not find any samples in the VCF for \"" << sp << "\"" << std::endl;
assert(!spPos.empty());
}
popToPosMap[sp] = spPos;
}
start = std::clock();
// std::cerr << " " << std::endl;
// std::cerr << "Outgroup at pos: "; print_vector_stream(speciesToPosMap["Outgroup"], std::cerr);
// std::cerr << "telvit at pos: "; print_vector_stream(speciesToPosMap["telvit"], std::cerr);
} else {
totalVariantNumber++;
if (totalVariantNumber % reportProgressEvery == 0) {
durationOverall = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
std::cerr << "Processed " << totalVariantNumber << " variants in " << durationOverall << "secs" << std::endl;
std::cerr << "GettingCounts " << durationGettingCounts << " calculation " << durationCalculation << "secs" << std::endl;
}
fields = split(line, '\t'); chr = fields[0]; coord = fields[1]; coordDouble = stringToDouble(coord);
std::vector<std::string> genotypes(fields.begin()+NUM_NON_GENOTYPE_COLUMNS,fields.end());
// Only consider biallelic SNPs
string refAllele = fields[3]; string altAllele = fields[4]; bool ignoreSite = false;
if (altAllele == "*") ignoreSite = true;
if (!opt::allowIndels) {
if (refAllele.length() > 1 || altAllele.length() > 1) ignoreSite = true;
}
if (ignoreSite) {
refAllele.clear(); refAllele.shrink_to_fit(); altAllele.clear(); altAllele.shrink_to_fit();
genotypes.clear(); genotypes.shrink_to_fit(); continue;
}
startGettingCounts = std::clock();
GeneralSetCounts* c = new GeneralSetCounts(popToPosMap, (int)genotypes.size());
c->getSetVariantCountsSimple(genotypes, posToPopMap);
genotypes.clear(); genotypes.shrink_to_fit();
durationGettingCounts = ( std::clock() - startGettingCounts ) / (double) CLOCKS_PER_SEC;
// std::cerr << "Here:" << totalVariantNumber << std::endl;
if (opt::af) {
std::ofstream* outFileAF = new std::ofstream(stripExtension(opt::setsFile) + "_AF" + ".txt");
*outFileAF << chr << "\t" << coord << "\t" << refAllele << "\t" << altAllele;
for(std::map<string,double>::iterator iter = c->setAAFs.begin(); iter != c->setAAFs.end(); ++iter) {
*outFileAF << "\t" << iter->second;
}
*outFileAF << "\n";
}
startCalculation = std::clock();
// for (std::vector<std::string>::size_type i = 0; i != populations.size(); i++) {
// allPs[i] = c->setAAFs.at(populations[i]);
// }
// durationFirstLoop = ( std::clock() - startCalculation ) / (double) CLOCKS_PER_SEC;
// find if we are in a gene:
std::vector<string> SNPgeneDetails = wgAnnotation.getSNPgeneDetails(chr, atoi(coord.c_str()));
if (SNPgeneDetails[0] != "") {
currentGene = SNPgeneDetails[0];
if (previousGene == "") previousGene = currentGene;
}
// Check if we are still in the same physical window...
if (coordDouble > currentWindowEnd || coordDouble < currentWindowStart) {
for (int i = 0; i != PBStrios.size(); i++) {
int nFwSNPs1 = (int)PBSfixedWindowResults[i][0].size(); int nFwSNPs2 = (int)PBSfixedWindowResults[i][1].size(); int nFwSNPs3 = (int)PBSfixedWindowResults[i][2].size();
double PBSfw1 = 0; if (nFwSNPs1 > 0) { PBSfw1 = vector_average(PBSfixedWindowResults[i][0]); }
double PBSfw2 = 0; if (nFwSNPs2 > 0) { PBSfw2 = vector_average(PBSfixedWindowResults[i][1]); }
double PBSfw3 = 0; if (nFwSNPs3 > 0) { PBSfw3 = vector_average(PBSfixedWindowResults[i][2]); }
*outFilesFixedWindow[i] << chr << "\t" << currentWindowStart << "\t" << currentWindowEnd << "\t" << PBSfw1 << "\t" << PBSfw2 << "\t" << PBSfw3 << "\t" << nFwSNPs1 << "\t" << nFwSNPs2 << "\t" << nFwSNPs3 << std::endl;
PBSfixedWindowResults[i][0].clear(); PBSfixedWindowResults[i][1].clear(); PBSfixedWindowResults[i][2].clear();
}
if (coordDouble > currentWindowEnd) {
currentWindowStart = currentWindowStart + opt::fixedWindowSize; currentWindowEnd = currentWindowEnd + opt::fixedWindowSize;
} else if (coordDouble < currentWindowStart) {
currentWindowStart = 0; currentWindowEnd = 0 + opt::fixedWindowSize;
}
}
// std::cerr << coord << "\t";
// print_vector_stream(SNPgeneDetails, std::cerr);
// Now calculate the PBS stats:
double p1; double p2; double p3;
for (int i = 0; i != PBStrios.size(); i++) {
p1 = c->setAAFs.at(PBStrios[i][0]); //assert(p_S1 == pS1test);
if (p1 == -1) continue; // If any member of the trio has entirely missing data, just move on to the next trio
p2 = c->setAAFs.at(PBStrios[i][1]); //assert(p_S2 == pS2test);
if (p2 == -1) continue;
p3 = c->setAAFs.at(PBStrios[i][2]); // assert(p_S3 == pS3test);
if (p3 == -1) continue;
if (p1 == 0 && p2 == 0 && p3 == 0) { continue; }
if (p1 == 1 && p2 == 1 && p3 == 1) { continue; }
usedVars[i]++;
std::vector<double> thisSNP_PBS = calculatePBSfromAFs(p1,p2,p3,
c->setAlleleCounts.at(PBStrios[i][0]),
c->setAlleleCounts.at(PBStrios[i][1]),
c->setAlleleCounts.at(PBStrios[i][2]));
PBSresults[i][0].push_back(thisSNP_PBS[0]); PBSresults[i][1].push_back(thisSNP_PBS[1]); PBSresults[i][2].push_back(thisSNP_PBS[2]);
PBSresults[i][3].push_back(stringToDouble(coord));
PBSresults[i][0].pop_front(); PBSresults[i][1].pop_front(); PBSresults[i][2].pop_front(); PBSresults[i][3].pop_front();
PBSfixedWindowResults[i][0].push_back(thisSNP_PBS[0]); PBSfixedWindowResults[i][1].push_back(thisSNP_PBS[1]); PBSfixedWindowResults[i][2].push_back(thisSNP_PBS[2]);
if (!opt::annotFile.empty()) { if (SNPgeneDetails[0] != "") {
if (SNPgeneDetails[1] == "exon") {
PBSgeneResults[i][0].push_back(thisSNP_PBS[0]); PBSgeneResults[i][1].push_back(thisSNP_PBS[1]); PBSgeneResults[i][2].push_back(thisSNP_PBS[2]);
} else if (SNPgeneDetails[1] == "intron") {
PBSgeneResults[i][3].push_back(thisSNP_PBS[0]); PBSgeneResults[i][4].push_back(thisSNP_PBS[1]); PBSgeneResults[i][5].push_back(thisSNP_PBS[2]);
} else if (SNPgeneDetails[1] == "promoter") {
PBSgeneResults[i][6].push_back(thisSNP_PBS[0]); PBSgeneResults[i][7].push_back(thisSNP_PBS[1]); PBSgeneResults[i][8].push_back(thisSNP_PBS[2]);
}
}}
if (usedVars[i] > opt::windowSize && (usedVars[i] % opt::windowStep == 0)) {
// std::cerr << PBSresults[i][0][0] << std::endl;
*outFiles[i] << chr << "\t" << (int)PBSresults[i][3][0] << "\t" << coord << "\t" << vector_average(PBSresults[i][0]) << "\t" << vector_average(PBSresults[i][1]) << "\t" << vector_average(PBSresults[i][2]) << std::endl;
}
// }
}
if (!opt::annotFile.empty()) { if (previousGene != "" && currentGene != previousGene) {
for (int i = 0; i != PBStrios.size(); i++) {
int nExonSNPs = (int)PBSgeneResults[i][0].size(); int nIntronSNPs = (int)PBSgeneResults[i][3].size(); int nPromoterSNPs = (int)PBSgeneResults[i][6].size();
double ExonPBS1 = 0; double ExonPBS2 = 0; double ExonPBS3 = 0;
if (nExonSNPs > 0) { ExonPBS1 = vector_average(PBSgeneResults[i][0]); ExonPBS2 = vector_average(PBSgeneResults[i][1]); ExonPBS3 = vector_average(PBSgeneResults[i][2]);}
double IntronPBS1 = 0; double IntronPBS2 = 0; double IntronPBS3 = 0;
if (nIntronSNPs > 0) { IntronPBS1 = vector_average(PBSgeneResults[i][3]); IntronPBS2 = vector_average(PBSgeneResults[i][4]); IntronPBS3 = vector_average(PBSgeneResults[i][5]);}
double PromoterPBS1 = 0; double PromoterPBS2 = 0; double PromoterPBS3 = 0;
if (nPromoterSNPs > 0) { PromoterPBS1 = vector_average(PBSgeneResults[i][6]); PromoterPBS2 = vector_average(PBSgeneResults[i][7]); PromoterPBS3 = vector_average(PBSgeneResults[i][8]);}
*outFilesGenes[i] << previousGene << "\t" << nExonSNPs << "\t" << nIntronSNPs << "\t" << nPromoterSNPs << "\t" << ExonPBS1 << "\t" << ExonPBS2 << "\t" << ExonPBS3 << "\t" << IntronPBS1 << "\t" << IntronPBS2 << "\t" << IntronPBS3 << "\t" << PromoterPBS1 << "\t" << PromoterPBS2 << "\t" << PromoterPBS3 << std::endl;
for (int j = 0; j <= 8; j++) { PBSgeneResults[i][j].clear(); }
}
previousGene = currentGene;
}}
durationCalculation = ( std::clock() - startCalculation ) / (double) CLOCKS_PER_SEC;
delete c;
}
}
return 0;
}
void parsePBSoptions(int argc, char** argv) {
bool die = false; string regionArgString; std::vector<string> regionArgs;
std::vector<string> windowSizeStep;
for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)
{
std::istringstream arg(optarg != NULL ? optarg : "");
switch (c)
{
case '?': die = true; break;
case 'f': arg >> opt::fixedWindowSize; break;
case 'w':
windowSizeStep = split(arg.str(), ',');
opt::windowSize = atoi(windowSizeStep[0].c_str());
opt::windowStep = atoi(windowSizeStep[1].c_str());
break;
case 'n': arg >> opt::runName; break;
case 'i': opt::allowIndels = true; break;
case OPT_ANNOT: arg >> opt::annotFile; break;
case OPT_AF: opt::af = true; break;
case 'h':
std::cout << PBS_USAGE_MESSAGE;
exit(EXIT_SUCCESS);
}
}
if (argc - optind < 3) {
std::cerr << "missing arguments\n";
die = true;
}
else if (argc - optind > 3)
{
std::cerr << "too many arguments\n";
die = true;
}
if (die) {
std::cout << "\n" << PBS_USAGE_MESSAGE;
exit(EXIT_FAILURE);
}
// Parse the input filenames
opt::vcfFile = argv[optind++];
opt::setsFile = argv[optind++];
opt::PBStriosFile = argv[optind++];
}