-
Notifications
You must be signed in to change notification settings - Fork 0
/
Phylip.cpp
113 lines (100 loc) · 2.83 KB
/
Phylip.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#include <iostream>
#include <fstream>
#include <iomanip>
#include <cstdlib>
#include "Phylip.h"
#include "Exceptions.h"
void Phylip::loadData(const char *aFilename, std::vector<std::string> &aSpecies,
std::vector<std::string> &aSequences) {
// Open the file
std::ifstream in(aFilename);
if (!in) {
std::ostringstream o;
o << "Cannot open gene file \"" << aFilename << '"';
throw FastCodeMLFatal(o);
}
// Skip empty lines and verify the file has at least one line
std::string str;
do {
if (!getline(in, str)) {
in.close();
std::ostringstream o;
o << "File \"" << aFilename << "\" is empty";
throw FastCodeMLFatal(o);
}
} while (str.empty());
// From the first line extract number of species and number of basis
unsigned long nspecies, nbasis;
char *endptr;
const char *next = str.c_str();
nspecies = strtol(next, &endptr, 10);
if (endptr == next) {
in.close();
std::ostringstream o;
o << "File \"" << aFilename << "\" is malformed";
throw FastCodeMLFatal(o);
}
next = endptr;
nbasis = strtol(next, &endptr, 10);
if (endptr == next) {
in.close();
std::ostringstream o;
o << "File \"" << aFilename << "\" is malformed";
throw FastCodeMLFatal(o);
}
// Read and parse the genes
while (aSpecies.size() < nspecies && getline(in, str)) {
// Extract the specie name
if (str.empty())
continue;
size_t p1 = str.find_first_not_of(" \t\r");
if (p1 == std::string::npos)
continue;
size_t p2 = str.find_first_of(" \t\r", p1);
std::string s;
s.assign(str, p1, p2 - p1);
aSpecies.push_back(s);
// Extract the gene specification
s.clear();
for (;;) {
for (;;) {
p1 = str.find_first_not_of(" \t\r", p2);
if (p1 == std::string::npos)
break;
p2 = str.find_first_of(" \t\r", p1);
if (p2 == std::string::npos)
p2 = str.size();
s.append(str, p1, p2 - p1);
}
if (s.size() >= nbasis)
break;
if (!getline(in, str))
break;
p2 = 0;
}
aSequences.push_back(s);
}
in.close();
// Check correct number of species loaded
if (nspecies != aSpecies.size()) {
std::ostringstream o;
o << "File \"" << aFilename
<< "\" has number of species mismatch (or is malformed)";
throw FastCodeMLFatal(o);
}
// Check the number of nucleotides read
for (unsigned int n = 0; n < nspecies; ++n) {
if (aSequences[n].length() != nbasis) {
std::ostringstream o;
o << "File \"" << aFilename << "\" gene " << n
<< " has wrong number of nucleotides";
throw FastCodeMLFatal(o);
}
}
// Other sanity checks
if (nbasis % 3) {
std::ostringstream o;
o << "File \"" << aFilename << "\" number of basis is not multiple of 3";
throw FastCodeMLFatal(o);
}
}