-
Notifications
You must be signed in to change notification settings - Fork 2
/
msdatafile.cc
144 lines (120 loc) · 3.68 KB
/
msdatafile.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#include <boost/algorithm/string.hpp>
#include <boost/lexical_cast.hpp>
#include "msdatafile.h"
#include "sputil.h"
using namespace std;
typedef SPIOException SPIOE;
/** Reads datasetfile and extracts DNA sequences from all population and
the outgroup.
@param inp file to read data from.
@param dataset number of current data set.
@param n_sequences #sequences[population][locus].
@param n_sites number of sites[locus].
@param sequences sequences[population][locus][number].
@param outgroup outgroup data[locus].
@param mask missing data bool[locus][site].
*/
void read_dataset(istream & inp, size_t dataset,
const vector<vector<size_t> > & n_sequences, const vector<size_t> & n_sites,
vector<vector<StrSample> > & sequences, vector<Sequence> & outgroups,
const vector<vector<bool> > & mask )
{
size_t nseg;
const string match_segsites = "segsites:";
vector<int> positions;
string str;
// this is a bit non-obvious
const size_t n_pops = n_sequences.size();
const size_t n_loci = n_sites.size();
// loop over all loci
for (size_t l=0; l<n_loci; l++)
{
istringstream itmp;
string stmp;
// skip all lines that don't start with 'segsites:'
do
{
// next non-empty line
skip_space(inp, str);
itmp.clear();
itmp.str(str);
// get first part of line
stmp = "";
itmp >> stmp;
if (!inp.good() || itmp.fail())
throw SPIOE(string(ERR_LOC " Error in reading dataset ") +
lexical_cast<string>(dataset) + " at locus " +
lexical_cast<string>(l) + ": expected '" + match_segsites +
"', found '" + stmp + "'");
}
while (stmp != match_segsites);
itmp >> nseg;
if (itmp.fail())
throw SPIOE(string(ERR_LOC " Error in reading dataset ") +
lexical_cast<string>(dataset) + " at locus " +
lexical_cast<string>(l) + ": couldn't read number of seg sites ");
if (nseg > n_sites[l])
{ /*if too many segregating sites*/
nseg = n_sites[l];
// TODO: ask Ludovic whether error message in this case
//errorfile << "\nerror in reading dataset: nseg=" << nseg
// << " > total number of sites=" << n_sites[l]
// << " in locus " << l;
}
skip_space(inp, str); // next non-blank line
if (mask.size()) // we need position info
{
positions.clear();
positions.resize(nseg);
itmp.clear();
itmp.str(str);
// throw away first part of line
stmp = "";
itmp >> stmp;
float p;
for (size_t s=0; s<nseg; s++)
{
itmp >> p;
positions[s] = lround(p * mask[l].size());
}
}
// for all population
for (size_t p=0; p<n_pops; p++)
{
sequences[p][l].set_tot_n_sites(n_sites[l]);
// all sequences of population p
for (size_t h=0; h<n_sequences[p][l]; h++)
{
string str("");
str.reserve(nseg);
// careful, this is two things in one
if (nseg>0 && !getline(inp, str))
throw SPIOE(
string(ERR_LOC
" error in reading dataset: cannot read seq of "
"haplotype ") +
lexical_cast<string>(h) + " in population " +
lexical_cast<string>(p) + " at locus " +
lexical_cast<string>(l));
Sequence & seq = sequences[p][l].sequence(h);
seq.clear(); seq.reserve(nseg);
if (mask.size())
{
for (size_t i=0; i<str.size(); i++)
// only add non-masked sites
if (mask[l][positions[i]] == 0)
seq.push_back(int(str[i]) - int('0'));
}
else
for (size_t i=0; i<str.size(); i++)
{
if (str[i] != '0' && str[i] != '1')
cerr << "unexpected input: " << str[i] << std::endl;
seq.push_back(int(str[i]) - int('0'));
}
}
} /* end loop over haplotypes in population A*/
outgroups[l].clear();
outgroups[l].resize(nseg, 0);
} /*end loop over loci*/
} /*end of get_dataset*/