-
Notifications
You must be signed in to change notification settings - Fork 13
/
nxtrim.cpp
executable file
·234 lines (216 loc) · 8.06 KB
/
nxtrim.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#include "version.h"
#include "matepair.h"
#include "fastqlib.h"
#include <getopt.h>
using namespace std;
string percent(int num,int den)
{
char buffer[100];
sprintf(buffer,"%d / %d\t( %.2f%% )\t",num,den,100. * float(num)/float(den));
return(buffer);
}
void usage()
{
cerr << "\nProgram:\tnxtrim" << endl;
cerr << "Version:\t" << VERSION <<endl;
cerr << "Contact:\[email protected]\n" << endl;
cerr << "Copyright (c) 2018, Illumina, Inc. All rights reserved. See LICENSE for further details.\n"<<endl;
cerr << "Usage:\tnxtrim -1 R1.fastq.gz -2 R2.fastq.gz [options]\n" << endl;
cerr << "Required arguments:" <<endl;
cerr << " -1 [ --r1 ] arg read 1 in fastq format (gzip allowed)"<<endl;
cerr << " -2 [ --r2 ] arg read 2 in fastq format (gzip allowed)"<<endl;
cerr << "Allowed options:"<<endl;
cerr << " -O [ --output-prefix ] arg output prefix"<<endl;
cerr << " --justmp just creates a the mp/unknown libraries (reads with adapter at the start will be completely N masked)"<<endl;
cerr << " --stdout print trimmed reads to stdout (equivalent to justmp)"<<endl;
cerr << " --stdout-mp print only known MP reads to stdout (good for scaffolding)"<<endl;
cerr << " --stdout-un print only unknown reads to stdout"<<endl;
cerr << " --rf leave mate pair reads in RF orientation [by default are flipped into FR]"<<endl;
cerr << " --preserve-mp preserve MPs even when the corresponding PE has longer reads"<<endl;
cerr << " --ignorePF ignore chastity/purity filters in read headers"<<endl;
cerr << " --separate output paired reads in separate files (prefix_R1/prefix_r2). Default is interleaved."<<endl;
cerr << " -a, --aggressive more aggressive adapter search (see docs/adapter.md)"<<endl;
cerr << " -s, --similarity arg (=0.85) The minimum similarity between strings to be considered a match (Hamming distance divided by string length)"<<endl;
cerr << " -v, --minoverlap arg (=12) The minimum overlap to be considered for matching"<<endl;
cerr << " -l, --minlength arg (=21) The minimum read length to output (smaller reads will be filtered)"<<endl;
exit(0);
}
#define STDOUT 0
#define JUSTMP 1
#define JOINREADS 2
#define NORC 3
#define PMP 4
#define IGNOREPF 5
#define MP 6
#define UNKNOWN 7
#define SEPARATE 8
#define STDOUT_MP 9
#define STDOUT_UN 10
int main(int argc,char **argv)
{
int c;
if(argc<2)
usage();
bool joinreads=false;
bool preserve_mp=false;
bool justmp=false;
int minoverlap=12;
float similarity=0.85;
int minlen=21;
char *r1 = NULL;
char *r2 = NULL;
string prefix;
bool rc = true;
bool ignorePF = false;
bool write_stdout=false;
bool write_stdout_mp=false;
bool write_stdout_un=false;
bool hamming = true;
bool separate=false;
bool aggressive=false;
static struct option loptions[] = {
{"r1",1,0,'1'},
{"r2",1,0,'2'},
{"output-prefix",1,0,'O'},
{"stdout",0,0,STDOUT},
{"stdout-mp",0,0,STDOUT_MP},
{"stdout-un",0,0,STDOUT_UN},
{"justmp",0,0,JUSTMP},
{"joinreads",0,0,JOINREADS},
{"rf",0,0,NORC},
{"preserve-mp",0,0,PMP},
{"aggressive",0,0,'a'},
{"ignorePF",0,0,IGNOREPF},
{"mp",0,0,MP},
{"unknown",0,0,UNKNOWN},
{"separate",0,0,SEPARATE},
{"smith-waterman",0,0,'w'},
{"similarity",1,0,'s'},
{"minoverlap",1,0,'v'},
{"minlength",1,0,'l'},
{0,0,0,0}
};
while ((c = getopt_long(argc, argv, "a1:2:O:s:v:l:",loptions,NULL)) >= 0)
{
switch (c)
{
case '1': r1 = optarg; break;
case '2': r2 = optarg; break;
case 'O': prefix = optarg; break;
case 's': similarity = atof(optarg); break;
case 'v': minoverlap = atoi(optarg); break;
case 'l': minlen = atoi(optarg); break;
case 'a': aggressive = true; break;
case STDOUT: write_stdout=true; break;
case STDOUT_MP: write_stdout_mp=true; break;
case STDOUT_UN: write_stdout_un=true; break;
case JUSTMP:justmp=true; break;
case JOINREADS:joinreads=true; break;
case NORC:rc=false; break;
case PMP:preserve_mp=true; break;
case IGNOREPF:ignorePF=true; break;
case SEPARATE:separate=true; break;
case 'w':hamming=false; break;
default: die("Unrecognised argument");
}
}
if(!(r1==NULL&&r2==NULL) && !(r1!=NULL&&r2!=NULL))
die("both --r1 and --r2 must be speicified");
if(write_stdout && !prefix.empty())
die("--stdout and -O are incompatible");
if(!write_stdout && !write_stdout_mp && !write_stdout_un && prefix.empty() )
die("one of --stdout / --stdout-mp / --stdout-un / -O must be specified");
if(preserve_mp&&justmp)
die("the --preserve_mp and --justmp flags are incompatible!");
if( (write_stdout+write_stdout_mp+write_stdout_un)>1)
die("only one of --stdout / --stdout-mp / --stdout-un may be specified!");
if(minlen<=0)
{
die("--minlength must be >0");
}
if(minoverlap<=0)
{
die("--minoverlap must be >0");
}
if(write_stdout||write_stdout_mp||write_stdout_un)
{
cerr << "Writing to stdout"<<endl;
justmp=true;
}
else
{
cerr << "Output: " << prefix <<".*.fastq.gz"<<endl;
}
cerr << "Trimming:\nR1:\t" <<r1<<"\nR2:\t"<<r2<<endl;
if(preserve_mp) cerr<< "--preserve-mp is on: will favour MPs over PEs" <<endl;
if(joinreads) {
die("--joinreads is deprecated");
cerr<< "--joinreads is on: will attempt to merge R1 with R2 that proceeds an adapter" <<endl;
}
if(aggressive) cerr<< "--aggressive is on: will spend extra time searching for adapter" <<endl;
if(justmp)
preserve_mp=true;
pairReader infile(r1,r2);
int nodata=0;
readPair p;
pair<int,int> pos;
matePair m;
m.setAggressive(aggressive);
int num_reads_with_multiple_adapters=0,npass=0,nread=0;
bool trim_warn=true;
nxtrimWriter out;
if(write_stdout||write_stdout_un||write_stdout_mp)
{
out.open();
if(write_stdout_un) out.setMP(false);
if(write_stdout_mp) out.setUN(false);
}
else
out.open(prefix,justmp,separate);
int se_only = 0;
while(infile.next(p))
{
if(p.r1.l!=p.r2.l && trim_warn)
{
cerr << "WARNING: reads with differing lengths. Has this data already been processed with nxtrim?"<<endl;
trim_warn=false;
}
if((!p.r1.filtered && !p.r2.filtered)||ignorePF)
{
bool read_pair_had_multiple_adapters=m.build(p,minoverlap,similarity,minlen,joinreads,hamming,preserve_mp,justmp);
num_reads_with_multiple_adapters+=read_pair_had_multiple_adapters;
if(!read_pair_had_multiple_adapters)
{
nodata+=( (m.mp.r1.l==0||m.mp.r2.l==0) && (m.pe.r1.l==0||m.pe.r2.l==0) && (m.unknown.r1.l==0||m.unknown.r2.l==0) && m.se.l==0);
se_only += ( (m.mp.r1.l==0||m.mp.r2.l==0) && (m.pe.r1.l==0||m.pe.r2.l==0) && (m.unknown.r1.l==0||m.unknown.r2.l==0) ) && m.se.l>0;
}
if(rc)
{
m.mp.rc();
m.unknown.rc();
}
else
{
m.pe.rc();
}
out.write(m);
npass++;
}
nread++;
if(nread%10000==0)
cerr << "READ PAIR "<<nread<<endl;
}
cerr << "\nTrimming summary:"<<endl;
cerr << percent(npass,nread) << "reads passed chastity/purity filters."<<endl;
cerr << percent(num_reads_with_multiple_adapters,npass) << "reads had multiple copies of adapter (filtered)."<<endl;
npass-=num_reads_with_multiple_adapters;
cerr << percent(nodata,npass) << "read pairs were ignored because template length appeared less than read length"<<endl;
npass-=nodata;
cerr << npass << " remaining reads were trimmed"<<endl<<endl;
cerr << percent(out.n_mp,npass) << "read pairs had MP orientation"<<endl;
cerr << percent(out.n_pe,npass) << "read pairs had PE orientation"<<endl;
cerr << percent(out.n_unk,npass) << "read pairs had unknown orientation"<<endl;
cerr << percent(se_only,npass) << "were single end reads"<<endl<<endl;
cerr << percent(out.n_se - se_only,npass) << "extra single end reads were generated from overhangs"<<endl;
return(0);
}