-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathpandaseq.h
343 lines (313 loc) · 11.5 KB
/
pandaseq.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
/* PANDAseq -- Assemble paired FASTQ Illumina reads and strip the region between amplification primers.
Copyright (C) 2011-2012 Andre Masella
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _PANDASEQ_H
# define _PANDASEQ_H
# ifdef __cplusplus
# define EXTERN_C_BEGIN extern "C" {
# define EXTERN_C_END }
# else
# define EXTERN_C_BEGIN
# define EXTERN_C_END
# endif
# include <pandaseq-common.h>
EXTERN_C_BEGIN
/*
* While this is not a GLib-based file, it generally follows GLib-style
* conventions, particularly in the documentation.
*
* Every pointer is assumed to be not-null unless specified otherwise by
* (allow-none).
*
* All return types and pointers are must be freed or unreferenced by the
* caller unless marked as (transfer none).
*
* Arrays are generally followed by a length. If the array is being returned,
* the array length is an out parameter.
*
* The major library interfaces are refence counted. When all copies are
* unreferenced, using the appropriate function, it will be garbage collected.
* If it needs to be stored in multiple places, it can simply be referenced.
*
* PANDAseq makes heavy use of closures which involve a function pointer and a
* void pointer containing opaque data passed to the function pointer. If the
* opaque data must persist beyond the life of the call, another function
* pointer is required to clean up the data. If neither the closure nor the
* clean up is necessary, these may be null.
*
* If compiled against pthreads, PANDAseq is relatively thread-safe. All
* functions that change reference counts are may be called at any time from
* any thread. All other functions are have no guaranteeds unless explicity
* stated.
*
* See https://live.gnome.org/GObjectIntrospection/Annotations for more
* information.
*/
# define PANDA_API 3
/**
* Find the best offset of a small sequence in a large sequence.
* @threshold: the minimum log probability to match
* @penalty: the penalty to subtract from the probability for each base from the start of the sequence
* @reverse: if false, scan the sequence from start to finish, else, scan in the opposite direction
* @haystack: (array length=haystack_length): the sequence to be searched
* @needle: (array length=needle_length): the sequence for which to look
* Returns: 0 if the sequence is not found, or one more than the offset
*/
size_t panda_compute_offset_qual(
double threshold,
double penalty,
bool reverse,
const panda_qual *haystack,
size_t haystack_length,
const panda_nt *needle,
size_t needle_length);
/**
* Find the best offset of a small sequence in a large sequence.
* @threshold: the minimum log probability to match
* @penalty: the penalty to subtract from the probability for each base from the start of the sequence
* @reverse: if false, scan the sequence from start to finish, else, scan in the opposite direction
* @haystack: (array length=haystack_length): the sequence to be searched
* @needle: (array length=needle_length): the sequence for which to look
* Returns: 0 if the sequence is not found, or one more than the offset
*/
size_t panda_compute_offset_result(
double threshold,
double penalty,
bool reverse,
const panda_result *haystack,
size_t haystack_length,
const panda_nt *needle,
size_t needle_length);
/**
* Create an object to read sequences from two character streams of FASTQ data
*
* @forward: (closure forward_data) (scope notified): the functions to provide the stream of forward characters. Every time a new data is required, forward(forward_data) is called. When the stream has retrned EOF or the assembler is deallocated, forward_destroy(forward_data) is called.
* @reverse: (closure reverse_data) (scope notified): the same for the reverse
* sequence.
* @qualmin: the quality to subtract from the incoming file (usually 33 or 64, depending on CASAVA versi
n)
* @policy: method to handle unbarcoded sequences
* @index: (closure index_data) (scope notified): the functions to provide the stream of index/barcode characters.
* @logger: the logging function to use during assembly.
* Returns: (closure user_data) (scope notified): The function to call.
*/
PandaNextSeq panda_create_fastq_reader(
PandaBufferRead forward,
void *forward_data,
PandaDestroy forward_destroy,
PandaBufferRead reverse,
void *reverse_data,
PandaDestroy reverse_destroy,
PandaLogProxy logger,
unsigned char qualmin,
PandaTagging policy,
PandaBufferRead index,
void *index_data,
PandaDestroy index_destroy,
void **user_data,
PandaDestroy *destroy);
/**
* Compare sequences assembled by two different assemblers.
*
* @reader: (closure reader_data): the source of the sequences.
* @control: (closure control_data): the control assembly process.
* @experiment: (closure experiment_data): the experiment assembly process.
* @suppress_quality_diffs: consider nucleotides that have different quality scores to be identical.
*/
bool panda_diff(
PandaNextSeq reader,
void *reader_data,
PandaAssemble control,
void *control_data,
PandaAssemble experiment,
void *experiment_data,
bool suppress_quality_diffs);
/**
* Wraps an existing stream of reads and clips off reads that have the too-long overlap problem.
* @inner:(closure inner_data) (scope notified): the stream to wrap.
* @forward:(array length=forward_length): the sequence to trim from the forward read.
* @reverse:(array length=reverse_length): the sequence to trim from the reverse read.
* @skip: whether to try to assemble sequences that don't contain a trim sequence.
* @threshold: a log probability threshold for cut-off alignment
* Returns: (closure next_data) (scope notified): the sequence stream
*/
PandaNextSeq panda_trim_overhangs(
PandaNextSeq inner,
void *inner_data,
PandaDestroy inner_destroy,
PandaLogProxy logger,
panda_nt *forward,
size_t forward_length,
panda_nt *reverse,
size_t reverse_length,
bool skip,
double threshold,
void **next_data,
PandaDestroy *next_destroy);
/**
* Compute log(1 - exp(p)) efficiently.
*
* See [[http://cran.r-project.org/web/packages/Rmpfr/vignettes/log1mexp-note.pdf|Mächler, 2012]].
*/
double panda_log1mexp(
double p);
/**
* The current version string
*/
char const *panda_version(
void);
/* === Flags === */
/** Usual output about assembly. */
# define PANDA_DEBUG_BUILD ((PandaDebug) 1)
/** Input processing-related errors. */
# define PANDA_DEBUG_FILE ((PandaDebug) 2)
/** Extra statistics. */
# define PANDA_DEBUG_STAT ((PandaDebug) 4)
/** Information about building the k-mer table (long and boring). */
# define PANDA_DEBUG_KMER ((PandaDebug) 8)
/** Excruciating detail about the reconstruction. */
# define PANDA_DEBUG_RECON ((PandaDebug) 16)
/** Bucket loads of data about mistatches. */
# define PANDA_DEBUG_MISMATCH ((PandaDebug) 32)
# define PANDA_DEBUG_DEFAULT (PANDA_DEBUG_BUILD | PANDA_DEBUG_FILE | PANDA_DEBUG_STAT)
/**
* The current flags used be the assember to report errors. Some errors are always reported.
*/
PANDA_EXTERN PandaDebug panda_debug_flags;
/* === Methods (for things from elsewhere) === */
/**
* The output-file representation of an error code.
* Returns: (transfer none): The string representation
*/
char const *panda_code_str(
PandaCode code);
/**
* Convert the PHRED quality score to a log probability.
*/
double panda_quality_log_probability(
const panda_qual *q);
/**
* Convert the PHRED quality score to a probability.
*/
double panda_quality_probability(
const panda_qual *q);
/**
* Convert the probability to a PHRED quality score.
*/
char panda_result_phred(
const panda_result *r);
/* === I/O Methods === */
/**
* Open a file that might be uncompressed or compressed with gzip or bzip2.
*
* @file_name: the file to open
* @logger: the logger to write data
* Returns: (scope notified) (closure user_data): the buffer read function to use.
*/
PandaBufferRead panda_open_buffer(
const char *file_name,
PandaLogProxy logger,
void **user_data,
PandaDestroy *destroy);
/**
* Open a pair of FASTQ files for reading.
*
* Files may be uncompressed, or compressed with gzip or bzip2.
*
* @forward: the forward filename
* @reverse: the reverse filename
* @logger: the logging function to use during assembly.
* @qualmin: the value to strip from the quality scores. Usually 33 or 64, depending on CASAVA version.
* @index: a file containing separate barcodes
* Returns: (closure user_data) (scope notified): The function to call.
*/
PandaNextSeq panda_open_fastq(
const char *forward,
const char *reverse,
PandaLogProxy logger,
unsigned char qualmin,
PandaTagging policy,
const char *index,
void **user_data,
PandaDestroy *destroy);
/**
* Write an unassembled sequence to a FASTA file as a concatenated pair.
*/
void panda_output_fail(
PandaAssembler assembler,
const panda_seq_identifier *id,
const panda_qual *forward,
size_t forward_length,
const panda_qual *reverse,
size_t reverse_length,
PandaWriter writer);
/**
* Write an unassembled sequence to a FASTQ file as a concatenated pair.
*/
void panda_output_fail_qual(
PandaAssembler assembler,
const panda_seq_identifier *id,
const panda_qual *forward,
size_t forward_length,
const panda_qual *reverse,
size_t reverse_length,
PandaWriter writer);
/**
* Write an assembly to a FASTA file.
*/
bool panda_output_fasta(
const panda_result_seq *sequence,
PandaWriter writer);
/**
* Write an assembly to a FASTQ file.
*/
bool panda_output_fastq(
const panda_result_seq *sequence,
PandaWriter writer);
PandaNextSeq panda_create_async_reader(
PandaNextSeq next,
void *next_data,
PandaDestroy next_destroy,
size_t length,
void **user_data,
PandaDestroy *destroy);
/**
* The first base in the result sequence in the overlap.
*/
# define PANDA_RESULT_OVERLAP_OFFSET(result) ((result)->forward_length - (result)->forward_offset - (result)->overlap)
/**
* The first base in the forward sequence in the overlap.
*/
# define PANDA_RESULT_OVERLAP_FORWARD_OFFSET(result) ((result)->forward_length - (result)->overlap)
/**
* The first base in the reverse sequence in the overlap.
*/
# define PANDA_RESULT_OVERLAP_REVERSE_OFFSET(result) ((result)->result_length - (result)->overlap)
/* === Convenience macro is for Vala === */
# define PANDA_FAIL(file, append, user_data, destroy) (*(user_data) = fopen(file, append ? "a" : "w"), *(destroy) = fclose, *(user_data) == NULL ? NULL : (PandaFailAlign) panda_output_fail)
# define PANDACONCATE(x,y) x ## y
# define PANDACONCAT(x,y) PANDACONCATE(x, y)
/* === Everything else === */
# include<pandaseq-algorithm.h>
# include<pandaseq-args.h>
# include<pandaseq-assembler.h>
# include<pandaseq-iter.h>
# include<pandaseq-linebuf.h>
# include<pandaseq-log.h>
# include<pandaseq-module.h>
# include<pandaseq-nt.h>
# include<pandaseq-seqid.h>
# include<pandaseq-set.h>
# include<pandaseq-writer.h>
EXTERN_C_END
#endif