-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhashtagging.nf
291 lines (246 loc) · 8.66 KB
/
hashtagging.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
#!/usr/bin/env nextflow
// Enable DSL 2 syntax
nextflow.enable.dsl = 2
// Format the multi config CSVs for each sample
process multi_config {
// Load the appropriate dependencies
label "python"
// Copy all output files to the folder specified by the user with --output
// in a subdirectory named 'config/'
publishDir "${params.output}/", mode: 'copy', overwrite: true
input:
// The sample grouping CSV (indicating which FASTQ files contain which data)
path "grouping.csv"
// The multiplexing CSV (which hashtags were used for each sample)
path "multiplexing.csv"
// The optional feature barcode CSV (indicating whether additional
// feature barcode analysis should be run after demultiplexing)
path "feature_reference.csv"
output:
// Get the config used for demultiplexing
path "demux.config.csv", emit: demux
path "post_demux.config.csv", optional: true, emit: post_demux
script:
// Run the code defined in templates/multi_config.py
template "multi_config_hashtagging.py"
}
// Define the process used to run cellranger multi
process demux_hashtags {
// Load the appropriate dependencies
label "cellranger"
// Copy all output files to the folder specified by the user with --output
publishDir "${params.output}/", mode: 'copy', overwrite: true
input:
// The run configuration for demultiplexing is driven by a multi config CSV
path "demux.config.csv"
// Stage the FASTQ folder (by symlink) in the working directory
path "FASTQ_DIR"
// Stage the reference transcriptome (by symlink) in the working directory
path "GEX_REF"
// The hashtags CSV
path "hashtags.csv"
output:
// Capture any created files as outputs
path "*", emit: all
// Capture the per-sample BAM files
path "demultiplexed_samples/*.bam", emit: bam, optional: true
script:
// Run the code defined in templates/demux_hashtags.sh
template "demux_hashtags.sh"
}
// Convert the BAM file for each sample into FASTQs
process bam_to_fastq {
// Load the appropriate dependencies
label "bamtofastq"
// Copy all output files to the folder specified by the user with --output
publishDir "${params.output}/", mode: 'copy', overwrite: true
input:
// The run configuration for demultiplexing is driven by a multi config CSV
tuple val(sample_name), path(BAM)
output:
// Capture any created files as outputs
tuple val(sample_name), path("${sample_name}/"), emit: fastqs
path "*.log", emit: log
"""#!/bin/bash
set -e
bamtofastq --version | tee -a bamtofastq.${sample_name}.log
bamtofastq \
--reads-per-fastq=2200000000 \
${BAM} \
${sample_name} \
2>&1 | tee -a bamtofastq.${sample_name}.log
# Since the GEX reads are listed first in the demux step
# (note that we explicitly set this in the config)
# as a result, the GEX reads will be the first FASTQ folder
# We will drop the other reads and keep only the GEX reads
mv ${sample_name}/demultiplexed_samples_0*/* ${sample_name}/
rmdir ${sample_name}/demultiplexed_samples_0*
rm -r ${sample_name}/demultiplexed_samples_1*
"""
}
// Optionally run cellranger multi on the demuxed reads
process post_demux_multi {
// Load the appropriate dependencies
label "cellranger"
// Copy all output files to the folder specified by the user with --output
publishDir "${params.output}/", mode: 'copy', overwrite: true
input:
// The run configuration for demultiplexing is driven by a multi config CSV
path "post_demux.config.csv"
// The FASTQ files for each sample
tuple val(sample_name), path("DEMUX_DIR")
// Stage the FASTQ folder (by symlink) in the working directory
path "FASTQ_DIR"
// Stage the reference transcriptome (by symlink) in the working directory
path "GEX_REF"
// Stage the reference V(D)J (by symlink) in the working directory
path "VDJ_REF"
// The feature reference CSV
path "feature.csv"
output:
// Capture any created files as outputs
path "*", emit: all
script:
// Run the code defined in templates/demux_hashtags.sh
template "post_demux_multi.sh"
}
workflow {
log.info"""
Parameters:
output: ${params.output}
grouping: ${params.grouping}
include_introns: ${params.include_introns}
fastq_dir: ${params.fastq_dir}
transcriptome_dir: ${params.transcriptome_dir}
vdj_dir: ${params.vdj_dir}
multiplexing: ${params.multiplexing}
feature_csv: ${params.feature_csv}
probes_csv: ${params.probes_csv}
dryrun: ${params.dryrun}
cellranger_version: ${params.cellranger_version}
"""
// Check that the user specified the output parameter
if("${params.output}" == "false"){
error "Parameter 'output' must be specified"
}
// Check that the user specified the grouping parameter
if("${params.grouping}" == "false"){
error "Parameter 'grouping' must be specified"
}
// Check that the user specified the fastq_dir parameter
if("${params.fastq_dir}" == "false"){
error "Parameter 'fastq_dir' must be specified"
}
// Check that the user specified the transcriptome_dir parameter
if("${params.transcriptome_dir}" == "false"){
error "Parameter 'transcriptome_dir' must be specified"
}
// Check that the user specified the vdj_dir parameter
if("${params.vdj_dir}" == "false"){
error "Parameter 'vdj_dir' must be specified"
}
// Check that the user specified the feature_csv parameter
if("${params.feature_csv}" == "false"){
error "Parameter 'feature_csv' must be specified"
}
// Check that the user specified the probes_csv parameter
if("${params.probes_csv}" == "false"){
error "Parameter 'probes_csv' must be specified"
}
// Point to the FASTQ directory
fastq_dir = file(
"${params.fastq_dir}",
checkIfExists: true,
type: "dir",
glob: false
)
// Point to the reference transcriptome
transcriptome_dir = file(
"${params.transcriptome_dir}",
checkIfExists: true,
type: "dir",
glob: false
)
// Point to the reference vdj
vdj_dir = file(
"${params.vdj_dir}",
checkIfExists: true,
type: "dir",
glob: false
)
// Point to the grouping CSV provided by the user
grouping = file(
"${params.grouping}",
checkIfExists: true,
type: "file",
glob: false
)
// Point to the multiplexing CSV (which by default is an empty table in templates/)
// Indicates which samples were processed with which hashtag
multiplexing = file(
"${params.multiplexing}",
checkIfExists: true,
type: "file",
glob: false
)
// Point to the feature reference CSV (which by default is an empty table in templates/)
feature_csv = file(
"${params.feature_csv}",
checkIfExists: true,
type: "file",
glob: false
)
// Point to the hashtag CSV which describes the hashtagging library
hashtag_csv = file(
"${params.hashtag_csv}",
checkIfExists: true,
type: "file",
glob: false
)
// Build the multi config CSV for each sample
multi_config(grouping, multiplexing, feature_csv)
// If the user has not set the `dryrun` parameter
if("${params.dryrun}" == "false"){
// Split up each sample
demux_hashtags(
multi_config.out.demux,
fastq_dir,
transcriptome_dir,
hashtag_csv
)
// Convert the BAM files to FASTQ
bam_to_fastq(
demux_hashtags.out.bam
.flatten()
.map {
it -> [
it.name.replace(".bam", ""),
it
]
}
)
// If there is data beyond the GEX, run cellranger multi
// on the reads from each individual sample
post_demux_multi(
multi_config.out.post_demux,
bam_to_fastq.out.fastqs,
fastq_dir,
transcriptome_dir,
vdj_dir,
feature_csv
)
}else{
// Log the location of all output configs
multi_config
.out
.map { it -> it.name }
.toSortedList()
.view {
"""
Multi config CSVs have been written to:
"${params.output}/configs/"
${it}
"""
}
}
}