-
Notifications
You must be signed in to change notification settings - Fork 1
/
segway.wdl
502 lines (443 loc) · 15.7 KB
/
segway.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
version 1.0
struct RuntimeEnvironment {
String docker
String singularity
}
workflow segway {
meta {
version: "1.2.0"
caper_docker: "encodedcc/segway-pipeline:1.2.0"
caper_singularity: "docker://encodedcc/segway-pipeline:1.2.0"
description: "ENCODE Segway pipeline, see https://github.com/ENCODE-DCC/segway-pipeline for details."
}
input {
# Pipeline inputs to run from beginning
Array[File]? bigwigs
Array[String] assays
File chrom_sizes
File annotation_gtf
# Segway resource parameter
Int num_segway_cpus = 96
# Segway training hyperparameters. First three defaults taken from Libbrecht et al 2019
Int resolution = 100
Float minibatch_fraction = 0.01
Int max_train_rounds = 25
Int num_instances = 10
Int? num_labels
Float prior_strength = 1.0
Float segtransition_weight_scale = 1.0
Float track_weight = 0.01
Int ruler_scale = 100
# Segtools parameters
Int segtools_aggregation_flank_bases = 10000
# Optional inputs for reinterpretation
File? segway_output_bed
Array[String]? tracknames
File? feature_aggregation_tab
File? signal_distribution_tab
File? segment_sizes_tab
File? length_distribution_tab
String docker = "encodedcc/segway-pipeline:1.2.0"
String singularity = "docker://encodedcc/segway-pipeline:1.2.0"
}
RuntimeEnvironment runtime_environment = {
"docker": docker,
"singularity": singularity
}
if (!defined(segway_output_bed)) {
call make_genomedata { input:
bigwigs = select_all([bigwigs])[0],
chrom_sizes = chrom_sizes,
runtime_environment = runtime_environment,
}
call segway_train { input:
genomedata = make_genomedata.genomedata,
num_labels = select_first([num_labels, make_genomedata.num_labels]),
resolution = resolution,
prior_strength = prior_strength,
segtransition_weight_scale = segtransition_weight_scale,
ruler_scale = ruler_scale,
track_weight = track_weight,
minibatch_fraction = minibatch_fraction,
max_train_rounds = max_train_rounds,
num_instances = num_instances,
ncpus = num_segway_cpus,
runtime_environment = runtime_environment,
}
call segway_annotate { input:
genomedata = make_genomedata.genomedata,
traindir = segway_train.traindir,
ncpus = num_segway_cpus,
runtime_environment = runtime_environment,
}
call segtools { input:
genomedata = make_genomedata.genomedata,
segway_output_bed = segway_annotate.output_bed,
annotation_gtf = annotation_gtf,
segway_params = segway_annotate.segway_params,
flank_bases = segtools_aggregation_flank_bases,
runtime_environment = runtime_environment,
}
call make_trackname_assay { input:
tracknames = select_first([bigwigs]),
assays = assays,
runtime_environment = runtime_environment,
}
call interpretation { input:
trackname_assay = make_trackname_assay.trackname_assay,
feature_aggregation_tab = segtools.feature_aggregation_tab,
signal_distribution_tab = segtools.signal_distribution_tab,
segment_sizes_tab = segtools.segment_sizes_tab,
length_distribution_tab = segtools.length_distribution_tab,
runtime_environment = runtime_environment,
}
}
if (defined(segway_output_bed)) {
call make_trackname_assay as make_trackname_assay_from_strings { input:
tracknames = select_first([tracknames]),
assays = assays,
runtime_environment = runtime_environment,
}
call interpretation as interpret_existing_bed { input:
trackname_assay = make_trackname_assay_from_strings.trackname_assay,
feature_aggregation_tab = select_first([feature_aggregation_tab]),
signal_distribution_tab = select_first([signal_distribution_tab]),
segment_sizes_tab = select_first([segment_sizes_tab]),
length_distribution_tab = select_first([length_distribution_tab]),
runtime_environment = runtime_environment,
}
}
File segway_output_bed_ = select_first([segway_output_bed, segway_annotate.output_bed])
File mnemonics = select_first([interpretation.mnemonics, interpret_existing_bed.mnemonics])
call relabel { input:
bed = segway_output_bed_,
mnemonics = mnemonics,
runtime_environment = runtime_environment,
}
call recolor_bed { input:
bed = relabel.relabeled_bed,
runtime_environment = runtime_environment,
}
if (defined(chrom_sizes)) {
call bed_to_bigbed as recolored_bed_to_bigbed { input:
bed = recolor_bed.recolored_bed,
chrom_sizes = chrom_sizes,
output_stem = "recolored",
runtime_environment = runtime_environment,
}
}
}
task make_genomedata {
input {
Array[File] bigwigs
File chrom_sizes
RuntimeEnvironment runtime_environment
}
command <<<
set -euo pipefail
python "$(which make_genomedata.py)" --files ~{sep=" " bigwigs} --sizes ~{chrom_sizes} -o files.genomedata
python "$(which calculate_num_labels.py)" --num-tracks ~{length(bigwigs)} -o num_labels.txt
>>>
output {
File genomedata = "files.genomedata"
Int num_labels = read_int("num_labels.txt")
Int num_tracks = length(bigwigs)
}
runtime {
cpu: 4
memory: "16 GB"
disks: "local-disk 500 SSD"
docker: runtime_environment.docker
singularity: runtime_environment.singularity
}
}
task segway_train {
input {
File genomedata
Int num_labels
Int resolution
Float prior_strength
Float segtransition_weight_scale
Int ruler_scale
Float track_weight
Float minibatch_fraction
Int max_train_rounds
Int num_instances
Int ncpus
RuntimeEnvironment runtime_environment
}
command <<<
set -euo pipefail
mkdir tmp
export TMPDIR="${PWD}/tmp"
export SEGWAY_RAND_SEED=112344321
export SEGWAY_NUM_LOCAL_JOBS=~{ncpus}
export OMP_NUM_THREADS=1
mkdir traindir
SEGWAY_CLUSTER=local segway train \
--num-labels ~{num_labels} \
--resolution ~{resolution} \
--minibatch-fraction ~{minibatch_fraction} \
--num-instances ~{num_instances} \
--prior-strength ~{prior_strength} \
--segtransition-weight-scale ~{segtransition_weight_scale} \
--ruler-scale ~{ruler_scale} \
--track-weight ~{track_weight} \
--max-train-rounds ~{max_train_rounds} \
~{genomedata} \
traindir
# See https://stackoverflow.com/a/54908072 and
# https://reproducible-builds.org/docs/archives/. Want to make tar idempotent
find traindir -print0 |
LC_ALL=C sort -z |
tar --owner=0 --group=0 --numeric-owner --mtime='2019-01-01 00:00Z' \
--pax-option=exthdr.name=%d/PaxHeaders/%f,delete=atime,delete=ctime \
--no-recursion --null -T - -cf traindir.tar
gzip -nc traindir.tar > traindir.tar.gz
>>>
output {
File traindir = "traindir.tar.gz"
# Checks that the model training actually emitted final params, not used
File trained_params = "traindir/params/params.params"
}
runtime {
cpu: ncpus
memory: "300 GB"
disks: "local-disk 1000 SSD"
docker: runtime_environment.docker
singularity: runtime_environment.singularity
}
}
task segway_annotate {
input {
File genomedata
File traindir
Int ncpus
RuntimeEnvironment runtime_environment
}
command <<<
set -euo pipefail
mkdir tmp
export TMPDIR="${PWD}/tmp"
export SEGWAY_RAND_SEED=112344321
export SEGWAY_NUM_LOCAL_JOBS=~{ncpus}
export OMP_NUM_THREADS=1
mkdir traindir && tar xf ~{traindir} -C traindir --strip-components 1
mkdir identifydir
SEGWAY_CLUSTER=local segway annotate ~{genomedata} --bed=segway.bed traindir identifydir
find traindir -regextype egrep -regex 'traindir/(auxiliary|params/input.master|params/params.params|segway.str|triangulation)($|/.*)' -print0 |
LC_ALL=C sort -z |
tar --owner=0 --group=0 --numeric-owner --mtime='2019-01-01 00:00Z' \
--pax-option=exthdr.name=%d/PaxHeaders/%f,delete=atime,delete=ctime \
--no-recursion --null -T - -cf training_params.tar
gzip -nc training_params.tar > training_params.tar.gz
tail -n +2 segway.bed > segway_no_header.bed
gzip -nc segway_no_header.bed > segway.bed.gz
find identifydir -print0 |
LC_ALL=C sort -z |
tar --owner=0 --group=0 --numeric-owner --mtime='2019-01-01 00:00Z' \
--pax-option=exthdr.name=%d/PaxHeaders/%f,delete=atime,delete=ctime \
--no-recursion --null -T - -cf identifydir.tar
gzip -nc identifydir.tar > identifydir.tar.gz
>>>
output {
File segway_params = "training_params.tar.gz"
File identifydir = "identifydir.tar.gz"
File output_bed = "segway.bed.gz"
}
runtime {
cpu: ncpus
memory: "400 GB"
disks: "local-disk 1000 SSD"
docker: runtime_environment.docker
singularity: runtime_environment.singularity
}
}
task bed_to_bigbed {
input {
File bed
File chrom_sizes
String output_stem = "segway"
RuntimeEnvironment runtime_environment
}
command <<<
set -euo pipefail
gzip -dc ~{bed} > ~{output_stem}.bed
bedToBigBed ~{output_stem}.bed ~{chrom_sizes} ~{output_stem}.bb
gzip -n ~{output_stem}.bed
>>>
output {
File bigbed = "~{output_stem}.bb"
}
runtime {
docker: runtime_environment.docker
singularity: runtime_environment.singularity
}
}
task segtools {
input {
File genomedata
File segway_output_bed
File annotation_gtf
File segway_params
Int flank_bases
RuntimeEnvironment runtime_environment
}
command <<<
# Can't set the usual values since some of the commands fail with nonzero
# set -euo pipefail
mkdir segway_params && tar xf ~{segway_params} -C segway_params --strip-components 1
segtools-length-distribution -o length_distribution ~{segway_output_bed}
segtools-gmtk-parameters -o gmtk_parameters segway_params/params/params.params
segtools-aggregation \
--normalize \
-o feature_aggregation \
--mode=gene \
--flank-bases=~{flank_bases} \
~{segway_output_bed} \
~{annotation_gtf}
# TODO: undo temporary env fix once segtools is patched. Use conda run to avoid bashrc wackiness
conda run -n segtools-signal-distribution \
segtools-signal-distribution \
--transformation arcsinh \
-o signal_distribution \
~{segway_output_bed} \
~{genomedata} \
|| true
>>>
output {
Array[File] length_distribution_info = glob("length_distribution/*")
File length_distribution_tab = "length_distribution/length_distribution.tab"
File segment_sizes_tab = "length_distribution/segment_sizes.tab"
Array[File] gmtk_info = glob("gmtk_parameters/*")
Array[File] feature_aggregation_info = glob("feature_aggregation/*")
File feature_aggregation_tab = "feature_aggregation/feature_aggregation.tab"
Array[File] signal_distribution_info = glob("signal_distribution/*")
File signal_distribution_tab = "signal_distribution/signal_distribution.tab"
}
runtime {
cpu: 8
memory: "16 GB"
disks: "local-disk 250 SSD"
docker: runtime_environment.docker
singularity: runtime_environment.singularity
}
}
task make_trackname_assay {
input {
Array[String] tracknames
Array[String] assays
String output_filename = "trackname_assay.txt"
RuntimeEnvironment runtime_environment
}
command <<<
set -euo pipefail
python \
"$(which make_trackname_assay.py)" \
--tracknames ~{sep=" " tracknames} \
--assays ~{sep=" " assays} \
--output-filename ~{output_filename}
>>>
output {
File trackname_assay = "~{output_filename}"
}
runtime {
cpu: 1
memory: "2 GB"
disks: "local-disk 10 SSD"
docker: runtime_environment.docker
singularity: runtime_environment.singularity
}
}
task interpretation {
input {
File feature_aggregation_tab
File signal_distribution_tab
File trackname_assay
File segment_sizes_tab
File length_distribution_tab
RuntimeEnvironment runtime_environment
}
command <<<
set -euo pipefail
export SEGWAY_OUTPUT=segwayOutput
export SAMPLE_NAME=sample
mkdir -p "${SEGWAY_OUTPUT}/${SAMPLE_NAME}/"
mv ~{trackname_assay} "${SEGWAY_OUTPUT}"
mv \
~{feature_aggregation_tab} \
~{signal_distribution_tab} \
~{segment_sizes_tab} \
~{length_distribution_tab} \
"${SEGWAY_OUTPUT}/${SAMPLE_NAME}"
python \
"$(which apply_samples.py)" \
interpretation-output \
--model-path /opt/interpretation_samples/model_300_reg.020_auc0.89V04.pickle.gz \
--input-path "${PWD}/${SEGWAY_OUTPUT}"
>>>
output {
Array[File] stats = glob("interpretation-output/stats/*")
File mnemonics = "interpretation-output/classification/sample/mnemonics.txt"
File classifier_data = "interpretation-output/classification/sample/classifier_data.tab"
File classifier_probailities = "interpretation-output/classification/sample/probs.txt"
}
runtime {
cpu: 4
memory: "16 GB"
disks: "local-disk 100 SSD"
docker: runtime_environment.docker
singularity: runtime_environment.singularity
}
}
task relabel {
input {
File bed
File mnemonics
String output_stem = "relabeled"
RuntimeEnvironment runtime_environment
}
command <<<
set -euo pipefail
gzip -dc ~{bed} > decompressed.bed
python \
"$(which relabel.py)" \
-o ~{output_stem}.bed \
decompressed.bed \
~{mnemonics}
gzip -n ~{output_stem}.bed
>>>
output {
File relabeled_bed = "~{output_stem}.bed.gz"
}
runtime {
cpu: 1
memory: "2 GB"
disks: "local-disk 20 SSD"
docker: runtime_environment.docker
singularity: runtime_environment.singularity
}
}
task recolor_bed {
input {
File bed
String output_filename = "recolored.bed"
RuntimeEnvironment runtime_environment
}
command <<<
set -euo pipefail
gzip -dc ~{bed} > decompressed.bed
python "$(which recolor_bed.py)" -o ~{output_filename} decompressed.bed
gzip -n ~{output_filename}
>>>
output {
File recolored_bed = "~{output_filename}.gz"
}
runtime {
cpu: 1
memory: "2 GB"
disks: "local-disk 20 SSD"
docker: runtime_environment.docker
singularity: runtime_environment.singularity
}
}