-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmutect2.wdl
951 lines (825 loc) · 34 KB
/
mutect2.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
## Copyright Broad Institute, 2017
##
## This WDL workflow runs GATK4 Mutect 2 on a single tumor-normal pair or on a single tumor sample,
## and performs additional filtering and functional annotation tasks.
##
## Main requirements/expectations :
## - One analysis-ready BAM file (and its index) for each sample
##
## Description of inputs:
##
## ** Runtime **
## gatk_docker, oncotator_docker: docker images to use for GATK 4 Mutect2 and for Oncotator
## preemptible_attempts: how many preemptions to tolerate before switching to a non-preemptible machine (on Google)
## gatk_override: (optional) local file or Google bucket path to a GATK 4 java jar file to be used instead of the GATK 4 jar
## in the docker image. This must be supplied when running in an environment that does not support docker
## (e.g. SGE cluster on a Broad on-prem VM)
##
## ** Workflow options **
## intervals: genomic intervals (will be used for scatter)
## scatter_count: number of parallel jobs to generate when scattering over intervals
## artifact_modes: types of artifacts to consider in the orientation bias filter (optional)
## m2_extra_args, m2_extra_filtering_args: additional arguments for Mutect2 calling and filtering (optional)
## split_intervals_extra_args: additional arguments for splitting intervals before scattering (optional)
## run_orientation_bias_filter: if true, run the orientation bias filter post-processing step (optional, false by default)
## run_oncotator: if true, annotate the M2 VCFs using oncotator (to produce a TCGA MAF). Important: This requires a
## docker image and should not be run in environments where docker is unavailable (e.g. SGE cluster on
## a Broad on-prem VM). Access to docker hub is also required, since the task downloads a public docker image.
## (optional, false by default)
##
## ** Primary inputs **
## ref_fasta, ref_fai, ref_dict: reference genome, index, and dictionary
## tumor_bam, tumor_bam_index: BAM and index for the tumor sample
## normal_bam, normal_bam_index: BAM and index for the normal sample
##
## ** Primary resources ** (optional but strongly recommended)
## pon, pon_index: optional panel of normals in VCF format containing probable technical artifacts (false positves)
## gnomad, gnomad_index: optional database of known germline variants (see http://gnomad.broadinstitute.org/downloads)
## variants_for_contamination, variants_for_contamination_index: VCF of common variants with allele frequencies for calculating contamination
##
## ** Secondary resources ** (for optional tasks)
## onco_ds_tar_gz, default_config_file: Oncotator datasources and config file
## sequencing_center, sequence_source: metadata for Oncotator
##
## Outputs :
## - One VCF file and its index with primary filtering applied; secondary filtering and functional annotation if requested; a bamout.bam
## file of reassembled reads if requested
##
## Cromwell version support
## - Successfully tested on v29
##
## LICENSING :
## This script is released under the WDL source code license (BSD-3) (see LICENSE in
## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
## be subject to different licenses. Users are responsible for checking that they are
## authorized to run all programs before running this script. Please see the docker
## pages at https://hub.docker.com/r/broadinstitute/* for detailed licensing information
## pertaining to the included programs.
# tag gatk4.0.1.2 release
workflow Mutect2 {
# Mutect2 inputs
File? intervals
File ref_fasta
File ref_fai
File ref_dict
File tumor_bam
File tumor_bai
File? normal_bam
File? normal_bai
File? pon
File? pon_index
Int scatter_count
File? gnomad
File? gnomad_index
File? variants_for_contamination
File? variants_for_contamination_index
Boolean? run_orientation_bias_filter
Boolean run_ob_filter = select_first([run_orientation_bias_filter, false])
Array[String]? artifact_modes
File? tumor_sequencing_artifact_metrics
String? m2_extra_args
String? m2_extra_filtering_args
String? split_intervals_extra_args
Boolean? make_bamout
Boolean make_bamout_or_default = select_first([make_bamout, false])
Boolean? compress_vcfs
Boolean compress = select_first([compress_vcfs, false])
# oncotator inputs
Boolean? run_oncotator
Boolean run_oncotator_or_default = select_first([run_oncotator, false])
File? onco_ds_tar_gz
String? onco_ds_local_db_dir
String? sequencing_center
String? sequence_source
File? default_config_file
# funcotator inputs
Boolean? run_funcotator
Boolean run_funcotator_or_default = select_first([run_funcotator, false])
String? reference_version
String? data_sources_tar_gz
String? transcript_selection_mode
Array[String]? transcript_selection_list
Array[String]? annotation_defaults
Array[String]? annotation_overrides
File? gatk_override
# runtime
String gatk_docker
String basic_bash_docker = "ubuntu:16.04"
String? oncotator_docker
Int? preemptible_attempts
# Use as a last resort to increase the disk given to every task in case of ill behaving data
Int? emergency_extra_disk
# Disk sizes used for dynamic sizing
Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_dict, "GB") + size(ref_fai, "GB"))
Int tumor_bam_size = ceil(size(tumor_bam, "GB") + size(tumor_bai, "GB"))
Int gnomad_vcf_size = if defined(gnomad) then ceil(size(gnomad, "GB") + size(gnomad_index, "GB")) else 0
Int normal_bam_size = if defined(normal_bam) then ceil(size(normal_bam, "GB") + size(normal_bai, "GB")) else 0
# If no tar is provided, the task downloads one from broads ftp server
Int onco_tar_size = if defined(onco_ds_tar_gz) then ceil(size(onco_ds_tar_gz, "GB") * 3) else 100
Int gatk_override_size = if defined(gatk_override) then ceil(size(gatk_override, "GB")) else 0
# This is added to every task as padding, should increase if systematically you need more disk for every call
Int disk_pad = 10 + gatk_override_size + select_first([emergency_extra_disk,0])
# These are multipliers to multipler inputs by to make sure we have enough disk to accommodate for possible output sizes
# Large is for Bams/WGS vcfs
# Small is for metrics/other vcfs
Float large_input_to_output_multiplier = 2.25
Float small_input_to_output_multiplier = 2.0
# logic about output file names -- these are the names *without* .vcf extensions
String output_basename = basename(tumor_bam, ".bam")
String unfiltered_name = output_basename + "-unfiltered"
String filtered_name = output_basename + "-filtered"
String funcotated_name = output_basename + "-funcotated"
String output_vcf_name = basename(tumor_bam, ".bam") + ".vcf"
call SplitIntervals {
input:
intervals = intervals,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
scatter_count = scatter_count,
split_intervals_extra_args = split_intervals_extra_args,
gatk_override = gatk_override,
gatk_docker = gatk_docker,
preemptible_attempts = preemptible_attempts,
disk_space = ref_size + ceil(size(intervals, "GB") * small_input_to_output_multiplier) + disk_pad
}
Int m2_output_size = tumor_bam_size / scatter_count
scatter (subintervals in SplitIntervals.interval_files ) {
call M2 {
input:
intervals = subintervals,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
tumor_bam = tumor_bam,
tumor_bai = tumor_bai,
normal_bam = normal_bam,
normal_bai = normal_bai,
pon = pon,
pon_index = pon_index,
gnomad = gnomad,
gnomad_index = gnomad_index,
preemptible_attempts = preemptible_attempts,
m2_extra_args = m2_extra_args,
make_bamout = make_bamout_or_default,
compress = compress,
gatk_override = gatk_override,
gatk_docker = gatk_docker,
preemptible_attempts = preemptible_attempts,
disk_space = tumor_bam_size + normal_bam_size + ref_size + gnomad_vcf_size + m2_output_size + disk_pad
}
Float sub_vcf_size = size(M2.unfiltered_vcf, "GB")
Float sub_bamout_size = size(M2.output_bamOut, "GB")
}
call SumFloats as SumSubVcfs {
input:
sizes = sub_vcf_size,
preemptible_attempts = preemptible_attempts
}
call MergeVCFs {
input:
input_vcfs = M2.unfiltered_vcf,
input_vcf_indices = M2.unfiltered_vcf_index,
output_name = unfiltered_name,
compress = compress,
gatk_override = gatk_override,
gatk_docker = gatk_docker,
preemptible_attempts = preemptible_attempts,
disk_space = ceil(SumSubVcfs.total_size * large_input_to_output_multiplier) + disk_pad
}
if (make_bamout_or_default) {
call SumFloats as SumSubBamouts {
input:
sizes = sub_bamout_size,
preemptible_attempts = preemptible_attempts
}
call MergeBamOuts {
input:
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
bam_outs = M2.output_bamOut,
output_vcf_name = basename(MergeVCFs.merged_vcf, ".vcf"),
gatk_override = gatk_override,
gatk_docker = gatk_docker,
disk_space = ceil(SumSubBamouts.total_size * large_input_to_output_multiplier) + disk_pad
}
}
if (run_ob_filter && !defined(tumor_sequencing_artifact_metrics)) {
call CollectSequencingArtifactMetrics {
input:
gatk_docker = gatk_docker,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
preemptible_attempts = preemptible_attempts,
tumor_bam = tumor_bam,
tumor_bai = tumor_bai,
gatk_override = gatk_override,
disk_space = tumor_bam_size + ref_size + disk_pad
}
}
if (defined(variants_for_contamination)) {
call CalculateContamination {
input:
gatk_override = gatk_override,
intervals = intervals,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
preemptible_attempts = preemptible_attempts,
gatk_docker = gatk_docker,
tumor_bam = tumor_bam,
tumor_bai = tumor_bai,
normal_bam = normal_bam,
normal_bai = normal_bai,
variants_for_contamination = variants_for_contamination,
variants_for_contamination_index = variants_for_contamination_index,
disk_space = tumor_bam_size + normal_bam_size + ceil(size(variants_for_contamination, "GB") * small_input_to_output_multiplier) + disk_pad
}
}
call Filter {
input:
gatk_override = gatk_override,
gatk_docker = gatk_docker,
intervals = intervals,
unfiltered_vcf = MergeVCFs.merged_vcf,
unfiltered_vcf_index = MergeVCFs.merged_vcf_index,
output_name = filtered_name,
compress = compress,
preemptible_attempts = preemptible_attempts,
contamination_table = CalculateContamination.contamination_table,
m2_extra_filtering_args = m2_extra_filtering_args,
disk_space = ceil(size(MergeVCFs.merged_vcf, "GB") * small_input_to_output_multiplier) + disk_pad
}
if (run_ob_filter) {
# Get the metrics either from the workflow input or CollectSequencingArtifactMetrics if no workflow input is provided
File input_artifact_metrics = select_first([tumor_sequencing_artifact_metrics, CollectSequencingArtifactMetrics.pre_adapter_metrics])
call FilterByOrientationBias {
input:
gatk_override = gatk_override,
input_vcf = Filter.filtered_vcf,
input_vcf_index = Filter.filtered_vcf_index,
output_name = filtered_name,
compress = compress,
gatk_docker = gatk_docker,
preemptible_attempts = preemptible_attempts,
pre_adapter_metrics = input_artifact_metrics,
artifact_modes = artifact_modes,
disk_space = ceil(size(Filter.filtered_vcf, "GB") * small_input_to_output_multiplier) + ceil(size(input_artifact_metrics, "GB")) + disk_pad
}
}
if (run_oncotator_or_default) {
File oncotate_vcf_input = select_first([FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf])
call oncotate_m2 {
input:
m2_vcf = oncotate_vcf_input,
onco_ds_tar_gz = onco_ds_tar_gz,
onco_ds_local_db_dir = onco_ds_local_db_dir,
sequencing_center = sequencing_center,
sequence_source = sequence_source,
default_config_file = default_config_file,
case_id = M2.tumor_sample[0],
control_id = M2.normal_sample[0],
oncotator_docker = select_first([oncotator_docker, "NO_ONCOTATOR_DOCKER_GIVEN"]),
preemptible_attempts = preemptible_attempts,
disk_space = ceil(size(oncotate_vcf_input, "GB") * large_input_to_output_multiplier) + onco_tar_size + disk_pad
}
}
if (run_funcotator_or_default) {
File funcotate_vcf_input = select_first([FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf])
File funcotate_vcf_input_index = select_first([FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index])
call Funcotate {
input:
m2_vcf = funcotate_vcf_input,
m2_vcf_index = funcotate_vcf_input_index,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
reference_version = select_first([reference_version, "NO_REFERENCE_VERSION_GIVEN"]),
output_name = funcotated_name,
compress = compress,
data_sources_tar_gz = data_sources_tar_gz,
transcript_selection_mode = transcript_selection_mode,
transcript_selection_list = transcript_selection_list,
annotation_defaults = annotation_defaults,
annotation_overrides = annotation_overrides,
gatk_docker = gatk_docker,
gatk_override = gatk_override
}
}
output {
File unfiltered_vcf = MergeVCFs.merged_vcf
File unfiltered_vcf_index = MergeVCFs.merged_vcf_index
File filtered_vcf = select_first([FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf])
File filtered_vcf_index = select_first([FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index])
File? contamination_table = CalculateContamination.contamination_table
File? oncotated_m2_maf = oncotate_m2.oncotated_m2_maf
File? funcotated_vcf = Funcotate.funcotated_vcf
File? funcotated_vcf_index = Funcotate.funcotated_vcf_index
File? preadapter_detail_metrics = CollectSequencingArtifactMetrics.pre_adapter_metrics
File? bamout = MergeBamOuts.merged_bam_out
File? bamout_index = MergeBamOuts.merged_bam_out_index
}
}
task SplitIntervals {
# inputs
File? intervals
File ref_fasta
File ref_fai
File ref_dict
Int scatter_count
String? split_intervals_extra_args
File? gatk_override
# runtime
String gatk_docker
Int? mem
Int? preemptible_attempts
Int? disk_space
Int? cpu
Boolean use_ssd = false
# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem * 1000 else 3500
Int command_mem = machine_mem - 500
command {
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
mkdir interval-files
gatk --java-options "-Xmx${command_mem}m" SplitIntervals \
-R ${ref_fasta} \
${"-L " + intervals} \
-scatter ${scatter_count} \
-O interval-files \
${split_intervals_extra_args}
cp interval-files/*.intervals .
}
runtime {
docker: gatk_docker
memory: machine_mem + " MB"
disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 10])
cpu: select_first([cpu, 1])
}
output {
Array[File] interval_files = glob("*.intervals")
}
}
task M2 {
# inputs
File? intervals
File ref_fasta
File ref_fai
File ref_dict
File tumor_bam
File tumor_bai
File? normal_bam
File? normal_bai
File? pon
File? pon_index
File? gnomad
File? gnomad_index
String? m2_extra_args
Boolean? make_bamout
Boolean compress
String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf"
String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx"
File? gatk_override
# runtime
String gatk_docker
Int? mem
Int? preemptible_attempts
Int? disk_space
Int? cpu
Boolean use_ssd = false
# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem * 1000 else 3500
Int command_mem = machine_mem - 500
command <<<
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
# We need to create these files regardless, even if they stay empty
touch bamout.bam
echo "" > normal_name.txt
gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${tumor_bam} -O tumor_name.txt -encode
tumor_command_line="-I ${tumor_bam} -tumor `cat tumor_name.txt`"
if [[ -f "${normal_bam}" ]]; then
gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${normal_bam} -O normal_name.txt -encode
normal_command_line="-I ${normal_bam} -normal `cat normal_name.txt`"
fi
gatk --java-options "-Xmx${command_mem}m" Mutect2 \
-R ${ref_fasta} \
$tumor_command_line \
$normal_command_line \
${"--germline-resource " + gnomad} \
${"-pon " + pon} \
${"-L " + intervals} \
-O "${output_vcf}" \
${true='--bam-output bamout.bam' false='' make_bamout} \
${m2_extra_args}
>>>
runtime {
docker: gatk_docker
memory: machine_mem + " MB"
disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 10])
cpu: select_first([cpu, 1])
}
output {
File unfiltered_vcf = "${output_vcf}"
File unfiltered_vcf_index = "${output_vcf_index}"
File output_bamOut = "bamout.bam"
String tumor_sample = read_string("tumor_name.txt")
String normal_sample = read_string("normal_name.txt")
}
}
task MergeVCFs {
# inputs
Array[File] input_vcfs
Array[File] input_vcf_indices
String output_name
Boolean compress
String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf"
String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx"
File? gatk_override
# runtime
String gatk_docker
Int? mem
Int? preemptible_attempts
Int? disk_space
Int? cpu
Boolean use_ssd = false
# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem * 1000 else 3500
Int command_mem = machine_mem - 1000
# using MergeVcfs instead of GatherVcfs so we can create indices
# WARNING 2015-10-28 15:01:48 GatherVcfs Index creation not currently supported when gathering block compressed VCFs.
command {
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
gatk --java-options "-Xmx${command_mem}m" MergeVcfs -I ${sep=' -I ' input_vcfs} -O ${output_vcf}
}
runtime {
docker: gatk_docker
memory: machine_mem + " MB"
disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 10])
cpu: select_first([cpu, 1])
}
output {
File merged_vcf = "${output_vcf}"
File merged_vcf_index = "${output_vcf_index}"
}
}
task MergeBamOuts {
# inputs
File ref_fasta
File ref_fai
File ref_dict
Array[File]+ bam_outs
String output_vcf_name
File? gatk_override
# runtime
String gatk_docker
Int? mem
Int? preemptible_attempts
Int? disk_space
Int? cpu
Boolean use_ssd = false
# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem * 1000 else 7000
Int command_mem = machine_mem - 1000
command <<<
# This command block assumes that there is at least one file in bam_outs.
# Do not call this task if len(bam_outs) == 0
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
gatk --java-options "-Xmx${command_mem}m" GatherBamFiles \
-I ${sep=" -I " bam_outs} -O ${output_vcf_name}.out.bam -R ${ref_fasta}
samtools index ${output_vcf_name}.out.bam ${output_vcf_name}.out.bam.bai
>>>
runtime {
docker: gatk_docker
memory: machine_mem + " MB"
disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 10])
cpu: select_first([cpu, 1])
}
output {
File merged_bam_out = "${output_vcf_name}.out.bam"
File merged_bam_out_index = "${output_vcf_name}.out.bam.bai"
}
}
task CollectSequencingArtifactMetrics {
# inputs
File ref_fasta
File ref_fai
File tumor_bam
File tumor_bai
File? gatk_override
# runtime
String gatk_docker
Int? mem
Int? preemptible_attempts
Int? disk_space
Int? cpu
Boolean use_ssd = false
# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem * 1000 else 7000
Int command_mem = machine_mem - 1000
command {
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
gatk --java-options "-Xmx${command_mem}m" CollectSequencingArtifactMetrics \
-I ${tumor_bam} -O "gatk" -R ${ref_fasta} -VALIDATION_STRINGENCY LENIENT
}
runtime {
docker: gatk_docker
memory: machine_mem + " MB"
disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 10])
cpu: select_first([cpu, 1])
}
output {
File pre_adapter_metrics = "gatk.pre_adapter_detail_metrics"
}
}
task CalculateContamination {
# inputs
File? intervals
File ref_fasta
File ref_fai
File ref_dict
File tumor_bam
File tumor_bai
File? normal_bam
File? normal_bai
File? variants_for_contamination
File? variants_for_contamination_index
File? gatk_override
# runtime
Int? preemptible_attempts
String gatk_docker
Int? disk_space
Int? mem
# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem * 1000 else 7000
Int command_mem = machine_mem - 500
command {
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
if [[ -f "${normal_bam}" ]]; then
gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -I ${normal_bam} ${"-L " + intervals} -V ${variants_for_contamination} -O normal_pileups.table
NORMAL_CMD="-matched normal_pileups.table"
fi
gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"-L " + intervals} -V ${variants_for_contamination} -O pileups.table
gatk --java-options "-Xmx${command_mem}m" CalculateContamination -I pileups.table -O contamination.table $NORMAL_CMD
}
runtime {
docker: gatk_docker
memory: command_mem + " MB"
disks: "local-disk " + select_first([disk_space, 100]) + " HDD"
preemptible: select_first([preemptible_attempts, 10])
}
output {
File pileups = "pileups.table"
File contamination_table = "contamination.table"
}
}
task Filter {
# inputs
File? intervals
File unfiltered_vcf
File unfiltered_vcf_index
String output_name
Boolean compress
String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf"
String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx"
File? contamination_table
String? m2_extra_filtering_args
File? gatk_override
# runtime
String gatk_docker
Int? mem
Int? preemptible_attempts
Int? disk_space
Int? cpu
Boolean use_ssd = false
# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem * 1000 else 7000
Int command_mem = machine_mem - 500
command {
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
gatk --java-options "-Xmx${command_mem}m" FilterMutectCalls -V ${unfiltered_vcf} \
-O ${output_vcf} \
${"--contamination-table " + contamination_table} \
${m2_extra_filtering_args}
}
runtime {
docker: gatk_docker
memory: machine_mem + " MB"
disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 10])
cpu: select_first([cpu, 1])
}
output {
File filtered_vcf = "${output_vcf}"
File filtered_vcf_index = "${output_vcf_index}"
}
}
task FilterByOrientationBias {
# input
File? gatk_override
File input_vcf
File input_vcf_index
String output_name
Boolean compress
String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf"
String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx"
File pre_adapter_metrics
Array[String]? artifact_modes
# runtime
Int? preemptible_attempts
String gatk_docker
Int? disk_space
Int? mem
Int? cpu
Boolean use_ssd = false
# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem * 1000 else 7000
Int command_mem = machine_mem - 500
command {
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
gatk --java-options "-Xmx${command_mem}m" FilterByOrientationBias \
-V ${input_vcf} \
-AM ${sep=" -AM " artifact_modes} \
-P ${pre_adapter_metrics} \
-O ${output_vcf}
}
runtime {
docker: gatk_docker
memory: command_mem + " MB"
disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 10])
cpu: select_first([cpu, 1])
}
output {
File filtered_vcf = "${output_vcf}"
File filtered_vcf_index = "${output_vcf_index}"
}
}
task oncotate_m2 {
# inputs
File m2_vcf
File? onco_ds_tar_gz
String? onco_ds_local_db_dir
String? oncotator_exe
String? sequencing_center
String? sequence_source
File? default_config_file
String case_id
String? control_id
# runtime
String oncotator_docker
Int? mem
Int? preemptible_attempts
Int? disk_space
Int? cpu
Boolean use_ssd = false
# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem * 1000 else 3500
Int command_mem = machine_mem - 500
command <<<
# fail if *any* command below (not just the last) doesn't return 0, in particular if wget fails
set -e
# local db dir is a directory and has been specified
if [[ -d "${onco_ds_local_db_dir}" ]]; then
echo "Using local db-dir: ${onco_ds_local_db_dir}"
echo "THIS ONLY WORKS WITHOUT DOCKER!"
ln -s ${onco_ds_local_db_dir} onco_dbdir
elif [[ "${onco_ds_tar_gz}" == *.tar.gz ]]; then
echo "Using given tar file: ${onco_ds_tar_gz}"
mkdir onco_dbdir
tar zxvf ${onco_ds_tar_gz} -C onco_dbdir --strip-components 1
else
echo "Downloading and installing oncotator datasources from Broad FTP site..."
# Download and untar the db-dir
wget ftp://[email protected]/bundle/oncotator/oncotator_v1_ds_April052016.tar.gz
tar zxvf oncotator_v1_ds_April052016.tar.gz
ln -s oncotator_v1_ds_April052016 onco_dbdir
fi
${default="/root/oncotator_venv/bin/oncotator" oncotator_exe} --db-dir onco_dbdir/ -c $HOME/tx_exact_uniprot_matches.AKT1_CRLF2_FGFR1.txt \
-v ${m2_vcf} ${case_id}.maf.annotated hg19 -i VCF -o TCGAMAF --skip-no-alt --infer-onps --collapse-number-annotations --log_name oncotator.log \
-a Center:${default="Unknown" sequencing_center} \
-a source:${default="Unknown" sequence_source} \
-a normal_barcode:${control_id} \
-a tumor_barcode:${case_id} \
${"--default_config " + default_config_file}
>>>
runtime {
docker: oncotator_docker
memory: machine_mem + " MB"
bootDiskSizeGb: 12
disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 10])
cpu: select_first([cpu, 1])
}
output {
File oncotated_m2_maf="${case_id}.maf.annotated"
}
}
# Calculates sum of a list of floats
task SumFloats {
Array[Float] sizes
# Runtime parameters
Int? preemptible_attempts
command <<<
python -c "print ${sep="+" sizes}"
>>>
output {
Float total_size = read_float(stdout())
}
runtime {
docker: "python:2.7"
disks: "local-disk " + 10 + " HDD"
preemptible: select_first([preemptible_attempts, 10])
}
}
task Funcotate {
# inputs
File ref_fasta
File ref_fai
File ref_dict
File m2_vcf
File m2_vcf_index
String reference_version
String output_name
Boolean compress
String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf"
String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx"
File? data_sources_tar_gz
String? transcript_selection_mode
Array[String]? transcript_selection_list
Array[String]? annotation_defaults
Array[String]? annotation_overrides
# ==============
# Process input args:
String transcript_selection_arg = if defined(transcript_selection_list) then " --transcript-list " else ""
String annotation_def_arg = if defined(annotation_defaults) then " --annotation-default " else ""
String annotation_over_arg = if defined(annotation_overrides) then " --annotation-override " else ""
# ==============
# runtime
String gatk_docker
File? gatk_override
Int? mem
Int? preemptible_attempts
Int? disk_space_gb
Int? cpu
Boolean use_ssd = false
# You may have to change the following two parameter values depending on the task requirements
Int default_ram_mb = 3000
# WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples.
Int default_disk_space_gb = 100
# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem *1000 else default_ram_mb
Int command_mem = machine_mem - 1000
command <<<
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
DATA_SOURCES_TAR_GZ=${data_sources_tar_gz}
if [[ ! -e $DATA_SOURCES_TAR_GZ ]] ; then
# We have to download the data sources:
echo "Data sources gzip does not exist: $DATA_SOURCES_TAR_GZ"
echo "Downloading default data sources..."
wget ftp://[email protected]/bundle/funcotator/funcotator_dataSources.v1.0.20180105.tar.gz
tar -zxf funcotator_dataSources.v1.0.20180105.tar.gz
DATA_SOURCES_FOLDER=funcotator_dataSources.v1.0.20180105
else
# Extract the tar.gz:
mkdir datasources_dir
tar zxvf ${data_sources_tar_gz} -C datasources_dir --strip-components 1
DATA_SOURCES_FOLDER="$PWD/datasources_dir"
fi
gatk --java-options "-Xmx${command_mem}m" Funcotator \
--data-sources-path $DATA_SOURCES_FOLDER \
--ref-version ${reference_version} \
-R ${ref_fasta} \
-V ${m2_vcf} \
-O ${output_vcf} \
${"--transcript-selection-mode " + transcript_selection_mode} \
${transcript_selection_arg}${default="" sep=" --transcript-list " transcript_selection_list} \
${annotation_def_arg}${default="" sep=" --annotation-default " annotation_defaults} \
${annotation_over_arg}${default="" sep=" --annotation-override " annotation_overrides}
>>>
runtime {
docker: gatk_docker
memory: machine_mem + " MB"
disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 3])
cpu: select_first([cpu, 1])
}
output {
File funcotated_vcf = "${output_vcf}"
File funcotated_vcf_index = "${output_vcf_index}"
}
}