-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfaster_alternative_workflow_with_bwa.wdl
258 lines (238 loc) · 6.9 KB
/
faster_alternative_workflow_with_bwa.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
workflow FasterAltWorkflow {
File input1_fq
File? input2_fq
String sample
String output_bam_basename
String vcf_basename
File wgs_calling_interval_list
Int haplotype_scatter_count
Int break_bands_at_multiples_of
# Reference Files
File ref_fasta
File ref_fasta_index
File ref_dict
File ref_alt
File ref_bwt
File ref_sa
File ref_amb
File ref_ann
File ref_pac
# Path to tools
String tool_path
#Optimization flags
Int compression_level
call BamMapMarkSort {
input:
input1_fq = input1_fq,
input2_fq = input2_fq,
sample = sample,
tool_path = tool_path,
output_bam_basename = output_bam_basename,
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
ref_amb = ref_amb,
ref_ann = ref_ann,
ref_bwt = ref_bwt,
ref_pac = ref_pac,
ref_sa = ref_sa
}
# Break the calling interval_list into sub-intervals
# Perform variant calling on the sub-intervals, and then gather the results
call ScatterIntervalList {
input:
interval_list = wgs_calling_interval_list,
scatter_count = haplotype_scatter_count,
break_bands_at_multiples_of = break_bands_at_multiples_of,
compression_level = compression_level,
tool_path = tool_path
}
# Call variants in parallel over WGS calling intervals
scatter (index in range(ScatterIntervalList.interval_count)) {
# Generate GVCF by interval
call HaplotypeCaller {
input:
input_bam = BamMapMarkSort.output_bam,
input_bam_index = BamMapMarkSort.output_bam_index,
interval_list = ScatterIntervalList.out[index],
vcf_basename = vcf_basename,
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
compression_level = compression_level,
tool_path = tool_path
}
}
# Combine by-interval GVCFs into a single sample GVCF file
call GatherHC {
input:
input_vcfs = HaplotypeCaller.output_vcf,
input_vcfs_indexes = HaplotypeCaller.output_vcf_index,
vcf_basename = vcf_basename,
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
tool_path = tool_path
}
}
task BamMapMarkSort {
File input1_fq
File? input2_fq
String sample
String tool_path
String bwa_options
String output_bam_basename
# reference files
File ref_fasta
File ref_fasta_index
File ref_dict
File ref_amb
File ref_ann
File ref_bwt
File ref_pac
File ref_sa
# config
Int cores
String sort_memory
Int cpu
String memory
String tmpDir
command <<<
set -euo pipefail
${tool_path}/bwa ${bwa_options} ${input1_fq} ${input2_fq} \
| \
${tool_path}/samblaster \
| \
${tool_path}/samtools sort -m ${sort_memory} -@ ${cores} -l 0 -o ${output_bam_basename}.bam /dev/stdin \
&& \
${tool_path}/samtools index ${output_bam_basename}.bam ${output_bam_basename}.bai
#Using Samtools is 2x faster
#${tool_path}/sambamba view -S -f bam -l 0 /dev/stdin \
#| \
#${tool_path}/sambamba sort --tmpdir=${tmpDir} -m ${sort_memory} -t ${cores} -l 0 -o ${output_bam_basename}.bam /dev/stdin
>>>
runtime {
memory: memory
cpu: cpu
}
output {
File output_bam = "${output_bam_basename}.bam"
File output_bam_index = "${output_bam_basename}.bai"
#if using sambamba - use the .bam.bai index output file
#File output_bam_index = "${output_bam_basename}.bam.bai"
}
}
# This task calls picard's IntervalListTools to scatter the input interval list into scatter_count sub interval lists
# Note that the number of sub interval lists may not be exactly equal to scatter_count. There may be slightly more or less.
# Thus we have the block of python to count the number of generated sub interval lists.
task ScatterIntervalList {
File interval_list
Int scatter_count
Int break_bands_at_multiples_of
Int compression_level
String memory
Int cpu
String tool_path
command <<<
set -e
mkdir out
java -Dsamjdk.compression_level=${compression_level} -Xmx10g -jar ${tool_path}/picard.jar \
IntervalListTools \
SCATTER_COUNT=${scatter_count} \
SUBDIVISION_MODE=BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \
UNIQUE=true \
SORT=true \
BREAK_BANDS_AT_MULTIPLES_OF=${break_bands_at_multiples_of} \
INPUT=${interval_list} \
OUTPUT=out
python3 <<CODE
import glob, os
# Works around a JES limitation where multiples files with the same name overwrite each other when globbed
intervals = sorted(glob.glob("out/*/*.interval_list"))
for i, interval in enumerate(intervals):
(directory, filename) = os.path.split(interval)
newName = os.path.join(directory, str(i + 1) + filename)
os.rename(interval, newName)
print(len(intervals))
CODE
>>>
output {
Array[File] out = glob("out/*/*.interval_list")
Int interval_count = read_int(stdout())
}
runtime {
memory: memory
cpu: cpu
}
}
# Call variants on a single sample with HaplotypeCaller to produce a GVCF
task HaplotypeCaller {
File input_bam
File input_bam_index
File interval_list
String vcf_basename
File ref_dict
File ref_fasta
File ref_fasta_index
Float? contamination
String gatk_gkl_pairhmm_implementation
Int gatk_gkl_pairhmm_threads
String smith_waterman_implementation
Int compression_level
String memory
Int cpu
String tool_path
# We use interval_padding 500 below to make sure that the HaplotypeCaller has context on both sides around
# the interval because the assembly uses them.
command <<<
export GATK_LOCAL_JAR=${tool_path}/gatk4.0.0.0/gatk.jar && \
${tool_path}/gatk4.0.0.0/gatk --java-options -Xmx6g \
HaplotypeCaller \
-R ${ref_fasta} \
-I ${input_bam} \
-O ${vcf_basename}.vcf.gz \
-L ${interval_list} \
-ip 100 \
-contamination ${default=0 contamination} \
--max-alternate-alleles 3 \
-pairHMM ${gatk_gkl_pairhmm_implementation} \
--native-pair-hmm-threads ${gatk_gkl_pairhmm_threads} \
--smith-waterman ${smith_waterman_implementation}
#-pairHMM AVX_LOGLESS_CACHING \
>>>
runtime {
memory: memory
cpu: cpu
#require_fpga: "yes"
}
output {
File output_vcf = "${vcf_basename}.vcf.gz"
File output_vcf_index = "${vcf_basename}.vcf.gz.tbi"
}
}
task GatherHC {
Array[File] input_vcfs
Array[File] input_vcfs_indexes
String vcf_basename
File ref_dict
File ref_fasta
File ref_fasta_index
String tool_path
String memory
Int cpu
command {
java -cp ${tool_path}/GenomeAnalysisTK.jar org.broadinstitute.gatk.tools.CatVariants \
-V ${sep=' -V ' input_vcfs} \
-R ${ref_fasta} \
-out ${vcf_basename}.vcf \
-assumeSorted
}
runtime {
memory: memory
cpu: cpu
}
output {
File gathered_vcf = "${vcf_basename}.vcf"
File gathered_vcf_index = "${vcf_basename}.vcf.idx"
}
}