Skip to content

Commit

Permalink
Merge pull request #6 from UW-GAC/merge-vcf-workflow
Browse files Browse the repository at this point in the history
Add workflow to merge VCFs
  • Loading branch information
amstilp authored Nov 20, 2024
2 parents 1c8f0ca + c423794 commit 95ef9e5
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 9 deletions.
3 changes: 3 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,6 @@ workflows:
- name: bcftools_merge_overlap
subclass: WDL
primaryDescriptorPath: /bcftools_merge_overlap.wdl
- name: bcftools_merge
subclass: WDL
primaryDescriptorPath: /bcftools_merge.wdl
17 changes: 9 additions & 8 deletions .github/workflows/womtool.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,15 @@ jobs:
fail-fast: false
matrix:
wdl-file: [
./plink2_pgen2bed.wdl,
./liftover_vcf_picard.wdl,
./plink2_vcf2bed.wdl,
./bcftools_merge_overlap.wdl,
./plink2_pgen2vcf.wdl,
./plink2_bed2vcf.wdl,
./plink2_vcf2pgen.wdl,
./crossmap.wdl
bcftools_merge.wdl,
bcftools_merge_overlap.wdl,
crossmap.wdl,
liftover_vcf_picard.wdl,
plink2_bed2vcf.wdl,
plink2_pgen2bed.wdl,
plink2_pgen2vcf.wdl,
plink2_vcf2bed.wdl,
plink2_vcf2pgen.wdl
]

name: "Run womtool: ${{ matrix.wdl-file }}"
Expand Down
24 changes: 23 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ PRIMED file conversion workflows

This workflow uses [plink2](https://www.cog-genomics.org/plink/2.0/) to convert a file from binary PLINK format (bed/bim/fam) to VCF.

Default behavior is to output SNPs only, omitting any "I/D" codes for indels, as these are not accepted by downstream workflows such as liftover and imputation.
Default behavior is to output SNPs only, omitting any "I/D" codes for indels, as these are not accepted by downstream workflows such as liftover and imputation.

Any pseudoautomsomal SNPs ('XY' code) will be merged with the X chromosome using plink2's "merge-x" option. The default is to add 'chr' prefixes to chromosome codes, as this is the standard for hg19 and hg38 and facilitates using UCSC chain files for liftover.

Expand Down Expand Up @@ -156,3 +156,25 @@ out_file | VCF file with coordinates in target build
md5sum | md5 checksum of out_file
rejects_file | VCF file with variants that could not be lifted over
num_rejects | number of variants in the rejects file


## bcftools_merge

This workflow uses [bcftools](https://samtools.github.io/bcftools/bcftools.html#merge) to merge VCFs into a single VCF. Before merging, it creates an index for each VCF. It can run in parallel for multiple sets of VCFs.

Inputs:

input | description
--- | ---
vcf_files | An array of arrays of VCF files to merge. Each array of VCF files will be merged into a single VCF file.
output_prefixes | Array of output prefixes for the merged VCF files. This should be an array of the same length as vcf_files.
missing_to_ref | Set genotypes at missing sites to the reference allele (0/0). Default is false.
merge_options | (optional) if specified, additional options to pass to `bcftools merge`
mem_gb | (optional, default 16 GB) RAM required for merging. If the job fails due to lack of memory, try setting this to a larger value.

Outputs:

output | description
--- | ---
out_file | Array of merged VCF files, same length as vcf_files
out_index_file | Array of index files for the merged VCF files, same length as vcf_files
9 changes: 9 additions & 0 deletions bcftools_merge.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"bcftools_merge.vcf_files": [
["tmp/AMR_subset1000_chr22.vcf.gz", "tmp/EUR_subset1000_chr22.vcf.gz"]
],
"bcftools_merge.output_prefixes": [
"merged_chr22"
],
"bcftools_merge.missing_to_ref": false
}
119 changes: 119 additions & 0 deletions bcftools_merge.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
version 1.0


workflow bcftools_merge {
input {
Array[Array[File]] vcf_files
Array[String] output_prefixes
Boolean missing_to_ref = false
String? merge_options
Int mem_gb = 16
}

scatter (pair in zip(vcf_files, output_prefixes)) {

scatter (vcf_file in pair.left) {
call create_index_file {
input: vcf_file = vcf_file,
output_prefix = "index"
}
}

call merge_vcfs {
input: vcf_files = pair.left,
out_prefix = pair.right,
mem_gb = mem_gb,
missing_to_ref = missing_to_ref,
merge_options = merge_options,
index_files = create_index_file.index_file
}
}

output {
Array[File] out_files = merge_vcfs.out_file
Array[File] out_index_files = merge_vcfs.out_index_file
}

meta {
author: "Adrienne Stilp"
email: "[email protected]"
}
}

task create_index_file {

input {
File vcf_file
String? output_prefix = "index"
}

Int disk_size = ceil(2 * size(vcf_file, "GB")) + 2

command <<<

echo {~vcf_file}

bcftools index \
~{vcf_file} \
-o ~{output_prefix}.csi
>>>

output {
File index_file = "~{output_prefix}.csi"
}

runtime {
docker: "nanozoo/bcftools:1.19--1dccf69"
disks: "local-disk " + disk_size + " SSD"
}
}

task merge_vcfs {
input {
Array[File] vcf_files
Array[File] index_files
String? merge_options
String out_prefix
Int mem_gb = 16
Boolean missing_to_ref = false
}

Int disk_size = ceil(3*(size(vcf_files, "GB"))) + 10

command <<<
set -e -o pipefail

echo "writing input file"
VCF_ARRAY=(~{sep=" " vcf_files}) # Load array into bash variable
INDEX_ARRAY=(~{sep=" " index_files}) # Load array into bash variable
for idx in ${!VCF_ARRAY[*]}
do
echo "${VCF_ARRAY[$idx]}##idx##${INDEX_ARRAY[$idx]}"
done > files.txt

echo "printing files to merge"
cat files.txt


echo "Merging files..."
# Merge files.
bcftools merge \
-l files.txt \
~{if missing_to_ref then "--missing-to-ref" else ""} \
~{if defined(merge_options) then merge_options else ""} \
-o ~{out_prefix}.vcf.gz \
--write-index
>>>

output {
File out_file = "~{out_prefix}.vcf.gz"
File out_index_file = "~{out_prefix}.vcf.gz.csi"
}

runtime {
docker: "nanozoo/bcftools:1.19--1dccf69"
disks: "local-disk " + disk_size + " SSD"
memory: mem_gb + " GB"

}
}

0 comments on commit 95ef9e5

Please sign in to comment.