Merge pull request #6 from UW-GAC/merge-vcf-workflow

Add workflow to merge VCFs
UW-GAC · Nov 20, 2024 · 95ef9e5 · 95ef9e5
2 parents 1c8f0ca + c423794
commit 95ef9e5
Show file tree

Hide file tree

Showing 5 changed files with 163 additions and 9 deletions.
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -33,3 +33,6 @@ workflows:
    - name: bcftools_merge_overlap
      subclass: WDL
      primaryDescriptorPath: /bcftools_merge_overlap.wdl
+   - name: bcftools_merge
+     subclass: WDL
+     primaryDescriptorPath: /bcftools_merge.wdl
diff --git a/.github/workflows/womtool.yml b/.github/workflows/womtool.yml
@@ -26,14 +26,15 @@ jobs:
       fail-fast: false
       matrix:
         wdl-file: [
-          ./plink2_pgen2bed.wdl,
-          ./liftover_vcf_picard.wdl,
-          ./plink2_vcf2bed.wdl,
-          ./bcftools_merge_overlap.wdl,
-          ./plink2_pgen2vcf.wdl,
-          ./plink2_bed2vcf.wdl,
-          ./plink2_vcf2pgen.wdl,
-          ./crossmap.wdl
+          bcftools_merge.wdl,
+          bcftools_merge_overlap.wdl,
+          crossmap.wdl,
+          liftover_vcf_picard.wdl,
+          plink2_bed2vcf.wdl,
+          plink2_pgen2bed.wdl,
+          plink2_pgen2vcf.wdl,
+          plink2_vcf2bed.wdl,
+          plink2_vcf2pgen.wdl
         ]
 
     name: "Run womtool: ${{ matrix.wdl-file }}"

diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ PRIMED file conversion workflows
 
 This workflow uses [plink2](https://www.cog-genomics.org/plink/2.0/) to convert a file from binary PLINK format (bed/bim/fam) to VCF.
 
-Default behavior is to output SNPs only, omitting any "I/D" codes for indels, as these are not accepted by downstream workflows such as liftover and imputation. 
+Default behavior is to output SNPs only, omitting any "I/D" codes for indels, as these are not accepted by downstream workflows such as liftover and imputation.
 
 Any pseudoautomsomal SNPs ('XY' code) will be merged with the X chromosome using plink2's "merge-x" option. The default is to add 'chr' prefixes to chromosome codes, as this is the standard for hg19 and hg38 and facilitates using UCSC chain files for liftover.
 
@@ -156,3 +156,25 @@ out_file | VCF file with coordinates in target build
 md5sum | md5 checksum of out_file
 rejects_file | VCF file with variants that could not be lifted over
 num_rejects | number of variants in the rejects file
+
+
+## bcftools_merge
+
+This workflow uses [bcftools](https://samtools.github.io/bcftools/bcftools.html#merge) to merge VCFs into a single VCF. Before merging, it creates an index for each VCF. It can run in parallel for multiple sets of VCFs.
+
+Inputs:
+
+input | description
+--- | ---
+vcf_files | An array of arrays of VCF files to merge. Each array of VCF files will be merged into a single VCF file.
+output_prefixes | Array of output prefixes for the merged VCF files. This should be an array of the same length as vcf_files.
+missing_to_ref | Set genotypes at missing sites to the reference allele (0/0). Default is false.
+merge_options | (optional) if specified, additional options to pass to `bcftools merge`
+mem_gb | (optional, default 16 GB) RAM required for merging. If the job fails due to lack of memory, try setting this to a larger value.
+
+Outputs:
+
+output | description
+--- | ---
+out_file | Array of merged VCF files, same length as vcf_files
+out_index_file | Array of index files for the merged VCF files, same length as vcf_files
diff --git a/bcftools_merge.json b/bcftools_merge.json
@@ -0,0 +1,9 @@
+{
+    "bcftools_merge.vcf_files": [
+        ["tmp/AMR_subset1000_chr22.vcf.gz", "tmp/EUR_subset1000_chr22.vcf.gz"]
+    ],
+    "bcftools_merge.output_prefixes": [
+        "merged_chr22"
+    ],
+    "bcftools_merge.missing_to_ref": false
+}
diff --git a/bcftools_merge.wdl b/bcftools_merge.wdl
@@ -0,0 +1,119 @@
+version 1.0
+
+
+workflow bcftools_merge {
+    input {
+        Array[Array[File]] vcf_files
+        Array[String] output_prefixes
+        Boolean missing_to_ref = false
+        String? merge_options
+        Int mem_gb = 16
+    }
+
+    scatter (pair in zip(vcf_files, output_prefixes)) {
+
+        scatter (vcf_file in pair.left) {
+            call create_index_file {
+                input: vcf_file = vcf_file,
+                    output_prefix = "index"
+            }
+        }
+
+        call merge_vcfs {
+            input: vcf_files = pair.left,
+                out_prefix = pair.right,
+                mem_gb = mem_gb,
+                missing_to_ref = missing_to_ref,
+                merge_options = merge_options,
+                index_files = create_index_file.index_file
+        }
+    }
+
+    output {
+        Array[File] out_files = merge_vcfs.out_file
+        Array[File] out_index_files = merge_vcfs.out_index_file
+    }
+
+    meta {
+        author: "Adrienne Stilp"
+        email: "[email protected]"
+    }
+}
+
+task create_index_file {
+
+    input {
+        File vcf_file
+        String? output_prefix = "index"
+    }
+
+    Int disk_size = ceil(2 * size(vcf_file, "GB")) + 2
+
+    command <<<
+
+        echo {~vcf_file}
+
+        bcftools index \
+            ~{vcf_file} \
+            -o ~{output_prefix}.csi
+    >>>
+
+    output {
+        File index_file = "~{output_prefix}.csi"
+    }
+
+    runtime {
+        docker: "nanozoo/bcftools:1.19--1dccf69"
+        disks: "local-disk " + disk_size + " SSD"
+    }
+}
+
+task merge_vcfs {
+    input {
+        Array[File] vcf_files
+        Array[File] index_files
+        String? merge_options
+        String out_prefix
+        Int mem_gb = 16
+        Boolean missing_to_ref = false
+    }
+
+    Int disk_size = ceil(3*(size(vcf_files, "GB"))) + 10
+
+    command <<<
+        set -e -o pipefail
+
+        echo "writing input file"
+        VCF_ARRAY=(~{sep=" " vcf_files}) # Load array into bash variable
+        INDEX_ARRAY=(~{sep=" " index_files}) # Load array into bash variable
+        for idx in ${!VCF_ARRAY[*]}
+        do
+            echo "${VCF_ARRAY[$idx]}##idx##${INDEX_ARRAY[$idx]}"
+        done > files.txt
+
+        echo "printing files to merge"
+        cat files.txt
+
+
+        echo "Merging files..."
+        # Merge files.
+        bcftools merge \
+            -l files.txt \
+            ~{if missing_to_ref then "--missing-to-ref" else ""} \
+            ~{if defined(merge_options) then merge_options else ""} \
+            -o ~{out_prefix}.vcf.gz \
+            --write-index
+    >>>
+
+    output {
+        File out_file = "~{out_prefix}.vcf.gz"
+        File out_index_file = "~{out_prefix}.vcf.gz.csi"
+    }
+
+    runtime {
+        docker: "nanozoo/bcftools:1.19--1dccf69"
+        disks: "local-disk " + disk_size + " SSD"
+        memory: mem_gb + " GB"
+
+    }
+}