Skip to content

Commit

Permalink
Merge pull request #145 from parsaeskandar/master
Browse files Browse the repository at this point in the history
Haplotype sampling WDL
  • Loading branch information
jmonlong authored Dec 12, 2023
2 parents 65dd739 + 4a477dc commit 00167b2
Show file tree
Hide file tree
Showing 10 changed files with 721 additions and 112 deletions.
261 changes: 186 additions & 75 deletions README.md

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions params/haplotype_sampling.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"HaplotypeSampling.IN_GBZ_FILE": "tests/small_sim_graph/graph.gbz",
"HaplotypeSampling.INPUT_READ_FILE_FIRST": "tests/small_sim_graph/reads_1.fastq.gz",
"HaplotypeSampling.INPUT_READ_FILE_SECOND": "tests/small_sim_graph/reads_2.fastq.gz"
}
14 changes: 14 additions & 0 deletions params/haplotype_sampling_and_giraffe.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"Giraffe.INPUT_READ_FILE_1": "tests/small_sim_graph/reads_1.fastq.gz",
"Giraffe.INPUT_READ_FILE_2": "tests/small_sim_graph/reads_2.fastq.gz",
"Giraffe.GBZ_FILE": "tests/small_sim_graph/graph.gbz",
"Giraffe.REFERENCE_PREFIX": "chr",
"Giraffe.SAMPLE_NAME": "s0",
"Giraffe.MIN_FILE": "tests/small_sim_graph/graph.min",
"Giraffe.DIST_FILE": "tests/small_sim_graph/graph.dist",
"Giraffe.OUTPUT_GAF": true,
"Giraffe.OUTPUT_CALLING_BAMS": true,
"Giraffe.OUTPUT_SINGLE_BAM": true,
"Giraffe.MAP_MEM": 8,
"Giraffe.HAPLOTYPE_SAMPLING": true
}
2 changes: 2 additions & 0 deletions params/happy_evaluation.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@
"HappyEvaluation.REFERENCE_FILE": "tests/small_sim_graph/ref.fa",
"HappyEvaluation.REMOVE_HOM_REFS": true
}


44 changes: 44 additions & 0 deletions tasks/bioinfo_utils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -544,3 +544,47 @@ task convertCRAMtoFASTQ {
docker: "quay.io/biocontainers/samtools:1.14--hb421002_0"
}
}


task kmerCountingKMC {
input {
File input_read_file_1
File? input_read_file_2
String output_file_name

Int kmer_length
Int max_ram = 64
Int nb_cores = 16
Int disk_size = round(size(input_read_file_1, "G") + size(input_read_file_2, "G")) + 10
}

command <<<
# Set the exit code of a pipeline to that of the rightmost command
# to exit with a non-zero status, or zero if all commands of the pipeline exit
set -o pipefail
# cause a bash script to exit immediately when a command fails
set -e
# cause the bash shell to treat unset variables as an error and exit immediately
set -u
# echo each line of the script to stdout so we can see what is happening
set -o xtrace
#to turn off echo do 'set +o xtrace'

echo ~{input_read_file_1} > scratch_file.txt
~{if defined(input_read_file_2) then "echo ~{input_read_file_2} >> scratch_file.txt" else ""}

kmc -k~{kmer_length} -m~{max_ram} -okff -t~{nb_cores} @scratch_file.txt ~{output_file_name} .

rm scratch_file.txt
>>>
output {
File kff_file = output_file_name + ".kff"
}
runtime {
preemptible: 2
cpu: nb_cores
memory: max_ram + " GB"
disks: "local-disk " + disk_size + " SSD"
docker: "quay.io/biocontainers/kmc:3.2.1--hf1761c0_2"
}
}
14 changes: 7 additions & 7 deletions tasks/gam_gaf_utils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ task mergeGAMandSort {
memory: in_mem + " GB"
cpu: in_cores
disks: "local-disk " + disk_size + " SSD"
docker: "quay.io/vgteam/vg:ci-684-bc9aa5dfc4b0d14519ea47333075906a4ec74656"
docker: "quay.io/vgteam/vg:v1.50.1"
}
}

Expand Down Expand Up @@ -56,7 +56,7 @@ task mergeGAFandSort {
memory: in_mem + " GB"
cpu: in_cores
disks: "local-disk " + disk_size + " SSD"
docker: "quay.io/vgteam/vg:ci-684-bc9aa5dfc4b0d14519ea47333075906a4ec74656"
docker: "quay.io/vgteam/vg:v1.50.1"
}
}

Expand Down Expand Up @@ -92,7 +92,7 @@ task splitGAM {
memory: in_mem + " GB"
cpu: in_cores
disks: "local-disk " + disk_size + " SSD"
docker: "quay.io/vgteam/vg:ci-684-bc9aa5dfc4b0d14519ea47333075906a4ec74656"
docker: "quay.io/vgteam/vg:v1.50.1"
}
}

Expand Down Expand Up @@ -128,7 +128,7 @@ task splitGAF {
memory: in_mem + " GB"
cpu: in_cores
disks: "local-disk " + disk_size + " SSD"
docker: "quay.io/vgteam/vg:ci-684-bc9aa5dfc4b0d14519ea47333075906a4ec74656"
docker: "quay.io/vgteam/vg:v1.50.1"
}
}

Expand Down Expand Up @@ -161,7 +161,7 @@ task mergeGAF {
memory: "6GB"
cpu: 1
disks: "local-disk " + in_disk + " SSD"
docker: "quay.io/vgteam/vg:ci-684-bc9aa5dfc4b0d14519ea47333075906a4ec74656"
docker: "quay.io/vgteam/vg:v1.50.1"
}
}

Expand Down Expand Up @@ -225,7 +225,7 @@ task surjectGAFtoSortedBAM {
memory: mem_gb + " GB"
cpu: nb_cores
disks: "local-disk " + disk_size + " SSD"
docker: "quay.io/vgteam/vg:ci-684-bc9aa5dfc4b0d14519ea47333075906a4ec74656"
docker: "quay.io/vgteam/vg:v1.50.1"
}
}

Expand Down Expand Up @@ -280,7 +280,7 @@ task surjectGAFtoBAM {
memory: mem_gb + " GB"
cpu: nb_cores
disks: "local-disk " + disk_size + " SSD"
docker: "quay.io/vgteam/vg:ci-684-bc9aa5dfc4b0d14519ea47333075906a4ec74656"
docker: "quay.io/vgteam/vg:v1.50.1"
}
}

162 changes: 162 additions & 0 deletions tasks/vg_indexing.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
version 1.0


task createDistanceIndex {
input {
File in_gbz_file
Int nb_cores = 16
Int in_extract_mem = 120
Int in_extract_disk = 2 * round(size(in_gbz_file, "G")) + 20
}
String output_prefix = sub(basename(in_gbz_file), "\\.gbz$", "")

command {
set -eux -o pipefail

vg index -t ~{nb_cores} -j "~{output_prefix}.dist" ~{in_gbz_file}
}

output {
File output_dist_index = "~{output_prefix}.dist"
}
runtime {
preemptible: 2
cpu: nb_cores
memory: in_extract_mem + " GB"
disks: "local-disk " + in_extract_disk + " SSD"
docker: "quay.io/vgteam/vg:v1.50.1"

}
}

task createRIndex {
input {
File in_gbz_file
Int nb_cores = 16
Int in_extract_mem = 120
Int in_extract_disk = 2 * round(size(in_gbz_file, "G")) + 20
}

String out_prefix_name = sub( basename(in_gbz_file), "\\.gbz$", "")

command {
# Set the exit code of a pipeline to that of the rightmost command
# to exit with a non-zero status, or zero if all commands of the pipeline exit
set -o pipefail
# cause a bash script to exit immediately when a command fails
set -e
# cause the bash shell to treat unset variables as an error and exit immediately
set -u
# echo each line of the script to stdout so we can see what is happening
set -o xtrace
#to turn off echo do 'set +o xtrace'

vg gbwt -p --num-threads ~{nb_cores} -r ~{out_prefix_name}.ri -Z ~{in_gbz_file}

}

output {
File output_R_index = "~{out_prefix_name}.ri"
}
runtime {
preemptible: 2
cpu: nb_cores
memory: in_extract_mem + " GB"
disks: "local-disk " + in_extract_disk + " SSD"
docker: "quay.io/vgteam/vg:v1.50.1"

}

}

task createHaplotypeIndex {
input {
File in_gbz_file
File in_dist_index
File in_R_index
Int kmer_length
Int window_length
Int subchain_length
Int nb_cores = 16
Int in_extract_mem = 120
Int in_extract_disk = 2 * round(size(in_gbz_file, "G") + size(in_dist_index, "G") + size(in_R_index, "G")) + 20
}

String out_prefix_name = sub( basename(in_gbz_file), "\\.gbz$", "")

command {
# Set the exit code of a pipeline to that of the rightmost command
# to exit with a non-zero status, or zero if all commands of the pipeline exit
set -o pipefail
# cause a bash script to exit immediately when a command fails
set -e
# cause the bash shell to treat unset variables as an error and exit immediately
set -u
# echo each line of the script to stdout so we can see what is happening
set -o xtrace
#to turn off echo do 'set +o xtrace'


vg haplotypes -v 2 --kmer-length ~{kmer_length} \
--window-length ~{window_length} \
--subchain-length ~{subchain_length} \
-t ~{nb_cores} -d ~{in_dist_index} \
-r ~{in_R_index} -H ~{out_prefix_name}.hapl ~{in_gbz_file}

}

output {
File output_hap_index = "~{out_prefix_name}.hapl"
}
runtime {
preemptible: 2
cpu: nb_cores
memory: in_extract_mem + " GB"
disks: "local-disk " + in_extract_disk + " SSD"
docker: "quay.io/vgteam/vg:v1.50.1"

}

}


task createMinimizerIndex {
input {
File in_gbz_file
File in_dist_index
String out_name
Int nb_cores = 16
Int in_extract_mem = 120
Int in_extract_disk = 4 * round(size(in_gbz_file, "G") + size(in_dist_index, "G")) + 20
}

command {
# Set the exit code of a pipeline to that of the rightmost command
# to exit with a non-zero status, or zero if all commands of the pipeline exit
set -o pipefail
# cause a bash script to exit immediately when a command fails
set -e
# cause the bash shell to treat unset variables as an error and exit immediately
set -u
# echo each line of the script to stdout so we can see what is happening
set -o xtrace
#to turn off echo do 'set +o xtrace'

vg minimizer -p -t ~{nb_cores} -o ~{out_name}.min -d ~{in_dist_index} ~{in_gbz_file}

}

output {
File output_minimizer = "~{out_name}.min"
}
runtime {
preemptible: 2
cpu: nb_cores
memory: in_extract_mem + " GB"
disks: "local-disk " + in_extract_disk + " SSD"
docker: "quay.io/vgteam/vg:v1.50.1"

}

}

Loading

0 comments on commit 00167b2

Please sign in to comment.