forked from biowdl/tasks
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ncbi.wdl
131 lines (119 loc) · 4.22 KB
/
ncbi.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
version 1.0
task GenomeDownload {
input {
String outputPath
String? section = "refseq"
String? format = "all"
String? assemblyLevel = "all"
String? taxId
String? refseqCategory
Boolean? humanReadable
String? ncbiBaseUri
Int? parallel
Int? retries
Boolean verbose = true
Boolean debug = false
String? domain = "all"
String executable = "ncbi-genome-download"
String? preCommand
}
command {
set -e -o pipefail
~{preCommand}
~{executable} \
~{"--section " + section} \
~{"--format " + format} \
~{"--assembly-level " + assemblyLevel } \
~{"--taxid " + taxId } \
~{"--refseq-category " + refseqCategory} \
~{"--output-folder " + outputPath } \
~{true="--human-readable" false="" humanReadable} \
~{"--uri " + ncbiBaseUri } \
~{"--parallel " + parallel } \
~{"--retries " + retries } \
~{true="--verbose" false="" verbose } \
~{true="--debug" false ="" debug } \
~{domain}
# Check md5sums for all downloaded files
for folder in $(realpath ~{outputPath})/*/*/*
do
(
md5sums="$(
cd $folder
for file in *
do
if [[ ! $file == "MD5SUMS" ]]
then
grep $file MD5SUMS
fi
done
)"
cd $folder; echo $md5sums | md5sum -c)
done
}
output {
Array[File] fastaGzFiles = glob(outputPath + "/*/*/*/*_genomic.fna.gz")
Array[File] genbankGzFiles = glob(outputPath + "/*/*/*/*_genomic.gbff.gz")
Array[File] featuresGzFiles = glob(outputPath + "/*/*/*/*_feature_table.txt.gz")
Array[File] gffGzFiles = glob(outputPath + "/*/*/*/*_genomic.gff.gz")
Array[File] proteinFastaGzFiles = glob(outputPath + "/*/*/*/*_protein.faa.gz")
Array[File] genpeptGzFiles = glob(outputPath + "/*/*/*/*_protein.gpff.gz")
Array[File] wgsGzFiles = glob(outputPath + "/*/*/*/*_wgsmaster.gbff.gz")
Array[File] cdsFastaGzFiles = glob(outputPath + "/*/*/*/*_cds_from_genomic.fna.gz")
Array[File] rnaFastaGzFiles = glob(outputPath + "/*/*/*/*_rna_from_genomic.fna.gz")
Array[File] assemblyReportFiles = glob(outputPath + "/*/*/*/*_assembly_report.txt")
Array[File] assemblyStatsFiles = glob(outputPath + "/*/*/*/*_assembly_stats.txt")
}
}
task DownloadNtFasta{
input {
String libraryPath
String seqTaxMapPath
Boolean unzip = true
String ntDir = libraryPath + "/nt"
String ntFilePath = ntDir + "/nt.fna"
}
command {
set -e -o pipefail
mkdir -p ~{ntDir}
rsync -av --partial rsync://ftp.ncbi.nih.gov/blast/db/FASTA/nt.gz* ~{ntDir}
(cd ~{ntDir} && md5sum -c nt.gz.md5)
# Only unzip when necessary
if ~{true='true' false='false' unzip}
then
zcat ~{ntDir}/nt.gz > ~{ntFilePath}
fi
}
output {
File ntFileGz = ntDir + "/nt.gz"
File library = libraryPath
# Added array file to allow for multiple downloads later.
# Also allows for easier pipeline logic.
Array[File] ntFastas = glob(ntDir + "/*.fna")
Array[File] ntFastasGz = glob(ntDir + "/nt*.gz")
}
}
task DownloadAccessionToTaxId {
input {
String downloadDir
Boolean gzip = false
}
command {
set -e -o pipefail
mkdir -p ~{downloadDir}
rsync -av \
--partial \
rsync://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_*.accession2taxid.gz* \
~{downloadDir}
(cd ~{downloadDir} && md5sum -c *.md5)
for file in ~{downloadDir}/nucl_*.accession2taxid.gz
do
zcat $file | tail -n +2 | cut -f 2,3 ~{true="| gzip" false='' gzip} > \
$file.seqtaxmap~{true='.gz' false='' gzip}
done
}
output {
Array[File] seqTaxMaps = glob(downloadDir + "/*.seqtaxmap")
Array[File] seqTaxMapsGz = glob(downloadDir + "/*.seqtaxmap.gz")
}
}