-
Notifications
You must be signed in to change notification settings - Fork 0
/
mergeVCF.pbs
128 lines (99 loc) · 4.73 KB
/
mergeVCF.pbs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/bin/bash
####### RITCHIELAB PBS TEMPLATE FILE
#
# Make a copy this script to use as the basis for your own script.
#
# Most of the available PBS options are described below, with a default
# or example setting. Lines starting with "##PBS" are ignored; to enable
# them, remove the second #.
#
# Put your own job commands inside the marked off section near the bottom,
# leaving the commands above and below it in place. In order to avoid an
# excessive IO burden on the network filesystem, it is best to copy your
# input data to the provided ${TMPDIR}, generate any output there, and then
# copy the final results back to the group directory.
####### user-assigned job name; avoid special characters besides _.-
#PBS -N merge_variants
####### email address to send notifications to: user@host[,user@host[...]]
#PBS -M [email protected]
####### types of email notifications to send: [a]bort, [b]egin, [e]nd, [n]one
#PBS -m bae
####### restart job from the beginning if it crashes (will overwrite previous output!): [y]es, [n]o
##PBS -r y
####### special queue name (we have "lionxf-mdr23" on LionXF only)
####### leave this out to let our qsub wrapper detect and use any available priority queue
##PBS -q queuename
####### run as an array job with these (numeric) ID numbers
##PBS -t 0,1,2-7
####### Allow others in the group to see the output
#PBS -W umask=0027
####### Throttle jobs by using a virtual resource (LionXF ONLY)
####### N can be any of 1,2,3,4,5
####### M is the amount of capacity to consume per job (max capacity is 1000)
##PBS -l gres=ritchielab_N:M
####### number of cluster nodes and/or processors to use (ACCRE:always append ":x86")
####### "nodes=X:ppn=Y" for Y cores each on X different nodes
####### "nodes=X" for X cores on any (or the same) node
#PBS -l nodes=1
####### maximum per-job memory (total shared by all cores/nodes)
#PBS -l mem=30gb
####### maximum per-core memory
#PBS -l pmem=30gb
####### maximum job execution time (real time, not just CPU time): DD:HH:MM:SS
#PBS -l walltime=12:00:00
####### output filename (default:"<script.pbs>.o<jobid>")
##PBS -o output.file
####### combine output streams: std[o]ut, std[e]rr
#PBS -j oe
####### these env vars are available when the job runs:
####### PBS_JOBNAME user-assigned job name as provided at submission
####### PBS_O_HOST name of the host on which qsub was run
####### PBS_O_LOGNAME name of user who submitted the job
####### PBS_O_HOME absolute path of the home directory of the user who submitted the job
####### PBS_O_WORKDIR absolute path from which the job was submitted
####### PBS_O_QUEUE name of the scheduling queue to which the job was submitted
####### PBS_SERVER name of the host to which qsub submitted the job
####### PBS_QUEUE name of the scheduling queue from which the job is being run
####### PBS_JOBID unique job number assigned by the scheduler
####### PBS_NODEFILE filename containing the names of nodes assigned to the job
####### PBS_ARRAYID array identifier for this sub-job within an array job
####### TMPDIR absolute path of temp directory on the assigned node's local disk (not GPFS) -- not provided by ACCRE!
# build PBS_BASEID from PBS_JOBID (minus array/queue labels) and PBS_QUEUE
PBS_BASEID=$(echo "${PBS_JOBID}" | grep -Po "^[0-9]+")
if [[ -z "${PBS_BASEID}" ]]; then echo "ERROR: unable to identify PBS_BASEID from PBS_JOBID '${PBS_JOBID}'"; exit 1; fi
PBS_BASEID="${PBS_BASEID}.${PBS_QUEUE}"
# create a temp directory in $TMPDIR if provided, otherwise /tmp or ~/group/tmp
for d in "${TMPDIR}" "/tmp" "${RITCHIELAB_GROUP_DIR}/tmp"; do
TMPDIR="${d}/ritchie_lab.pbstmp.${PBS_JOBID}"
[[ -d "${d}" ]] && mkdir "${TMPDIR}" && break
done
if [[ ! -d "${TMPDIR}" ]]; then echo "ERROR: unable to create temp directory in \$TMPDIR, '/tmp' or '~/group/tmp'"; exit 1; fi
# PBS always starts scripts in $HOME but most folks expect the script to run in the directory it was submitted from
cd "${PBS_O_WORKDIR}"
####### v---- JOB COMMANDS BELOW ----v
ANALYSIS_FILE_DIR="/gpfs/group1/m/mdr23/datasets/GATK/2.5"
if test ! -z "$REFERENCE"; then
REF_GENOME="$REFERENCE"
else
REF_GENOME="$ANALYSIS_FILE_DIR/human_g1k_v37_decoy.fasta"
fi
if test -z "$IN_DIR"; then
echo "ERROR: Input directory of VCF files must be given"
exit 2
fi
if test -z "$PREFIX"; then
PREFIX="$PWD/$PBS_JOBID"
fi
OUT_F="$PREFIX.merged.vcf.gz"
JAVA_OPTIONS="-d64 -Xms512m -Xmx27G" GenomeAnalysisTK-3.3-0 \
-T CombineVariants \
-R $REF_GENOME \
-filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED \
$(ls -1 $IN_DIR/*.vcf $IN_DIR/*.vcf.gz 2>/dev/null | sed -e 's/^/-V /' | tr '\n' ' ') \
-o "$OUT_F"
tabix -p vcf "$OUT_F"
####### ^---- JOB COMMANDS ABOVE ----^
# clean up TMPDIR (but preserve previous exit code)
CODE=$?
rm -rf "${TMPDIR}"
exit $CODE