-
Notifications
You must be signed in to change notification settings - Fork 0
/
Run_ref_prepper.sh
88 lines (60 loc) · 3.51 KB
/
Run_ref_prepper.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/bin/bash
# This script generates K-nearest neighbours using Run_FNN.R script
# This script when run on a given target files creates windows for CNV calling
# This script also outputs extreme GC and low mappability regions given a target file
# Needs the following information from reference shell script specific to batch of interest
# QC BAM STATS FILE (from Picard)
# TARGET FILE (can also be given with -t option)
# FASTA REFERENCE
# MAPPABILITY BED (for appropriate read length)
# SPECIAL REGIONS BED (from CLAMMS)
# INSERT SIZE
# SEQDB (from PLINK)
# MAX and MIN GC
# COMPLEXITY THRESHOLD
# WIll out put the following:
# WINDOWS BED
# TARGET REG
# LOW COMPLEXITY
# EXTREME GC
# k-NN REFERENCES (one file per sample)
# Requires PLINK/SEQ installed and in pathi
usage="
this script prepares reference files for CNV calling with XHMM, CANOES, CLAMMS
it requires an cnv.exome.references.sh containing correct path to each file
and will generate K-nearest neighbours for input to CLAMMS
as well as a windows.bed file for given exon capture kit
along with extreme GC and low complexity targets for input to XHMM
Run_ref_prepper.sh
-r (required) path/to/cnv.exome.references.sh
-t (optional) path/to/targets.bed (unless specified in -r)
-H (flag) echo this message
Usage: bash Run_ref_prepper.sh -r path/to/RefFil -t path/to/TgtFil
(make sure to run on compute node rather than head node on biocluster
and update cnv.exome.references.sh with path to windows.bed afterwards)
"
while getopts r:t:H opt; do
case "$opt" in
r) RefFil="$OPTARG";;
t) TgtFil="$OPTARG";;
H) echo "$usage"; exit;;
esac
done
# Import variables / read in reference file
RefFil=`readlink -f $RefFil`
source $RefFil
# Double check that TgtFil is loaded
TgtFil=`readlink -f $TgtFil`
TgtNam=`basename $TgtFil | cut -f1 -d "."`
# Find K nearest neighbours based on QC file
Rscript $RES_DIR/Run_FNN.R $RES_DIR/knn/$BATCH.QC.bam_stats.txt &
# Create a windows file based on capture kit to calculate read counts
$CLAMMS/annotate_windows.sh $TgtFil $REF $MAP_REG $INSERT_SIZE $SPEC_REG > $RES_DIR/$TgtNam.windows.bed &
# Create locus complexity and gc content files
echo -e "#CHR\tBP1\tBP2\tID" "\n$(cat $TgtFil)" | awk -F "\t" '{if(NR==1){print $0} else {print "chr"$0, $NF="target_"NR-1}}' OFS="\t" | sed -e 's/[[:space:]]*$//' > $RES_DIR/$TgtNam.targets.reg
pseq . loc-load --locdb $RES_DIR/$TgtNam.targets.LOCDB --file $RES_DIR/$TgtNam.targets.reg --group targets --out $RES_DIR/$TgtNam.LOCDB.loc-load.log --noweb
pseq . loc-stats --locdb $RES_DIR/$TgtNam.targets.LOCDB --group targets --seqdb /home/sai2116/bin/plinkseq/hg19/seqdb.hg19 --noweb | awk '{if (NR > 1) print $_}' | sort -k1 -g | awk '{print $10}' | paste $TgtFil - | awk '{print $1"\t"$2"\t"$3"\t"$4}' > $RES_DIR/$TgtNam.locus_complexity.txt
cat $RES_DIR/$TgtNam.locus_complexity.txt | awk '{if ($4 > 0.25) print $0}' | awk '{split ($0,a); print (a[1]":"a[2]"-"a[3])}' > $RES_DIR/$TgtNam.low_complexity_targets.txt
pseq . loc-stats --locdb $RES_DIR/$TgtNam.targets.LOCDB --group targets --seqdb /home/sai2116/bin/plinkseq/hg19/seqdb.hg19 --noweb | awk '{if (NR > 1) print $_}' | awk '{if ($8 < $minGC || $8 > $maxGC) print $4}' | sed 's/chr//' | sed 's/\../-/' > $RES_DIR/$TgtNam.extreme_gc_targets.txt
# 1 bp adjustement for book ended intervals
#cat $1.low_complexity_targets.txt | awk '{split($0,1,"-"); print a[1]"-"a[2]-1}' > $1.xhmm.low_complexity_targets.txt