-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHFS_first.slurm
83 lines (72 loc) · 3.61 KB
/
HFS_first.slurm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/bin/bash
#SBATCH -J R
#!/bin/bash
#SBATCH -J R
#SBATCH -p 64c512g
#SBATCH --mail-type=end
#SBATCH --mail-user=xx
#SBATCH -N 1
#SBATCH --ntasks-per-node=2 ###depends on RAM
#SBATCH -o array_%A_%a.out
#SBATCH -e array_%A_%a.err
#SBATCH --array=1-1361%1361
########modify these. CHR is chromosome number without "chr".#######
project_id="UKB"
map_path() { echo "~/shapeit5/resources/maps/b38/chr$CHR.b38.gmap.gz"; } ###if your genotype data is already phased, left empty
bgen_path() { echo "/dssg/home/acct-bioxsyy/share/ukb/ukb22828_c${CHR}_b0_v3.bgen"; }
sample_path() { echo "/dssg/home/acct-bioxsyy/share/ukb/ukb22828_c${CHR}_b0_v3.sample"; }
keep_path="/dssg/home/acct-bioxsyy/share/ukb/all_chr/indlist" ###!!remember to SORT this one-column sample id file.
shapeit5_path=~/shapeit5/SHAPEIT5_phase_common_static_v1.0.0 ###if your genotype data is already phased, left empty
###########################
module load miniconda3
source activate flat
export PATH="$PATH:~/FLAT/code/"
chunk=$SLURM_ARRAY_TASK_ID
line=$(awk -v id="$chunk" '$2 == id {print $0}' ~/FLAT/data/chunkchr)
read -r chr _ <<< "$line"
CHR="${chr//chr}"
mkdir -p ~/FLAT/$project_id/${chr}/$chunk
cd ~/FLAT/$project_id/${chr}/$chunk
awk -v id="$chunk" '$6 == id {print $1,$2,$3,$4}' ~/FLAT/data/loci_sliding_full.bed > int.bed
grep -v '^[[:space:]]*$' int.bed | while IFS=' ' read -r _ start end i; do
range="${chr}:${start}-${end}"
mkdir -p ~/FLAT/$project_id/${chr}/${chunk}/$i
cd ~/FLAT/$project_id/${chr}/${chunk}/$i
samtools faidx ~/FLAT/data/$chr.fa $range > int.fa
bgenix -g $(bgen_path) -incl-rsids ~/FLAT/snplist/loci$CHR/$chunk/$i > int.bgen
plink2 --bgen int.bgen ref-first --sample $(sample_path) --hwe 1e-6 --mac 10 --max-alleles 2 --geno 0.1 --recode vcf --out int -keep $keep_path
if [ -z "$shapeit5_path" ]; then
plink2 --vcf int.vcf --mind 0.1 --export haps ref-first --out geno
else
bcftools view -c 0 int.vcf -Oz -o int.vcf.gz
bcftools index int.vcf.gz
if $shapeit5_path --input int.vcf.gz --map $(map_path) --region $CHR --output phased.vcf.gz; then
plink2 --vcf phased.vcf.gz --mind 0.1 --export haps ref-first --out geno
rm phased.vcf.gz
else
sed -i 's/\//|/g' int.vcf
sed -i 's/\.|./0|0/g' int.vcf
plink2 --vcf int.vcf --mind 0.1 --export haps ref-first --out geno
echo "" > failflag
fi
fi
awk 'BEGIN{OFS="\t"}{print "chr"$1,$3,$2,$4,$5,".",".",".","GT"}' geno.haps | awk 'BEGIN{print "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"}1' > variant.txt
cut -d " " -f 6- geno.haps | datamash transpose -W > int.txt
cut -s -f 2 -d " " geno.sample | tail -n +3 | awk -F"_" '{for(i=0;i<2;i++)print$1}' | paste -d"," - int.txt > int1.txt
sort int1.txt -k 2 -t "," > int.txt
awk -F"," '!_[$2]++' int.txt | awk -F"," '{$0=$2","NR} 1' >haps.txt
LC_ALL=C join -1 2 -2 1 -t "," -o1.1 -o2.2 int.txt haps.txt | LC_ALL=C sort -k1 -t "," | datamash -t"," -g1 collapse 2 > haps.geno
LC_ALL=C join -o 1.2,1.3 -t "," -a 2 haps.geno <(tail -n +2 $keep_path | awk 'BEGIN{OFS=","} {$1=$1; print}') | awk 'BEGIN{FS=",";OFS="\t"} {$1=$1; print}' | gzip > hapsgeno.gz
sed -i 's/,/\t/g' haps.txt
nr=$(wc -l haps.txt)
nr=(${nr// / })
nr=${nr[0]}
yes $i | head -n $nr | nl > hap.index
awk 'BEGIN{OFS="\t";} NF{NF--};1' haps.txt | nl | datamash -W transpose | paste variant.txt - | awk 'BEGIN{print "##fileformat=VCFv4.2"}1' | bgzip > haplotype.vcf.gz
bcftools index haplotype.vcf.gz
cp int.fa ref.fa
rm -r *txt geno* int* lift* phased* haps.geno
cd ~/FLAT/$project_id/${chr}/${chunk}
done
# Record completion
echo "$chunk" >> "~/FLAT/$project_id/finishlist"