-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsample.vcf.sh
87 lines (73 loc) · 2.31 KB
/
sample.vcf.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# Generate a set of VCF files from the 1000 genomes project
# Each file corresponds to 1 individual (here also a sample) and the
# filename is a catenation of the sample ID and file shecksum (using sha1)
# To reduce VCF file size, the variants are filtered against the AF field value:
# modify the AF_THRESHOLD value accordingly.
# The VCF from 1000 genomes for Chromosome 22 (because it is small)
ORIG_VCF=ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz
FILTERED_VCF=filtered.vcf
AF_THRESHOLD=0.97
CACHE_DIR=./cache
DATA_DIR=./data
check_prerequisites()
{
# bcftools
if ! command -v bcftools &> /dev/null
then
echo "Error: bcftools could not be found"
exit 1
fi
# compgen
if ! command -v compgen &> /dev/null
then
echo "Error: compgen utility could not be found"
exit 1
fi
}
download_orig_file()
{
echo "Downloading Chr22 VCF from 1000 genomes project"
if [ -f "${CACHE_DIR}/${ORIG_VCF}" ]; then
echo "Using Chr22 VCF file from cache."
return 0
fi
curl -v "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/${ORIG_VCF}" -o "${CACHE_DIR}/${ORIG_VCF}"
}
filter_vcf()
{
echo "Generating a downsized filtered version of Chr22 VCF file. This may take a few minutes."
if [ -f "${CACHE_DIR}/${FILTERED_VCF}" ]; then
echo "Using filtered VCF file from cache."
return 0;
fi
bcftools view -i "AF>${AF_THRESHOLD}" "${CACHE_DIR}/${ORIG_VCF}" > "${CACHE_DIR}/${FILTERED_VCF}"
}
create_dirs()
{
if [ ! -d $CACHE_DIR ]; then
mkdir $CACHE_DIR
fi
if [ ! -d $DATA_DIR ]; then
mkdir $DATA_DIR
fi
}
check_prerequisites
create_dirs
download_orig_file
filter_vcf
echo "Generating sample VCF files in ${DATA_DIR}"
FILE="${CACHE_DIR}/${FILTERED_VCF}"
SAMPLE_LIST=$(bcftools query -l $FILE)
SAMPLE_NB=$(echo "${SAMPLE_LIST}" | wc -l | awk '{$1=$1};1')
COUNT=0
for sample in $SAMPLE_LIST; do
((COUNT+=1))
printf "Generating VCF file ${COUNT} / ${SAMPLE_NB} \\r"
SAMPLE_FILE="${DATA_DIR}/${sample}.vcf.gz"
if compgen -G "${DATA_DIR}/${sample}-*.vcf.gz" > /dev/null; then
echo "VCF for ${sample} already exists."
continue
fi
bcftools view -c1 -Oz -s $sample -o $SAMPLE_FILE $FILE
done
echo "${SAMPLE_NB} sample VCF files generated."