This repository has been archived by the owner on Apr 13, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mk_allele_matrix.sh
174 lines (155 loc) · 8.25 KB
/
mk_allele_matrix.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#!/bin/bash
# Author: Yu Wan <[email protected]>
# Copyright 2017 Yu Wan
# Licensed under the Apache License, Version 2.0
# Creation: 5 Sep 2016, latest edition: 1 Nov 2020
display_usage() {
echo "
This pipeline modifies a combined gene table that is made by SRST2 to make an allele table in accordance with sequence clustering. Use this pipeline on an gene table after reliability assessment of its allele calls. In other words, the input gene table should not have any question marks (laid by SRST2 to denote uncertain allele calls).
Dependencies: CD-HIT, Python 3 and R.
Arguments:
-p: the program you want to run for sequence clustering, including its path when it is not in the current working directory
-d: (optional) the directory where other scripts of this pipeline are located. Default: the current script directory.
-o: (optional) the output directory name not followed by a forward slash. Default: clusters
-a: arguments for CD-HIT-EST
-f: input FASTA files for clustering (consensus allele sequences from SRST2)
-g: a gene table from SRST2
-c: an optional tab-delimited file of allele-hit scores. It is produced by filter_allele_table.R in directory \'reliability\'.
-s: stages to run. -s=all or -s=\'1,3,4\' (maximum: 5) etc. Comma-delimited, no whitespace is allowed.
-e: whether allele names contain unique identifiers. 0: no; 1 (default): yes. For example, set -e=1 when allele names look like dfrA12_1.
-r: whether replace allele names with the representative allele name when their consensus sequences belong to the same cluster.
Usage:
bash mk_allele_matrix.sh -p='apps/seq/cd-hit-est' -a='-c 1 -d 0 -s 1 -aL 1 -aS 1 -A 1 -uL 0 -uS 0 -p 1 -g 1' -f=*.fasta -g='profiles_res_genes.txt' -s=all -e=0
bash mk_allele_matrix.sh -p='apps/seq/cd-hit' -d='../PAMmaker' -o='clrst' -a='-c 1 -d 0 -s 1 -aL 1 -aS 1 -A 1 -uL 0 -uS 0 -p 1 -g 1' -f=*.fasta -g='profiles_res_genes.txt' -s='1,3,4' -e=1
Warning:
Sequence headers in original input FASTA files will be changed by this program, where whitespaces are replaced with '|'. So you may want to compress and backup your raw data before running this pipeline.
"
}
# Display usage information if a null argument, "-h" or "--help" is encountered.
if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
display_usage
exit
fi
# Defaults
code_path=$(dirname "$0") # get the directory name of this script
out_path="clusters" # default name of the output directory under $PWD
id_ext=1
rename=false # Do not rename the alleles when their consensus sequences belong to the same cluster.
# Read arguments from the command line ##########
for i in "$@"; do # loop through every argument
case $i in
-p=*)
program="${i#*=}"
;;
-d=*)
code_path="${i#*=}"
;;
-o=*)
out_path="${i#*=}"
;;
-a=*)
arguments="${i#*=}" # get the string after the equal sign in $i
;;
-f=*)
fasta="${i#*=}" # a wildcard statement, such as *.fasta
;;
-g=*)
allele_table="${i#*=}" # an allele table from SRST2 after reliability assessment
;;
-c=*)
score_file="${i#*=}" # an optional tab-delimited file of allele-hit scores
;;
-s=*)
stages="${i#*=}" # get stages (comma-delimited, no whitespaces) to run
;;
-e=*)
id_ext="${i#*=}" # "0" or "1"
;;
-r)
rename=true # enable the allele name replacement
;;
esac
done
# Parse stages ##########
if [[ "$stages" -eq "all" ]]; then
stages=(1, 2, 3, 4, 5)
else
IFS="," read -r -a stages <<< $stages # split
fi
# make an output directory if it does not exist
if [ ! -d ${out_path} ]; then
mkdir $out_path
fi
# 1. Append a sample name to every sequence name in each FASTA file ##########
if [[ "${stages[@]}" =~ "1" ]]; then # if the array contains 1
echo "Appending sample names to sequences."
# replace whitespaces with '|' in-place because CD-HIT does not read information beyond the first space
# sed -i 's/ /\|/g' *.fasta works
sed -i 's/ /\|/g' ${fasta}
fi
# 2. Cluster consensus sequences ##########
if [[ "${stages[@]}" =~ "2" ]]; then
echo "Clustering nucleotide sequences with CD-HIT-EST."
cat $fasta > fasta.tmp
${program} -i fasta.tmp -o ${out_path}/clusters.fna $arguments > ${out_path}/$(basename "${program}").log # run either cd-hit or cd-hit-est
rm fasta.tmp
fi
# 3. Append suffices to allele names as extended identifiers ##########
if [[ "${stages[@]}" =~ "3" ]]; then
# First, convert CD-HIT-EST outputs into a table.
if [ ! -f "${out_path}/cluster_table.txt" ]; then
echo "Tabulating CD-HIT-EST outputs."
# As it is better to keep the cluster_table.txt for additional analyses, I do not make a longer pipe here.
python3 ${code_path}/utility/tabulate_cdhit.py ${out_path}/clusters.fna.clstr > ${out_path}/cluster_table.txt # run the next command as long as this command successed
else
echo "Skip tabulating CD-HIT-EST outputs because it has been done already. Delete the file if you want to rerun this step."
fi
# Next, append a cluster index to every member to make an extended allele ID.
# However, there may be multiple allele names present in the same cluster, and they do not have to belong to the same gene.
# There are two causes of this problem: a nucleotide identity < 100% for sequence clustering, or alleles are so close that
# the same consensus sequence can be equally aligned to different alleles in the reference database at scores influenced by
# the mapping quality. In this case, when it is enabled by a user, the pipeline renames the alleles using the name of the
# representative sequence of the cluster and create a table mapping the old and new names.
echo "Assigning allele identifiers into the genotype table."
if [[ ${id_ext} = 1 ]]; then # Allele names contain the unique identifiers.
if [ "$rename" = true ]; then
python ${code_path}/utility/clustering_allele_variants.py -i $allele_table -c ${out_path}/cluster_table.txt -r -o ${out_path}/modified_gene_table.txt -m ${out_path}/allele_name_replacement.txt
else
python ${code_path}/utility/clustering_allele_variants.py -i $allele_table -c ${out_path}/cluster_table.txt -o ${out_path}/modified_gene_table.txt
fi
else
echo "No unique allele identifiers are present."
if [ "$rename" = true ]; then
python ${code_path}/utility/clustering_allele_variants.py -i $allele_table -c ${out_path}/cluster_table.txt -r -n -o ${out_path}/modified_gene_table.txt -m ${out_path}/allele_name_replacement.txt
else
python ${code_path}/utility/clustering_allele_variants.py -i $allele_table -c ${out_path}/cluster_table.txt -n -o ${out_path}/modified_gene_table.txt
fi
fi
AM_present=`[ -f "${out_path}/modified_gene_table.txt" ]`
if [ ! -f allele_db.fna ] && ${AM_present}; then
echo "Creating a database of representative sequences named by allele names."
if [ "$rename" = true ]; then
python ${code_path}/utility/mk_allele_db.py -a ${out_path}/modified_gene_table.txt -r ${out_path}/allele_name_replacement.txt -c ${fasta} -o ${out_path}/allele_db.fna
else
python ${code_path}/utility/mk_allele_db.py -a ${out_path}/modified_gene_table.txt -c ${fasta} -o ${out_path}/allele_db.fna
fi
fi
fi
# 4. Creating the allele matrix
if [[ "${stages[@]}" =~ "4" ]] && ${AM_present}; then
echo "Converting the genotype table into an allelic presence-absence matrix."
Rscript ${code_path}/utility/convert_matrix.R ${out_path}/modified_gene_table.txt ${out_path}
fi
# Modify allele names in the score file and produce summary statistics of maxMAF for each allele call
# Please ensure the score file is accessible.
# This step does not require every allele in the score file to be present in the allelic PAM. Only those in the PAM
# will be retained in the new score file.
aPAM="${out_path}/allele_paMatrix.txt" # output of convert_matrix.R
if [[ "${stages[@]}" =~ "5" ]] && [ ! -z "$score_file" ] && [ -f "${aPAM}" ]; then
echo "Summarising maximum minor-allele-frequencies (MAF) for each allele call."
if [ "$rename" = true ]; then
Rscript ${code_path}/unicity/maxMAFstats.R -a ${out_path}/allele_paMatrix.txt -s ${score_file} -r ${out_path}/allele_name_replacement.txt -o ${out_path}/allele
else
Rscript ${code_path}/unicity/maxMAFstats.R -a ${out_path}/allele_paMatrix.txt -s ${score_file} -o ${out_path}/allele
fi
fi