-
Notifications
You must be signed in to change notification settings - Fork 1
/
filter_vars.py
188 lines (155 loc) · 5.64 KB
/
filter_vars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
'''
This script will put the eight results files from Javi's script for each
sample (one per segment) together. Positions with coverage below the
user-set minimum will then be removed.It will then filter for positions
where at least one nucleotide (not matching the reference) is present at
a frequency above another user-defined minimum.
The parametersFile should be the arguments for this script. An example is
example.filt_args.
The aa_ref file header also needs to include the reading frame (1, 2 or 3) and offset
of the sequence in amino acids with respect to the nucleotide reference sequence
for that gene. An example of how to format this is shown in the reference for PB1
included in the repository. PB1_2_3 indicates that when the nucleotide reference
sequence of PB1 was translated (using the ExPASy online tool) that the protein
sequence was in reading frame 2 and there was a gap of 3 amino acids at the start before
the start of the ORF i.e. the M residue was in position 4.
'''
import sys
import pandas as pd
import numpy as np
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
parametersFile = sys.argv[1]
for line in open(parametersFile):
exec(line.strip())
class ref_gene:
def __init__(self, name):
self.name = name
self.nt_seq = ""
self.aa_seq = ""
self.aa_off = 0
self.aa_rf = 0
def add_nt_seq(self, seq):
self.nt_seq = seq
def add_aa_seq(self, seq, ref_frame, offset):
self.aa_seq = seq
self.aa_rf = int(ref_frame)
self.aa_off = int(offset)
def get_nt_refs(nt_ref_dir, gene_list):
if not nt_ref_dir.endswith("/"):
nt_ref_dir += "/"
gene_paths = [nt_ref_dir + gene for gene in gene_list]
gene_seqs = ["".join(open(path).read().split("\n")[1:-1]) for path in gene_paths]
ref_dict = {gene: seq for gene, seq in zip(gene_list, gene_seqs)}
return ref_dict
def get_aa_refs(aa_ref_dir, gene_list):
if not aa_ref_dir.endswith("/"):
aa_ref_dir += "/"
aa_paths = [aa_ref_dir + gene for gene in gene_list]
aa_data = [open(path).read().split("\n")[:-1] for path in aa_paths]
aa_info = [data[0].split()[-1].split("_")[1:] for data in aa_data]
aa_seqs = ["".join(open(path).read().split("\n")[1:-1]) for path in aa_paths]
comb = [[seq] + info for seq, info in zip(aa_seqs, aa_info)]
ref_dict = {gene: comb_info for gene, comb_info in zip(gene_list, comb)}
return ref_dict
def init_ref_inst(name, nt_ref_dict, aa_ref_dict):
aa_seq = aa_ref_dict[name][0]
aa_rf = aa_ref_dict[name][1]
aa_off = aa_ref_dict[name][2]
class_inst = ref_gene(name)
class_inst.add_nt_seq(nt_ref_dict[name])
class_inst.add_aa_seq(aa_seq, aa_rf, aa_off)
return class_inst
def get_results(res_path, gene_list):
res_paths = [res_path.replace("###",gene) for gene in gene_list]
res_list = [pd.read_csv(path) for path in res_paths]
comb_res = pd.concat(res_list)
return comb_res
def fill_df(row, ref_dict, min_freq):
#Filling codon positions and ref AAs
gene = row["gene"]
pos = row["pos"] - 1
rf = ref_dict[gene].aa_rf - 1
offset = ref_dict[gene].aa_off
codon_pos = ((pos - rf) // 3) - offset
if codon_pos < 0:
codon_pos = np.nan
row["codon"] = codon_pos + 1
try:
ref_AA = ref_dict[gene].aa_seq[int(codon_pos)]
except:
ref_AA = np.nan
row["ref_AA"] = ref_AA
#Filling ref nts
pos = row["pos"] - 1
ref_nt_seq = ref_dict[gene].nt_seq
ref_base = ref_nt_seq[pos]
row["ref_nt"] = ref_base
#Ordering variant nts, filling nt freqs,
#translating variant codons and testing if variants
#are synonymous
cov = row["cov"]
bases = ["A","C","G","T"]
if cov > 0:
base_freqs = {base: row[base]/cov for base in bases}
else:
base_freqs = {base: 0 for base in bases}
sorted_base_freqs = sorted(base_freqs.items(), reverse=True, key=lambda x: x[1])
pos_in_codon = (pos - rf) % 3
codon = list(ref_nt_seq[pos - pos_in_codon : pos + (3 - pos_in_codon)])
for num in range(1,5):
i = num - 1
base = sorted_base_freqs[i][0]
freq = round(sorted_base_freqs[i][1], 3)
row["nt_" + str(num)] = base
row["freq_" + str(num)] = freq
try:
codon[pos_in_codon] = base
except:
None
s_codon = Seq("".join(codon), IUPAC.ambiguous_dna)
if len(s_codon) == 3:
var_AA = str(s_codon.translate())
if float(freq) >= min_freq and type(ref_AA) == str:
if var_AA == ref_AA:
row["syn_" + str(num)] = "syn"
else:
row["syn_" + str(num)] = "non-syn"
else:
var_AA = np.nan
row["AA_" + str(num)] = var_AA
return row
#Reading in reference sequences
genes = ["HA", "MA", "NA", "NP", "NS", "PA", "PB1", "PB2"]
nt_refs = get_nt_refs(nt_ref_path, genes)
aa_refs = get_aa_refs(aa_ref_path, genes)
refs = {gene: init_ref_inst(gene, nt_refs, aa_refs) for gene in genes}
#Reading in inputs
results = get_results(results_path, genes)
#this is because pandas interprets the gene name "NA" as empty
results["seq"] = results["seq"].fillna("NA")
results = results[results["afterFilt"] >= min_cov]
#Creating new dataframe for results
cols = ["gene","pos","codon","cov","ref_nt","ref_AA","",
"freq_1","nt_1","AA_1","syn_1","",
"freq_2","nt_2","AA_2","syn_2","",
"freq_3","nt_3","AA_3","syn_3","",
"freq_4","nt_4","AA_4","syn_4","",
"A","C","G","T"]
df = pd.DataFrame(columns = cols)
df["gene"] = results["seq"]
df["pos"] = results["pos"]
df["cov"] = results["afterFilt"]
bases = ["A", "C", "G", "T"]
for base in bases:
df[base] = results[base]
df = df.apply(fill_df, axis = 1, ref_dict = refs, min_freq = min_var_freq)
for base in bases:
del df[base]
df = df[(df["ref_nt"] != df["nt_1"]) | (df["freq_2"] >= min_var_freq)]
#Saving outputs
sample_name = results_path.split("/")[-1].split("_")[0]
if not output_path.endswith("/"):
output_path += "/"
all_filtered_res_path = output_path + sample_name + "_filtered_variants.csv"
df.to_csv(all_filtered_res_path)