-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathheuristics_PSSM.py
243 lines (162 loc) · 7.39 KB
/
heuristics_PSSM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
#!/usr/bin/env python
# coding: utf-8
# # Heuristic sequence weighting
import numpy as np
import pandas as pd
from time import time
import math
import os
# ### Functions
def load_data_utils():
data_dir = "/Users/laurasansc/github/algorithms_bioinformatics/data/"
# Load the alphabet file
alphabet_file = data_dir + "Matrices/alphabet"
alphabet = np.loadtxt(alphabet_file, dtype=str)
# Load the matrices file
blosum62_file = data_dir + "Matrices/blosum62.freq_rownorm"
_blosum62 = np.loadtxt(blosum62_file, dtype=float).T
blosum62 = {}
for i, letter_1 in enumerate(alphabet):
blosum62[letter_1] = {}
for j, letter_2 in enumerate(alphabet):
blosum62[letter_1][letter_2] = _blosum62[i, j]
# Load the bg information
bg_file = data_dir + "Matrices/bg.freq.fmt"
_bg = np.loadtxt(bg_file, dtype=float)
bg = {}
for i in range(0, len(alphabet)):
bg[alphabet[i]] = _bg[i]
return alphabet, blosum62, bg
def initialize_matrix(peptide_len, alphabet):
init_matrix = [0]*peptide_len
for i in range(0, peptide_len):
row = {}
for letter in alphabet:
row[letter] = 0.0
init_matrix[i] = row
return init_matrix
def c_matrix(peptides, peptide_len, alphabet):
cmatrix = initialize_matrix(peptide_len, alphabet)
for position in range(0, peptide_len):
for peptide in peptides:
cmatrix[position][peptide[position]] += 1
return cmatrix
def heuristic_weighting(peptides, cmatrix, sequence_weighting, peptide_len, alphabet):
# w = 1 / r * s
# where
# r = number of different amino acids in column
# s = number of occurrence of amino acid in column
weights = {}
for peptide in peptides:
# apply sequence weighting
if sequence_weighting:
w = 0.0
neff = 0.0
for position in range(0, peptide_len):
r = 0
for letter in alphabet:
if cmatrix[position][letter] != 0:
r += 1
s = cmatrix[position][peptide[position]]
w += 1.0/(r * s)
neff += r
neff = neff / peptide_len
# do not apply sequence weighting
else:
w = 1
neff = len(peptides)
weights[peptide] = w
return weights,neff
def f_matrix(peptides, peptide_len, alphabet, weights):
fmatrix = initialize_matrix(peptide_len, alphabet)
for position in range(0, peptide_len):
n = 0;
for peptide in peptides:
fmatrix[position][peptide[position]] += weights[peptide]
n += weights[peptide]
for letter in alphabet:
fmatrix[position][letter] = fmatrix[position][letter]/n
return fmatrix
def g_matrix(fmatrix, peptide_len, alphabet, blosum62):
gmatrix = initialize_matrix(peptide_len, alphabet)
for position in range(0, peptide_len):
for letter_1 in alphabet:
for letter_2 in alphabet:
gmatrix[position][letter_1] += fmatrix[position][letter_2] * blosum62[letter_1][letter_2]
return gmatrix
def p_matrix(fmatrix, gmatrix, peptide_len, alphabet, beta, neff):
pmatrix = initialize_matrix(peptide_len, alphabet)
alpha = neff - 1
for position in range(0, peptide_len):
for a in alphabet:
pmatrix[position][a] = (alpha*fmatrix[position][a] + beta*gmatrix[position][a]) / (alpha + beta)
return pmatrix
def w_matrix(pmatrix, bg, peptide_len, alphabet):
wmatrix = initialize_matrix(peptide_len, alphabet)
for position in range(0, peptide_len):
for letter in alphabet:
if pmatrix[position][letter] > 0:
wmatrix[position][letter] = 2 * math.log(pmatrix[position][letter]/bg[letter])/math.log(2)
else:
wmatrix[position][letter] = -999.9
return wmatrix
def to_psi_blast(matrix):
header = ["", "A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]
print ('{:>4} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8}'.format(*header))
letter_order = ["A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]
for i, row in enumerate(matrix):
scores = []
scores.append(str(i+1) + " A")
for letter in letter_order:
score = row[letter]
scores.append(round(score, 4))
print('{:>4} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8}'.format(*scores))
def to_psi_blast_file(matrix, file_name):
with open(file_name, 'w') as file:
header = ["", "A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]
file.write ('{:>4} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8}\n'.format(*header))
letter_order = ["A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]
for i, row in enumerate(matrix):
scores = []
scores.append(str(i+1) + " A")
for letter in letter_order:
score = row[letter]
scores.append(round(score, 4))
file.write('{:>4} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8}\n'.format(*scores))
# ### Data directory
def main():
print("Calculating the PSSMs with an heuristics algorithms")
data_dir = "/Users/laurasansc/github/algorithms_bioinformatics/data/"
# ### Main
alphabet, blosum62, bg = load_data_utils()
sequence_weighting = True
beta = 50
peptide_len = 9
train_dir = data_dir + 'train/'
# Iterate through data directory to find train files
for peptides_file in os.listdir(train_dir):
if peptides_file.endswith("train.csv"):
allele = peptides_file.replace("_train.csv","")
# Read file
peptides_df = pd.read_csv(train_dir+peptides_file)
raw_peptides = peptides_df["sequence"].to_list()
peptides = []
for i in range(0, len(raw_peptides)):
if len(raw_peptides[i]) == peptide_len:
peptides.append(raw_peptides[i])
#else:
# print("Peptide length too short discard", raw_peptides[i])
### get count matrices
cmatrix = c_matrix(peptides, peptide_len, alphabet)
### weights
weights, neff = heuristic_weighting(peptides, cmatrix, sequence_weighting, peptide_len, alphabet)
### f, g, p and w matrices
fmatrix = f_matrix(peptides, peptide_len, alphabet, weights)
gmatrix = g_matrix(fmatrix, peptide_len, alphabet, blosum62)
pmatrix = p_matrix(fmatrix, gmatrix, peptide_len, alphabet, beta, neff)
wmatrix = w_matrix(pmatrix, bg, peptide_len, alphabet)
# Write out PSSM in Psi-Blast format to file
file_name = data_dir + "Heuristics_PSSM/w_matrix_" + allele + '.csv'
to_psi_blast_file(wmatrix, file_name)
if __name__ == "__main__":
main()