-
Notifications
You must be signed in to change notification settings - Fork 1
/
pairwise_global_module.py
98 lines (78 loc) · 3.43 KB
/
pairwise_global_module.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandas as pd
import numpy as np
def global_alignment(seqs, subs_mat, gap_penalty, sequence_type):
'''Global alignment method aka Needleman-Wunsch alignment'''
# unpack dict
names_l, seqs_l = zip(*seqs.items())
seq1 = seqs_l[0]
seq2 = seqs_l[1]
# create two matrices (one for storing the values and the other for traceback)
nrow = len(seq1) + 1
ncol = len(seq2) + 1
rownames = ['']
rownames.extend(list(seq1))
colnames = ['']
colnames.extend(list(seq2))
df1 = pd.DataFrame(np.zeros((nrow, ncol)), index=rownames, columns=colnames)
df2 = pd.DataFrame(np.zeros((nrow, ncol)), index=rownames, columns=colnames)
# fill the first row and the first column
df1.iloc[0, 0] = 0
for i in range(1, nrow):
df1.iloc[i, 0] = df1.iloc[i - 1, 0] + gap_penalty
for j in range(1, ncol):
df1.iloc[0, j] = df1.iloc[0, j - 1] + gap_penalty
# make trace df
df2.iloc[0, 0] = '•'
for i in range(1, nrow):
df2.iloc[i, 0] = '↑'
for j in range(1, ncol):
df2.iloc[0, j] = '←'
# fill the rest of the table
for j in range(1, ncol):
for i in range(1, nrow):
if sequence_type == 'protein':
subs_score = subs_mat.loc[seq1[i - 1], seq2[j - 1]]
else:
if seq1[i - 1] == seq2[j - 1]:
subs_score = 1
else: subs_score = -1
df1.iloc[i, j] = max(df1.iloc[i - 1, j] + gap_penalty,
df1.iloc[i, j - 1] + gap_penalty,
df1.iloc[i - 1, j - 1] + subs_score)
if df1.iloc[i, j] == (df1.iloc[i - 1, j - 1] + subs_score):
df2.iloc[i, j] = '↖'
elif df1.iloc[i, j] == (df1.iloc[i - 1, j] + gap_penalty):
df2.iloc[i, j] = '←'
elif df1.iloc[i, j] == (df1.iloc[i, j - 1] + gap_penalty):
df2.iloc[i, j] = '↑'
final_score = df1.iloc[nrow - 1, ncol - 1]
# print('{0} \n\n {1}'.format(df1, df2))
# traceback function (function inside a function to avoid causing problems with a scope)
def traceback(df, seq1, seq2, names_l):
reconst_seq1 = [seq1[-1]]
reconst_seq2 = [seq2[-1]]
# i for row number and j for column number
j = df.shape[1] - 1
i = df.shape[0] - 1
while i > 1 and j > 1:
# if score in the diagonal is the highest
if max(df.iloc[i - 1, j - 1], df.iloc[i - 1, j], df.iloc[i, j - 1]) == df.iloc[i - 1, j - 1]:
reconst_seq1.append(seq1[i - 2])
reconst_seq2.append(seq2[j - 2])
i -= 1
j -= 1
# if score in the left is the highest
elif max(df.iloc[i - 1, j - 1], df.iloc[i - 1, j - 1], df.iloc[i, j - 1]) == df.iloc[i, j - 1]:
reconst_seq1.append('_')
reconst_seq2.append(seq2[j - 2])
j -= 1
# if score above is the highest
else:
reconst_seq1.append(seq1[i - 2])
reconst_seq2.append('_')
i -= 1
return '\n'.join([names_l[0] + ': ' + ''.join(reconst_seq1[::-1]),
names_l[1] + ': ' + ''.join(reconst_seq2[::-1])])
# print('Sequence alignment score:', final_score, '\n')
traceback_seqs = traceback(df1, seq1, seq2, names_l)
return traceback_seqs, '{0} \n\n {1}'.format(df1, df2), final_score