-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathext_mut_profile.py
90 lines (78 loc) · 3.29 KB
/
ext_mut_profile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import re
import os
import subprocess
import sys
class Mut_Profile():
def __init__(self):
self.genes = {}
def add_gene(self, name):
self.genes[name] = {"miss_matched": {}, "count": 0 }
def mut(self, gene_name, read, start_index, mutation):
pattern = re.compile('\d+|[\^CGTA]+')
tokens = pattern.findall(mutation)
index = start_index
missed = self.genes[gene_name]["miss_matched"]
self.genes[gene_name]["count"] += 1
for token in tokens:
delete_flag = False
if '0' <= token[0] <= '9':
index += int(token)
continue
if token[0] == '^':
delete_flag = True
token = token[1:]
for nt in token:
if index not in missed:
missed[index] = {"count": 0}
try:
read_nt = read[index - start_index]
except:
pass
if delete_flag:
read_nt = "*"
diff = (nt, read_nt)
if diff not in missed[index]:
missed[index][diff] = [0, 0, 0]
missed[index][diff][0] += 1
# missed[index]["count"] += 1
if not delete_flag:
missed[index]["count"] += 1
index += 1
def set_ratio(self):
for gene in self.genes:
missed = self.genes[gene]["miss_matched"]
for index in missed:
for diff in missed[index]:
if diff == "count":
continue
missed[index][diff][1] = '{0:.10f}'.format((missed[index][diff][0] / missed[index]["count"]))
missed[index][diff][2] = '{0:.10f}'.format(missed[index][diff][0] / self.genes[gene]["count"])
def read_from_file(self, file):
file = file
for line in file:
data = line.strip().split('\t')
if data[0].startswith('@SQ'):
self.add_gene(data[1].split(':')[1])
elif '@' not in data[0] and data[2] != '*':
gene_name = data[2]
read = data[9]
start_index = int(data[3])
mutation = data[12].split(':')[2]
self.mut(gene_name, read, start_index, mutation)
def export(self, out=False):
if (not out):
out=sys.stdout
for gene in self.genes:
out.write(gene+"\t"+str(self.genes[gene]["count"])+"\n")
for index in self.genes[gene]["miss_matched"]:
if self.genes[gene]["miss_matched"][index]["count"] ==0:
continue
#print(index,"\t", self.genes[gene]["miss_matched"][index]) #{('G', '-'): [1, 0.5, 0.1111111111111111], 'count': 2, ('G', 'C'): [1, 0.5, 0.1111111111111111]}
out.write(str(index)+"\t"+str(self.genes[gene]["miss_matched"][index])+"\n") #{('G', '-'): [1, 0.5, 0.1111111111111111], 'count': 2, ('G', 'C'): [1, 0.5, 0.1111111111111111]}
def main():
MutP = Mut_Profile()
a = sys.stdin
MutP.read_from_file(a)
MutP.export()
if __name__ == "__main__":
main()