-
Notifications
You must be signed in to change notification settings - Fork 0
/
DnaPlot.py
executable file
·458 lines (419 loc) · 18 KB
/
DnaPlot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
import re
import tkinter as tk
import os.path
from matplotlib import pyplot as plt
from matplotlib import style
import numpy as np
from tkinter import messagebox
from pathlib import Path
#GlobalVariable to store names of all the Different Genome in the raw text file
Filenamelist=[]
#The genome class
class genome(object):
#Object constructor
def __init__(self,inp):
#if else block for constructor overloading
if len(inp)==2: #If reading from the raw file (First read)
self.Name=inp[0] #stores the Name
self.DNAseq=inp[1] #stores the DNA Sequence
self.whaterror={0:'No Error',1:"'Stop' In Middle of sequence",2:'DNA Sequence Not A Multiple of Three'} #An error dict used for storing DNA Sequence Errors
self.CodonCountDict={ #Initializing Dictionary map to store Codon counts to calculate required values
'ATA':0, 'ATC':0, 'ATT':0, 'ATG':0,
'ACA':0, 'ACC':0, 'ACG':0, 'ACT':0,
'AAC':0, 'AAT':0, 'AAA':0, 'AAG':0,
'AGC':0, 'AGT':0, 'AGA':0, 'AGG':0,
'CTA':0, 'CTC':0, 'CTG':0, 'CTT':0,
'CCA':0, 'CCC':0, 'CCG':0, 'CCT':0,
'CAC':0, 'CAT':0, 'CAA':0, 'CAG':0,
'CGA':0, 'CGC':0, 'CGG':0, 'CGT':0,
'GTA':0, 'GTC':0, 'GTG':0, 'GTT':0,
'GCA':0, 'GCC':0, 'GCG':0, 'GCT':0,
'GAC':0, 'GAT':0, 'GAA':0, 'GAG':0,
'GGA':0, 'GGC':0, 'GGG':0, 'GGT':0,
'TCA':0, 'TCC':0, 'TCG':0, 'TCT':0,
'TTC':0, 'TTT':0, 'TTA':0, 'TTG':0,
'TAC':0, 'TAT':0, 'TAA':0, 'TAG':0,
'TGC':0, 'TGT':0, 'TGA':0, 'TGG':0,}
self.ProtienSeq=self.translate() #calls the member function transate() to get the protien seq and store it in a instance variable
self.ProtienCodonCount=self.GetCodonDistribution() #get the codon count distribution
if self.errorno==0: #check if any errors were set during protien sequence generation if yes set Nc to 0 else
self.Nc=self.calcNcValue() #Compute Nc Value
else:
self.Nc=0
else: #if reading from a processed file/db table get individual info and discard unnecessary info
self.Name=inp[0]
self.DNAseq=inp[1]
self.ProtienSeq=inp[2]
self.Nc=float(inp[4])
self.errorno=int(inp[3])
self.whaterror={0:'No Error',1:"'Stop' In Middle of sequence",2:'DNA Sequence Not A Multiple of Three'}
#function to generate the protien sequence from DNA sequence
def translate(self):
table = { #A map to translate individual codons into corresponding protien
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'Stop', 'TAG':'Stop',
'TGC':'C', 'TGT':'C', 'TGA':'Stop', 'TGG':'W',
}
protein ="" #temp variable to store the protien seq
length=len(self.DNAseq) #check DNA seq Length for errors
if length%3 == 0:
for i in range(0, length, 3): #take set of three genes from the start and translate it to corresponding protien
codon = self.DNAseq[i:i + 3]
if i<(length-3) and table[codon] == 'Stop' : #check of "stop in middle" error
self.errorno=1
return 'N/A'
protein+= table[codon]
self.storeCodonCount(codon)
else:
self.errorno=2
return 'N/A'
self.errorno=0
return protein
#similar to tostring() in java, a string representation of an object
def __str__(self):
return ('\nGenome Name : ' + self.Name + '\nGenome DNA sequence :\n' + self.DNAseq + '\nGenome Protien Sequence :\n'+ self.ProtienSeq + '\nError if any : ' + self.whaterror[self.errorno] + '\nNc Value of the Codon : ' + str(self.Nc))
#increment the codon counts of each gene as an when the occur
def storeCodonCount(self,codname):
if (codname in self.CodonCountDict):
self.CodonCountDict[codname]+=1
#Function to get the codon counts (number only) grouped by the protien they generate
def GetCodonDistribution(self):
self.CodonDist=[
[self.CodonCountDict['ATG']],
[self.CodonCountDict['TGG']],
[self.CodonCountDict['TTT'],self.CodonCountDict['TTC']],
[self.CodonCountDict['TAT'],self.CodonCountDict['TAC']],
[self.CodonCountDict['CAT'],self.CodonCountDict['CAC']],
[self.CodonCountDict['CAA'],self.CodonCountDict['CAG']],
[self.CodonCountDict['AAT'],self.CodonCountDict['AAC']],
[self.CodonCountDict['AAA'],self.CodonCountDict['AAG']],
[self.CodonCountDict['GAT'],self.CodonCountDict['GAC']],
[self.CodonCountDict['GAA'],self.CodonCountDict['GAG']],
[self.CodonCountDict['TGT'],self.CodonCountDict['TGC']],
[self.CodonCountDict['ATT'],self.CodonCountDict['ATC'],self.CodonCountDict['ATA']],
[self.CodonCountDict['GTT'],self.CodonCountDict['GTC'],self.CodonCountDict['GTA'],self.CodonCountDict['GTG']],
[self.CodonCountDict['CCT'],self.CodonCountDict['CCC'],self.CodonCountDict['CCA'],self.CodonCountDict['CCG']],
[self.CodonCountDict['ACT'],self.CodonCountDict['ACC'],self.CodonCountDict['ACA'],self.CodonCountDict['ACG']],
[self.CodonCountDict['GCT'],self.CodonCountDict['GCC'],self.CodonCountDict['GCA'],self.CodonCountDict['GCG']],
[self.CodonCountDict['GGT'],self.CodonCountDict['GGC'],self.CodonCountDict['GGA'],self.CodonCountDict['GGG']],
[self.CodonCountDict['TTA'],self.CodonCountDict['TTG'],self.CodonCountDict['CTT'],self.CodonCountDict['CTC'],self.CodonCountDict['CTA'],self.CodonCountDict['CTG']],
[self.CodonCountDict['TCT'],self.CodonCountDict['TCC'],self.CodonCountDict['TCA'],self.CodonCountDict['TCG'],self.CodonCountDict['AGT'],self.CodonCountDict['AGC']],
[self.CodonCountDict['CGT'],self.CodonCountDict['CGC'],self.CodonCountDict['CGA'],self.CodonCountDict['CGG'],self.CodonCountDict['AGA'],self.CodonCountDict['AGG']]
]
#map to store the codon counts grouped by the protiens they generate
ProtienDist={
'Met':self.CodonDist[0], #eg this is a 'list' of counts of all individual codons that makes up the protien Met
'Trp':self.CodonDist[1],
'Phe':self.CodonDist[2],
'Tyr':self.CodonDist[3],
'His':self.CodonDist[4],
'Gln':self.CodonDist[5],
'Asn':self.CodonDist[6],
'Lys':self.CodonDist[7],
'Asp':self.CodonDist[8],
'Glu':self.CodonDist[9],
'Cys':self.CodonDist[10],
'Lle':self.CodonDist[11],
'Val':self.CodonDist[12],
'Pro':self.CodonDist[13],
'Thr':self.CodonDist[14],
'Ala':self.CodonDist[15],
'Gly':self.CodonDist[16],
'Leu':self.CodonDist[17],
'Ser':self.CodonDist[18],
'Arg':self.CodonDist[19]}
return ProtienDist
#get the Pi squared value of the different individual 'types'of same protien,
def getpisqvec(self):
subtotal=0
temppi2vec=[]
for i in range(len(self.CodonDist)):
temp=[]
for j in range(len(self.CodonDist[i])):
subtotal+=self.CodonDist[i][j]
for j in range(len(self.CodonDist[i])):
if subtotal == 0 or self.CodonDist[i][j]== 0:
temp.append(0)
else:
temp.append((self.CodonDist[i][j]/subtotal)**2)
temppi2vec.append(temp)
subtotal=0
return temppi2vec
#generate a list of Fk values of each protien
def getfk(self):
subtotal=0
pi2sumvec=[]
pi2vec=self.getpisqvec()
for i in range(len(pi2vec)):
for j in range(len(pi2vec[i])):
subtotal+=pi2vec[i][j]
if subtotal == 0:
pi2sumvec.append(0)
else:
pi2sumvec.append(subtotal)
subtotal=0
for i in range(len(pi2sumvec)):
if pi2sumvec[i]!=0:
pi2sumvec[i]=1/pi2sumvec[i]
return pi2sumvec
#compute Nc value of this genome instance (sum of fk vlaues)
def calcNcValue(self):
subtotal=0
pi2sumvec=self.getfk()
fk=0
for i in range(len(pi2sumvec)):
fk+=pi2sumvec[i]
return fk
#function to write the compute heavy processed data to a file (later to be upgraded to work with a database backend)
def writetofile(self):
if (not os.path.isdir('GenomeObjFiles')): #checks if output folder exists and makes one if it doesn't
os.mkdir('GenomeObjFiles')
filetoopen=os.path.join('GenomeObjFiles' , self.Name[-11:])
filepath=Path(filetoopen+'.txt')
#if filepath.is_file():
#return
fileopen=open(filetoopen + ".txt", 'w')
fileopen.write(self.Name + '\n')
fileopen.write(self.DNAseq + '\n')
fileopen.write(self.ProtienSeq + '\n')
fileopen.write(str(self.errorno) +'\n')
fileopen.write(str(self.Nc) + '\n')
fileopen.close
class fullGenome(object):
def __init__(self,fname):
dna_str=[]
self.check = 0
self.No_A=0
self.No_T=0
self.No_G=0
self.No_C=0
self.ATarr=[]
self.GCarr=[]
self.ATcumu=[]
self.GCcumu=[]
self.ATarr.append(0)
self.GCarr.append(0)
self.ATcumu.append(0)
self.GCcumu.append(0)
dnaArr=[]
seqstr=''
eofcheck=False
my_file = Path(fname+".txt")
if not my_file.is_file():
messagebox.showinfo("ERROR!!!", "Wrong Input File Name Given, Please Give correct Filename")
with open(fname + ".txt",'r') as dna:
if self.check==0:
dna_str=dna.readline().replace("\n","")
dna1st=''.join(str(e) for e in dna_str)
dna1stS=dna1st.lower()
self.check=1
if not dna1stS[-18:] == "complete sequence.":
messagebox.showinfo("ERROR!!!", "Wrong type of file uploaded, Please Give correct File")
return
while eofcheck==False:
for var in range(18):
dna_str=dna.readline().replace("\n","")
if dna_str=="":
eofcheck=True
dna_str = dna_str.replace("\r", "")
dna_str = dna_str.replace("u", "t")
dnaArr.append(dna_str)
seqstr = "".join(str(e) for e in dnaArr)
seqstr2 = seqstr.upper()
self.calc_skew(seqstr2)
dnaArr=[]
self.get_skewPlots()
def calc_skew(self,dnastr):
A=self.No_A
T=self.No_T
C=self.No_C
G=self.No_G
for i in dnastr:
if i == 'A':
A=A+1
elif i == 'T':
T=T+1
elif i == 'G':
G=G+1
else:
C=C+1
self.No_A=A
self.No_T=T
self.No_G=G
self.No_C=C
self.ATarr.append((self.No_A-self.No_T))
self.GCarr.append((self.No_G-self.No_C))
#self.AT.append(self.No_A-self.No_T)
#self.GC.append(self.No_G-self.No_C)
def get_skewPlots(self):
style.use('ggplot')
x1 = [int(i*1080) for i in range(len(self.ATarr))]
x2 = [int(i*1080) for i in range(len(self.GCarr))]
y1=[self.ATarr[int(i/1080)] for i in x1]
y2=[self.GCarr[int(i/1080)] for i in x2]
#y3=[self.AT[i] for i in x]
plt.subplot(2,1,1)
plt.plot(x1,y1,'g',label='AT-Skew', linewidth=1)
plt.subplot(2,1,2)
plt.plot(x2,y2,'c',label='GC-Skew',linewidth=1)
#plt.plot(x,y3,'r',label='A-T cumulative', linewidth=1)
plt.title('Cumulative DNA Skew Graph')
plt.ylabel('Skewness')
plt.xlabel('No of Nucleotides In the DNA')
plt.legend()
plt.grid(True,color='k')
#plt.savefig('OutputGraphs\\AT and GC Skew_' + self.Name[-11:-1] + '.png')
plt.show()
def readfromfile(Name):
if (not os.path.isdir('GenomeObjFiles')):
messagebox.showinfo('ERROR', 'NO GENERATED FILES FOUND, PLEASE UPLOAD FILE FIRST')
return
filetoopen=os.path.join('GenomeObjFiles' ,Name[-11:])
inpfile=open(filetoopen + '.txt' , 'r')
Genomename=inpfile.readline()[:-1]
GenomeDnaseq=inpfile.readline()[:-1]
GenomeProtienseq=inpfile.readline()[:-1]
Genomeerrno=inpfile.readline()[:-1]
GenomeNcVal=inpfile.readline()[:-1]
inpvec=[Genomename,GenomeDnaseq,GenomeProtienseq,Genomeerrno,GenomeNcVal]
TempGenome=genome(inpvec)
inpfile.close()
return TempGenome
def getDnaList(fname):
dna_str=[]
matchObj=re.compile(r'>\w{4}_\w{2}\d{4}:\|\d{1,4}\|\w{6}_\d{4}')
my_file = Path(fname+".txt")
if not my_file.is_file():
messagebox.showinfo("ERROR!!!", "Wrong Input File Name Given, Please Give correct Filename")
with open(fname + ".txt",'r') as dna:
dna_str=dna.read().replace("\n","")
dna_str = dna_str.replace("\r", "")
dna_str = dna_str.replace("u", "t")
seqstr = ''.join(str(e) for e in dna_str)
seqstr2 = seqstr.upper()
MatchList=matchObj.finditer(seqstr2)
span_list=[]
for match in MatchList:
span_list.append(list(match.span()))
if span_list==[]:
messagebox.showinfo("ERROR!!!", "Wrong Type of file, Please use correct Format for the file")
return
dna_list=[]
last=0
for var in range(len(span_list)-1):
dna_list.append(seqstr2[span_list[var][0]:span_list[var][1]])
dna_list.append(seqstr2[span_list[var][1]:span_list[var+1][0]])
last=var+1
dna_list.append(seqstr2[span_list[last][0]:span_list[last][1]])
dna_list.append(seqstr2[span_list[last][1]:])
return dna_list
def preprocessfile():
fname=filenameVar.get()
dna_list=getDnaList(fname)
for var in range(0,len(dna_list)-1,2):
inplist=[dna_list[var],dna_list[var+1]]
GenomeObj=genome(inplist)
GenomeObj.writetofile()
if var%2==0:
Filenamelist.append(dna_list[var])
#insert code to remove previous files entries
dropDownMenu.delete(0, 'end')
for var in range(len(Filenamelist)):
dropDownMenu.insert(var,Filenamelist[var])
del Filenamelist[:]
def printtable():
data=gendata.get()
headers=['Genome Name','DNA Sequence','Generated Protien Sequence', 'Input error if any', 'Nc Value']
def fetchShow():
if dropDownMenu.get(tk.ACTIVE)=="":
messagebox.showinfo("ERROR!!!", "No Gene Sequence Selected. Please select gene sequence or upload the genome file first.")
return
tempvar=dropDownMenu.get(tk.ACTIVE)
gen=readfromfile(tempvar)
Outputbox.config(state="normal")
Outputbox.delete('1.0', tk.END)
Outputbox.insert(tk.INSERT,"Name:\n")
Outputbox.insert(tk.INSERT,gen.Name)
Outputbox.insert(tk.INSERT,"\nDNA sequence:\n")
Outputbox.insert(tk.INSERT,gen.DNAseq)
Outputbox.insert(tk.INSERT,"\nProtien Sequence:\n")
Outputbox.insert(tk.INSERT,gen.ProtienSeq)
Outputbox.insert(tk.INSERT,"\nInput Error if any:\n")
Outputbox.insert(tk.INSERT,gen.whaterror[gen.errorno])
Outputbox.insert(tk.INSERT,"\nNc Value:\n")
Outputbox.insert(tk.INSERT,gen.Nc)
Outputbox.config(state="disabled")
def returnShow():
tempvar=dropDownMenu.get(tk.ACTIVE)
gen=readfromfile(tempvar)
return gen
def genGraph():
if filenamePlotVar.get()=="":
messagebox.showinfo("ERROR!!!", "Filename Doesn't match with any file in the local directory. Please Check if you entered the correct Filename and it is present in the current Directory.")
return
Com_gene_filename=filenamePlotVar.get()
fullGen=fullGenome(Com_gene_filename)
if __name__=="__main__":
root=tk.Tk()
root.geometry("1200x1000")
root.title('DNA sequence Processor')
frame1=tk.Frame(root)
l=tk.Label(frame1,text="Upload the DNA sequence File Name :")
l.grid(row=0,column=0,padx=10,pady=10)
filenameVar=tk.StringVar(frame1)
entryb=tk.Entry(frame1,textvariable=filenameVar).grid(row=0,column=1,padx=10,pady=10)
fileb=tk.Button(frame1,text= "Process",command=preprocessfile)
fileb.grid(row=0,column=2,padx=10,pady=10)
frame1.grid(row=0,column=0,padx=10,pady=10)
frame2=tk.Frame(root)
l2=tk.Label(frame2,text="Choose which file info is needed")
l2.grid(row=0,column=0,padx=10,pady=10)
fr=tk.Frame(frame2)
dropDownMenu = tk.Listbox(fr,font=("Verdana",10),width=100)
dropDownMenu.pack(side='left',fill="both",expand=True)
sbr=tk.Scrollbar(fr)
sbr.pack(side='right',fill='y')
sbr.config(command=dropDownMenu.yview)
dropDownMenu.config(yscrollcommand=sbr.set)
fr.grid(row=1,column=0,padx=10,pady=10)
frame2.grid(row=1,column=0,padx=10,pady=10)
frame3=tk.Frame(root)
dispbtn=tk.Button(frame3,text="Get Details",command=fetchShow)
dispbtn.grid(row=0,column=1,padx=10,pady=10)
btnl=tk.Label(frame3,text="Get The Details of Selected Genome :")
btnl.grid(row=0,column=0,padx=10,pady=10)
frame3.grid(row=2,column=0)
frame4=tk.Frame(root)
l3=tk.Label(frame4,text="Details of the selected Geneome Data :")
l3.grid(row=0,column=0,padx=10,pady=10)
fr2=tk.Frame(frame4)
Outputbox = tk.Text(fr2,font=("Verdana",10),height=15,width=100)
Outputbox.pack(side='left',fill="both",expand=True)
sbr2=tk.Scrollbar(fr2)
sbr2.pack(side='right',fill='y')
sbr2.config(command=Outputbox.yview)
Outputbox.config(yscrollcommand=sbr2.set)
fr2.grid(row=1,column=0,padx=10,pady=10)
frame4.grid(row=3,column=0,padx=10,pady=10)
frame5=tk.Frame(root)
l2=tk.Label(frame5,text="Give the Name of the File with the Complete genome Sequence: ")
l2.grid(row=0,column=0,padx=10,pady=10)
filenamePlotVar=tk.StringVar(frame5)
entryb=tk.Entry(frame5,textvariable=filenamePlotVar).grid(row=0,column=1,padx=10,pady=10)
graphbtn=tk.Button(frame5,text = "Generate Graph",command=genGraph)
graphbtn.grid(row=0,column=2,padx=10,pady=10)
frame5.grid(row=4,column=0,padx=10,pady=10)
root.mainloop()