-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathParsePlasmidGenomes.py
151 lines (117 loc) · 5.32 KB
/
ParsePlasmidGenomes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# parse accession numbers of genomes with plasmids into txt file
import csv
fname = "plasmid_finder_results_putonti.tsv"
#parse the info from phage results to an easier to use file
print("Parsing genome accessions with plasmids to file plasmidAccessions.txt...")
with open(fname) as f_in, open('plasmidAccessions.txt', 'w') as f_out:
rd = csv.reader(f_in, delimiter="\t", quotechar='"')
headers = next(rd)
for row in rd:
f_out.write(row[1]+','+row[2]+','+row[3]+'\n')
f_in.close(); f_out.close();
plasmidResultsFileName = 'plasmidCompareResults.csv'
print("Running plasmid analysis... (output will be in " + plasmidResultsFileName + ")")
#read the plasmid results, putting the information into a dictionary for each accession
accessionDict = {}
for line in open('plasmidAccessions.txt'):
accessionInfo = line.split(',')
accessionDict[accessionInfo[0]] = []
accessionDict[accessionInfo[0]].append(accessionInfo[1].split(" ")[0])
accessionDict[accessionInfo[0]].append(accessionInfo[2].strip())
#key = accession, value = [genus, plasmid yes/no], RM system type will be appended later
#initialize a list for accessions with BLAST info: if it doesn't have BLAST info, it won't be used in the analysis
accessionsWithBlastInfo = []
#read all lines of the CSV (blast info) file, skipping the first line
for line in open('RMBlastCSV.csv').readlines()[1:]:
#extract the accession number from the file name
blastInfo = line.split(',')
#get the E value for the accession's hit
#if it is over 0.01, it will not be used
evalue = float(blastInfo[4])
if evalue <= 0.01:
accession = blastInfo[0]
accession = accession[:accession.index('_ASM')]
accessionsWithBlastInfo.append(accession)
#extract the enzyme type that was found
enzType = blastInfo[3]
accessionDict[accession].append(enzType) #RM system type is now in the information for each accession (also contains genus and yes/no for plasmid)
#open an output file for the plasmid results
plasmidResults = open(plasmidResultsFileName,'w')
#initialize two dictionaries: one to holds the number of 'yes' for type and one that holds all for type
typeYes = {}
typeAll = {}
#go through each accession that was ran in the BLAST analysis, counting the number of each
#RM system type, and the number of that type that have plasmids as well
for accession in accessionsWithBlastInfo:
rmtype = accessionDict[accession][2]
hasPlasmid = (accessionDict[accession][1] == 'yes')
#add to total of RM System type
if rmtype not in typeAll:
typeAll[rmtype] = 1
else:
typeAll[rmtype] += 1
#add to number that has plasmids if applicable
if hasPlasmid:
if rmtype not in typeYes:
typeYes[rmtype] = 1
else:
typeYes[rmtype] += 1
#write headers for the type information to be written
plasmidResults.write('RM System Type,Total # of System Type,# with Plasmid,% with Plasmid\n')
for rmType in typeAll.keys():
plasmidResults.write(rmType + ',') #write the type name
plasmidResults.write(str(typeAll[rmType]) + ',') #write the total number of that type
#write the number of 'yes' values for that type
try:
plasmidResults.write(str(typeYes[rmType]) + ',')
except KeyError: #if it has no 'yes' values, write 0
typeYes[rmType] = 0
plasmidResults.write(str(typeYes[rmType]) + ',')
#write the % of type that have plasmid
plasmidResults.write(str(float((typeYes[rmType]/typeAll[rmType]))*100) + '\n')
plasmidResults.write('\n')
#for each type (already written to the file), analyze which genera containing that type also have plasmids
#are certain genera more likely to have or not have plasmids?
for type in typeYes:
#initialize dictionaries to hold the number of 'yes' for each genera and the total for each genera
genusYes = {}
genusTotal = {}
#write the type and headers to the file
plasmidResults.write(type + '\n')
plasmidResults.write('Genus,Total # of Genus,# with Plasmid,% with Plasmid\n')
#go through the accessions
for acc in accessionsWithBlastInfo:
accInfo = accessionDict[acc]
#extract the RM type that was found from that accession
#and check if it matches the current type
rmtype = accInfo[2]
if rmtype == type:
#if it matches the current type, extract the genus and check if it has a plasmid
genus = accInfo[0]
hasPlasmid = (accInfo[1] == 'yes')
#add the genus to the total
if genus not in genusTotal:
genusTotal[genus] = 1
else:
genusTotal[genus] += 1
#add to the number with plasmids, if applicable
if hasPlasmid:
if genus not in genusYes:
genusYes[genus] = 1
else:
genusYes[genus] += 1
#for each genus under the current type, write its info to the file
for genus in genusTotal.keys():
#write the genus
plasmidResults.write(genus + ',')
#write the total number of that genus with the RM system type
plasmidResults.write(str(genusTotal[genus]) + ',')
#write the total number of that genus with plasmid
try:
plasmidResults.write(str(genusYes[genus]) + ',')
except KeyError:
genusYes[genus] = 0
plasmidResults.write(str(genusYes[genus]) + ',')
#write the % of genus with plasmid
plasmidResults.write(str(float((genusYes[genus]/genusTotal[genus]))*100) + '\n')
plasmidResults.write('\n')