-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset_statistics.py
120 lines (97 loc) · 3.93 KB
/
dataset_statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/python3
# usage: dataset_statistics.py <path to tsv file> <path to sequence.fasta file>
import csv
import sys
import re
dataset_file_path = 'dataset\\training.tsv' #sys.argv[1]
seq_file_path = 'dataset/training_sequences.fasta' #sys.argv[2]
pdb_2_chain_2_label = {}
with open(dataset_file_path, 'r') as dataset_file:
d = csv.reader(dataset_file, delimiter='\t')
for line in d:
if line != [] and line[0] == 'pdb_id': # header
continue
if line == []: # empty line
continue
# entries with content
if line[0] not in pdb_2_chain_2_label:
pdb_2_chain_2_label[line[0]] = {}
cath_info_in_one_protein = line[2].split('),(')
#print(list(line[0]) + cath_info_in_one_protein) # test
for bracket in cath_info_in_one_protein:
if re.match(r'.*[:]{1}([0-9|A-Z])', bracket): # match object != None, there is a match
#print(re.match(r'.*[:]{1}([A-Z])', bracket).group(1)) # test
if bracket.split(',')[-1][-1] == ')':
cath_label = bracket.split(',')[-1][:-1]
else:
cath_label = bracket.split(',')[-1]
if re.match(r'.*[:]{1}([0-9|A-Z])', bracket).group(1) not in pdb_2_chain_2_label[line[0]]:
pdb_2_chain_2_label[line[0]][re.match(r'.*[:]{1}([0-9|A-Z])', bracket).group(1)] = {cath_label}
else:
pdb_2_chain_2_label[line[0]][re.match(r'.*[:]{1}([0-9|A-Z])', bracket).group(1)].add(cath_label)
total = 0
counts = {1:0, 2:0, 3:0, 4:0, 6:0}
with open(seq_file_path, 'r') as seq_file:
for line in seq_file:
if line == '\n': # empty line
continue
if line.startswith('>'): # header
total += 1
header = line[1:].rstrip().split(':')
if header[0] in pdb_2_chain_2_label: # header[0] is pdb id
if header[1] in pdb_2_chain_2_label[header[0]]: # header[1] is chain
for label in pdb_2_chain_2_label[header[0]][header[1]]:
if label.startswith('1.'):
counts[1] += 1
if label.startswith('2.'):
counts[2] += 1
if label.startswith('3.'):
counts[3] += 1
if label.startswith('4.'):
counts[4] += 1
if label.startswith('6.'):
counts[6] += 1
print(total)
print(counts)
'''
pdb_ids = []
cath_labels = []
with open(dataset_file_path, 'r') as dataset_file:
d = csv.reader(dataset_file, delimiter='\t')
for line in d:
#print(line) # test
if line != [] and line[0] == 'pdb_id': # header
continue
if line == []: # empty line
continue
# entries with content
if line[0] not in pdb_ids:
pdb_ids.append(line[0])
cath_info_in_one_protein = line[2].split('),(')
#print(cath_info_in_one_protein) # test
labels_in_one_protein = []
for seg in cath_info_in_one_protein:
if seg.split(',')[-1][-1] == ')':
cath_label = seg.split(',')[-1][:-1]
else:
cath_label = seg.split(',')[-1]
if cath_label not in labels_in_one_protein: # version 2
labels_in_one_protein.append(cath_label) # version 2
#labels_in_one_protein.append(cath_label) # version 1
for label in labels_in_one_protein:
cath_labels.append(label)
counts = {1:0, 2:0, 3:0, 4:0, 6:0}
for code in cath_labels:
if code.startswith('1.'):
counts[1] += 1
if code.startswith('2.'):
counts[2] += 1
if code.startswith('3.'):
counts[3] += 1
if code.startswith('4.'):
counts[4] += 1
if code.startswith('6.'):
counts[6] += 1
print(counts)
print(len(pdb_ids))
'''