forked from xuyk/Rosalind
-
Notifications
You must be signed in to change notification settings - Fork 0
/
062_LREP.py
65 lines (55 loc) · 1.94 KB
/
062_LREP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env python
'''
A solution to a ROSALIND bioinformatics problem.
Problem Title: Finding the Longest Multiple Repeat
Rosalind ID: LREP
Rosalind #: 062
URL: http://rosalind.info/problems/lrep/
'''
with open('data/rosalind_lrep.txt') as input_data:
raw_data = input_data.readlines()
# Parse the data.
s = raw_data[0].strip()
k = int(raw_data[1].strip())
node_info = [line.strip().split() for line in raw_data[2:]]
node_info = [[[line[0],line[1]], int(line[2]),int(line[3])] for line in node_info]
edges = [info[0] for info in node_info]
# Construct a previous node dictionary for quick backtracking.
previous_node = {'node1':'ROOT'}
for edge in edges:
previous_node[edge[1]] = edge[0]
# Construct a dictionary relating a node to its substring.
node_str = {'node1':''}
for info in node_info:
node_str[info[0][1]] = s[info[1]-1:info[1]+info[2]-1].strip('$')
# Determine the leaves.
heads = set([edge[0] for edge in edges])
tails = set([edge[1] for edge in edges])
leaves = tails - heads
# Count the number of descendants for each node.
num_nodes = max([int(node[4:]) for node in leaves])
descendants = [0]*num_nodes
for leaf in leaves:
descendants[int(leaf[4:])-1] += 1
temp_node = previous_node[leaf]
while temp_node != 'ROOT':
descendants[int(temp_node[4:])-1] += 1
temp_node = previous_node[temp_node]
# Find the candidate substrings.
candidate_nodes = []
for i, num in enumerate(descendants[1:]): # Skip node1, as it is the empty string.
if num >= k:
candidate_nodes.append('node'+str(i+2)) # Plus two since we're starting at at index 0 = node 2.
candidate_strings = []
for node in candidate_nodes:
temp_str = ''
temp_node = node
while temp_node != 'ROOT':
temp_str = node_str[temp_node] + temp_str
temp_node = previous_node[temp_node]
candidate_strings.append(temp_str)
# Print and save a longest candidate substring.
lrep = max(candidate_strings, key=len)
print lrep
with open('output/062_LREP.txt', 'w') as output_file:
output_file.write(lrep)