forked from JensTimmerman/scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathphilosophy.py
108 lines (93 loc) · 2.56 KB
/
philosophy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/bin/python
import urllib2
import sys
from BeautifulSoup import BeautifulSoup
def strip_brackets(string):
"""
remove brackets from a string
leave brackets between "<a></a>" tags in place
hihi, this is like an automata
"""
string = "" + str(string)
#print "input: ",string
d = 0
k = 0
out = ''
for i in string:
#check for tag when not in parantheses mode
if d < 1:
if i == '>':
k-=1
if i =="<":
k += 1
#check for parentheses
if k < 1:
if i == '(':
d += 1
if d > 0:
out += ' '
else:
out += i
if i == ')' :
d -= 1
else:
out +=i
#print "output: ",out
return out
class PhilosophyGame():
"""
This class folows a given, or in other case random wikipedia article
and folows it's trace to the philosophy page
each time following the first link in the content, skipping links between parentheses
based on http://imgur.com/mqlKD
depends on beatifulsoup
still has some utf8 problems
"""
def __init__(self,prefix="http://en.wikipedia.org",userAgent='Mozilla/5.0'):
self.opener = urllib2.build_opener()
self.opener.addheaders = [('User-agent', userAgent)]
self.prefix = prefix
def trace(self,article):
"""
trace the first link in each article, that's not between parentheses
"""
print article
#TODO: check for loops?
#and maybe fix them? :p
#currently loops between Phonetic_transcription and International_Phonetic_Alphabet
#TODO: use dynamic computing (caching of results)
resource = self.opener.open(article)
data = resource.read()
resource.close()
soup = BeautifulSoup(data)
for i in soup.find('div',id="bodyContent").findAll({'ul' : True, 'p' : True},recursive=False):
#find first link here that isn't in parenthesis
i = BeautifulSoup(strip_brackets(i))
#print i
for j in i.findAll('a'):
k = 0
for val,att in j.attrs:
if val =="href":
nexturl =att
if val =="title":
next = att
k=1
if k==0: #citations or something, no title, skipp
continue
if next == "Philosophy":
print "You have arrived"
return
else:
if not nexturl.startswith("http://"):
nexturl = self.prefix + nexturl
else: #prefix sometimes switches between wicktionary and wikipedia
self.prefix = nexturl.rpartition("/wiki/")[0]
self.trace(nexturl)
return
if __name__ == "__main__":
game = PhilosophyGame()
if len(sys.argv) == 1 :
game.trace("http://en.wikipedia.org/wiki/Special:Random")
else:
for i in sys.argv[1:]:
game.trace("http://en.wikipedia.org/wiki/" + i)