-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_timeline.py
55 lines (42 loc) · 1.56 KB
/
parse_timeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#coding: utf-8
import re
import json
import requests
from pyquery import PyQuery as pq
from redis import StrictRedis
redis = StrictRedis()
REDIS_KEY = 'schedge:timeline:biology'
YEAR = 365.2425
def getEvents():
url = 'http://en.wikipedia.org/wiki/Timeline_of_biology_and_organic_chemistry'
r = requests.get(url)
if not r.ok:
print 'Error: %s' % r.text
return
redis.delete(REDIS_KEY)
pattern = ur'\s*(?P<year_rep>(?P<circa>c\.)?\s*((?P<year>\d+)|.+)(?P<bc>\s*BCE?)?)\s*—\s*(?P<description>.+)'
for element in pq(r.text).find('#mw-content-text > ul > li'):
html = pq(element).html()
matches = re.match(pattern, html, re.UNICODE)
if not matches:
print 'Failed to match pattern: %s' % html
continue
match = matches.groupdict()
abs_description = re.sub(r'/wiki/', 'http://en.wikipedia.org/wiki/', match['description'])
event = dict(description=abs_description, year=match['year_rep'])
if match['year']:
event['median'] = int(match['year']) if not match['bc'] else int(match['year']) * -1 + 1
event['resolution'] = YEAR if not match['circa'] else 5 * YEAR
else:
if 'Naturalis' in match['description']:
event['median'] = 60
event['resolution'] = 20 * YEAR
elif 'Galen' in match['description']:
event['median'] = 165
event['resolution'] = 70 * YEAR
elif 'Jan Baptist van Helmont' in match['description']:
event['median'] = 1625
event['resolution'] = 20 * YEAR
redis.sadd(REDIS_KEY, json.dumps(event))
if __name__ == '__main__':
getEvents()