forked from nkokkalis/UniFeed
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunifeed.py
executable file
·107 lines (88 loc) · 3.01 KB
/
unifeed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python
import urllib2
from HTMLParser import HTMLParser
import datetime
import re
import xml.etree.ElementTree as et
class UnipiParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.innorm = False
self.ina = False
self.infont = False
self.table_depth = 0
self.font_depth = 0
self.articles = []
def handle_starttag(self, tag, attrs):
if tag == 'table':
if self.innorm:
self.table_depth += 1
else:
for attr in attrs:
if attr[0] == 'class' and attr[1] == 'norm':
self.table_depth = 1
self.innorm = True
if tag == 'font' and self.innorm:
self.infont = True
self.font_depth += 1
if tag == 'a' and self.infont:
self.ina = True
for attr in attrs:
if attr[0] == 'href':
self.articles.append(attr[1])
def handle_endtag(self, tag):
if tag == 'table':
self.table_depth -=1
if self.table_depth == 0:
self.innorm = False
if tag == 'font':
self.font_depth -= 1
if self.font_depth == 0:
self.infont = False
if tag == 'a':
self.ina = False
class Stripper(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.ob = ''
def handle_data(self, data):
self.ob += data
def strip(html):
s = Stripper()
s.feed(html)
return s.ob
html = urllib2.urlopen('http://www.unipi.gr').read()
parser = UnipiParser()
parser.feed(html)
root = et.Element('feed')
root.set('xmlns', 'http://www.w3.org/2005/Atom')
global_title_node = et.SubElement(root, 'title')
global_title_node.text = 'Unipi feed'
icon_node = et.SubElement(root, 'icon')
icon_node.text = 'http://www.unipi.gr/favicon.ico'
global_id_node = et.SubElement(root, 'id')
global_id_node.text = 'urn:students.cs.unipi.gr-feed'
global_updated_node = et.SubElement(root, 'updated')
global_updated_node.text = datetime.datetime.now().strftime('%Y-%m-%dT%H-%M-%SZ')
for url in parser.articles:
html = urllib2.urlopen('http://www.unipi.gr/%s' % url).read()
html = html[html.find('smaplev2') + 5:]
title = re.findall('<p class="smaplev2">(.*?)</p>', html, re.S)[0].decode('cp1253')
title = strip(title)
datestr = re.findall('<p class="smaplev2">.*?</p>.*?<p>.*?<strong>(.*?)</strong>.*?</p>', html, re.S)[0].strip().decode('cp1253')
date = datetime.datetime.strptime(datestr, '%d/%m/%Y')
content = re.findall('<td class="norm"><p class="smaplev2">.*?</p>.*?<p>.*?</p>(.*?)</td>', html, re.S)[0].strip().decode('cp1253')
id = url.split('=')[-1]
entry_node = et.SubElement(root, 'entry')
title_node = et.SubElement(entry_node, 'title', type="html")
title_node.text = title
link_node = et.SubElement(entry_node, 'link')
link_node.set('href', 'http://www.unipi.gr/%s' % url)
id_node = et.SubElement(entry_node, 'id')
id_node.text = 'urn:article-%s' % id
updated_node = et.SubElement(entry_node, 'updated')
updated_node.text = date.strftime('%Y-%m-%dT00-00-00Z')
summary_node = et.SubElement(entry_node, 'content', mode="escaped", type="text/html")
summary_node.text = content
tree = et.ElementTree(root)
tree.write('miou.atom', encoding="utf-8")