-
Notifications
You must be signed in to change notification settings - Fork 0
/
VagalumeParser.py
72 lines (56 loc) · 2.62 KB
/
VagalumeParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# -*- Mode: python; coding: utf-8; tab-width: 8; indent-tabs-mode: t; -*-
#
# Copyright (C) 2010 Jonnas Fonini <[email protected]>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
#
# The Rhythmbox authors hereby grant permission for non-GPL compatible
# GStreamer plugins to be used and distributed together with GStreamer
# and Rhythmbox. This permission is above and beyond the permissions granted
# by the GPL license by which Rhythmbox is covered. If you modify this code
# you may extend this exception to your version of the code, but you are not
# obligated to do so. If you do not wish to do so, delete this exception
# statement from your version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
import urllib
import re
import rb
import HTMLParser
from unicodedata import normalize
class VagalumeParser (object):
def __init__(self, artist, title):
self.artist = artist
self.title = title
def search(self, callback, *data):
artist = normalize('NFKD', self.artist.decode('utf-8')).encode('ASCII','ignore')
title = normalize('NFKD', self.title.decode('utf-8')).encode('ASCII','ignore')
artist = urllib.quote(artist.replace('(', '').replace(')', '').replace(' ', '-').replace('\'', ''))
title = urllib.quote(title.replace('(', '').replace(')', '').replace(' ', '-').replace('\'', ''))
url = 'http://vagalume.uol.com.br/%s/%s.html' % (artist, title)
loader = rb.Loader()
loader.get_url (url, self.got_lyrics, callback, *data)
def got_lyrics(self, lyric, callback, *data):
lyric = re.sub('<[Bb][Rr][^>]*>', '', lyric)
if re.search('<div class="tab_original">([^<]*)', lyric) is not None:
body = re.split('<div class="tab_original">([^<]*)', lyric)[1]
pars = HTMLParser.HTMLParser()
body = pars.unescape(body)
title = re.split('<h1>(.*)</h1>', lyric)[1]
artist = re.split('<h2 class="head"><a href=(.*)>(.*)</a></h2>', lyric)[2]
title = "%s - %s\n\n" % (artist, title)
lyric = title + body
lyric += "\n\nLyrics provided by vagalume.com.br"
callback(lyric, *data)
else:
callback (None, *data)