-
Notifications
You must be signed in to change notification settings - Fork 0
/
adinebook.py
106 lines (93 loc) · 3.65 KB
/
adinebook.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#! /usr/bin/python
# -*- coding: utf-8 -*-
"""All things that are specifically related to adinebook website"""
import re
import logging
import requests
from bs4 import BeautifulSoup as BS
import commons
import isbn
class Response(commons.BaseResponse):
"""Create Adinebook's response object."""
def __init__(self, adinebook_url, date_format='%Y-%m-%d'):
"""Make the dictionary and run self.generate()."""
self.date_format = date_format
self.url = adinebook_url
self.dictionary = url2dictionary(adinebook_url)
if 'language' not in self.dictionary:
# assume that language is either fa or en
# todo: give warning about this assumption
self.detect_language(self.dictionary['title'], {'en', 'fa'})
self.generate()
def isbn2url(isbn):
"""Convert isbn string to AdinebookURL. Return the url as string."""
# apparently adinebook uses 10 digit codes (without hyphens) for its books
# if it's an isbn13 then the first 3 digits are excluded:
isbn = isbn.replace('-', '')
isbn = isbn.replace(' ', '')
if len(isbn) == 13:
isbn = isbn[3:]
url = 'http://www.adinebook.com/gp/product/' + isbn
return url
def url2dictionary(adinebook_url):
"""Get adinebook_url and return the result as a dict."""
try:
# this try statement is needed because if adinebook is not available then
# ottobib should continoue its' work in isbn.py
headers = {'User-agent':
'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:29.0)' +
' Gecko/20100101 Firefox/29.0'}
r = requests.get(adinebook_url, headers=headers)
adinebook_html = r.content.decode('utf-8')
except Exception:
logger.exception(adinebook_url)
return
if 'صفحه مورد نظر پبدا نشد.' in adinebook_html:
return
else:
d = {}
d['type'] = 'book'
bs = BS(adinebook_html)
if bs.title:
pattern = 'آدینه بوک:\s*(?P<title>.*?)\s*~(?P<names>.*?)\s*$'
m = re.search(pattern, bs.title.text)
if m:
d['title'] = m.group('title')
names = m.group('names').split('،')
# initiating name lists:
if m.group('names'):
d['authors'] = []
d['others'] = []
if '(ويراستار)' in m.group('names'):
d['editors'] = []
if '(مترجم)' in m.group('names'):
d['translators'] = []
# building lists:
for name in names:
if '(ويراستار)' in name:
d['editors'].append(commons.Name(name.split('(ويراستار)')[0]))
elif '(مترجم)' in name:
d['translators'].append(commons.Name(name.split('(مترجم)')[0]))
elif '(' in name:
d['others'].append(commons.Name(re.split('\(.*\)', name)[0]))
d['others'][-1].fullname = name
else:
d['authors'].append(commons.Name(name))
if not d['authors']:
del d['authors']
if not d['others']:
del d['others']
m = re.search('نشر:</b>\s*(.*?)\s*\(.*</li>', adinebook_html)
if m:
d['publisher'] = m.group(1)
m = re.search('نشر:</b>.*\([\d\s]*(.*?)،.*', adinebook_html)
if m:
d['month'] = m.group(1)
m = re.search('نشر:</b>.*?\(.*?(\d\d\d\d)\)</li>', adinebook_html)
if m:
d['year'] = m.group(1)
m = re.search('شابک:.*?([\d-]*)</span></li>', adinebook_html)
if m:
d['isbn'] = m.group(1)
return d
logger = logging.getLogger(__name__)