-
Notifications
You must be signed in to change notification settings - Fork 11
/
Books.py
148 lines (136 loc) · 6.27 KB
/
Books.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python
# import needed libraries
from Writer import Writer
from Browser import Browser
from bs4 import BeautifulSoup
from Tools import id_from_url, read_books, get_digits
# A class to Search then Scrape lists and books from GoodReads.com
class Books:
def __init__(self, path=None, arabic=True):
self.arabic = arabic
# Browsing and writing managers
self.br = Browser()
self.wr = Writer(path) if path else Writer()
# An array for scrapped books
self._books_ids = []
# Append an external books ids array to local array
def append_books(self, books_ids):
# Loop through sent books ids
for book_id in books_ids:
# Only append id if it's not stored already
if book_id not in self._books_ids:
self._books_ids.append(book_id)
# Scrape books and write them to a file (browse is: list, lists, author or shelf)
def output_books(self, keyword=None, browse="list", file_name="books"):
self.wr.open(file_name, "w+")
# Get books if keyword is provided, otherwise output stored books
books_ids = self.get_books(keyword, browse) if keyword else self._books_ids
# Loop through book ids and write them
for book_id in books_ids:
self.wr.write(book_id)
self.wr.close()
def output_books_editions(self, books_ids=None, file_name="editions"):
skip = len(read_books(file_name))
self.wr.open(file_name, "a+")
# Loop through book ids and write their editions id
for book_id in books_ids[skip:] or self._books_ids[skip:]:
editions_id = self.get_book_editions_id(book_id)
# Editions id is None when page refuses to load
if editions_id is None: return self.wr.close()
# Write editions id to file if it loads correctly
self.wr.write(editions_id or "-"*7)
# Display book id and editions id
print(f"Book ID:\t{book_id:<15}Book Editions ID:\t{editions_id or ''}")
self.wr.close()
return True
def output_books_edition_by_language(self, editions_ids, lang="Arabic", file_name="ara_books"):
skip = len(read_books(file_name))
self.wr.open(file_name, "a+")
# Loop through book ids and write their editions id
for editions_id in editions_ids[skip:]:
books_ids = self.get_book_edition_by_language(editions_id, lang) if editions_id.isdigit() else ''
# Editions id is None when page refuses to load
if books_ids is None: return self.wr.close()
# Write editions id to file if it loads correctly
self.wr.write(books_ids or "-"*7)
# Display book id and editions id
print(f"Book Editions ID:\t{editions_id:<15}Books IDs:\t{books_ids or ''}")
self.wr.close()
# Open a new file to move done list to it
self.wr.open(file_name + "_list")
# Loop through previously scraped editions ids
for line in read_books(file_name):
# If line isn't empty
if line != "-"*7:
# Write each book edition id in a separate line
[self.wr.write(id_) for id_ in line.split(',')]
self.wr.close()
return True
# Main function to scrape books ids
def get_books(self, keyword, browse="list"):
# Get lists in search list if searching
if browse == "lists":
keywords = self._get_lists(keyword.replace(' ', '+'))
browse = "list"
# Otherwise, it's a single "list" or "shelf"
else:
keywords = [
str(key) for key in (
keyword if isinstance(keyword, list) else [keyword]
)]
try:
# Loop through all lists
for keyword in keywords:
# Open each list url
self.br.open_page(keyword, browse)
# Scrape pages until there's no next page
while True:
self._scrape_list("book", self._books_ids)
if not self.br.goto_next_page():
break
except Exception as e:
print("Couldn't go to next page:", e)
finally:
return self._books_ids
def get_book_editions_id(self, book_id):
self.br.open("/book/show/", book_id)
return self.br.editions_id()
def get_book_edition_by_language(self, editions_id, lang):
self.br.open_book_editions(editions_id)
soup = BeautifulSoup(self.br.page_source, "lxml").find(class_="workEditions")
if not soup: return None
editions = []
for details in soup.find_all(class_="editionData"):
language, rating = [row.find(class_="dataValue") for row in details.find_all(class_="dataRow")[-3:-1]]
if language.text.strip() == lang:
reviewers = get_digits(rating.find("span").text)
if reviewers > 50:
editions.append(id_from_url.match(details.find(class_="bookTitle")["href"]).group(1))
return ','.join(editions)
# Main function to scrape lists ids
def _get_lists(self, keyword):
lists = []
# Open GoodReads' lists search url
self.br.open_list_search(keyword)
# Scrape all result pages
while True:
self._scrape_list("list", lists)
# Go to next page if there's one, otherwise break
if not self.br.goto_next_page():
break
return lists
# Scrape a single search results page
def _scrape_list(self, title, array):
soup = BeautifulSoup(self.br.page_source, "lxml").find(class_="tableList")
if not soup: return None
for book in soup.find_all("tr"):
if self.arabic or get_digits(book.find(class_="minirating").text.split("—")[1]) > 1000:
try: # Get id from url
id_ = id_from_url.match(book.find(class_=title + "Title")["href"]).group(1)
except Exception:
print("Couldn't extract Book Id from URL")
continue
# Extract and store unique id from link
if id_ not in array:
array.append(id_)
print(f"{title.capitalize()} {id_:<10}count:\t{len(array)}")