-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
180 lines (138 loc) · 5.9 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
"""
Script for scraping product info from www.osta.ee.
Running this script from command line will scrape all valid product data from
specified subcategory at https://www.osta.ee.
Category can be specified as first command-line argument.
Category nesting is supported.
Scraped results will be written to a JSON output file.
Output filename can be specified as second command-line argument, default filename
is 'products.json'.
This script runs synchronously and may take several seconds depending on category
size and internet connection.
Minimal user input verification is provided, entering invalid arguments will break
the script.
Depends on: BeautifulSoup4.
"""
from bs4 import BeautifulSoup
import json
import sys
import requests
import urllib.request
import urllib.error
def scrape_product_category(category_url, filename):
"""
Scrape a specified Osta.ee product category and write results to a JSON file.
:param category_url: url to osta.ee product category
:param filename: desired output filename
"""
print(f"Scraping category at url: {category_url}\n")
entry_page_object = get_page_object_from_url(category_url)
page_count = get_sub_page_count(entry_page_object)
items_per_page = 60
print(f"Category has roughly {page_count * items_per_page} products:"
f" {page_count} pages with {items_per_page} products per page\n")
print(f"Scraping page 1...")
scraped_products = get_product_list_from_page_object(entry_page_object)
for page_number in range(2, page_count + 1):
print(f"Scraping page {page_number}...")
url = f"{category_url}/page-{page_number}"
scraped_products += get_product_list_from_page_object(get_page_object_from_url(url))
print(f"\nScraping done, products scraped: {len(scraped_products)}")
print(f"\nWriting results to {filename}...")
write_product_list_to_json_file(scraped_products, filename)
print(f"Scrape results successfully written to file.")
def get_product_list_from_page_object(page_object):
"""
Get list of dictionary objects containing product information from page object.
:param page_object: BeautifulSoup4 page object
:return:
"""
product_objects = get_product_objects_from_page(page_object)
products = []
for product_object in product_objects:
product = {
"title": get_product_title(product_object),
"price": get_product_price(product_object),
"img_href": get_product_img_href(product_object)
}
products.append(product)
return products
def get_sub_page_count(page_object) -> int:
"""
Get number of sub pages in a category from a category page object.
:param page_object: BeautifulSoup4 page object.
:return: number of sub pages in category.
"""
return int(page_object.find("span", {"class": "page-count"}).text)
def get_page_object_from_url(url):
"""
Get BeautifulSoup4 page object from url.
:param url: web page url
:return: BeautifulSoup4 page object representing web page with specified url.
"""
request = requests.get(url)
return BeautifulSoup(request.text, "html.parser")
def get_product_objects_from_page(page_object):
"""
Get sub page-objects containing product information from main page object.
:param page_object: BeautifulSoup4 page object
:return: List of BeautifulSoup4 page objects
"""
main_product_lists = page_object.findAll("ul", {"class": "js-main-offers-list"})
return main_product_lists[1].findAll("figure", {"class": "offer-thumb"})
def get_product_title(product_page_object):
"""
Get product title string from product page-object.
:param product_page_object: BeautifulSoup4 page object
:return: string representation of product title
"""
return product_page_object.find("p", {"class": "offer-thumb__title"}).get("title")
def get_product_price(product_page_object):
"""
Get product price string from product page-object.
:param product_page_object: BeautifulSoup4 page object
:return: string representation of product price
"""
return product_page_object.find("span", {"class": "price-cp"}).text
def get_product_img_href(product_page_object) -> str:
"""
Get product image href link from product page-object.
:param product_page_object: BeautifulSoup4 page object
:return: string representation of product image href
"""
anchor_element = product_page_object.find("figure", {"class": "offer-thumb__image"}).find("a", {"class": "lazy"})
return anchor_element.get("data-original")
def write_product_list_to_json_file(product_list, filename: str):
"""
Write list of product items into .json file with specified filename.
:param product_list: List of product items to write.
:param filename: Name of output file
"""
with open(filename, 'w') as file:
json.dump(product_list, file, indent=4)
def check_url(url: str):
print(f"Checking URL {url}")
try:
connection = urllib.request.urlopen(url)
status_code = connection.getcode()
print(f"Server responded with status code {status_code}\n")
except urllib.error.HTTPError as error:
print(f"URL checking failed, url {url} raised {error}")
print("This is most likely due to the provided category not being valid or Osta.ee servers are not "
"responding.")
sys.exit()
if __name__ == "__main__":
base_url = "https://www.osta.ee/kategooria/"
output_filename = "products.json"
category = ""
if len(sys.argv) >= 2:
category = sys.argv[1].lstrip("/")
if len(sys.argv) >= 3:
output_filename = sys.argv[2]
if not category:
print("Scraping failed, please enter a valid category to scrape as a command line argument,"
" for example arvutid/sulearvutid")
else:
full_url = base_url + category
check_url(full_url)
scrape_product_category(full_url, output_filename)