forked from scrapfly/scrapfly-scrapers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
etsy.py
133 lines (114 loc) · 5.74 KB
/
etsy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""
This is an example web scraper for etsy.com.
To run this scraper set env variable $SCRAPFLY_KEY with your scrapfly API key:
$ export $SCRAPFLY_KEY="your key from https://scrapfly.io/dashboard"
"""
import os
import math
import json
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
from typing import Dict, List
from loguru import logger as log
SCRAPFLY = ScrapflyClient(key=os.environ["SCRAPFLY_KEY"])
BASE_CONFIG = {
# bypass Etsy.com web scraping blocking
"asp": True,
# set the poxy location to US
"country": "US",
}
def strip_text(text):
"""remove extra spaces while handling None values"""
if text != None:
text = text.strip()
return text
def parse_search(response: ScrapeApiResponse) -> Dict:
"""parse data from Etsy search pages"""
selector = response.selector
data = []
script = json.loads(selector.xpath("//script[@type='application/ld+json']/text()").get())
# get the total number of pages
total_listings = script["numberOfItems"]
total_pages = math.ceil(total_listings / 64)
for product in selector.xpath("//div[@data-search-results-lg]/ol/li"):
link = product.xpath(".//a[contains(@class, 'listing-link')]/@href").get()
rate = product.xpath(".//input[@name='rating']/@value").get()
number_of_reviews = strip_text(product.xpath(".//span[contains(@class, 'wt-text-caption') and contains(., '(')]/text()").get())
price = product.xpath(".//span[@class='currency-value']/text()").get()
original_price = product.xpath(".//span[contains(text(),'Original Price')]/text()").get()
discount = strip_text(product.xpath(".//span[contains(text(),'off')]/text()").get())
seller = product.xpath(".//span[contains(text(),'From shop')]/text()").get()
data.append({
"productLink": '/'.join(link.split('/')[:5]) if link else None,
"productTitle": strip_text(product.xpath(".//h3[contains(@class, 'text-caption')]/text()").get()),
"productImage": product.xpath("//img[@data-listing-card-listing-image]/@src").get(),
"seller": seller.replace("From shop ", "") if seller else None,
"estimatedArrival": product.xpath(".//span[@class='wt-no-wrap']/text()").get(),
"listingType": "Paid listing" if product.xpath(".//span[@data-ad-label='Ad by Etsy seller']") else "Free listing",
"productRate": float(rate) if rate else None,
"numberOfReviews": number_of_reviews,
"freeShipping": "Yes" if product.xpath(".//span[contains(text(),'Free shipping')]/text()").get() else "No",
"productPrice": float(price.replace(",", "")) if price else None,
"priceCurrency": product.xpath(".//span[@class='currency-symbol']/text()").get(),
"originalPrice": float(original_price.split("$")[-1].strip()) if original_price else "No discount",
"discount": discount if discount else "No discount",
})
return {
"search_data": data,
"total_pages": total_pages
}
def parse_product_page(response: ScrapeApiResponse) -> Dict:
"""parse hidden product data from product pages"""
selector = response.selector
script = selector.xpath("//script[contains(text(),'offers')]/text()").get()
data = json.loads(script)
return data
def parse_shop_page(response: ScrapeApiResponse) -> Dict:
"""parse hidden shop data from shop pages"""
selector = response.selector
script = selector.xpath("//script[contains(text(),'itemListElement')]/text()").get()
data = json.loads(script)
return data
async def scrape_search(url: str, max_pages: int = None) -> List[Dict]:
"""scrape product listing data from Etsy search pages"""
log.info("scraping the first search page")
# etsy search pages are dynaminc, requiring render_js enabled
first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url, wait_for_selector="//div[@data-search-pagination]", render_js=True, **BASE_CONFIG))
data = parse_search(first_page)
search_data = data["search_data"]
# get the number of total pages to scrape
total_pages = data["total_pages"]
if max_pages and max_pages < total_pages:
total_pages = max_pages
log.info(f"scraping search pagination ({total_pages - 1} more pages)")
# add the remaining search pages in a scraping list
other_pages = [
ScrapeConfig(url + f"&page={page_number}", wait_for_selector="//div[@data-search-pagination]", render_js=True, **BASE_CONFIG)
for page_number in range(2, total_pages + 1)
]
# scrape the remaining search pages concurrently
async for response in SCRAPFLY.concurrent_scrape(other_pages):
data = parse_search(response)
search_data.extend(data["search_data"])
log.success(f"scraped {len(search_data)} product listings from search")
return search_data
async def scrape_product(urls: List[str]) -> List[Dict]:
"""scrape trustpilot company pages"""
products = []
# add the product page URLs to a scraping list
to_scrape = [ScrapeConfig(url, **BASE_CONFIG) for url in urls]
# scrape all the product pages concurrently
async for response in SCRAPFLY.concurrent_scrape(to_scrape):
data = parse_product_page(response)
products.append(data)
log.success(f"scraped {len(products)} product listings from product pages")
return products
async def scrape_shop(urls: List[str]) -> List[Dict]:
shops = []
# add the shop page URLs to a scraping list
to_scrape = [ScrapeConfig(url, **BASE_CONFIG) for url in urls]
# scrape all the shop pages concurrently
async for response in SCRAPFLY.concurrent_scrape(to_scrape):
data = parse_shop_page(response)
shops.append(data)
log.success(f"scraped {len(shops)} shops from shop pages")
return shops