forked from scrapfly/scrapfly-scrapers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fashionphile.py
79 lines (64 loc) · 2.87 KB
/
fashionphile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""
This is an example web scraper for fashionphile.com.
To run this scraper set env variable $SCRAPFLY_KEY with your scrapfly API key:
$ export $SCRAPFLY_KEY="your key from https://scrapfly.io/dashboard"
"""
import os
import json
from typing import Dict, List
from pathlib import Path
from loguru import logger as log
from urllib.parse import parse_qs, urlencode, urlparse
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
SCRAPFLY = ScrapflyClient(key=os.environ["SCRAPFLY_KEY"])
BASE_CONFIG = {
# bypass fashionphile.com web scraping blocking
"asp": True,
# set the proxy country to US
"country": "US",
}
output = Path(__file__).parent / "results"
output.mkdir(exist_ok=True)
def find_hidden_data(result: ScrapeApiResponse) -> dict:
"""extract hidden NEXT_DATA from page html"""
data = result.selector.css("script#__NEXT_DATA__::text").get()
data = json.loads(data)
return data
async def scrape_products(urls: List[str]) -> dict:
"""scrape fashionphile product pages for product data"""
to_scrape = [ScrapeConfig(url, **BASE_CONFIG) for url in urls]
products = []
async for response in SCRAPFLY.concurrent_scrape(to_scrape):
data = find_hidden_data(response)
product = data["props"]["pageProps"]["initialState"]["productPageReducer"]["productData"]
products.append(product)
log.success(f"scraped {len(products)} product listings from product pages")
return products
def update_url_parameter(url, **params):
"""update url query parameter of an url with new values"""
current_params = parse_qs(urlparse(url).query)
updated_query_params = urlencode({**current_params, **params}, doseq=True)
return f"{url.split('?')[0]}?{updated_query_params}"
async def scrape_search(url: str, max_pages: int = 10) -> List[Dict]:
log.info(f"scraping search page {url}")
# scrape first page
result_first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG))
data_first_page = find_hidden_data(result_first_page)
data_first_page = data_first_page["props"]["pageProps"]["initialState"]["listingPageReducer"]["listingData"]
results = data_first_page["results"]
# find total page count
total_pages = data_first_page["pages"]
if max_pages and max_pages < total_pages:
total_pages = max_pages
# scrape remaining pages
log.info(f"scraping search pagination ({total_pages-1} more pages)")
to_scrape = [
ScrapeConfig(update_url_parameter(url, page=page), **BASE_CONFIG)
for page in range(2, total_pages + 1)
]
async for result in SCRAPFLY.concurrent_scrape(to_scrape):
data = find_hidden_data(result)
data = data["props"]["pageProps"]["initialState"]["listingPageReducer"]["listingData"]
results.extend(data["results"])
log.success(f"scraped {len(results)} product listings from search pages")
return results