forked from scrapfly/scrapfly-scrapers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
trustpilot.py
107 lines (90 loc) · 3.98 KB
/
trustpilot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
This is an example web scraper for trustpilot.com.
To run this scraper set env variable $SCRAPFLY_KEY with your scrapfly API key:
$ export $SCRAPFLY_KEY="your key from https://scrapfly.io/dashboard"
"""
import os
import json
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
from typing import Dict, List
from loguru import logger as log
SCRAPFLY = ScrapflyClient(key=os.environ["SCRAPFLY_KEY"])
BASE_CONFIG = {
# bypass trustpilot web scraping blocking
"asp": True,
# set the poxy location to US
"country": "US",
}
def parse_hidden_data(response: ScrapeApiResponse):
"""parse JSON data from script tags"""
selector = response.selector
script = selector.xpath("//script[@id='__NEXT_DATA__']/text()").get()
data = json.loads(script)
return data
def parse_company_data(data: Dict) -> Dict:
"""parse company data from JSON and execlude the web app details"""
data = data["props"]["pageProps"]
return {
"pageUrl": data["pageUrl"],
"companyDetails": data["businessUnit"],
"reviews": data["reviews"],
}
async def scrape_company(urls: List[str]) -> List[Dict]:
"""scrape trustpilot company pages"""
companies = []
# add the company pages to a scraping list
to_scrape = [ScrapeConfig(url, **BASE_CONFIG) for url in urls]
# scrape all the company pages concurrently
async for response in SCRAPFLY.concurrent_scrape(to_scrape):
data = parse_hidden_data(response)
data = parse_company_data(data)
companies.append(data)
log.success(f"scraped {len(companies)} company listings from company pages")
return companies
async def scrape_search(url: str, max_pages: int = None) -> List[Dict]:
"""scrape trustpilot search pages"""
# scrape the first search page first
log.info("scraping the first search page")
first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG))
data = parse_hidden_data(first_page)["props"]["pageProps"]["businessUnits"]
search_data = data["businesses"]
# get the number of pages to scrape
total_pages = data["totalPages"]
if max_pages and max_pages < total_pages:
total_pages = max_pages
log.info(f"scraping search pagination ({total_pages - 1} more pages)")
# add the remaining search pages in a scraping list
other_pages = [
ScrapeConfig(url + f"?page={page_number}", **BASE_CONFIG)
for page_number in range(2, total_pages + 1)
]
# scrape the remaining search pages concurrently
async for response in SCRAPFLY.concurrent_scrape(other_pages):
data = parse_hidden_data(response)["props"]["pageProps"]["businessUnits"]["businesses"]
search_data.extend(data)
log.success(f"scraped {len(search_data)} company listings from search")
return search_data
async def scrape_reviews(url: str, max_pages: int = None) -> List[Dict]:
"""parse review data from the API"""
# send a POST request to the first review page and get the result directly in JSON
first_page = await SCRAPFLY.async_scrape(
ScrapeConfig(url, method="POST", **BASE_CONFIG)
)
data = json.loads(first_page.scrape_result["content"])["pageProps"]
reviews_data = data["reviews"]
# get the number of review pages to scrape
total_pages = data["filters"]["pagination"]["totalPages"]
if max_pages and max_pages < total_pages:
total_pages = max_pages
log.info(f"scraping reviews pagination ({total_pages - 1} more pages)")
# add the remaining search pages in a scraping list
other_pages = [
ScrapeConfig(url + f"&page={page_number}", method="POST", **BASE_CONFIG)
for page_number in range(2, total_pages + 1)
]
# scrape the remaining search pages concurrently
async for response in SCRAPFLY.concurrent_scrape(other_pages):
data = json.loads(response.scrape_result["content"])["pageProps"]["reviews"]
reviews_data.extend(data)
log.success(f"scraped {len(reviews_data)} company reviews")
return reviews_data