-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
109 lines (93 loc) · 5.63 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import scrapy, json, re
from scrapy.crawler import CrawlerProcess
class BeachesScraper(scrapy.Spider):
name = 'BeachesScraper'
with open("config.json") as file:
scraper_config = json.load(file)
# declare start urls
start_urls = scraper_config.get("start_urls")
custom_settings = {
'DOWNLOAD_DELAY' : 0.5,
'RETRY_TIMES': 10,
# export as json format
'FEED_FORMAT' : 'json',
'FEED_URI' : scraper_config.get("output_file")
}
def parse(self, response):
view_all_link = response.css(self.scraper_config.get("selectors").get("view_all_beaches_css")).extract_first()
object_id = re.search('object_id=(\d+)', view_all_link).group(1)
url = "https://beachsearcher.com/en/searchMain?object=country&object_id={}&smart=0&pagesize=18&offset=0"
yield scrapy.Request(url=url.format(object_id), callback=self.beaches_api_parse)
def beaches_api_parse(self, response):
data = response.json()
all_count = data.get("AllCount")
beaches = data.get("Beaches")
offset = data.get("Offset")
new_offset = offset + 18
if new_offset < all_count:
yield scrapy.Request(response.url.rstrip(str(offset)) + str(new_offset), callback=self.beaches_api_parse)
for beach in beaches:
_id = beach.get("id")
slug = beach.get("slug")
beach_url = f"https://beachsearcher.com/en/beach/{_id}/{slug}"
features = json.loads(beach.get("features"))
try:
beach_tags = [features.get("audience").get("label")]
except:
beach_tags = []
map_url= beach_url + "/map"
coordinates = re.search("\((.*)\)", beach.get("point")).group(1).split()
yield scrapy.Request(beach_url, callback=self.beach_parsed, meta={
"beach_tags": [beach_tags],
"map_url": map_url,
"coordinates": coordinates
})
def beach_parsed(self, response):
try:
width = response.xpath("//a[contains(text(), 'Width - ')]/text()").extract_first().replace("Width - ", "")
except:
width = ""
try:
shore_shape = response.xpath("//a[contains(text(), 'Shore shape - ')]/text()").extract_first().replace("Shore shape - ", "")
except:
shore_shape = ""
try:
length = response.xpath("//a[contains(text(), 'Length - ')]/text()").extract_first().replace("Length - ", "")
except:
length = ""
yield {
"beach_url": response.url,
"beach_supertitle": response.css("div.beach-desc__booking-city::text").extract_first(),
"beach_title": response.css("div.beach-desc__title > h1::text").extract_first().strip(),
"beach_subtitle": response.css("div.beach-desc__title > h3::text").extract_first().strip(),
"beach_tags": response.meta.get("beach_tags"),
"beach_rating": response.css("div.beach-desc__points::text").extract_first(),
"beach_rank": " ".join([ item.strip() for item in response.css("div.beach-desc__out::text").extract() ]),
"map_url": response.meta.get("map_url"),
"coordinates": response.meta.get("coordinates"),
"key_features": response.css("li.beach-top-special__item span::text").extract(),
"water_score": response.xpath("//div[text()='Water']/../following-sibling::div[1]/div[2]/text()").extract_first().strip().split('/')[0],
"cover_score": response.xpath("//div[text()='Cover']/../following-sibling::div[1]/div[2]/text()").extract_first().strip().split('/')[0],
"cleanliness": response.xpath("//div[text()='Cleanliness']/../following-sibling::div[1]/div[2]/text()").extract_first().strip().split('/')[0],
"amenities": response.xpath("//div[text()='Amenities']/../following-sibling::div[1]/div[2]/text()").extract_first().strip().split('/')[0],
"natural features": response.xpath("//h3[text()='Natural features']/following-sibling::ul/div/li/a/text()").extract(),
"general_features": response.xpath("//div[text()='ownership']/../following-sibling::div[1]/li/a/text()").extract(),
"occupancy": response.xpath("//div[text()='occupancy']/../following-sibling::div[1]/li/a/text()").extract_first(),
"sharks": response.xpath("//div[text()='sharks']/../following-sibling::div[1]/li/a/text()").extract_first(),
"size_and_shape": {
"width": width,
"shore_shape": shore_shape,
"length": length
},
"location_text": response.xpath("//div[text()='location']/../following-sibling::div[1]/li/a/text()").extract_first(),
"access": response.xpath("//h3[text()='Access']/following-sibling::ul/div/li/a/text()").extract(),
"cover_and_water": response.xpath("//div[text()='cover']/../following-sibling::div[1]/li/a/text()").extract(),
"swimming_details": response.xpath("//div[text()='swimming details']/../following-sibling::div[1]/li/a/text()").extract(),
"water": response.xpath("//div[text()='water']/../following-sibling::div[1]/li/a/text()").extract(),
"rental": response.xpath("//h3[text()='Rental']/following-sibling::ul/div/li/a/text()").extract(),
"audience": response.xpath("//h3[text()='Audience']/following-sibling::ul/div/li/a/text()").extract(),
"infrastructure": response.xpath("//h3[text()='Infrastructure']/following-sibling::ul/div/li/a/text()").extract()
}
process = CrawlerProcess()
process.crawl(BeachesScraper)
process.start()