Skip to content

Commit

Permalink
Support new style amazon list
Browse files Browse the repository at this point in the history
Change-Id: I0fba1b23d0c913ed44d7ac683e8eb9b6d84db651
  • Loading branch information
刘晨星 committed Nov 3, 2020
1 parent beaba2c commit 8165b03
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 7 deletions.
1 change: 1 addition & 0 deletions amazon/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class BookItem(scrapy.Item):
title = scrapy.Field()
date = scrapy.Field()
author = scrapy.Field()
author_date = scrapy.Field()
price = scrapy.Field()
rating = scrapy.Field()
rating_num = scrapy.Field()
Expand Down
21 changes: 14 additions & 7 deletions amazon/spiders/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,28 +42,35 @@ def start_requests(self):
) for cat, url in self.start_urls.items()]

def parse_book_follow_next_page(self, response):
lis = response.xpath('//ul[contains(@class, "s-result-list")]/li')
lis = response.xpath('//ul[contains(@class, "s-result-list")]/li') or \
response.xpath('//div[contains(@class, "s-result-list")]/div[contains(@class, "s-result-item")]')
for li in lis:
item = BookItem()
item['title'] = safe_list_get(li.xpath('.//h2/@data-attribute').extract(), 0, '')
item['title'] = safe_list_get(li.xpath('.//h2/@data-attribute').extract() or \
li.xpath('.//h2//span/text()').extract(),
0, '')
if item['title'] == '':
continue
item['date'] = safe_list_get(li.xpath('.//div[@class="a-row a-spacing-none"][1]/span/text()').extract(), 0, 'Unknown')
item['author'] = safe_list_get(li.xpath('.//div[@class="a-row a-spacing-none"][2]/span/text()').extract(), 0, 'Unknown')
item['author_date'] = ''.join(li.xpath('.//div[@class="a-row a-size-base a-color-secondary"][1]/span/text()').extract())
# price = li.xpath('.//span[contains(@class, "s-price")]/text()').extract()
# if len(price) == 0:
# price = li.xpath('.//span[contains(@class, "a-color-price")]/text()').extract()
# item['price'] = price[-1] if len(price) > 0 else '-1.0'
item['price'] = ''.join(li.xpath('.//span[contains(@class, "price")]/text()')[-3:].extract())
item['rating'] = float(safe_list_get(li.xpath('.//i[contains(@class, "a-icon-star")]/span/text()').re('[\d\.]+'), 0, 0.0))
item['rating_num'] = int(safe_list_get(li.xpath('.//a[contains(@class, "a-size-small")]/text()').re('\d+'), 0, 0))
item['url'] = safe_list_get(li.xpath('.//a[contains(@class, "s-access-detail-page")]/@href').extract(), 0, '')
item['rating_num'] = int(safe_list_get(li.xpath('.//a[contains(@class, "a-size-small")]/text()').re('\d+') or \
li.xpath('.//div[contains(@class,"a-size-small")]/span[2]//span/text()').re('\d+'), 0, 0))
item['url'] = safe_list_get(li.xpath('.//a[contains(@class, "s-access-detail-page")]/@href').extract() or \
li.xpath('.//a[contains(@class, "a-link-normal")]/@href').extract(), 0, '')
if self.allowed_domains[0] not in item['url']:
item['url'] = self.allowed_domains[0] + item['url']
item['category'] = response.meta['category']
yield item

next_page = response.xpath('//li[contains(@class, "a-last")]/a/@href')
if not next_page:
next_page = response.xpath('//a[@id="pagnNextLink"]/@href')
next_page = response.xpath('//li[contains(@class, "a-last")]/a/@href') or \
response.xpath('//a[@id="pagnNextLink"]/@href')
self.logger.debug(next_page)
if next_page:
url = response.urljoin(next_page[0].extract())
Expand Down

0 comments on commit 8165b03

Please sign in to comment.