Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bump pygments from 2.7.2 to 2.7.4 #6

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
10 changes: 5 additions & 5 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ verify_ssl = true
name = "pypi"

[packages]
scrapy = "*"
selenium = "*"
ipython = "*"
pymongo = "*"
simplejson = "*"
scrapy = ">=1.6.0"
selenium = ">=3.141.0"
ipython = ">=7.2.0"
pymongo = ">=3.7.2"
simplejson = ">=3.16"

[dev-packages]

Expand Down
618 changes: 356 additions & 262 deletions Pipfile.lock

Large diffs are not rendered by default.

File renamed without changes.
1 change: 1 addition & 0 deletions book/amazon/items.py → amazon/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class BookItem(scrapy.Item):
title = scrapy.Field()
date = scrapy.Field()
author = scrapy.Field()
author_date = scrapy.Field()
price = scrapy.Field()
rating = scrapy.Field()
rating_num = scrapy.Field()
Expand Down
File renamed without changes.
6 changes: 3 additions & 3 deletions book/amazon/settings.py → amazon/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'book'
BOT_NAME = 'amazon'

SPIDER_MODULES = ['book.spiders']
NEWSPIDER_MODULE = 'book.spiders'
SPIDER_MODULES = ['amazon.spiders']
NEWSPIDER_MODULE = 'amazon.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
Expand Down
File renamed without changes.
32 changes: 19 additions & 13 deletions book/amazon/spiders/amazon.py → amazon/spiders/list.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
import datetime
import scrapy
from items import BookItem
from amazon.items import BookItem


def safe_list_get(l, idx, default=''):
Expand All @@ -21,8 +22,9 @@ class AmazonSpider(scrapy.Spider):
}

def __init__(self, cat=None, url=None, node=None):
if cat and (node or url):
self.cat = cat
if cat is None:
self.cat = datetime.datetime.today().strftime('%Y%m%d')
if node or url:
if url:
self.start_url = url
else:
Expand All @@ -35,33 +37,37 @@ def start_requests(self):
meta={'category': self.cat},
callback=self.parse_book_follow_next_page
)]
return [scrapy.Request(
url,
meta={'category': cat},
callback=self.parse_book_follow_next_page
) for cat, url in self.start_urls.items()]

def parse_book_follow_next_page(self, response):
lis = response.xpath('//ul[contains(@class, "s-result-list")]/li')
lis = response.xpath('//ul[contains(@class, "s-result-list")]/li') or \
response.xpath('//div[contains(@class, "s-result-list")]/div[contains(@class, "s-result-item")]')
for li in lis:
item = BookItem()
item['title'] = safe_list_get(li.xpath('.//h2/@data-attribute').extract(), 0, '')
item['title'] = safe_list_get(li.xpath('.//h2/@data-attribute').extract() or \
li.xpath('.//h2//span/text()').extract(),
0, '')
if item['title'] == '':
continue
item['date'] = safe_list_get(li.xpath('.//div[@class="a-row a-spacing-none"][1]/span/text()').extract(), 0, 'Unknown')
item['author'] = safe_list_get(li.xpath('.//div[@class="a-row a-spacing-none"][2]/span/text()').extract(), 0, 'Unknown')
item['author_date'] = ''.join(li.xpath('.//div[@class="a-row a-size-base a-color-secondary"][1]/span/text()').extract())
# price = li.xpath('.//span[contains(@class, "s-price")]/text()').extract()
# if len(price) == 0:
# price = li.xpath('.//span[contains(@class, "a-color-price")]/text()').extract()
# item['price'] = price[-1] if len(price) > 0 else '-1.0'
item['price'] = ''.join(li.xpath('.//span[contains(@class, "price")]/text()')[-3:].extract())
item['rating'] = float(safe_list_get(li.xpath('.//i[contains(@class, "a-icon-star")]/span/text()').re('[\d\.]+'), 0, 0.0))
item['rating_num'] = int(safe_list_get(li.xpath('.//a[contains(@class, "a-size-small")]/text()').re('\d+'), 0, 0))
item['url'] = safe_list_get(li.xpath('.//a[contains(@class, "s-access-detail-page")]/@href').extract(), 0, '')
item['rating_num'] = int(safe_list_get(li.xpath('.//a[contains(@class, "a-size-small")]/text()').re('\d+') or \
li.xpath('.//div[contains(@class,"a-size-small")]/span[2]//span/text()').re('\d+'), 0, 0))
item['url'] = safe_list_get(li.xpath('.//a[contains(@class, "s-access-detail-page")]/@href').extract() or \
li.xpath('.//a[contains(@class, "a-link-normal")]/@href').extract(), 0, '')
if self.allowed_domains[0] not in item['url']:
item['url'] = self.allowed_domains[0] + item['url']
item['category'] = response.meta['category']
yield item

next_page = response.xpath('//a[@id="pagnNextLink"]/@href')
next_page = response.xpath('//li[contains(@class, "a-last")]/a/@href') or \
response.xpath('//a[@id="pagnNextLink"]/@href')
self.logger.debug(next_page)
if next_page:
url = response.urljoin(next_page[0].extract())
Expand Down
Binary file removed book/duokan/duokan/spiders/.list.py.swp
Binary file not shown.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 2 additions & 0 deletions scrapy.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[settings]
default = amazon.settings