Skip to content

Commit

Permalink
Update Pipfile
Browse files Browse the repository at this point in the history
  • Loading branch information
morefreeze authored and [email protected] committed Sep 30, 2023
1 parent 5fd9939 commit 2320110
Show file tree
Hide file tree
Showing 27 changed files with 444 additions and 329 deletions.
10 changes: 5 additions & 5 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ verify_ssl = true
name = "pypi"

[packages]
scrapy = "*"
selenium = "*"
ipython = "*"
pymongo = "*"
simplejson = "*"
scrapy = ">=1.6.0"
selenium = ">=3.141.0"
ipython = ">=7.2.0"
pymongo = ">=3.7.2"
simplejson = ">=3.16"

[dev-packages]

Expand Down
618 changes: 373 additions & 245 deletions Pipfile.lock

Large diffs are not rendered by default.

File renamed without changes.
1 change: 1 addition & 0 deletions book/amazon/items.py → amazon/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class BookItem(scrapy.Item):
title = scrapy.Field()
date = scrapy.Field()
author = scrapy.Field()
author_date = scrapy.Field()
price = scrapy.Field()
rating = scrapy.Field()
rating_num = scrapy.Field()
Expand Down
File renamed without changes.
6 changes: 3 additions & 3 deletions book/amazon/settings.py → amazon/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'book'
BOT_NAME = 'amazon'

SPIDER_MODULES = ['book.spiders']
NEWSPIDER_MODULE = 'book.spiders'
SPIDER_MODULES = ['amazon.spiders']
NEWSPIDER_MODULE = 'amazon.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
Expand Down
File renamed without changes.
56 changes: 56 additions & 0 deletions amazon/spiders/list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# -*- coding: utf-8 -*-
import datetime
import locale
import scrapy
from amazon.items import BookItem


def safe_list_get(l, idx, default=''):
return l[idx] if len(l) > idx else default


class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ["amazon.cn"]
cat = None
start_url = None
start_urls = {
'文学巨匠': 'https://www.amazon.cn/s/?node=1851470071&ie=UTF8',
'外国文学': 'https://www.amazon.cn/s/?node=1851471071&ie=UTF8',
'秋乏冬眠': 'https://www.amazon.cn/s/?node=1851472071&ie=UTF8',
'文艺青年': 'https://www.amazon.cn/s/?node=1851473071&ie=UTF8',
'诺贝尔奖': 'https://www.amazon.cn/s/?node=1851474071&ie=UTF8',
}

def __init__(self, cat=None, url=None, node=None):
self.cat = cat if cat else datetime.datetime.today().strftime('%Y%m%d')
self.start_url = 'https://www.amazon.cn/s/?node=%s' % (node) if node else url

def start_requests(self):
if self.cat and self.start_url:
return [scrapy.Request(
self.start_url,
meta={'category': self.cat},
callback=self.parse_book_follow_next_page
)]

def parse_book_follow_next_page(self, response):
lis = response.xpath('//ul[contains(@class, "s-result-list")]/li') or \
response.xpath('//div[contains(@class, "s-result-list")]/div[contains(@class, "s-result-item")]')

def parse_first_page(self, response):
lis = response.xpath('//ul[contains(@class, "s-result-list")]/li')
item['rating_num'] = int(safe_list_get(li.xpath('.//a[contains(@class, "a-size-small")]/text()').re('\d+') or \
li.xpath('.//div[contains(@class,"a-size-small")]/span[2]//span/text()').re('\d+'), 0, 0))
item['url'] = safe_list_get(li.xpath('.//a[contains(@class, "s-access-detail-page")]/@href').extract() or \
li.xpath('.//a[contains(@class, "a-link-normal")]/@href').extract(), 0, '')
if self.allowed_domains[0] not in item['url']:
item['url'] = self.allowed_domains[0] + item['url']
item['category'] = response.meta['category']
yield item

next_page = response.xpath('//li[contains(@class, "a-last")]/a/@href') or \
response.xpath('//a[@id="pagnNextLink"]/@href')

next_page = response.xpath('//a[@id="pagnNextLink"]/@href') or response.xpath('//li[@class="a-last"]/a/@href')

68 changes: 0 additions & 68 deletions book/amazon/spiders/amazon.py

This file was deleted.

Binary file removed book/duokan/duokan/spiders/.list.py.swp
Binary file not shown.
4 changes: 2 additions & 2 deletions caoporn/caoporn/spiders/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ def convert_time_len(tl):

class ListSpider(scrapy.Spider):
name = "list"
allowed_domains = ['caoporn.com', 'caomoo.com', 'caomee.com', '51.caoxee.com', 'cao.vgao.xyz']
allowed_domains = ['caoporn.com', 'caomoo.com', 'caomee.com', '51.caoxee.com', 'cao.vgao.xyz', 'gao2.cc']
# find from mongo
start_url = 'http://cao.vgao.xyz/videos?page=1'
start_url = 'https://gao2.cc/videos?page=1'
max_page = 5000

def __init__(self, *args, **kwargs):
Expand Down
8 changes: 2 additions & 6 deletions caoporn/get_mp4.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,10 @@ def get_mp4(url):
r = requests.get(url)
content = r.content.decode('utf-8')
m = re.search(r'textarea name="video_embed_code[^>]+>([^<]+)</textarea>', content)
if not m:
return ''
emb_url = m.group(1).strip()
r = requests.get(emb_url)
content = r.content.decode('utf-8')
m = re.search(r'<source src="([^"]+.mp4[^"]+)"', content)
if not m:
return ''
m = re.search(r'<source src="([^"]+.mp4)"', content)
mp4_url = m.group(1).strip()
return mp4_url

Expand All @@ -44,7 +40,7 @@ def main():
if args.code:
urls = []
for code in args.code:
urls.append('https://51.caoxee.com/video/%s' % code)
urls.append('https://gao2.cc/video60/%s' % (code))
else:
urls = args.url
true_urls = list(map(get_mp4, urls))
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 2 additions & 0 deletions scrapy.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[settings]
default = amazon.settings

0 comments on commit 2320110

Please sign in to comment.