-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathscrapy初级实战1:链家爬取一个交易.py
38 lines (31 loc) · 1.31 KB
/
scrapy初级实战1:链家爬取一个交易.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#注意rules的回调函数和类函数的名字不能用parse,不然抓不到数据,原因不明
#rule里的正则获取地区/街道的链接,parse1打印第一个交易信息,
#使用follow=True爬取南京所有地区和街道。
#从redis可以发现,有两个地区没有交易记录,一共有11个小区,
#所以y有96个值,87个不重复的,
#x有96个不重复值,11个地区url,85个街道url
from scrapy import Spider, Request, FormRequest
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import logging
from redis import Redis
red=Redis(password='asd123')
logger = logging.getLogger(__name__)
class LJSpider(CrawlSpider):
i=0
name = "Hupu"
start_urls = ["https://nj.lianjia.com/chengjiao/"]
rules = (
Rule(LinkExtractor(
allow=(r'/chengjiao/[a-z]{4,}\d{0,2}/$',)),
# https://nj.lianjia.com/chengjiao/jianye/
callback='parse1', follow=1),
)
def parse1(self, response):
a=response.xpath('//ul[@class="listContent"]/li')[0]
b=a.xpath('string(div/div[@class="title"])').extract()[0]
red.lpush('x',response.url)
red.lpush('y',b)
self.i+=1
logger.error(response.url+b+str(self.i)+'zzzzz')