forked from SpiderClub/haipproxy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
zhihu_spider.py
131 lines (101 loc) · 3.61 KB
/
zhihu_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# the code is partially copied from https://github.com/windcode/zhihu-crawler-people
import json
import time
from multiprocessing import Pool
from bs4 import BeautifulSoup as BS
from haipproxy.utils import get_redis_conn
from examples.zhihu.crawler import Crawler
per_page = 20
info_max_process_num = 50
list_max_process_num = 10
host = 'https://www.zhihu.com'
waiting_set = 'zhihu:seeds:to_crawl'
seeds_all = 'zhihu:seeds:all'
info_set = 'zhihu:info:user'
# Not considering concurrent security
common_crawler = Crawler()
def init_db():
redis_client = get_redis_conn(db=1)
return redis_client
def get_info(url_token):
"""get user info"""
url = '%s/people/%s/answers' % (host, url_token)
html = common_crawler.get(url)
print("parsing page's HTML……")
if not html:
return
s = BS(html, 'html.parser')
try:
data = s.find('div', attrs={'id': 'data'})['data-state']
data = json.loads(data)
data = data['entities']['users'][url_token]
except Exception:
return None
# filter data according to userType
if data['userType'] != 'people':
return None
return data
def get_per_followers(url_token, page, sum_page):
"""crawl use's followers"""
print('crawling page %d/%d ……' % (page, sum_page))
followers = list()
url = '%s/people/%s/followers?page=%d' % (host, url_token, page)
html = common_crawler.get(url)
s = BS(html, 'html.parser')
try:
data = s.find('div', attrs={'id': 'data'})['data-state']
data = json.loads(data)
items = data['people']['followersByUser'][url_token]['ids']
except (AttributeError, TypeError):
return list()
for item in items:
if item is not None and item is not False and item is not True and item != '知乎用户':
print(item)
followers.append(item)
return followers
def get_followers(url_token, follower_count):
# get all the followers of the specified url_token
# return [] if user has no followers
if follower_count == 0:
return []
sum_page = int((follower_count - 1) / per_page) + 1
pool = Pool(processes=list_max_process_num)
results = []
for page in range(1, sum_page + 1):
results.append(pool.apply_async(get_per_followers, (url_token, page, sum_page)))
pool.close()
pool.join()
follower_list = []
for result in results:
follower_list += result.get()
return follower_list
def start():
redis_client = init_db()
while not redis_client.scard(waiting_set):
# block if there is no seed in waiting_set
print('no seeds in waiting set {}'.format(waiting_set))
time.sleep(0.1)
# fetch seeds from waiting_set
url_token = redis_client.spop(waiting_set).decode()
print("crawling %s's user info……" % url_token)
user = get_info(url_token)
redis_client.sadd(info_set, user)
print("crawling %s's followers list……" % url_token)
try:
follower_list = get_followers(url_token, user['followerCount'])
except (TypeError, AttributeError):
return
for follower in follower_list:
if not redis_client.sismember(seeds_all, follower):
pipe = redis_client.pipeline(False)
pipe.sadd(waiting_set, follower)
pipe.sadd(seeds_all, follower)
pipe.execute()
print("user {}'s info has being crawled".format(url_token))
if __name__ == '__main__':
init_seeds = ['resolvewang', 'excited-vczh']
redis_conn = init_db()
redis_conn.sadd(waiting_set, *init_seeds)
redis_conn.sadd(seeds_all, *init_seeds)
while True:
start()