-
Notifications
You must be signed in to change notification settings - Fork 0
/
boss.py
311 lines (264 loc) · 11.2 KB
/
boss.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
import sqlite3
import requests
import os
import decode
import time
import json
import re
from selenium_boss import Browser
# 获取 Chrome 浏览器的 cookie 数据库路径
'''
此接口获取城市
https://www.zhipin.com/wapi/zpgeek/common/data/citysites.json
职位接口
https://www.zhipin.com/wapi/zpCommon/data/position.json
个人信息接口
https://www.zhipin.com/wapi/zpuser/wap/getUserInfo.json
首页
https://www.zhipin.com/web/geek/job?query=&page=6
工作列表 boss主要的反爬就在工作列表上
https://www.zhipin.com/wapi/zpgeek/search/joblist.json
'''
class Boss:
# 设置 User-Agent
headers = {
"accept": "application/json, text/plain, */*",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
"accept-language": "zh-CN,zh;q=0.9",
"sec-ch-ua": "\"Chromium\";v=\"110\", \"Not A(Brand\";v=\"24\", \"Google Chrome\";v=\"110\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"x-requested-with": "XMLHttpRequest",
}
browser = Browser()
cookies = {}
def __init__(self):
self.session = requests.Session()
self.session.verify = False
# 设置user-agent
self.session.headers.update(self.headers)
self.init_cookies()
print('cookies@', self.session.cookies.get_dict())
print('请求头@', self.session.headers)
self.boss_ids = self.open_boss_data()
pass
# 初始化token
def init_cookies(self):
# 连接Chrome浏览器的cookie数据库
self.conn = sqlite3.connect(self.get_cookie_file())
# 从本地浏览器储存中获取cookie
self.cookies = self.get_cookies('zhipin.com')
self.session.cookies.update(self.cookies)
self.session.headers.update({"token": self.get_use_token(),
"zp_token": self.session.cookies.get('geek_zp_token')})
def fresh_cookie_header(self):
self.cookies = self.browser.fresh_cookie()
# 更新检测的token字段
self.session.cookies.update(
{'__zp_stoken__': self.cookies.get('__zp_stoken__'),
'geek_zp_token': self.cookies.get('geek_zp_token'),
'__a': self.cookies.get('__a'), })
self.session.headers.update({"token": self.get_use_token(),
"zp_token": self.session.cookies.get('geek_zp_token')})
def get_cookie_file(self):
# 谷歌浏览器数据储存位置 执行start.bat后会自动生成
home_path = 'E:\dataChrome'
# home_path = os.path.expanduser('~')
# cookie_path = os.path.join(home_path, 'AppData', 'Local', 'Google', 'Chrome',
# 'User Data', 'Default', 'Network', 'Cookies')
cookie_path = os.path.join(home_path, 'Default', 'Network', 'Cookies')
print(cookie_path)
return cookie_path
# 读取 Chrome 浏览器的 cookie 数据库,并获取指定网站的 cookie
def get_cookies(self, url):
c = self.conn.cursor()
c.execute(
'SELECT name, encrypted_value FROM cookies WHERE host_key LIKE ?', ('%'+url+'%',))
cookies = c.fetchall()
c.close()
self.conn.close()
cookie_dict = {}
for cookie in cookies:
# cookie 解密
cookie_dict[cookie[0]] = decode.decode_cookies(cookie[1])
return cookie_dict
# 打开存储id的json数据
def open_boss_data(self, json_file_path="send.json"):
if os.path.exists(json_file_path):
# 如果文件存在则读取文件内容
with open(json_file_path, "r") as f:
data = json.load(f)
else:
# 如果文件不存在则创建文件
data = {}
with open(json_file_path, "w") as f:
json.dump(data, f)
return data
# 保存数据到本地
def save_boss_data(self, json_file_path="send.json"):
with open(json_file_path, "w") as f:
json.dump(self.boss_ids, f)
pass
# 获取用户token 放在请求头中
def get_use_token(self):
response = self.session.get(
f'https://www.zhipin.com/wapi/zpuser/wap/getUserInfo.json')
user_json = response.json()
token = user_json['zpData']['token']
return token
# 访问首页 更新ref
def get_head(self, page='1'):
url = f'https://www.zhipin.com/web/geek/job?query=&page={page}'
header = {
"referer": url
}
self.session.headers.update(header)
self.session.get(url=url)
pass
# 更新token
def update_token(self):
# 毫秒级的时间戳
now_time = int(time.time()*1000)
response = self.session.get(
f'https://www.zhipin.com/wapi/zppassport/get/zpToken?v={now_time}')
time_json = response.json()
# print('新的token出现@', time_json)
token = str(time_json['zpData']['token'])
print('token@', token)
# 更新请求头和cookie
self.session.cookies.update({"geek_zp_token": token})
self.session.headers.update({"zp_token": token})
# 获取指定城市的code
def get_city_code(self, city_name="深圳"):
response = self.session.get(
f'https://www.zhipin.com/wapi/zpgeek/common/data/citysites.json')
city_json = response.json()
city_list = city_json['zpData']
for city in city_list:
if (city['name'] == city_name):
print('城市@', city['name'])
return city['code']
pass
# 获取指定职位的code
def get_position_code(self, position_first="前端开发", position_last="前端开发工程师"):
response = self.session.get(
f'https://www.zhipin.com/wapi/zpCommon/data/position.json')
position_json = response.json()
position_list = position_json['zpData'][0]['subLevelModelList']
for position in position_list:
if (position['name'] == position_first):
for po in position['subLevelModelList']:
if (po['name'] == position_last):
return po['code']
pass
# https://www.zhipin.com/wapi/zpgeek/search/joblist.json?scene=1&query=&city=101210100&experience=104°ree=203&industry=&scale
# =303,304,305,306,302&stage=&position=100901&jobType=&salary=404&multiBusinessDistrict=&multiSubway=&page=1&pageSize=30
# 获取公司列表
def get_company(self, city='101210100', experience={'102', '101', '103'}, degree='0', scale={'302', '303', '304', '305', '306'}, position='100901', salary='0', page='1'):
scale_str = str(','.join(scale))
experience_str = str(','.join(experience))
while True:
response = self.session.get(
f'https://www.zhipin.com/wapi/zpgeek/search/joblist.json?scene=1&query=&city={city}&experience={experience_str}°ree={degree}&industry=&scale={scale_str}& \
stage=&position={position}&jobType=&salary={salary}&multiBusinessDistrict=&multiSubway=&page={page}&pageSize=30')
company_json = response.json()
print(company_json['code'])
if (str(company_json['code']) != '37'):
break
else:
print('爬虫被监测了,更新cookie@', company_json['code'])
self.fresh_cookie_header()
company_list = company_json['zpData']['jobList']
return company_list
# 发送post请求打招呼
def hello(self, securityId='', jobId='', lid=''):
url = f'https://www.zhipin.com/wapi/zpgeek/friend/add.json?securityId={securityId}&jobId={jobId}&lid={lid}'
response = self.session.post(url=url)
hello_json = response.json()
print('打招呼@@@@@@@@', hello_json)
pass
# 打招呼线程
def start_hello(self):
try:
position_code = self.get_position_code()
city_code = self.get_city_code()
time.sleep(2)
i = 1
while (i <= 10):
company_list = self.get_company(
city=city_code, position=position_code, page=str(i))
for item in company_list:
if 'encryptBossId' in self.boss_ids:
continue
self.boss_ids[item['encryptBossId']] = 1
self.hello(securityId=item['securityId'],
jobId=item['encryptJobId'],
lid=item['lid'])
time.sleep(0.8)
print(f'完成:{i}')
self.save_boss_data()
i = i+1
except Exception as e:
print('异常退出,保存数据:', e)
self.save_boss_data()
raise
# 进行中断操作也会走这里 KeyboardInterrupt
finally:
self.save_boss_data()
pass
# 把公司规模大于1000的过滤出来方便以后官网投递
def get_big_brand(self):
position_code = self.get_position_code()
city_code = self.get_city_code()
time.sleep(0.8)
# 把员工规模大于1000的存储在本地
def than_thousand(item):
min = str(item['brandScaleName']).split('-')[0]
min = re.findall(r'\d*', min)[0]
if int(min) >= 1000:
print(item['brandScaleName'], item['brandName'])
return True
return False
# 公司信息保存到本地
def save_brand_data(json_file_path="brand.json", brand_list=[]):
brand = {
str(city_code): brand_list
}
with open(json_file_path, "w", encoding="utf-8") as f:
json.dump(brand, f, ensure_ascii=False)
brand_list = []
i = 1
while (i <= 10):
try:
company_list = self.get_company(
city=city_code, position=position_code, page=str(i))
filter_list = list(filter(than_thousand, company_list))
for item in filter_list:
brand_list.append({
'name': item['brandName'],
'salary': item['brandScaleName']
})
except Exception as e:
save_brand_data(brand_list=brand_list)
print('异常退出,保存数据:', e)
exit(0)
# 进行中断操作也会走这里 KeyboardInterrupt
finally:
save_brand_data(brand_list=brand_list)
print(f'完成:{i}')
i = i+1
time.sleep(2)
# old = self.session.cookies.get('__zp_stoken__')
# self.fresh_cookie_header()
# new = self.session.cookies.get('__zp_stoken__')
# if old == new:
# print('两次一样的@@@@@-------------------@@@@@@@@@@', new)
# else:
# print('两次__zp_stoken__不一样@@@@@@@@@@@@@@@@@@@@@@@@@@@\n', new)
# time.sleep(random.uniform(1, 2))
pass
boss = Boss()
boss.start_hello()