-
Notifications
You must be signed in to change notification settings - Fork 0
/
QzoneCrawler.py
299 lines (271 loc) · 11.9 KB
/
QzoneCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
#-*- coding: UTF-8 -*-
import time
import os
import re
import gzip
import json
from socket import*
# httpx module from -> https://github.com/thisforeda/socket-http
import httpx
#-------------------------------------------------------------------------------------#
# (C) 2015 by ZhangYiDa <http://www.loogi.cn> #
# All Rights Reserved. #
#-------------------------------------------------------------------------------------#
MINIMUM = 255
STR_JSONDEC_ERROR = "JSON 解码失败"
STR_ENCODE_ERROR = "UNICODE编码失败"
STR_ENCODE_ERROR = "UNICODE编码失败"
CRLF = b'\r\n\r\n'
class Crawler:
def __init__(self):
self.basedir = ''
self.__log_file = self.__new_file_name('crawler-')
self.cookie = ''
self.main_page_cgi_url = 'http://user.qzone.qq.com/cgi-bin/main_page_cgi?uin=%s¶m=15|16&g_tk=%s'
self.feeds_html_act_url = 'http://ic2.qzone.qq.com/cgi-bin/feeds/feeds_html_act_all?hostuin=%s&start=%s&count=%s&g_tk=%s'
self.emotion_cgi_msglist_url = 'http://taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6?uin=%s&pos=%s&num=%s&g_tk=%s'
def __write(self,str):
'''
一个简单的日志记录过程 '''
with open(self.__log_file,'a+') as hfile:
str = time.asctime() + ':' + str + os.linesep
hfile.write( str )
def calc_GTK(self,cookie):
if type(cookie) is not str:
if type(cookie) is not bytes:
return None
cookie = str(self.__safeenc(cookie))
'''
在js的g_tk构造函数中貌似优先使用p_skey计算g_tk
如果没有则用skey,不知道分析的对不对,没学过JS
下面一段代码就是在COOKIE中查找 skey 的值
'''
match = re.search('(?i)p_skey=(.*?);',cookie)
if match is None:
match = re.search('(?i)skey=(.*?);',cookie)
if match is None:
match = re.search('(?i)rv2=(.*?);',cookie)
if match is None:
return None
'''将值转换为字节码,貌似字节参数可以直接传给自己写的DLL函数'''
try :
skey = match.groups()[0].encode('utf-8')
''' 调用DLL来计算g_tk的int值 '''
return str(self.getACSRFToken(skey))
except Exception as err:
self.__write(str(err))
return None
def getACSRFToken(self,skey):
var_hash = 5381
for i in range(0,len(skey)):
var_hash += (var_hash<<5) + skey[i]
return var_hash & 2147483647
def __new_file_name(self,prefix):
ctime = time.localtime()
stime = str(ctime.tm_year) + '_' \
+ str(ctime.tm_mon) + '_' \
+ str(ctime.tm_mday) + '_' \
+ str(ctime.tm_hour) + '_' \
+ str(ctime.tm_min) + '_' \
+ str(ctime.tm_sec) + '.EDA'
if type(prefix) is not str:
if type(prefix) is not int:
return None
prefix = str(prefix)
return self.basedir + prefix + '@' + stime
def __fmtqum(self,qnum):
if type(qnum) is not str:
if type(qnum) is not int:
return None
return str(qnum)
def how_many_shuoshuo(self,qnum):
cookie = self.cookie
if type(cookie) is not str and cookie:
cookie = str(cookie)
g_tk = self.calc_GTK(cookie)
if g_tk is not None:
qnum = self.__fmtqum(qnum)
'''获取说说数量的地址和说说JSON的地址不一样'''
ret = httpx.urlopen(
self.main_page_cgi_url%(qnum,g_tk),
custom_header={'Cookie':cookie}
)
if not hasattr(ret,'data'):
return -2
'''返回-1代表无权访问,返回值代表说说数量,返回-2代表。自己猜'''
match = re.search('"SS":(\d+)',self.__safedec(ret.data))
if match is not None:
return int(match.groups()[0])
return -1
return -2
def get_shuoshuo_by_feeds_html_act(self,qnum):
'''从什么位置开始'''
startpos = 0
'''每次请求的数量'''
distance = 10
'''输出文件的名称'''
cookie = self.cookie
if type(cookie) is not str and cookie:
cookie = str(cookie)
g_tk = self.calc_GTK(cookie)
if g_tk is not None:
outname = self.__new_file_name(qnum)
if outname is not None:
while True:
''' 说说从0开始,每次最低请求6个,不论怎样,只有多没有少
start=0;cnt=6
'''
#print("startpos=%d distance=%d"%(startpos,distance))
startpos += distance
ret = httpx.urlopen(
self.feeds_html_act_url%(qnum,startpos,distance,g_tk),
custom_header = {'Cookie':cookie}
)
if hasattr(ret,'data'):
with open(outname,'ab') as streamout:
streamout.write(ret.data + CRLF + CRLF)
'''如果在JSON中找到了这个代表没有说说可获取'''
if re.search('(?i)hasMoreFeeds_0:false',self.__safedec(ret.data)):
break
else:
return False
streamout.close()
return True
return False
def __parse_callback_json(self,data):
'''去除数据的_callback( json );'''
data = data[10:-2]
if type(data) is bytes:
data = self.__safedec(data)
'''
使用JSON 提取所有在说说中有评论的人并 以 {QQ:NAME} 格式保存到字典中
这个功能并不是很完善,下面只是我在3个人的说说中发现的 JSON 关系
'''
try :
Jsondata = json.loads(data)
except Exception as err:
self.__write(STR_JSONDEC_ERROR)
return False
if Jsondata.get('msglist'):
for msg in Jsondata['msglist']:
if msg.get('rtlist'):
for rt in msg['rtlist']:
if rt.get('name'):
self.dicts.update({rt['uin']:rt['name']})
if rt.get('conlist'):
for at in rt['conlist']:
if at.get('nick'):
self.dicts.update({at['uin']:at['nick']})
elif msg.get('commentlist'):
for comment in msg['commentlist']:
if comment.get('name'):
self.dicts.update({comment['uin']:comment['name']})
if comment.get('list_3'):
for li in comment['list_3']:
if li.get('name'):
self.dicts.update({li['uin']:li['name']})
else:
continue
#self.__write(str(msg.keys()))
return True
return False
''' '''
def get_shuoshuo_emotion_cgi_msglist(self,qnum):
'''从什么位置开始'''
startpos = 0
'''每次请求的数量'''
distance = 10
'''输出文件的名称'''
cookie = self.cookie
if type(cookie) is not str and cookie:
cookie = str(cookie)
g_tk = self.calc_GTK(cookie)
if g_tk is not None:
outname = self.__new_file_name(qnum)
isFirstRun = True
if outname is not None:
while True:
#print("startpos=%d distance=%d"%(startpos,distance))
ret = httpx.urlopen(
self.emotion_cgi_msglist_url%(qnum,startpos,distance,g_tk),
custom_header = {'Cookie':cookie}
)
if hasattr(ret,'data'):
'''------------数据大小检查-----------'''
if len(ret.data) < MINIMUM:
self.__write(self.__safedec(ret.data))
print("[WARN] 返回数据小于最低值,请先检查日志")
while True:
cmd = input("指令:")
if cmd == "continue":
break
elif cmd == "exit":
return False
elif cmd == "cookie":
self.cookie = input("Cookie:")
break
'''----------------------------------'''
if isFirstRun:
match = re.search('(?i)\"msgnum\":(\d+)',self.__safedec(ret.data))
if match is not None:
shuoshuo_cnt = int(match.groups()[0])
if shuoshuo_cnt == 0:
return False
isFirstRun = False
print("[INFO] user %d has %d shuoshuo,now crawling."%(qnum,shuoshuo_cnt))
else: return False
with open(outname,'ab') as streamout:
streamout.write(ret.data + CRLF*2)
self.__parse_callback_json(ret.data)
startpos += distance
if startpos >= shuoshuo_cnt:
break
else:
return False
streamout.close()
return True
return False
def __safedec(self,what):
try :
return what.decode('utf-8')
except UnicodeDecodeError:
try :
return what.decode('gbk')
except UnicodeDecodeError:
self.__write(STR_ENCODE_ERROR)
return ''
def __safeenc(self,what):
try :
return what.encode('utf-8')
except UnicodeEncodeError:
try :
return what.encode('gbk')
except UnicodeEncodeError:
self.__write(STR_ENCODE_ERROR)
return b''
def main_thread():
crawler = Crawler()
crawler.basedir = os.getcwd() + '\\crawler\\'
crawler.cookie = 'YOUR COOKIE HERE'
with open('friends.dat','rb') as hfriend:
for line in hfriend:
if line == '\n':
continue
try :
QQ = int(line.decode('utf-8').split('#')[0])
except Exception as err:
continue
startT = time.clock()
crawler.dicts = {}
ret = crawler.get_shuoshuo_emotion_cgi_msglist(QQ)
if ret == False:
print("[WARN] failed craw %s, access denied or no shuoshuo."%QQ)
continue
print("[TIME] time used= %d"%(time.clock() - startT))
with open(crawler.basedir + str(QQ) + '.dat','wb') as hf:
for (key,value) in crawler.dicts.items():
key = str(key).encode('utf-8')
value = value.encode('utf-8')
data = key + b' # ' + value + b'\r\n'
hf.write(data)
main_thread()