-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspider.py
153 lines (111 loc) · 5.08 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# -*- coding: UTF-8 -*-
import time
import requests
import numpy as np
from bs4 import BeautifulSoup
import functools
import re
from urllib.parse import urlparse, parse_qs
import psycopg2
headers=[
{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},
{'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},
{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}
]
class Spider(object):
pageSize = 40
def __init__(self):
self.db = psycopg2.connect(database="tih", user="postgres", password="123456", host="127.0.0.1", port="5432")
print("connect db success.")
def fetchListData(self, month, day, page = 0):
htmlURL = "http://www.todayonhistory.com/" + str(month) + '/' + str(day)
apiURL = "http://www.todayonhistory.com/index.php?m=content&c=index&a=json_event&page=" + str(page) + "&pagesize=" + str(self.pageSize) + "&month=" + str(month) + "&day=" + str(day)
res = requests.get(
htmlURL if page == 0 else apiURL,
headers=headers[np.random.randint(0, len(headers))]
)
result = []
if page == 0:
soup = BeautifulSoup(res.text, "html5lib").find(id="container").find_all("li")
for item in soup:
_data = {}
txtLink = item.select('.text > a, a.txt')
img = item.find('img')
year = item.select('.time .moh b')
if not len(txtLink):
continue
txtLink = txtLink[0]
description = item.select('.text > p')
result.append({
'url': txtLink.get('href'),
'title': txtLink.text,
'thumb': img.get('data-original') if img else '',
'solaryear': year[0].text if len(year) else '',
'description': description[0].text if len(description) else '',
})
else:
for item in res.json():
result.append({
'url': item['url'],
'title': item['title'],
'thumb': item['thumb'],
'solaryear': item['solaryear'],
'description': item['description'],
})
return result
def fetchAllListData(self, month, day):
page = 0
result = []
while (1):
_result = self.fetchListData(month, day, page)
result = result + _result
if (len(_result) == 0 or len(_result) < self.pageSize):
break
page = page + 1
return result
def fetchDetailData(self, url):
res = requests.get(url, headers=headers[np.random.randint(0, len(headers))])
res.encoding = 'utf-8'
if (res.status_code >= 400):
print(err)
soup = BeautifulSoup(res.text, "html5lib")
body = soup.select('.body')
idElm = soup.select('script[src^="http://www.todayonhistory.com/api.php"]')
return {
"body": body[0].prettify() if len(body) else '',
'id': parse_qs(urlparse(idElm[0].get('src')).query)['id'][0] if idElm else '',
}
def fetchDayAllData(self, month, day):
_list = self.fetchAllListData(month, day)
print('已获取%d月%d日全部列表数据,共计%d条,开始获取详情数据。' % (month, day, len(_list)))
for index, item in enumerate(_list):
time.sleep(np.random.rand() * 3)
print('开始获取第%d条数据:%s页面。' % (index + 1, item['url']))
_detail = self.fetchDetailData(item['url'])
item['id'] = _detail['id']
item['body'] = _detail['body']
item['month'] = month
item['day'] = day
print('已获取第%d条数据:第三方id为%s。' % (index + 1, item['id']))
self.saveData(item)
print('已获取%d月%d日全部数据。' % (month, day))
return _list
def saveData(self, data):
cur = self.db.cursor()
now = int(time.time())
cur.execute("insert into events (title, description, body, month, day, target, target_id, target_detail_url, create_time, update_time, status) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", (data['title'], data['description'], data['body'], data['month'], data['day'], 'www.todayinhistory.com', data['id'], data['url'], now, now, 1))
self.db.commit()
print("数据已保存。")
def getAllData(self):
for month in range(1, 12):
maxDay = 30
if (month in [1, 3, 5, 7, 8, 10, 12]):
maxDay = 31
if (month in [2]):
maxDay = 29
for day in range(1, maxDay):
self.fetchDayAllData(month, day)
print('已获取全部数据。')
if __name__ == "__main__":
spider = Spider()
spider.getAllData()