forked from CoolWell/wechat_spider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
html_parser.py
138 lines (114 loc) · 5.1 KB
/
html_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python2
# -*- coding: UTF-8 -*-
import re
import urlparse
import datetime
from bs4 import BeautifulSoup
class HtmlParser(object):
@staticmethod
def parse_list_url(response, name):
if response is None:
return
soup = BeautifulSoup(response, 'html.parser', from_encoding='utf-8')
if soup.find(id="noresult_part1_container"):
with open(r'no_wechat.txt', 'a') as f:
f.write(name.encode('utf-8'))
f.write('\n')
return
url = soup.find(id='sogou_vr_11002301_box_0').get('href')
return url
def parse_list(self, page_url, html_cont):
if page_url is None or html_cont is None:
return
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
push_date = soup.find_all('div', class_="weui_msg_card_hd", limit=10)
# 日期比较
oneday = datetime.timedelta(days=1)
today = str(datetime.date.today()-oneday)
for a in push_date:
push_date1 = str(datetime.datetime.strptime(a.get_text().encode('utf-8'), "%Y年%m月%d日"))[:10]
if today == push_date1:
new_urls = self._get_new_urls(page_url, soup, today)
# print(new_urls)
return new_urls
return
# new_urls = self._get_new_urls(page_url, soup)
# new_data = self._get_new_data(page_url, soup)
# return new_urls, new_data
def _get_new_urls(self, page_url, soup, today):
# new_urls = set()
# links = soup.find_all('a', href=re.compile(r"/view/\d+\.htm"))
# for link in links:
# new_url = link['href']
# new_full_url = urlparse.urljoin(page_url, new_url)
# new_urls.add(new_full_url)
# article = soup.find('div', class_='weui_msg_card_bd')
# article1 = article.find_all('h4', class_='weui_media_title')
# links = [link.get('hrefs') for link in article1]
links = []
article1 = soup.find_all('h4', class_='weui_media_title')#所有文章
push_data = soup.find_all('p', class_= "weui_media_extra_info")#文章日期
for date, article in zip(push_data, article1):
data1 = str(datetime.datetime.strptime(date.get_text().encode('utf-8'), "%Y年%m月%d日"))[:10]
if data1 == today:
links.append(article.get('hrefs'))
if data1 < today:
break
# print(links)
new_urls = []
for link in links:
if link != '':
full_url=urlparse.urljoin(page_url, link)
new_urls.append(full_url)
return new_urls
def _get_new_data(self, page_url, soup):
res_data = {}
# url
res_data['url'] = page_url
# <dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1>
title_node = soup.find('dd', class_= "lemmaWgt-lemmaTitle-title").find("h1")
res_data['title'] = title_node.get_text()
summary_node = soup.find('div', class_="lemma-summary")
res_data['summary'] = summary_node.get_text()
return res_data
def _replace_html(self, s):
"""替换html‘"’等转义内容为正常内容
Args:
s: 文字内容
Returns:
s: 处理反转义后的文字
"""
s = s.replace(''', '\'')
s = s.replace('"', '"')
s = s.replace('&', '&')
s = s.replace('>', '>')
s = s.replace('<', '<')
s = s.replace('¥', '¥')
s = s.replace('amp;', '')
s = s.replace('<', '<')
s = s.replace('>', '>')
s = s.replace(' ', ' ')
s = s.replace('\\', '')
return s
def parse_article(self, html):
# <div class="rich_media_content " id="js_content">
if html is None:
return
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
# today = str(datetime.date.today())
# post_date = soup.find('em', id='post-date' )
# if post_date == today:
# get_text()返回的是unicode编码
try:
title = soup.find('h2', class_='rich_media_title').get_text().strip(' \n').encode('utf-8')
wname = soup.find('a', id='post-user').get_text().encode('utf-8')
date = soup.find('em', id='post-date').get_text().encode('utf-8')
content = soup.find('div', class_='rich_media_content ').get_text().strip('\n').encode('utf-8')#文章内容
readNum = soup.find('span', id='sg_readNum3').get_text().encode('utf-8')
praise_num = soup.find('span', id='sg_likeNum3').get_text().encode('utf-8')
discuss_list = soup.find_all('li', class_='discuss_item')
discuss_content = [a.find('div', class_='discuss_message_content').get_text().strip().encode('utf-8') for a in discuss_list]
discuss_praise = [a.find('span', class_='praise_num').get_text().encode('utf-8') for a in discuss_list]
except Exception:
return None
return title, wname, date, content, readNum, praise_num, discuss_content, discuss_praise