forked from niaiwoya/github_huanghyw_jd_seckill-master
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tieba.py
123 lines (94 loc) · 3.52 KB
/
tieba.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import tld as tld
import mysql.connector
import pymysql
import re
import time
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
conn=mysql.connector.connect(
user='root',
password='123456',
host='127.0.0.1',
port='3306',
database='test'
)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'}
def getASection(url, bookName):
cursor = conn.cursor()
try:
bookName += ".txt"
# f = open(bookName, "a", encoding='utf-8') # 利用追加的方式打开文件,这样后面章节的内容不会覆盖前面章节
rsp = requests.get(url, headers=headers)
rsp.encoding = 'utf-8'
bs = BeautifulSoup(rsp.text, 'html.parser')
title = bs.select('h1')[0]
# f.write(title.text)
# f.write("\n")
body = bs.select('div.TRS_Editor')[0]
paragraphs = body.find_all('p')
contents = []
contents = paragraphs
title = title
content = str(body)[:]
values = ('article_title', 'article_site','article_content','article_download_time')
times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
site = '百度贴吧'
data = (title, site,content,times)
query = "INSERT INTO yq_article(article_title,article_site,article_content,article_download_time) VALUES ('%s','%s','%s','%s')" % (data)
try:
cursor.execute(query)
conn.commit()
except Exception as err:
print("sql语句执行错误", err)
conn.rollback()
cursor.close()
# f.writelines(content)
# f.close()
except IndexError as e:
print("======", e)
finally:
print('finally...')
def getBooks(url):
bookUrls = dict() # 用字典来存放小说信息,键为书名,值为链接地址
rsp = requests.get(url, headers=headers)
rsp.encoding = 'utf-8'
bs = BeautifulSoup(rsp.text, 'html.parser')
# try:
bookList = bs.select('ul #thread_list')[0]
sorts = bookList.select('a')
# except Exception as err:
# print("======", err)
for sort in sorts:
book = sort.findNext('a')
if book.attrs['href'] is not None:
urlhref = book.attrs['href']
href = urljoin(url,urlhref)
# href = 'http://www.moe.gov.cn' + urlhref[1:]
# href = href.replace('book', 'list') # 需要把url中的book替换为list,直接进入章节页面
bookName = book.text
if bookName not in bookUrls:
bookUrls[bookName] = href
# print("{}:{}".format(bookName,href))
print(bookUrls[bookName], '===========', bookName)
# for bookName in bookUrls.keys():
# getASection(bookUrls[bookName], bookName)
def getHtml(url):
bs_xml = BeautifulSoup(url)
print(bs_xml.prettify())
div = bs_xml.findAll('div', {'class': 'nav'})
div[0].contents
# 替换表情符号
def filter_emoji(desstr,restr=''):
#过滤表情
try:
co = re.compile(u'[\U00010000-\U0010ffff]')
except re.error:
co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
return co.sub(restr, desstr)
getBooks('https://tieba.baidu.com/p/7210039130')
# 'http://www.moe.gov.cn/s5987/202101/t20210114_509847.html'
# url = "http://www.moe.gov.cn/jyb_xwfb/gzdt_gzdt/"
# uri = "./s5987/202101/t20210105_508744.html"
# print(urljoin(url,uri))
# 'http://www.moe.gov.cn/jyb_xwfb/gzdt_gzdt/s5987/202101/t20210105_508744.html'