forked from niaiwoya/github_huanghyw_jd_seckill-master
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
117 lines (92 loc) · 3.6 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import tld as tld
import mysql.connector
import pymysql
import time
from bs4 import BeautifulSoup
import requests
from requests.compat import urljoin
conn=mysql.connector.connect(
user='root',
password='123456',
host='127.0.0.1',
port='3306',
database='test'
)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'}
def getASection(url, bookName):
cursor = conn.cursor()
try:
bookName += ".txt"
# f = open(bookName, "a", encoding='utf-8') # 利用追加的方式打开文件,这样后面章节的内容不会覆盖前面章节
rsp = requests.get(url, headers=headers)
rsp.encoding = 'utf-8'
bs = BeautifulSoup(rsp.text, 'html.parser')
title = bs.select('h1')[0]
# f.write(title.text)
# f.write("\n")
body = bs.select('div.TRS_Editor')[0]
paragraphs = body.find_all('p')
contents = []
contents = paragraphs
title = title
content = str(body)[:]
values = ('article_title', 'article_site','article_content','article_download_time')
times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
site = '中国政府网'
data = (title, site,content,times)
query = "INSERT INTO yq_article(article_title,article_site,article_content,article_download_time) VALUES ('%s','%s','%s','%s')" % (data)
try:
cursor.execute(query)
conn.commit()
except Exception as err:
print("sql语句执行错误", err)
conn.rollback()
cursor.close()
# f.writelines(content)
# f.close()
except IndexError as e:
print("======", e)
finally:
print('finally...')
# def getSections(url, bookName):
# rsp = requests.get(url, headers=headers)
# rsp.encoding = 'utf-8'
# bs = BeautifulSoup(rsp.text, 'html')
# sections = bs.select('div')[0]
# links = sections.select('a')
# #del links[0] # 第一个a标签为“收起”,将其删除
# for link in links:
# if link.attrs["href"] is not None:
# newUrl = urljoin(url, link.attrs['href'])
# getASection(newUrl, bookName)
def getBooks(url):
bookUrls = dict() # 用字典来存放小说信息,键为书名,值为链接地址
rsp = requests.get(url, headers=headers)
rsp.encoding = 'utf-8'
bs = BeautifulSoup(rsp.text, 'html.parser')
bookList = bs.select('div.moe-list.scy_lbsj-right-nr')[0]
sorts = bookList.select('a')
for sort in sorts:
book = sort.findNext('a')
if book.attrs['href'] is not None:
urlhref = book.attrs['href']
href = urljoin(url,urlhref)
# href = 'http://www.moe.gov.cn' + urlhref[1:]
# href = href.replace('book', 'list') # 需要把url中的book替换为list,直接进入章节页面
bookName = book.text
if bookName not in bookUrls:
bookUrls[bookName] = href
# print("{}:{}".format(bookName,href))
for bookName in bookUrls.keys():
getASection(bookUrls[bookName], bookName)
def getHtml(url):
bs_xml = BeautifulSoup(url)
print(bs_xml.prettify())
div = bs_xml.findAll('div', {'class': 'nav'})
div[0].contents
getBooks('http://www.moe.gov.cn/jyb_xwfb/gzdt_gzdt/')
# 'http://www.moe.gov.cn/s5987/202101/t20210114_509847.html'
# url = "http://www.moe.gov.cn/jyb_xwfb/gzdt_gzdt/"
# uri = "./s5987/202101/t20210105_508744.html"
# print(urljoin(url,uri))
# 'http://www.moe.gov.cn/jyb_xwfb/gzdt_gzdt/s5987/202101/t20210105_508744.html'