-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 460fc37
Showing
18 changed files
with
642 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import json | ||
from status_raise import StatusError | ||
|
||
def add_into_dir(base_url, url, name,show_content): | ||
try: | ||
data = '' | ||
index = 0 | ||
file_name = '' | ||
with open(base_url + 'dir.json','r',encoding="utf-8") as f: | ||
data = json.load(f) | ||
f.close() | ||
with open(base_url + 'dir.json','w',encoding="utf-8") as f: | ||
new_file = { | ||
'index': len(data['dir']), | ||
'file_name' : name.replace(' ', '_').replace("\\", '_').replace('/', '_').replace('?','_').replace('?', '_').replace('*', '_').replace(':', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_').replace('\'', '_') + url.split('/')[-1], | ||
'show_name': name, | ||
'show_content': show_content + '......' | ||
} | ||
index = new_file['index'] | ||
file_name = new_file['file_name'] | ||
data['dir'].append(new_file) | ||
json.dump(data,f,ensure_ascii=False) | ||
f.close() | ||
with open(base_url + 'map.json','r',encoding="utf-8") as f: | ||
data = json.load(f) | ||
f.close() | ||
with open(base_url + 'map.json','w',encoding="utf-8") as f: | ||
data['map'][file_name] = index | ||
json.dump(data,f,ensure_ascii=False) | ||
f.close() | ||
except Exception as e: | ||
raise StatusError(-3, '文件下载失败', e) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import requests | ||
import re | ||
from bs4 import BeautifulSoup as bf | ||
from initial_data import initial_data | ||
from status_raise import StatusError, Status | ||
|
||
# 拿问题名字、id,和回答id、内容的,顺便处理了回答的格式 | ||
|
||
def get_answer_info(url,content): | ||
base_url, get_way, push_way = initial_data.get_url_list(content) | ||
response = requests.request("GET", url) | ||
|
||
if bf(response.text, 'html.parser').find_all('title', text="安全验证 - 知乎") != []: | ||
raise StatusError(0, '触发反爬了', '触发反爬了') | ||
if bf(response.text, 'html.parser').find_all('div', class_='ErrorPage') != []: | ||
return Status(3, '知识荒原了', '知识荒原了'), '', '', '', '' | ||
question_info_src = bf(response.text, 'html.parser').find('div', class_='QuestionPage') | ||
#question_id = question_info_src.find('meta', itemprop='url').get('content').split('/')[-1] | ||
question_name = question_info_src.find('meta', itemprop='name').get('content') | ||
base_url = base_url + question_name.replace(' ', '_').replace("\\", '_').replace('/', '_').replace('?','_').replace('?', '_').replace('*', '_').replace(':', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_').replace('\'', '_') | ||
base_url = base_url + url.split('/')[-1] + '/' | ||
get_way = get_way + question_name.replace(' ', '_').replace("\\", '_').replace('/', '_').replace('?','_').replace('?', '_').replace('*', '_').replace(':', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_').replace('\'', '_') | ||
get_way = get_way + url.split('/')[-1] + '/' | ||
answer = str(question_info_src.find('div', class_="RichContent--unescapable")) | ||
answer = re.sub(r'(<div class="ContentItem-actions RichContent-actions">.*$)', '</div>', answer, 0, flags=re.MULTILINE) | ||
a = bf('<div class="CollectionItem"><div>', 'html.parser') | ||
inner_tag = a.new_tag('') | ||
title = a.new_tag('h1') | ||
title.string = question_name | ||
inner_tag.append(title) | ||
inner_tag.append(bf(answer, 'html.parser').select('span[itemprop]')[0]) | ||
b = bf(answer, 'html.parser').select('.ContentItem-time')[0] | ||
b.find_all('a')[0]['href'] = url | ||
inner_tag.append(a.new_tag('link', rel="stylesheet", href="../style.css")) | ||
a.div.contents= inner_tag.contents | ||
a.div.append(b) | ||
answer = str(a) | ||
|
||
return Status(0, '一切正常', '一切正常'),answer, question_name, base_url, get_way |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from initial_data import initial_data | ||
from query_all_needs import return_needs | ||
from status_raise import StatusError | ||
|
||
|
||
class Classifier(object): | ||
is_classfied = False | ||
is_pri = False | ||
def __init__(self, labels): | ||
if not initial_data.is_initial: | ||
raise StatusError(404, '传递未进行必需的前序操作的信息','') | ||
self.labels = labels | ||
self.is_pri = classify(self.labels) | ||
self.is_classfied = True | ||
self.base_url, self.get_way, self.push_way = initial_data.get_url_list(self) | ||
|
||
def set_content(self, content, base_url, get_way): | ||
self.content = content | ||
self.base_url = base_url | ||
self.get_way = get_way | ||
self.img_need,self.video_need = return_needs(self.content) | ||
pass | ||
|
||
|
||
|
||
|
||
def classify(labels): | ||
if '私人' in labels: | ||
return True | ||
else : | ||
return False | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
from pathlib import Path | ||
from status_raise import StatusError,Status | ||
|
||
def create_dir(img_need, comment_need, video_need, base_url, stop = False): | ||
if img_need: | ||
img_dir = Path(base_url + 'img') | ||
if video_need: | ||
video_dir = Path(base_url + 'video') | ||
if comment_need: | ||
text_dir = Path(base_url + 'text') | ||
comment_dir = Path(base_url + 'text/' + 'comment') | ||
child_comment_dir = Path(base_url + 'text/' + 'child_comment') | ||
try : | ||
Path(base_url).mkdir() | ||
if img_need: | ||
img_dir.mkdir() | ||
if video_need: | ||
video_dir.mkdir() | ||
if comment_need: | ||
text_dir.mkdir() | ||
comment_dir.mkdir() | ||
child_comment_dir.mkdir() | ||
return Status(0, '目录创建成功', '目录创建成功') | ||
except FileExistsError: | ||
if stop: | ||
return Status(2, base_url.split('/')[-2] + '已存在', FileExistsError) | ||
else : | ||
raise StatusError(-2, '目录创建失败,该目录已存在', FileExistsError) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import os | ||
from status_raise import StatusError | ||
from initial_data import initial_data | ||
|
||
def git(content): | ||
base_url, get_way, push_way = initial_data.get_url_list(content) | ||
if push_way == None: | ||
print('未配置git命令') | ||
return | ||
print('正在上传到github...') | ||
os.chdir(base_url) | ||
os.system('git init') | ||
os.system('git add -A') | ||
os.system('git commit -m "update"') | ||
os.system('git config http.sslVerify "false"') | ||
os.system(push_way) | ||
print('上传成功') | ||
|
||
def trans_pull(local_way, pull_way): | ||
if local_way == None: | ||
raise StatusError(1, '未配置git命令', '配置文件错误') | ||
print('正在从远程仓库拉取...') | ||
print ('cd ' + local_way ) | ||
os.chdir(local_way) | ||
os.system('git config http.sslVerify "false"') | ||
print(pull_way) | ||
os.system(pull_way) | ||
print('拉取成功') | ||
|
||
def trans_push(local_way, push_way): | ||
if local_way == None: | ||
raise StatusError(1, '未配置git命令', '配置文件错误') | ||
print('正在往远程仓库推送...') | ||
os.chdir(local_way) | ||
os.system('git add -A') | ||
os.system('git commit -m "already"') | ||
os.system(push_way) | ||
print('推送成功') | ||
|
||
def proxy_start(): | ||
os.system("start C:/Users/1/Desktop/工具/fastgithub_win-x64/fastgithub.exe") | ||
return | ||
|
||
def proxy_stop(): | ||
os.system('runas /user:administrator "taskkill /f /t /im fastgithub.exe"') | ||
return |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
from pathlib import Path | ||
from PIL import Image | ||
from bs4 import BeautifulSoup as bf | ||
|
||
from status_raise import StatusError,Status | ||
|
||
from initial_data import initial_data | ||
from answer import get_answer_info | ||
from p import get_p_info | ||
from dir import create_dir | ||
from classify import Classifier | ||
from img import get_set_img | ||
from video import get_set_video | ||
from save_html import save_html | ||
from get_text import get_all_text | ||
from add_into_dir import add_into_dir | ||
|
||
def execute(p_url, answer_url, label,comment_need = False, stop = False): | ||
label= [] | ||
url = '' | ||
content = Classifier(label) | ||
if p_url == '': | ||
url = answer_url | ||
status, got_content, name, base_url, get_way = get_answer_info(answer_url,content) | ||
else: | ||
url = p_url | ||
status ,got_content, name, base_url, get_way = get_p_info(p_url,content) | ||
if status.code == 3: | ||
if stop: | ||
return status, '',content | ||
else : | ||
raise StatusError(3, '知识荒原了', '知识荒原了') | ||
content.set_content(got_content, base_url, get_way) | ||
img_need, video_need = content.img_need, content.video_need | ||
dir_status = create_dir(img_need,comment_need,video_need,base_url, stop) | ||
if dir_status.code == 2: | ||
return dir_status, '',content | ||
if img_need: | ||
content.content = get_set_img(content).content | ||
if video_need: | ||
content.content = get_set_video(content).content | ||
show_content = get_all_text(content,base_url) | ||
add_into_dir(initial_data.get_url_list(content)[0],url,name,show_content) | ||
file = 'file://' + save_html(base_url, content) | ||
return Status(0, '一切正常', '一切正常'),file,content |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import requests | ||
|
||
# 这俩是用来拿问题下其他回答的 | ||
def get_answers_respones(question_id): | ||
global next_page_url | ||
global is_end | ||
|
||
url = 'https://www.zhihu.com/api/v4/questions/'+ question_id +'/feeds?include=%2Ccontent&limit=40' | ||
response = requests.request("GET", url).json() | ||
next_page_url = response['paging']['next'] | ||
is_end = response['paging']['is_end'] | ||
return response | ||
|
||
def get_answers_content(frist_response ,answer_id): | ||
global next_page_url | ||
global is_end | ||
for i in frist_response['data']: | ||
if str(i['target']['id']) == answer_id: | ||
return i['target']['content'] | ||
else : | ||
next_page_url = frist_response['paging']['next'] | ||
is_end = frist_response['paging']['is_end'] | ||
continue | ||
while not is_end: | ||
response = requests.request("GET", next_page_url).json() | ||
for i in response['data']: | ||
if str(i['target']['id']) == answer_id: | ||
return i['target']['content'] | ||
else : | ||
next_page_url = response['paging']['next'] | ||
is_end = response['paging']['is_end'] | ||
continue | ||
return None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from bs4 import BeautifulSoup as bf | ||
from status_raise import StatusError | ||
|
||
def get_all_text(res, base_url): | ||
content = bf(res.content, 'html.parser').find_all(class_='css-1g0fqss', options="[object Object]")[0] | ||
show_content = '' | ||
try: | ||
with open(base_url + "text.txt",'w' ,encoding='utf-8') as f: | ||
text = content.get_text() | ||
show_content = text[0:20] | ||
f.write(text) | ||
f.close() | ||
except Exception as e: | ||
raise StatusError(-2, '纯文本写入失败', e) | ||
|
||
return show_content | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import requests | ||
import re | ||
from bs4 import BeautifulSoup as bf | ||
from status_raise import StatusError, Status | ||
|
||
class get_set_img(object): | ||
is_initial = False | ||
img_list = [] | ||
img_name_index = 0 | ||
base_url = '' | ||
get_way = '' | ||
|
||
def __init__(self, content_classed): | ||
if content_classed.is_classfied: | ||
get_set_img.base_url, get_set_img.get_way = content_classed.base_url, content_classed.get_way | ||
get_set_img.is_initial = True | ||
get_set_img.set_img_list(content_classed.content) | ||
self.content = get_set_img.change_img_attr(content_classed.content) | ||
else: | ||
raise StatusError(404, '传递了未进行前序必要操作的信息', '程序bug') | ||
|
||
|
||
# 设置<img>的列表 | ||
def set_img_list(res): | ||
for i in bf(res, 'html.parser').find_all('figure'): | ||
img_src = get_set_img.get_img_url(i.find('img')['src']) | ||
print(i.find('img')) | ||
img_height = i.find('img')['data-rawheight'] | ||
img_width = i.find('img')['data-rawwidth'] | ||
img_aspect_ratio = int(img_width) / int(img_height) | ||
img_att = '<img loading="lazy" onerror="this.src=\'' + get_set_img.get_way + img_src + '\';this.onerror=null;" src="' + './' + img_src + '" aspect-ratio ="' + str(img_aspect_ratio) + '">' | ||
get_set_img.img_list.append(img_att) | ||
|
||
# 下载图片,返回图片的相对路径 | ||
def get_img_url(img_url): | ||
img_url = img_url.replace('720w.jpg?','1440w.jpg?') | ||
|
||
try: | ||
with open(get_set_img.base_url +'img/' + str(get_set_img.img_name_index) + '.jpg','wb') as f: | ||
f.write(requests.get(img_url).content) | ||
f.close() | ||
except Exception as e: | ||
raise StatusError(-3, '文件下载失败', e) | ||
get_set_img.img_name_index += 1 | ||
return 'img/' + str(get_set_img.img_name_index - 1) + '.jpg' | ||
|
||
# 把懒加载的<figure>替换成<img> | ||
def change_img_attr(content): | ||
def replace_img(m): | ||
return get_set_img.img_list.pop(0) | ||
content = re.sub(r"(<figure[^>]*>.*?<\/figure>)", replace_img, (content), 0, flags=re.MULTILINE) | ||
return content | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import json | ||
from status_raise import StatusError, Status | ||
|
||
class initial_data(object): | ||
is_initial = False | ||
__pri = [] | ||
__pub = [] | ||
|
||
def initial(): | ||
try : | ||
with open('./data.json') as f: | ||
data = json.load(f) | ||
f.close() | ||
initial_data.__pub = [data['pub_base_url'], data['pub_get_way'], data['pub_push_way'], data['pub_trans_local_way'], data['pub_trans_pull_way'], data['pub_trans_push_way']] | ||
initial_data.__pri = [data['pri_base_url'], data['pri_get_way'], data['pri_push_way'], data['pri_trans_local_way'], data['pri_trans_pull_way'], data['pri_trans_push_way']] | ||
except Exception as e: | ||
if initial_data.__pub == []: | ||
raise StatusError(1, 'data.json文件未正确配置,缺少必要的公开/默认路径 pub 设置。', '配置文件错误') | ||
elif initial_data.__pri == []: | ||
initial_data.is_initial = True | ||
return Status(1, 'data.json文件正确配置,但缺少可选的私人路径 pri 设置。', '配置文件不完整') | ||
|
||
raise StatusError(-1, 'data.json文件读取出错。\n' + str(e.args), '未知系统错误') | ||
|
||
initial_data.is_initial = True | ||
return Status(0,'data.json文件读取成功,数据完整。', '配置文件完整读取') | ||
|
||
def get_url_list(classed_obj): | ||
if classed_obj.is_pri: | ||
if initial_data.__pri != []: | ||
return initial_data.__pri[0:3] | ||
else: | ||
raise StatusError(1, '未提供对应功能所需信息,请检查data.json文件是否正确进行私人路径 pri 设置。', '配置文件错误') | ||
else : | ||
return initial_data.__pub[0:3] | ||
|
||
def get_trans(is_pri): | ||
if is_pri: | ||
if initial_data.__pri != []: | ||
return initial_data.__pri[3], initial_data.__pri[4], initial_data.__pri[5] | ||
else: | ||
raise StatusError(1, '未提供对应功能所需信息,请检查data.json文件是否正确进行私人路径 pri 设置。', '配置文件错误') | ||
else: | ||
return initial_data.__pub[3], initial_data.__pub[4], initial_data.__pub[5] | ||
|
||
|
Oops, something went wrong.