', 'html.parser')
+ inner_tag = a.new_tag('')
+ title = a.new_tag('h1')
+ title.string = question_name
+ inner_tag.append(title)
+ inner_tag.append(bf(answer, 'html.parser').select('span[itemprop]')[0])
+ b = bf(answer, 'html.parser').select('.ContentItem-time')[0]
+ b.find_all('a')[0]['href'] = url
+ inner_tag.append(a.new_tag('link', rel="stylesheet", href="../style.css"))
+ a.div.contents= inner_tag.contents
+ a.div.append(b)
+ answer = str(a)
+
+ return Status(0, '一切正常', '一切正常'),answer, question_name, base_url, get_way
\ No newline at end of file
diff --git a/classify.py b/classify.py
new file mode 100644
index 0000000..78a2929
--- /dev/null
+++ b/classify.py
@@ -0,0 +1,32 @@
+from initial_data import initial_data
+from query_all_needs import return_needs
+from status_raise import StatusError
+
+
+class Classifier(object):
+ is_classfied = False
+ is_pri = False
+ def __init__(self, labels):
+ if not initial_data.is_initial:
+ raise StatusError(404, '传递未进行必需的前序操作的信息','')
+ self.labels = labels
+ self.is_pri = classify(self.labels)
+ self.is_classfied = True
+ self.base_url, self.get_way, self.push_way = initial_data.get_url_list(self)
+
+ def set_content(self, content, base_url, get_way):
+ self.content = content
+ self.base_url = base_url
+ self.get_way = get_way
+ self.img_need,self.video_need = return_needs(self.content)
+ pass
+
+
+
+
+def classify(labels):
+ if '私人' in labels:
+ return True
+ else :
+ return False
+
\ No newline at end of file
diff --git a/dir.py b/dir.py
new file mode 100644
index 0000000..b29d4ed
--- /dev/null
+++ b/dir.py
@@ -0,0 +1,29 @@
+from pathlib import Path
+from status_raise import StatusError,Status
+
+def create_dir(img_need, comment_need, video_need, base_url, stop = False):
+ if img_need:
+ img_dir = Path(base_url + 'img')
+ if video_need:
+ video_dir = Path(base_url + 'video')
+ if comment_need:
+ text_dir = Path(base_url + 'text')
+ comment_dir = Path(base_url + 'text/' + 'comment')
+ child_comment_dir = Path(base_url + 'text/' + 'child_comment')
+ try :
+ Path(base_url).mkdir()
+ if img_need:
+ img_dir.mkdir()
+ if video_need:
+ video_dir.mkdir()
+ if comment_need:
+ text_dir.mkdir()
+ comment_dir.mkdir()
+ child_comment_dir.mkdir()
+ return Status(0, '目录创建成功', '目录创建成功')
+ except FileExistsError:
+ if stop:
+ return Status(2, base_url.split('/')[-2] + '已存在', FileExistsError)
+ else :
+ raise StatusError(-2, '目录创建失败,该目录已存在', FileExistsError)
+
\ No newline at end of file
diff --git a/do_git.py b/do_git.py
new file mode 100644
index 0000000..813b346
--- /dev/null
+++ b/do_git.py
@@ -0,0 +1,46 @@
+import os
+from status_raise import StatusError
+from initial_data import initial_data
+
+def git(content):
+ base_url, get_way, push_way = initial_data.get_url_list(content)
+ if push_way == None:
+ print('未配置git命令')
+ return
+ print('正在上传到github...')
+ os.chdir(base_url)
+ os.system('git init')
+ os.system('git add -A')
+ os.system('git commit -m "update"')
+ os.system('git config http.sslVerify "false"')
+ os.system(push_way)
+ print('上传成功')
+
+def trans_pull(local_way, pull_way):
+ if local_way == None:
+ raise StatusError(1, '未配置git命令', '配置文件错误')
+ print('正在从远程仓库拉取...')
+ print ('cd ' + local_way )
+ os.chdir(local_way)
+ os.system('git config http.sslVerify "false"')
+ print(pull_way)
+ os.system(pull_way)
+ print('拉取成功')
+
+def trans_push(local_way, push_way):
+ if local_way == None:
+ raise StatusError(1, '未配置git命令', '配置文件错误')
+ print('正在往远程仓库推送...')
+ os.chdir(local_way)
+ os.system('git add -A')
+ os.system('git commit -m "already"')
+ os.system(push_way)
+ print('推送成功')
+
+def proxy_start():
+ os.system("start C:/Users/1/Desktop/工具/fastgithub_win-x64/fastgithub.exe")
+ return
+
+def proxy_stop():
+ os.system('runas /user:administrator "taskkill /f /t /im fastgithub.exe"')
+ return
\ No newline at end of file
diff --git a/execute.py b/execute.py
new file mode 100644
index 0000000..0dafe85
--- /dev/null
+++ b/execute.py
@@ -0,0 +1,45 @@
+from pathlib import Path
+from PIL import Image
+from bs4 import BeautifulSoup as bf
+
+from status_raise import StatusError,Status
+
+from initial_data import initial_data
+from answer import get_answer_info
+from p import get_p_info
+from dir import create_dir
+from classify import Classifier
+from img import get_set_img
+from video import get_set_video
+from save_html import save_html
+from get_text import get_all_text
+from add_into_dir import add_into_dir
+
+def execute(p_url, answer_url, label,comment_need = False, stop = False):
+ label= []
+ url = ''
+ content = Classifier(label)
+ if p_url == '':
+ url = answer_url
+ status, got_content, name, base_url, get_way = get_answer_info(answer_url,content)
+ else:
+ url = p_url
+ status ,got_content, name, base_url, get_way = get_p_info(p_url,content)
+ if status.code == 3:
+ if stop:
+ return status, '',content
+ else :
+ raise StatusError(3, '知识荒原了', '知识荒原了')
+ content.set_content(got_content, base_url, get_way)
+ img_need, video_need = content.img_need, content.video_need
+ dir_status = create_dir(img_need,comment_need,video_need,base_url, stop)
+ if dir_status.code == 2:
+ return dir_status, '',content
+ if img_need:
+ content.content = get_set_img(content).content
+ if video_need:
+ content.content = get_set_video(content).content
+ show_content = get_all_text(content,base_url)
+ add_into_dir(initial_data.get_url_list(content)[0],url,name,show_content)
+ file = 'file://' + save_html(base_url, content)
+ return Status(0, '一切正常', '一切正常'),file,content
\ No newline at end of file
diff --git a/get_all_answers.py b/get_all_answers.py
new file mode 100644
index 0000000..ad853d9
--- /dev/null
+++ b/get_all_answers.py
@@ -0,0 +1,33 @@
+import requests
+
+# 这俩是用来拿问题下其他回答的
+def get_answers_respones(question_id):
+ global next_page_url
+ global is_end
+
+ url = 'https://www.zhihu.com/api/v4/questions/'+ question_id +'/feeds?include=%2Ccontent&limit=40'
+ response = requests.request("GET", url).json()
+ next_page_url = response['paging']['next']
+ is_end = response['paging']['is_end']
+ return response
+
+def get_answers_content(frist_response ,answer_id):
+ global next_page_url
+ global is_end
+ for i in frist_response['data']:
+ if str(i['target']['id']) == answer_id:
+ return i['target']['content']
+ else :
+ next_page_url = frist_response['paging']['next']
+ is_end = frist_response['paging']['is_end']
+ continue
+ while not is_end:
+ response = requests.request("GET", next_page_url).json()
+ for i in response['data']:
+ if str(i['target']['id']) == answer_id:
+ return i['target']['content']
+ else :
+ next_page_url = response['paging']['next']
+ is_end = response['paging']['is_end']
+ continue
+ return None
\ No newline at end of file
diff --git a/get_text.py b/get_text.py
new file mode 100644
index 0000000..d9eb01e
--- /dev/null
+++ b/get_text.py
@@ -0,0 +1,17 @@
+from bs4 import BeautifulSoup as bf
+from status_raise import StatusError
+
+def get_all_text(res, base_url):
+ content = bf(res.content, 'html.parser').find_all(class_='css-1g0fqss', options="[object Object]")[0]
+ show_content = ''
+ try:
+ with open(base_url + "text.txt",'w' ,encoding='utf-8') as f:
+ text = content.get_text()
+ show_content = text[0:20]
+ f.write(text)
+ f.close()
+ except Exception as e:
+ raise StatusError(-2, '纯文本写入失败', e)
+
+ return show_content
+
\ No newline at end of file
diff --git a/img.py b/img.py
new file mode 100644
index 0000000..6967596
--- /dev/null
+++ b/img.py
@@ -0,0 +1,53 @@
+import requests
+import re
+from bs4 import BeautifulSoup as bf
+from status_raise import StatusError, Status
+
+class get_set_img(object):
+ is_initial = False
+ img_list = []
+ img_name_index = 0
+ base_url = ''
+ get_way = ''
+
+ def __init__(self, content_classed):
+ if content_classed.is_classfied:
+ get_set_img.base_url, get_set_img.get_way = content_classed.base_url, content_classed.get_way
+ get_set_img.is_initial = True
+ get_set_img.set_img_list(content_classed.content)
+ self.content = get_set_img.change_img_attr(content_classed.content)
+ else:
+ raise StatusError(404, '传递了未进行前序必要操作的信息', '程序bug')
+
+
+ # 设置
![]()
的列表
+ def set_img_list(res):
+ for i in bf(res, 'html.parser').find_all('figure'):
+ img_src = get_set_img.get_img_url(i.find('img')['src'])
+ print(i.find('img'))
+ img_height = i.find('img')['data-rawheight']
+ img_width = i.find('img')['data-rawwidth']
+ img_aspect_ratio = int(img_width) / int(img_height)
+ img_att = '

'
+ get_set_img.img_list.append(img_att)
+
+ # 下载图片,返回图片的相对路径
+ def get_img_url(img_url):
+ img_url = img_url.replace('720w.jpg?','1440w.jpg?')
+
+ try:
+ with open(get_set_img.base_url +'img/' + str(get_set_img.img_name_index) + '.jpg','wb') as f:
+ f.write(requests.get(img_url).content)
+ f.close()
+ except Exception as e:
+ raise StatusError(-3, '文件下载失败', e)
+ get_set_img.img_name_index += 1
+ return 'img/' + str(get_set_img.img_name_index - 1) + '.jpg'
+
+ # 把懒加载的
替换成
+ def change_img_attr(content):
+ def replace_img(m):
+ return get_set_img.img_list.pop(0)
+ content = re.sub(r"(]*>.*?<\/figure>)", replace_img, (content), 0, flags=re.MULTILINE)
+ return content
+
diff --git a/initial_data.py b/initial_data.py
new file mode 100644
index 0000000..8374e21
--- /dev/null
+++ b/initial_data.py
@@ -0,0 +1,46 @@
+import json
+from status_raise import StatusError, Status
+
+class initial_data(object):
+ is_initial = False
+ __pri = []
+ __pub = []
+
+ def initial():
+ try :
+ with open('./data.json') as f:
+ data = json.load(f)
+ f.close()
+ initial_data.__pub = [data['pub_base_url'], data['pub_get_way'], data['pub_push_way'], data['pub_trans_local_way'], data['pub_trans_pull_way'], data['pub_trans_push_way']]
+ initial_data.__pri = [data['pri_base_url'], data['pri_get_way'], data['pri_push_way'], data['pri_trans_local_way'], data['pri_trans_pull_way'], data['pri_trans_push_way']]
+ except Exception as e:
+ if initial_data.__pub == []:
+ raise StatusError(1, 'data.json文件未正确配置,缺少必要的公开/默认路径 pub 设置。', '配置文件错误')
+ elif initial_data.__pri == []:
+ initial_data.is_initial = True
+ return Status(1, 'data.json文件正确配置,但缺少可选的私人路径 pri 设置。', '配置文件不完整')
+
+ raise StatusError(-1, 'data.json文件读取出错。\n' + str(e.args), '未知系统错误')
+
+ initial_data.is_initial = True
+ return Status(0,'data.json文件读取成功,数据完整。', '配置文件完整读取')
+
+ def get_url_list(classed_obj):
+ if classed_obj.is_pri:
+ if initial_data.__pri != []:
+ return initial_data.__pri[0:3]
+ else:
+ raise StatusError(1, '未提供对应功能所需信息,请检查data.json文件是否正确进行私人路径 pri 设置。', '配置文件错误')
+ else :
+ return initial_data.__pub[0:3]
+
+ def get_trans(is_pri):
+ if is_pri:
+ if initial_data.__pri != []:
+ return initial_data.__pri[3], initial_data.__pri[4], initial_data.__pri[5]
+ else:
+ raise StatusError(1, '未提供对应功能所需信息,请检查data.json文件是否正确进行私人路径 pri 设置。', '配置文件错误')
+ else:
+ return initial_data.__pub[3], initial_data.__pub[4], initial_data.__pub[5]
+
+
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..73a0fb9
--- /dev/null
+++ b/main.py
@@ -0,0 +1,73 @@
+from pathlib import Path
+from PIL import Image
+from bs4 import BeautifulSoup as bf
+
+from status_raise import StatusError
+
+from run_from_trans_station import run_from_trans_station
+from initial_data import initial_data
+from execute import execute
+from do_git import git, proxy_start, proxy_stop
+
+
+def collect():
+ answer_url = ''
+ p_url = ''
+
+ url = input('请输入链接:')
+ if url.split('/')[-2] == 'answer':
+ answer_url = url
+ elif url.split('/')[-2] == 'p':
+ p_url = url
+
+ comment_need = input('是否需要附上评论:')
+ if comment_need == 'y' or comment_need == 'Y' or comment_need == 'yes' or comment_need == 'Yes' or comment_need == 'YES':
+ comment_need = False
+ print('暂不支持附上评论哦')
+ else:
+ comment_need = False
+ label = []
+ status ,file_url,content = execute(p_url ,answer_url, label, comment_need)
+ print('\n\n\n点击这里访问:\n\n\n', file_url)
+ git(content)
+
+if __name__ == '__main__':
+ try:
+ initial_data.initial()
+ except StatusError:
+ print('\n\n\n初始化失败')
+ print('请先在同目录下正确配置data.json文件')
+ print('格式如下:\n\n')
+ print('{')
+ print(' "pri_base_url": "x:/xx/xx", -----(OPTIONAL)')
+ print(' "pri_push_way": "git push xxx xxx", -----(OPTIONAL)')
+ print(' "pri_get_way": "https://raw.githubusercontent.com/your_user_name/file/main/" -----(OPTIONAL)' )
+ print(' "pub_base_url": "x:/xx/xx",')
+ print(' "pub_push_way": "git push xxx xxx", ')
+ print(' "pub_get_way": "https://raw.githubusercontent.com/your_user_name/file/main/" -----(Your picturebed address)' )
+ print('}\n\n')
+ input('按任意键退出')
+ print('\n\n\n选择:\n')
+ print('1. 从单个知乎链接获取;\n')
+ print('2. 从中转站批量获取.\n\n\n')
+ choice = input('请输入:')
+ proxy_start()
+ if choice == '1':
+ collect()
+ elif choice == '2':
+ print('\n\n\n选择:\n')
+ print('1. pri;\n')
+ print('2. pub.\n\n\n')
+ choice = input('请输入: ')
+ if choice == 'pri':
+ run_from_trans_station(True)
+ elif choice == 'pub':
+ run_from_trans_station(False)
+ else :
+ raise StatusError(2, '未正确输入信息', '用户输入不在预期内')
+ else :
+ raise StatusError(2, '未正确输入信息', '用户输入不在预期内')
+ proxy_stop()
+
+
+
diff --git a/p.py b/p.py
new file mode 100644
index 0000000..87f3aad
--- /dev/null
+++ b/p.py
@@ -0,0 +1,36 @@
+import requests
+from bs4 import BeautifulSoup as bf
+from initial_data import initial_data
+from status_raise import StatusError, Status
+
+# 返回: 1. 状态,状态[0] 为0 则正常;其后为需求。
+def get_p_info(p_url,content):
+ base_url, get_way, push_way = initial_data.get_url_list(content)
+ res = requests.get(p_url).text
+
+ if bf(res, 'html.parser').find_all('title', text="安全验证 - 知乎") != []:
+ raise StatusError(0, '触发反爬了', '触发反爬了')
+
+ p_name = bf(res, 'html.parser').select('.Post-Title')[0].text
+ base_url = base_url + p_name.replace(' ', '_').replace("\\", '_').replace('?','_').replace('/', '_').replace('?', '_').replace('*', '_').replace(':', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_').replace('\'', '_')+ p_url.split('/')[-1] + '/'
+ get_way = get_way + p_name.replace(' ', '_').replace("\\", '_').replace('?','_').replace('/', '_').replace('?', '_').replace('*', '_').replace(':', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_').replace('\'', '_') + p_url.split('/')[-1]+ '/'
+ a = bf('')
+ new_tag = bf('', 'html.parser').div
+ title = a.new_tag('h1')
+ title.string = p_name
+ address = a.new_tag('a')
+ address['href'] = p_url
+ new_tag.append(title)
+ new_tag.append(bf(res, 'html.parser').select('div[options]')[0])
+ print(bf(res, 'html.parser').select('.ContentItem-time')[0].contents)
+ address.contents = [bf(res, 'html.parser').select('.ContentItem-time')[0].contents[0]]
+ ip = a.new_tag('span')
+ ip_contents = bf(res, 'html.parser').select('.ContentItem-time')[0].contents
+ if len(ip_contents) == 3:
+ ip.contents = [bf(res, 'html.parser').select('.ContentItem-time')[0].contents[2]]
+ new_end = bf('
'+ str(address) + str(ip) +'
', 'html.parser')
+ new_tag.append(a.new_tag('link', rel="stylesheet", href="../style.css"))
+ new_tag.append(new_end)
+ p = str(new_tag)
+
+ return Status(0, '一切正常', '一切正常'),p, p_name, base_url, get_way
diff --git a/query_all_needs.py b/query_all_needs.py
new file mode 100644
index 0000000..3d6f898
--- /dev/null
+++ b/query_all_needs.py
@@ -0,0 +1,12 @@
+from bs4 import BeautifulSoup as bf
+
+def return_needs(res):
+ img_need = False
+ video_need = False
+
+
+ res = bf(res, 'html.parser')
+ img_need = res.find_all('figure') != []
+ video_need = res.find_all('a', class_='video-box') != []
+ return img_need, video_need
+
\ No newline at end of file
diff --git a/run_from_trans_station.py b/run_from_trans_station.py
new file mode 100644
index 0000000..b383234
--- /dev/null
+++ b/run_from_trans_station.py
@@ -0,0 +1,46 @@
+import json
+from do_git import trans_pull, trans_push, git
+from execute import execute
+from initial_data import initial_data
+from status_raise import StatusError
+
+def run_from_trans_station(is_pri):
+ trans_loacal_way, trans_pull_way, trans_push_way = initial_data.get_trans(is_pri)
+ data = ''
+ try:
+ trans_pull(trans_loacal_way, trans_pull_way)
+ with open(trans_loacal_way + '/collection.json','r',encoding="utf-8") as f:
+ data = json.load(f)
+ f.close()
+ with open(trans_loacal_way + '/collection.json','w',encoding="utf-8") as f:
+ new_file = {
+ "url": []
+ }
+ json.dump(new_file,f,ensure_ascii=False)
+ f.close()
+ except Exception as e:
+ raise StatusError(-3, '获取远端url失败', e)
+ print(data['url'])
+ content = ''
+ for i in data['url']:
+ answer_url = ''
+ p_url = ''
+
+ url = i
+ if url.split('/')[-2] == 'answer':
+ answer_url = url
+ elif url.split('/')[-2] == 'p':
+ p_url = url
+
+ comment_need = False
+ label = []
+ status, file_url,content = execute(p_url ,answer_url, label, comment_need,stop=True)
+ if status.code == 2:
+ print('\n\n\n', status.tips + '\n\n\n')
+ continue
+ elif status.code == 3:
+ print('\n\n\n', status.tips + '\n\n\n')
+ continue
+ git(content)
+ input('\n完成,即将清空序列......\n')
+ trans_push(trans_loacal_way, trans_push_way)
diff --git a/save_html.py b/save_html.py
new file mode 100644
index 0000000..1cd316a
--- /dev/null
+++ b/save_html.py
@@ -0,0 +1,10 @@
+from status_raise import StatusError
+
+def save_html(base_url,content_ed):
+ try:
+ with open(base_url +'index.html','w',encoding='utf-8') as f:
+ f.write(content_ed.content)
+ f.close()
+ except Exception as e:
+ raise StatusError(-2, 'html保存失败', '文件创建失败')
+ return base_url +'index.html'
diff --git a/status_code.md b/status_code.md
new file mode 100644
index 0000000..bf98a62
--- /dev/null
+++ b/status_code.md
@@ -0,0 +1,27 @@
++ Status
+ + code 0
+ + 一切正常
+ + code 1
+ + data.json 配置不完整
+ + code 2
+ + 该文件已存在
+ + code 3
+ + 该文件已知识荒原
+
++ StatusError
+ + code -1
+ + 文件读取错误
+ + code -2
+ + 文件创建失败
+ + code -3
+ + 文件下载失败
+ + code 0
+ + 触发反爬了
+ + code 1
+ + 未提供对应功能所需信息
+ + code 2
+ + 未正确输入信息
+ + code 3
+ + 知识荒原
+ + code 404
+ + 程序bug
\ No newline at end of file
diff --git a/status_raise.py b/status_raise.py
new file mode 100644
index 0000000..c3181bc
--- /dev/null
+++ b/status_raise.py
@@ -0,0 +1,16 @@
+class StatusError(Exception):
+ def __init__(self, code, tips, response):
+ self.code = code
+ self.tips = tips
+ self.response = response
+ def __str__(self):
+ return '\n\n\ncode: ' + str(self.code) + '\n\ntips: ' + self.tips + '\n\nresponse: ' + str(self.response) + '\n\n\n'
+
+class Status(object):
+ def __init__(self, code, tips, response):
+ self.code = code
+ self.tips = tips
+ self.response = response
+
+ def show_status(self):
+ return {'code': self.code, 'tips': self.tips, 'response': self.response}
diff --git a/video.py b/video.py
new file mode 100644
index 0000000..7d67125
--- /dev/null
+++ b/video.py
@@ -0,0 +1,50 @@
+import requests
+import re
+from bs4 import BeautifulSoup as bf
+from status_raise import StatusError
+
+class get_set_video(object):
+ is_initial = False
+ video_list = []
+ video_name_index = 0
+ base_url = ''
+ get_way = ''
+
+ def __init__(self, content_classed):
+ if content_classed.is_classfied:
+ get_set_video.base_url, get_set_video.get_way = content_classed.base_url, content_classed.get_way
+ get_set_video.is_initial = True
+ get_set_video.set_video_list(content_classed.content)
+ self.content = get_set_video.change_video_attr(content_classed.content)
+ else:
+ raise StatusError(404, '传递了未进行前序必要操作的信息', '程序bug')
+
+ # 设置