diff --git a/add_into_dir.py b/add_into_dir.py new file mode 100644 index 0000000..ee198b1 --- /dev/null +++ b/add_into_dir.py @@ -0,0 +1,32 @@ +import json +from status_raise import StatusError + +def add_into_dir(base_url, url, name,show_content): + try: + data = '' + index = 0 + file_name = '' + with open(base_url + 'dir.json','r',encoding="utf-8") as f: + data = json.load(f) + f.close() + with open(base_url + 'dir.json','w',encoding="utf-8") as f: + new_file = { + 'index': len(data['dir']), + 'file_name' : name.replace(' ', '_').replace("\\", '_').replace('/', '_').replace('?','_').replace('?', '_').replace('*', '_').replace(':', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_').replace('\'', '_') + url.split('/')[-1], + 'show_name': name, + 'show_content': show_content + '......' + } + index = new_file['index'] + file_name = new_file['file_name'] + data['dir'].append(new_file) + json.dump(data,f,ensure_ascii=False) + f.close() + with open(base_url + 'map.json','r',encoding="utf-8") as f: + data = json.load(f) + f.close() + with open(base_url + 'map.json','w',encoding="utf-8") as f: + data['map'][file_name] = index + json.dump(data,f,ensure_ascii=False) + f.close() + except Exception as e: + raise StatusError(-3, '文件下载失败', e) \ No newline at end of file diff --git a/answer.py b/answer.py new file mode 100644 index 0000000..9f97b6c --- /dev/null +++ b/answer.py @@ -0,0 +1,39 @@ +import requests +import re +from bs4 import BeautifulSoup as bf +from initial_data import initial_data +from status_raise import StatusError, Status + +# 拿问题名字、id,和回答id、内容的,顺便处理了回答的格式 + +def get_answer_info(url,content): + base_url, get_way, push_way = initial_data.get_url_list(content) + response = requests.request("GET", url) + + if bf(response.text, 'html.parser').find_all('title', text="安全验证 - 知乎") != []: + raise StatusError(0, '触发反爬了', '触发反爬了') + if bf(response.text, 'html.parser').find_all('div', class_='ErrorPage') != []: + return Status(3, '知识荒原了', '知识荒原了'), '', '', '', '' + question_info_src = bf(response.text, 'html.parser').find('div', class_='QuestionPage') + #question_id = question_info_src.find('meta', itemprop='url').get('content').split('/')[-1] + question_name = question_info_src.find('meta', itemprop='name').get('content') + base_url = base_url + question_name.replace(' ', '_').replace("\\", '_').replace('/', '_').replace('?','_').replace('?', '_').replace('*', '_').replace(':', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_').replace('\'', '_') + base_url = base_url + url.split('/')[-1] + '/' + get_way = get_way + question_name.replace(' ', '_').replace("\\", '_').replace('/', '_').replace('?','_').replace('?', '_').replace('*', '_').replace(':', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_').replace('\'', '_') + get_way = get_way + url.split('/')[-1] + '/' + answer = str(question_info_src.find('div', class_="RichContent--unescapable")) + answer = re.sub(r'(
.*$)', '
', answer, 0, flags=re.MULTILINE) + a = bf('
', 'html.parser') + inner_tag = a.new_tag('') + title = a.new_tag('h1') + title.string = question_name + inner_tag.append(title) + inner_tag.append(bf(answer, 'html.parser').select('span[itemprop]')[0]) + b = bf(answer, 'html.parser').select('.ContentItem-time')[0] + b.find_all('a')[0]['href'] = url + inner_tag.append(a.new_tag('link', rel="stylesheet", href="../style.css")) + a.div.contents= inner_tag.contents + a.div.append(b) + answer = str(a) + + return Status(0, '一切正常', '一切正常'),answer, question_name, base_url, get_way \ No newline at end of file diff --git a/classify.py b/classify.py new file mode 100644 index 0000000..78a2929 --- /dev/null +++ b/classify.py @@ -0,0 +1,32 @@ +from initial_data import initial_data +from query_all_needs import return_needs +from status_raise import StatusError + + +class Classifier(object): + is_classfied = False + is_pri = False + def __init__(self, labels): + if not initial_data.is_initial: + raise StatusError(404, '传递未进行必需的前序操作的信息','') + self.labels = labels + self.is_pri = classify(self.labels) + self.is_classfied = True + self.base_url, self.get_way, self.push_way = initial_data.get_url_list(self) + + def set_content(self, content, base_url, get_way): + self.content = content + self.base_url = base_url + self.get_way = get_way + self.img_need,self.video_need = return_needs(self.content) + pass + + + + +def classify(labels): + if '私人' in labels: + return True + else : + return False + \ No newline at end of file diff --git a/dir.py b/dir.py new file mode 100644 index 0000000..b29d4ed --- /dev/null +++ b/dir.py @@ -0,0 +1,29 @@ +from pathlib import Path +from status_raise import StatusError,Status + +def create_dir(img_need, comment_need, video_need, base_url, stop = False): + if img_need: + img_dir = Path(base_url + 'img') + if video_need: + video_dir = Path(base_url + 'video') + if comment_need: + text_dir = Path(base_url + 'text') + comment_dir = Path(base_url + 'text/' + 'comment') + child_comment_dir = Path(base_url + 'text/' + 'child_comment') + try : + Path(base_url).mkdir() + if img_need: + img_dir.mkdir() + if video_need: + video_dir.mkdir() + if comment_need: + text_dir.mkdir() + comment_dir.mkdir() + child_comment_dir.mkdir() + return Status(0, '目录创建成功', '目录创建成功') + except FileExistsError: + if stop: + return Status(2, base_url.split('/')[-2] + '已存在', FileExistsError) + else : + raise StatusError(-2, '目录创建失败,该目录已存在', FileExistsError) + \ No newline at end of file diff --git a/do_git.py b/do_git.py new file mode 100644 index 0000000..813b346 --- /dev/null +++ b/do_git.py @@ -0,0 +1,46 @@ +import os +from status_raise import StatusError +from initial_data import initial_data + +def git(content): + base_url, get_way, push_way = initial_data.get_url_list(content) + if push_way == None: + print('未配置git命令') + return + print('正在上传到github...') + os.chdir(base_url) + os.system('git init') + os.system('git add -A') + os.system('git commit -m "update"') + os.system('git config http.sslVerify "false"') + os.system(push_way) + print('上传成功') + +def trans_pull(local_way, pull_way): + if local_way == None: + raise StatusError(1, '未配置git命令', '配置文件错误') + print('正在从远程仓库拉取...') + print ('cd ' + local_way ) + os.chdir(local_way) + os.system('git config http.sslVerify "false"') + print(pull_way) + os.system(pull_way) + print('拉取成功') + +def trans_push(local_way, push_way): + if local_way == None: + raise StatusError(1, '未配置git命令', '配置文件错误') + print('正在往远程仓库推送...') + os.chdir(local_way) + os.system('git add -A') + os.system('git commit -m "already"') + os.system(push_way) + print('推送成功') + +def proxy_start(): + os.system("start C:/Users/1/Desktop/工具/fastgithub_win-x64/fastgithub.exe") + return + +def proxy_stop(): + os.system('runas /user:administrator "taskkill /f /t /im fastgithub.exe"') + return \ No newline at end of file diff --git a/execute.py b/execute.py new file mode 100644 index 0000000..0dafe85 --- /dev/null +++ b/execute.py @@ -0,0 +1,45 @@ +from pathlib import Path +from PIL import Image +from bs4 import BeautifulSoup as bf + +from status_raise import StatusError,Status + +from initial_data import initial_data +from answer import get_answer_info +from p import get_p_info +from dir import create_dir +from classify import Classifier +from img import get_set_img +from video import get_set_video +from save_html import save_html +from get_text import get_all_text +from add_into_dir import add_into_dir + +def execute(p_url, answer_url, label,comment_need = False, stop = False): + label= [] + url = '' + content = Classifier(label) + if p_url == '': + url = answer_url + status, got_content, name, base_url, get_way = get_answer_info(answer_url,content) + else: + url = p_url + status ,got_content, name, base_url, get_way = get_p_info(p_url,content) + if status.code == 3: + if stop: + return status, '',content + else : + raise StatusError(3, '知识荒原了', '知识荒原了') + content.set_content(got_content, base_url, get_way) + img_need, video_need = content.img_need, content.video_need + dir_status = create_dir(img_need,comment_need,video_need,base_url, stop) + if dir_status.code == 2: + return dir_status, '',content + if img_need: + content.content = get_set_img(content).content + if video_need: + content.content = get_set_video(content).content + show_content = get_all_text(content,base_url) + add_into_dir(initial_data.get_url_list(content)[0],url,name,show_content) + file = 'file://' + save_html(base_url, content) + return Status(0, '一切正常', '一切正常'),file,content \ No newline at end of file diff --git a/get_all_answers.py b/get_all_answers.py new file mode 100644 index 0000000..ad853d9 --- /dev/null +++ b/get_all_answers.py @@ -0,0 +1,33 @@ +import requests + +# 这俩是用来拿问题下其他回答的 +def get_answers_respones(question_id): + global next_page_url + global is_end + + url = 'https://www.zhihu.com/api/v4/questions/'+ question_id +'/feeds?include=%2Ccontent&limit=40' + response = requests.request("GET", url).json() + next_page_url = response['paging']['next'] + is_end = response['paging']['is_end'] + return response + +def get_answers_content(frist_response ,answer_id): + global next_page_url + global is_end + for i in frist_response['data']: + if str(i['target']['id']) == answer_id: + return i['target']['content'] + else : + next_page_url = frist_response['paging']['next'] + is_end = frist_response['paging']['is_end'] + continue + while not is_end: + response = requests.request("GET", next_page_url).json() + for i in response['data']: + if str(i['target']['id']) == answer_id: + return i['target']['content'] + else : + next_page_url = response['paging']['next'] + is_end = response['paging']['is_end'] + continue + return None \ No newline at end of file diff --git a/get_text.py b/get_text.py new file mode 100644 index 0000000..d9eb01e --- /dev/null +++ b/get_text.py @@ -0,0 +1,17 @@ +from bs4 import BeautifulSoup as bf +from status_raise import StatusError + +def get_all_text(res, base_url): + content = bf(res.content, 'html.parser').find_all(class_='css-1g0fqss', options="[object Object]")[0] + show_content = '' + try: + with open(base_url + "text.txt",'w' ,encoding='utf-8') as f: + text = content.get_text() + show_content = text[0:20] + f.write(text) + f.close() + except Exception as e: + raise StatusError(-2, '纯文本写入失败', e) + + return show_content + \ No newline at end of file diff --git a/img.py b/img.py new file mode 100644 index 0000000..6967596 --- /dev/null +++ b/img.py @@ -0,0 +1,53 @@ +import requests +import re +from bs4 import BeautifulSoup as bf +from status_raise import StatusError, Status + +class get_set_img(object): + is_initial = False + img_list = [] + img_name_index = 0 + base_url = '' + get_way = '' + + def __init__(self, content_classed): + if content_classed.is_classfied: + get_set_img.base_url, get_set_img.get_way = content_classed.base_url, content_classed.get_way + get_set_img.is_initial = True + get_set_img.set_img_list(content_classed.content) + self.content = get_set_img.change_img_attr(content_classed.content) + else: + raise StatusError(404, '传递了未进行前序必要操作的信息', '程序bug') + + + # 设置的列表 + def set_img_list(res): + for i in bf(res, 'html.parser').find_all('figure'): + img_src = get_set_img.get_img_url(i.find('img')['src']) + print(i.find('img')) + img_height = i.find('img')['data-rawheight'] + img_width = i.find('img')['data-rawwidth'] + img_aspect_ratio = int(img_width) / int(img_height) + img_att = '' + get_set_img.img_list.append(img_att) + + # 下载图片,返回图片的相对路径 + def get_img_url(img_url): + img_url = img_url.replace('720w.jpg?','1440w.jpg?') + + try: + with open(get_set_img.base_url +'img/' + str(get_set_img.img_name_index) + '.jpg','wb') as f: + f.write(requests.get(img_url).content) + f.close() + except Exception as e: + raise StatusError(-3, '文件下载失败', e) + get_set_img.img_name_index += 1 + return 'img/' + str(get_set_img.img_name_index - 1) + '.jpg' + + # 把懒加载的
替换成 + def change_img_attr(content): + def replace_img(m): + return get_set_img.img_list.pop(0) + content = re.sub(r"(]*>.*?<\/figure>)", replace_img, (content), 0, flags=re.MULTILINE) + return content + diff --git a/initial_data.py b/initial_data.py new file mode 100644 index 0000000..8374e21 --- /dev/null +++ b/initial_data.py @@ -0,0 +1,46 @@ +import json +from status_raise import StatusError, Status + +class initial_data(object): + is_initial = False + __pri = [] + __pub = [] + + def initial(): + try : + with open('./data.json') as f: + data = json.load(f) + f.close() + initial_data.__pub = [data['pub_base_url'], data['pub_get_way'], data['pub_push_way'], data['pub_trans_local_way'], data['pub_trans_pull_way'], data['pub_trans_push_way']] + initial_data.__pri = [data['pri_base_url'], data['pri_get_way'], data['pri_push_way'], data['pri_trans_local_way'], data['pri_trans_pull_way'], data['pri_trans_push_way']] + except Exception as e: + if initial_data.__pub == []: + raise StatusError(1, 'data.json文件未正确配置,缺少必要的公开/默认路径 pub 设置。', '配置文件错误') + elif initial_data.__pri == []: + initial_data.is_initial = True + return Status(1, 'data.json文件正确配置,但缺少可选的私人路径 pri 设置。', '配置文件不完整') + + raise StatusError(-1, 'data.json文件读取出错。\n' + str(e.args), '未知系统错误') + + initial_data.is_initial = True + return Status(0,'data.json文件读取成功,数据完整。', '配置文件完整读取') + + def get_url_list(classed_obj): + if classed_obj.is_pri: + if initial_data.__pri != []: + return initial_data.__pri[0:3] + else: + raise StatusError(1, '未提供对应功能所需信息,请检查data.json文件是否正确进行私人路径 pri 设置。', '配置文件错误') + else : + return initial_data.__pub[0:3] + + def get_trans(is_pri): + if is_pri: + if initial_data.__pri != []: + return initial_data.__pri[3], initial_data.__pri[4], initial_data.__pri[5] + else: + raise StatusError(1, '未提供对应功能所需信息,请检查data.json文件是否正确进行私人路径 pri 设置。', '配置文件错误') + else: + return initial_data.__pub[3], initial_data.__pub[4], initial_data.__pub[5] + + \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..73a0fb9 --- /dev/null +++ b/main.py @@ -0,0 +1,73 @@ +from pathlib import Path +from PIL import Image +from bs4 import BeautifulSoup as bf + +from status_raise import StatusError + +from run_from_trans_station import run_from_trans_station +from initial_data import initial_data +from execute import execute +from do_git import git, proxy_start, proxy_stop + + +def collect(): + answer_url = '' + p_url = '' + + url = input('请输入链接:') + if url.split('/')[-2] == 'answer': + answer_url = url + elif url.split('/')[-2] == 'p': + p_url = url + + comment_need = input('是否需要附上评论:') + if comment_need == 'y' or comment_need == 'Y' or comment_need == 'yes' or comment_need == 'Yes' or comment_need == 'YES': + comment_need = False + print('暂不支持附上评论哦') + else: + comment_need = False + label = [] + status ,file_url,content = execute(p_url ,answer_url, label, comment_need) + print('\n\n\n点击这里访问:\n\n\n', file_url) + git(content) + +if __name__ == '__main__': + try: + initial_data.initial() + except StatusError: + print('\n\n\n初始化失败') + print('请先在同目录下正确配置data.json文件') + print('格式如下:\n\n') + print('{') + print(' "pri_base_url": "x:/xx/xx", -----(OPTIONAL)') + print(' "pri_push_way": "git push xxx xxx", -----(OPTIONAL)') + print(' "pri_get_way": "https://raw.githubusercontent.com/your_user_name/file/main/" -----(OPTIONAL)' ) + print(' "pub_base_url": "x:/xx/xx",') + print(' "pub_push_way": "git push xxx xxx", ') + print(' "pub_get_way": "https://raw.githubusercontent.com/your_user_name/file/main/" -----(Your picturebed address)' ) + print('}\n\n') + input('按任意键退出') + print('\n\n\n选择:\n') + print('1. 从单个知乎链接获取;\n') + print('2. 从中转站批量获取.\n\n\n') + choice = input('请输入:') + proxy_start() + if choice == '1': + collect() + elif choice == '2': + print('\n\n\n选择:\n') + print('1. pri;\n') + print('2. pub.\n\n\n') + choice = input('请输入: ') + if choice == 'pri': + run_from_trans_station(True) + elif choice == 'pub': + run_from_trans_station(False) + else : + raise StatusError(2, '未正确输入信息', '用户输入不在预期内') + else : + raise StatusError(2, '未正确输入信息', '用户输入不在预期内') + proxy_stop() + + + diff --git a/p.py b/p.py new file mode 100644 index 0000000..87f3aad --- /dev/null +++ b/p.py @@ -0,0 +1,36 @@ +import requests +from bs4 import BeautifulSoup as bf +from initial_data import initial_data +from status_raise import StatusError, Status + +# 返回: 1. 状态,状态[0] 为0 则正常;其后为需求。 +def get_p_info(p_url,content): + base_url, get_way, push_way = initial_data.get_url_list(content) + res = requests.get(p_url).text + + if bf(res, 'html.parser').find_all('title', text="安全验证 - 知乎") != []: + raise StatusError(0, '触发反爬了', '触发反爬了') + + p_name = bf(res, 'html.parser').select('.Post-Title')[0].text + base_url = base_url + p_name.replace(' ', '_').replace("\\", '_').replace('?','_').replace('/', '_').replace('?', '_').replace('*', '_').replace(':', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_').replace('\'', '_')+ p_url.split('/')[-1] + '/' + get_way = get_way + p_name.replace(' ', '_').replace("\\", '_').replace('?','_').replace('/', '_').replace('?', '_').replace('*', '_').replace(':', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_').replace('\'', '_') + p_url.split('/')[-1]+ '/' + a = bf('') + new_tag = bf('
', 'html.parser').div + title = a.new_tag('h1') + title.string = p_name + address = a.new_tag('a') + address['href'] = p_url + new_tag.append(title) + new_tag.append(bf(res, 'html.parser').select('div[options]')[0]) + print(bf(res, 'html.parser').select('.ContentItem-time')[0].contents) + address.contents = [bf(res, 'html.parser').select('.ContentItem-time')[0].contents[0]] + ip = a.new_tag('span') + ip_contents = bf(res, 'html.parser').select('.ContentItem-time')[0].contents + if len(ip_contents) == 3: + ip.contents = [bf(res, 'html.parser').select('.ContentItem-time')[0].contents[2]] + new_end = bf('
'+ str(address) + str(ip) +'
', 'html.parser') + new_tag.append(a.new_tag('link', rel="stylesheet", href="../style.css")) + new_tag.append(new_end) + p = str(new_tag) + + return Status(0, '一切正常', '一切正常'),p, p_name, base_url, get_way diff --git a/query_all_needs.py b/query_all_needs.py new file mode 100644 index 0000000..3d6f898 --- /dev/null +++ b/query_all_needs.py @@ -0,0 +1,12 @@ +from bs4 import BeautifulSoup as bf + +def return_needs(res): + img_need = False + video_need = False + + + res = bf(res, 'html.parser') + img_need = res.find_all('figure') != [] + video_need = res.find_all('a', class_='video-box') != [] + return img_need, video_need + \ No newline at end of file diff --git a/run_from_trans_station.py b/run_from_trans_station.py new file mode 100644 index 0000000..b383234 --- /dev/null +++ b/run_from_trans_station.py @@ -0,0 +1,46 @@ +import json +from do_git import trans_pull, trans_push, git +from execute import execute +from initial_data import initial_data +from status_raise import StatusError + +def run_from_trans_station(is_pri): + trans_loacal_way, trans_pull_way, trans_push_way = initial_data.get_trans(is_pri) + data = '' + try: + trans_pull(trans_loacal_way, trans_pull_way) + with open(trans_loacal_way + '/collection.json','r',encoding="utf-8") as f: + data = json.load(f) + f.close() + with open(trans_loacal_way + '/collection.json','w',encoding="utf-8") as f: + new_file = { + "url": [] + } + json.dump(new_file,f,ensure_ascii=False) + f.close() + except Exception as e: + raise StatusError(-3, '获取远端url失败', e) + print(data['url']) + content = '' + for i in data['url']: + answer_url = '' + p_url = '' + + url = i + if url.split('/')[-2] == 'answer': + answer_url = url + elif url.split('/')[-2] == 'p': + p_url = url + + comment_need = False + label = [] + status, file_url,content = execute(p_url ,answer_url, label, comment_need,stop=True) + if status.code == 2: + print('\n\n\n', status.tips + '\n\n\n') + continue + elif status.code == 3: + print('\n\n\n', status.tips + '\n\n\n') + continue + git(content) + input('\n完成,即将清空序列......\n') + trans_push(trans_loacal_way, trans_push_way) diff --git a/save_html.py b/save_html.py new file mode 100644 index 0000000..1cd316a --- /dev/null +++ b/save_html.py @@ -0,0 +1,10 @@ +from status_raise import StatusError + +def save_html(base_url,content_ed): + try: + with open(base_url +'index.html','w',encoding='utf-8') as f: + f.write(content_ed.content) + f.close() + except Exception as e: + raise StatusError(-2, 'html保存失败', '文件创建失败') + return base_url +'index.html' diff --git a/status_code.md b/status_code.md new file mode 100644 index 0000000..bf98a62 --- /dev/null +++ b/status_code.md @@ -0,0 +1,27 @@ ++ Status + + code 0 + + 一切正常 + + code 1 + + data.json 配置不完整 + + code 2 + + 该文件已存在 + + code 3 + + 该文件已知识荒原 + ++ StatusError + + code -1 + + 文件读取错误 + + code -2 + + 文件创建失败 + + code -3 + + 文件下载失败 + + code 0 + + 触发反爬了 + + code 1 + + 未提供对应功能所需信息 + + code 2 + + 未正确输入信息 + + code 3 + + 知识荒原 + + code 404 + + 程序bug \ No newline at end of file diff --git a/status_raise.py b/status_raise.py new file mode 100644 index 0000000..c3181bc --- /dev/null +++ b/status_raise.py @@ -0,0 +1,16 @@ +class StatusError(Exception): + def __init__(self, code, tips, response): + self.code = code + self.tips = tips + self.response = response + def __str__(self): + return '\n\n\ncode: ' + str(self.code) + '\n\ntips: ' + self.tips + '\n\nresponse: ' + str(self.response) + '\n\n\n' + +class Status(object): + def __init__(self, code, tips, response): + self.code = code + self.tips = tips + self.response = response + + def show_status(self): + return {'code': self.code, 'tips': self.tips, 'response': self.response} diff --git a/video.py b/video.py new file mode 100644 index 0000000..7d67125 --- /dev/null +++ b/video.py @@ -0,0 +1,50 @@ +import requests +import re +from bs4 import BeautifulSoup as bf +from status_raise import StatusError + +class get_set_video(object): + is_initial = False + video_list = [] + video_name_index = 0 + base_url = '' + get_way = '' + + def __init__(self, content_classed): + if content_classed.is_classfied: + get_set_video.base_url, get_set_video.get_way = content_classed.base_url, content_classed.get_way + get_set_video.is_initial = True + get_set_video.set_video_list(content_classed.content) + self.content = get_set_video.change_video_attr(content_classed.content) + else: + raise StatusError(404, '传递了未进行前序必要操作的信息', '程序bug') + + # 设置