git.

sduoooh · Jul 27, 2023 · 460fc37 · 460fc37
commit 460fc37
Show file tree

Hide file tree

Showing 18 changed files with 642 additions and 0 deletions.
diff --git a/add_into_dir.py b/add_into_dir.py
@@ -0,0 +1,32 @@
+import json
+from status_raise import StatusError
+
+def add_into_dir(base_url, url, name,show_content):
+    try:    
+            data = ''
+            index = 0
+            file_name = ''
+            with open(base_url + 'dir.json','r',encoding="utf-8") as f:
+                data = json.load(f)
+                f.close()
+            with open(base_url + 'dir.json','w',encoding="utf-8") as f:
+                new_file = {
+                      'index': len(data['dir']),
+                      'file_name' : name.replace(' ', '_').replace("\\", '_').replace('/', '_').replace('？','_').replace('?', '_').replace('*', '_').replace(':', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_').replace('\'', '_') + url.split('/')[-1],
+                      'show_name': name,
+                      'show_content':  show_content + '......'        
+                }
+                index = new_file['index']
+                file_name = new_file['file_name']
+                data['dir'].append(new_file)
+                json.dump(data,f,ensure_ascii=False)
+                f.close()
+            with open(base_url + 'map.json','r',encoding="utf-8") as f:
+                data = json.load(f)
+                f.close()
+            with open(base_url + 'map.json','w',encoding="utf-8") as f:
+                data['map'][file_name] = index
+                json.dump(data,f,ensure_ascii=False)
+                f.close()
+    except Exception as e:
+            raise StatusError(-3, '文件下载失败', e)
diff --git a/answer.py b/answer.py
@@ -0,0 +1,39 @@
+import requests
+import re
+from bs4 import BeautifulSoup as bf
+from initial_data import initial_data
+from status_raise import StatusError, Status
+
+# 拿问题名字、id，和回答id、内容的，顺便处理了回答的格式
+
+def get_answer_info(url,content):
+    base_url, get_way, push_way = initial_data.get_url_list(content)
+    response = requests.request("GET", url)
+
+    if bf(response.text, 'html.parser').find_all('title', text="安全验证 - 知乎") != []: 
+        raise StatusError(0, '触发反爬了', '触发反爬了')
+    if bf(response.text, 'html.parser').find_all('div', class_='ErrorPage') != []: 
+        return Status(3, '知识荒原了', '知识荒原了'), '', '', '', ''
+    question_info_src = bf(response.text, 'html.parser').find('div', class_='QuestionPage')
+    #question_id = question_info_src.find('meta', itemprop='url').get('content').split('/')[-1]
+    question_name = question_info_src.find('meta', itemprop='name').get('content')
+    base_url = base_url + question_name.replace(' ', '_').replace("\\", '_').replace('/', '_').replace('？','_').replace('?', '_').replace('*', '_').replace(':', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_').replace('\'', '_')
+    base_url = base_url + url.split('/')[-1] + '/'
+    get_way = get_way + question_name.replace(' ', '_').replace("\\", '_').replace('/', '_').replace('？','_').replace('?', '_').replace('*', '_').replace(':', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_').replace('\'', '_')
+    get_way = get_way + url.split('/')[-1] + '/'
+    answer = str(question_info_src.find('div', class_="RichContent--unescapable"))
+    answer = re.sub(r'(<div class="ContentItem-actions RichContent-actions">.*$)', '</div>', answer, 0,  flags=re.MULTILINE)
+    a = bf('<div class="CollectionItem"><div>', 'html.parser')
+    inner_tag = a.new_tag('')
+    title = a.new_tag('h1')
+    title.string = question_name
+    inner_tag.append(title)
+    inner_tag.append(bf(answer, 'html.parser').select('span[itemprop]')[0])
+    b = bf(answer, 'html.parser').select('.ContentItem-time')[0]
+    b.find_all('a')[0]['href'] = url
+    inner_tag.append(a.new_tag('link', rel="stylesheet", href="../style.css"))
+    a.div.contents= inner_tag.contents
+    a.div.append(b)
+    answer = str(a)
+
+    return Status(0, '一切正常', '一切正常'),answer, question_name, base_url, get_way
diff --git a/classify.py b/classify.py
@@ -0,0 +1,32 @@
+from initial_data import initial_data
+from query_all_needs import return_needs
+from status_raise import StatusError
+
+
+class Classifier(object):
+    is_classfied = False
+    is_pri = False
+    def __init__(self, labels):
+        if not initial_data.is_initial:
+            raise StatusError(404, '传递未进行必需的前序操作的信息','')
+        self.labels = labels
+        self.is_pri = classify(self.labels)
+        self.is_classfied = True
+        self.base_url, self.get_way, self.push_way = initial_data.get_url_list(self)
+
+    def set_content(self, content, base_url, get_way):
+        self.content = content
+        self.base_url = base_url
+        self.get_way = get_way
+        self.img_need,self.video_need = return_needs(self.content)
+        pass
+
+
+
+
+def classify(labels):
+    if '私人' in labels:
+        return True
+    else :
+        return False
+
diff --git a/dir.py b/dir.py
@@ -0,0 +1,29 @@
+from pathlib import Path
+from status_raise import StatusError,Status
+
+def create_dir(img_need, comment_need, video_need, base_url, stop = False):
+    if img_need:
+        img_dir = Path(base_url + 'img')
+    if video_need:
+        video_dir = Path(base_url + 'video')
+    if comment_need:
+        text_dir = Path(base_url + 'text')
+        comment_dir = Path(base_url + 'text/' + 'comment')
+        child_comment_dir = Path(base_url + 'text/' + 'child_comment')
+    try :
+        Path(base_url).mkdir()
+        if img_need:
+            img_dir.mkdir()
+        if video_need:
+            video_dir.mkdir()
+        if comment_need:
+            text_dir.mkdir()
+            comment_dir.mkdir()
+            child_comment_dir.mkdir()
+        return Status(0, '目录创建成功', '目录创建成功')
+    except FileExistsError:
+        if stop:
+            return Status(2, base_url.split('/')[-2] + '已存在', FileExistsError)
+        else :
+            raise StatusError(-2, '目录创建失败，该目录已存在', FileExistsError)
+
diff --git a/do_git.py b/do_git.py
@@ -0,0 +1,46 @@
+import os
+from status_raise import StatusError
+from initial_data import initial_data
+
+def git(content):
+    base_url, get_way, push_way = initial_data.get_url_list(content)
+    if push_way == None:
+        print('未配置git命令')
+        return
+    print('正在上传到github...')
+    os.chdir(base_url)
+    os.system('git init')
+    os.system('git add -A')
+    os.system('git commit -m "update"')
+    os.system('git config http.sslVerify "false"')
+    os.system(push_way)
+    print('上传成功')
+
+def trans_pull(local_way, pull_way):
+    if local_way == None:
+        raise StatusError(1, '未配置git命令', '配置文件错误')
+    print('正在从远程仓库拉取...')
+    print ('cd ' + local_way )
+    os.chdir(local_way)
+    os.system('git config http.sslVerify "false"')
+    print(pull_way)
+    os.system(pull_way)
+    print('拉取成功')
+
+def trans_push(local_way, push_way):
+    if local_way == None:
+        raise StatusError(1, '未配置git命令', '配置文件错误')
+    print('正在往远程仓库推送...')
+    os.chdir(local_way)
+    os.system('git add -A')
+    os.system('git commit -m "already"')
+    os.system(push_way)
+    print('推送成功')
+
+def proxy_start():
+    os.system("start C:/Users/1/Desktop/工具/fastgithub_win-x64/fastgithub.exe")
+    return
+
+def proxy_stop():
+    os.system('runas /user:administrator "taskkill /f /t /im fastgithub.exe"')
+    return
diff --git a/execute.py b/execute.py
@@ -0,0 +1,45 @@
+from pathlib import Path
+from PIL import Image
+from bs4 import BeautifulSoup as bf
+
+from status_raise import StatusError,Status
+
+from initial_data import initial_data
+from answer import get_answer_info
+from p import get_p_info
+from dir import create_dir
+from classify import Classifier
+from img import get_set_img
+from video import get_set_video
+from save_html import save_html
+from get_text import get_all_text
+from add_into_dir import add_into_dir
+
+def execute(p_url, answer_url, label,comment_need = False, stop = False):
+    label= []
+    url = ''
+    content = Classifier(label)
+    if p_url == '':
+        url = answer_url
+        status, got_content, name, base_url, get_way = get_answer_info(answer_url,content)
+    else:
+        url = p_url
+        status ,got_content, name, base_url, get_way = get_p_info(p_url,content)
+    if status.code == 3:
+        if stop:
+            return status, '',content
+        else :
+            raise StatusError(3, '知识荒原了', '知识荒原了')
+    content.set_content(got_content, base_url, get_way)
+    img_need, video_need = content.img_need, content.video_need
+    dir_status = create_dir(img_need,comment_need,video_need,base_url, stop)
+    if dir_status.code == 2:
+        return dir_status, '',content
+    if img_need:
+        content.content = get_set_img(content).content
+    if video_need:
+        content.content = get_set_video(content).content
+    show_content =  get_all_text(content,base_url)
+    add_into_dir(initial_data.get_url_list(content)[0],url,name,show_content)
+    file = 'file://' + save_html(base_url, content)
+    return Status(0, '一切正常', '一切正常'),file,content
diff --git a/get_all_answers.py b/get_all_answers.py
@@ -0,0 +1,33 @@
+import requests
+
+# 这俩是用来拿问题下其他回答的
+def get_answers_respones(question_id):
+    global next_page_url
+    global is_end
+
+    url = 'https://www.zhihu.com/api/v4/questions/'+ question_id +'/feeds?include=%2Ccontent&limit=40'
+    response = requests.request("GET", url).json()
+    next_page_url = response['paging']['next']
+    is_end = response['paging']['is_end']
+    return response
+
+def get_answers_content(frist_response ,answer_id):
+    global next_page_url
+    global is_end
+    for i in frist_response['data']:
+        if str(i['target']['id']) == answer_id:
+            return i['target']['content']
+        else :
+            next_page_url = frist_response['paging']['next']
+            is_end = frist_response['paging']['is_end']
+            continue
+    while not is_end:
+        response = requests.request("GET", next_page_url).json()
+        for i in response['data']:
+            if str(i['target']['id']) == answer_id:
+                return i['target']['content']
+            else :
+                next_page_url = response['paging']['next']
+                is_end = response['paging']['is_end']
+                continue
+    return None
diff --git a/get_text.py b/get_text.py
@@ -0,0 +1,17 @@
+from bs4 import BeautifulSoup as bf
+from status_raise import StatusError
+
+def get_all_text(res, base_url):
+    content = bf(res.content, 'html.parser').find_all(class_='css-1g0fqss', options="[object Object]")[0]
+    show_content = ''
+    try:
+        with open(base_url + "text.txt",'w' ,encoding='utf-8') as f:
+                text = content.get_text()
+                show_content = text[0:20]
+                f.write(text)
+                f.close()
+    except Exception as e:
+        raise StatusError(-2, '纯文本写入失败', e)
+
+    return show_content
+
diff --git a/img.py b/img.py
@@ -0,0 +1,53 @@
+import requests
+import re
+from bs4 import BeautifulSoup as bf
+from status_raise import StatusError, Status
+
+class get_set_img(object): 
+    is_initial = False
+    img_list = []
+    img_name_index = 0
+    base_url = ''
+    get_way = ''
+
+    def __init__(self, content_classed):
+        if content_classed.is_classfied:
+            get_set_img.base_url, get_set_img.get_way = content_classed.base_url, content_classed.get_way
+            get_set_img.is_initial = True
+            get_set_img.set_img_list(content_classed.content)
+            self.content = get_set_img.change_img_attr(content_classed.content)
+        else:
+            raise StatusError(404, '传递了未进行前序必要操作的信息', '程序bug')
+
+
+    # 设置<img>的列表
+    def set_img_list(res):
+        for i in bf(res, 'html.parser').find_all('figure'):
+            img_src = get_set_img.get_img_url(i.find('img')['src'])
+            print(i.find('img'))
+            img_height = i.find('img')['data-rawheight']
+            img_width = i.find('img')['data-rawwidth']
+            img_aspect_ratio = int(img_width) / int(img_height) 
+            img_att = '<img loading="lazy" onerror="this.src=\'' + get_set_img.get_way + img_src + '\';this.onerror=null;" src="' + './' + img_src + '" aspect-ratio ="' + str(img_aspect_ratio) + '">'
+            get_set_img.img_list.append(img_att)
+
+    # 下载图片，返回图片的相对路径
+    def get_img_url(img_url):
+        img_url = img_url.replace('720w.jpg?','1440w.jpg?')
+
+        try:
+            with open(get_set_img.base_url +'img/' + str(get_set_img.img_name_index) + '.jpg','wb') as f:
+                f.write(requests.get(img_url).content)
+                f.close()
+        except Exception as e:
+            raise StatusError(-3, '文件下载失败', e)
+        get_set_img.img_name_index += 1
+        return 'img/' + str(get_set_img.img_name_index - 1) + '.jpg'
+
+    # 把懒加载的<figure>替换成<img>
+    def change_img_attr(content):
+        def replace_img(m):
+            return get_set_img.img_list.pop(0)
+        content =  re.sub(r"(<figure[^>]*>.*?<\/figure>)", replace_img, (content), 0,  flags=re.MULTILINE)
+        return content
+
diff --git a/initial_data.py b/initial_data.py
@@ -0,0 +1,46 @@
+import json
+from status_raise import StatusError, Status
+
+class initial_data(object):
+    is_initial = False
+    __pri = []
+    __pub = []
+
+    def initial():
+        try : 
+            with open('./data.json') as f:
+                data = json.load(f)
+                f.close()
+                initial_data.__pub = [data['pub_base_url'], data['pub_get_way'], data['pub_push_way'], data['pub_trans_local_way'], data['pub_trans_pull_way'], data['pub_trans_push_way']]
+                initial_data.__pri = [data['pri_base_url'], data['pri_get_way'], data['pri_push_way'], data['pri_trans_local_way'], data['pri_trans_pull_way'], data['pri_trans_push_way']]     
+        except Exception as e:
+            if initial_data.__pub == []:
+                raise StatusError(1, 'data.json文件未正确配置，缺少必要的公开/默认路径 pub 设置。', '配置文件错误')
+            elif initial_data.__pri == []:
+                initial_data.is_initial = True
+                return Status(1, 'data.json文件正确配置，但缺少可选的私人路径 pri 设置。', '配置文件不完整')
+
+            raise StatusError(-1, 'data.json文件读取出错。\n' + str(e.args), '未知系统错误')
+
+        initial_data.is_initial = True
+        return Status(0,'data.json文件读取成功，数据完整。', '配置文件完整读取')
+
+    def get_url_list(classed_obj):
+        if classed_obj.is_pri:
+            if initial_data.__pri != []:
+                return initial_data.__pri[0:3]
+            else: 
+                raise StatusError(1, '未提供对应功能所需信息，请检查data.json文件是否正确进行私人路径 pri 设置。', '配置文件错误')
+        else :
+            return initial_data.__pub[0:3]
+
+    def get_trans(is_pri):
+        if is_pri:
+            if initial_data.__pri != []:
+                return initial_data.__pri[3], initial_data.__pri[4], initial_data.__pri[5]
+            else: 
+                raise StatusError(1, '未提供对应功能所需信息，请检查data.json文件是否正确进行私人路径 pri 设置。', '配置文件错误')
+        else:
+            return initial_data.__pub[3], initial_data.__pub[4], initial_data.__pub[5]
+
+