Skip to content

Commit

Permalink
git.
Browse files Browse the repository at this point in the history
  • Loading branch information
sduoooh committed Jul 27, 2023
0 parents commit 460fc37
Show file tree
Hide file tree
Showing 18 changed files with 642 additions and 0 deletions.
32 changes: 32 additions & 0 deletions add_into_dir.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import json
from status_raise import StatusError

def add_into_dir(base_url, url, name,show_content):
try:
data = ''
index = 0
file_name = ''
with open(base_url + 'dir.json','r',encoding="utf-8") as f:
data = json.load(f)
f.close()
with open(base_url + 'dir.json','w',encoding="utf-8") as f:
new_file = {
'index': len(data['dir']),
'file_name' : name.replace(' ', '_').replace("\\", '_').replace('/', '_').replace('?','_').replace('?', '_').replace('*', '_').replace(':', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_').replace('\'', '_') + url.split('/')[-1],
'show_name': name,
'show_content': show_content + '......'
}
index = new_file['index']
file_name = new_file['file_name']
data['dir'].append(new_file)
json.dump(data,f,ensure_ascii=False)
f.close()
with open(base_url + 'map.json','r',encoding="utf-8") as f:
data = json.load(f)
f.close()
with open(base_url + 'map.json','w',encoding="utf-8") as f:
data['map'][file_name] = index
json.dump(data,f,ensure_ascii=False)
f.close()
except Exception as e:
raise StatusError(-3, '文件下载失败', e)
39 changes: 39 additions & 0 deletions answer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import requests
import re
from bs4 import BeautifulSoup as bf
from initial_data import initial_data
from status_raise import StatusError, Status

# 拿问题名字、id,和回答id、内容的,顺便处理了回答的格式

def get_answer_info(url,content):
base_url, get_way, push_way = initial_data.get_url_list(content)
response = requests.request("GET", url)

if bf(response.text, 'html.parser').find_all('title', text="安全验证 - 知乎") != []:
raise StatusError(0, '触发反爬了', '触发反爬了')
if bf(response.text, 'html.parser').find_all('div', class_='ErrorPage') != []:
return Status(3, '知识荒原了', '知识荒原了'), '', '', '', ''
question_info_src = bf(response.text, 'html.parser').find('div', class_='QuestionPage')
#question_id = question_info_src.find('meta', itemprop='url').get('content').split('/')[-1]
question_name = question_info_src.find('meta', itemprop='name').get('content')
base_url = base_url + question_name.replace(' ', '_').replace("\\", '_').replace('/', '_').replace('?','_').replace('?', '_').replace('*', '_').replace(':', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_').replace('\'', '_')
base_url = base_url + url.split('/')[-1] + '/'
get_way = get_way + question_name.replace(' ', '_').replace("\\", '_').replace('/', '_').replace('?','_').replace('?', '_').replace('*', '_').replace(':', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_').replace('\'', '_')
get_way = get_way + url.split('/')[-1] + '/'
answer = str(question_info_src.find('div', class_="RichContent--unescapable"))
answer = re.sub(r'(<div class="ContentItem-actions RichContent-actions">.*$)', '</div>', answer, 0, flags=re.MULTILINE)
a = bf('<div class="CollectionItem"><div>', 'html.parser')
inner_tag = a.new_tag('')
title = a.new_tag('h1')
title.string = question_name
inner_tag.append(title)
inner_tag.append(bf(answer, 'html.parser').select('span[itemprop]')[0])
b = bf(answer, 'html.parser').select('.ContentItem-time')[0]
b.find_all('a')[0]['href'] = url
inner_tag.append(a.new_tag('link', rel="stylesheet", href="../style.css"))
a.div.contents= inner_tag.contents
a.div.append(b)
answer = str(a)

return Status(0, '一切正常', '一切正常'),answer, question_name, base_url, get_way
32 changes: 32 additions & 0 deletions classify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from initial_data import initial_data
from query_all_needs import return_needs
from status_raise import StatusError


class Classifier(object):
is_classfied = False
is_pri = False
def __init__(self, labels):
if not initial_data.is_initial:
raise StatusError(404, '传递未进行必需的前序操作的信息','')
self.labels = labels
self.is_pri = classify(self.labels)
self.is_classfied = True
self.base_url, self.get_way, self.push_way = initial_data.get_url_list(self)

def set_content(self, content, base_url, get_way):
self.content = content
self.base_url = base_url
self.get_way = get_way
self.img_need,self.video_need = return_needs(self.content)
pass




def classify(labels):
if '私人' in labels:
return True
else :
return False

29 changes: 29 additions & 0 deletions dir.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from pathlib import Path
from status_raise import StatusError,Status

def create_dir(img_need, comment_need, video_need, base_url, stop = False):
if img_need:
img_dir = Path(base_url + 'img')
if video_need:
video_dir = Path(base_url + 'video')
if comment_need:
text_dir = Path(base_url + 'text')
comment_dir = Path(base_url + 'text/' + 'comment')
child_comment_dir = Path(base_url + 'text/' + 'child_comment')
try :
Path(base_url).mkdir()
if img_need:
img_dir.mkdir()
if video_need:
video_dir.mkdir()
if comment_need:
text_dir.mkdir()
comment_dir.mkdir()
child_comment_dir.mkdir()
return Status(0, '目录创建成功', '目录创建成功')
except FileExistsError:
if stop:
return Status(2, base_url.split('/')[-2] + '已存在', FileExistsError)
else :
raise StatusError(-2, '目录创建失败,该目录已存在', FileExistsError)

46 changes: 46 additions & 0 deletions do_git.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os
from status_raise import StatusError
from initial_data import initial_data

def git(content):
base_url, get_way, push_way = initial_data.get_url_list(content)
if push_way == None:
print('未配置git命令')
return
print('正在上传到github...')
os.chdir(base_url)
os.system('git init')
os.system('git add -A')
os.system('git commit -m "update"')
os.system('git config http.sslVerify "false"')
os.system(push_way)
print('上传成功')

def trans_pull(local_way, pull_way):
if local_way == None:
raise StatusError(1, '未配置git命令', '配置文件错误')
print('正在从远程仓库拉取...')
print ('cd ' + local_way )
os.chdir(local_way)
os.system('git config http.sslVerify "false"')
print(pull_way)
os.system(pull_way)
print('拉取成功')

def trans_push(local_way, push_way):
if local_way == None:
raise StatusError(1, '未配置git命令', '配置文件错误')
print('正在往远程仓库推送...')
os.chdir(local_way)
os.system('git add -A')
os.system('git commit -m "already"')
os.system(push_way)
print('推送成功')

def proxy_start():
os.system("start C:/Users/1/Desktop/工具/fastgithub_win-x64/fastgithub.exe")
return

def proxy_stop():
os.system('runas /user:administrator "taskkill /f /t /im fastgithub.exe"')
return
45 changes: 45 additions & 0 deletions execute.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from pathlib import Path
from PIL import Image
from bs4 import BeautifulSoup as bf

from status_raise import StatusError,Status

from initial_data import initial_data
from answer import get_answer_info
from p import get_p_info
from dir import create_dir
from classify import Classifier
from img import get_set_img
from video import get_set_video
from save_html import save_html
from get_text import get_all_text
from add_into_dir import add_into_dir

def execute(p_url, answer_url, label,comment_need = False, stop = False):
label= []
url = ''
content = Classifier(label)
if p_url == '':
url = answer_url
status, got_content, name, base_url, get_way = get_answer_info(answer_url,content)
else:
url = p_url
status ,got_content, name, base_url, get_way = get_p_info(p_url,content)
if status.code == 3:
if stop:
return status, '',content
else :
raise StatusError(3, '知识荒原了', '知识荒原了')
content.set_content(got_content, base_url, get_way)
img_need, video_need = content.img_need, content.video_need
dir_status = create_dir(img_need,comment_need,video_need,base_url, stop)
if dir_status.code == 2:
return dir_status, '',content
if img_need:
content.content = get_set_img(content).content
if video_need:
content.content = get_set_video(content).content
show_content = get_all_text(content,base_url)
add_into_dir(initial_data.get_url_list(content)[0],url,name,show_content)
file = 'file://' + save_html(base_url, content)
return Status(0, '一切正常', '一切正常'),file,content
33 changes: 33 additions & 0 deletions get_all_answers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import requests

# 这俩是用来拿问题下其他回答的
def get_answers_respones(question_id):
global next_page_url
global is_end

url = 'https://www.zhihu.com/api/v4/questions/'+ question_id +'/feeds?include=%2Ccontent&limit=40'
response = requests.request("GET", url).json()
next_page_url = response['paging']['next']
is_end = response['paging']['is_end']
return response

def get_answers_content(frist_response ,answer_id):
global next_page_url
global is_end
for i in frist_response['data']:
if str(i['target']['id']) == answer_id:
return i['target']['content']
else :
next_page_url = frist_response['paging']['next']
is_end = frist_response['paging']['is_end']
continue
while not is_end:
response = requests.request("GET", next_page_url).json()
for i in response['data']:
if str(i['target']['id']) == answer_id:
return i['target']['content']
else :
next_page_url = response['paging']['next']
is_end = response['paging']['is_end']
continue
return None
17 changes: 17 additions & 0 deletions get_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from bs4 import BeautifulSoup as bf
from status_raise import StatusError

def get_all_text(res, base_url):
content = bf(res.content, 'html.parser').find_all(class_='css-1g0fqss', options="[object Object]")[0]
show_content = ''
try:
with open(base_url + "text.txt",'w' ,encoding='utf-8') as f:
text = content.get_text()
show_content = text[0:20]
f.write(text)
f.close()
except Exception as e:
raise StatusError(-2, '纯文本写入失败', e)

return show_content

53 changes: 53 additions & 0 deletions img.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import requests
import re
from bs4 import BeautifulSoup as bf
from status_raise import StatusError, Status

class get_set_img(object):
is_initial = False
img_list = []
img_name_index = 0
base_url = ''
get_way = ''

def __init__(self, content_classed):
if content_classed.is_classfied:
get_set_img.base_url, get_set_img.get_way = content_classed.base_url, content_classed.get_way
get_set_img.is_initial = True
get_set_img.set_img_list(content_classed.content)
self.content = get_set_img.change_img_attr(content_classed.content)
else:
raise StatusError(404, '传递了未进行前序必要操作的信息', '程序bug')


# 设置<img>的列表
def set_img_list(res):
for i in bf(res, 'html.parser').find_all('figure'):
img_src = get_set_img.get_img_url(i.find('img')['src'])
print(i.find('img'))
img_height = i.find('img')['data-rawheight']
img_width = i.find('img')['data-rawwidth']
img_aspect_ratio = int(img_width) / int(img_height)
img_att = '<img loading="lazy" onerror="this.src=\'' + get_set_img.get_way + img_src + '\';this.onerror=null;" src="' + './' + img_src + '" aspect-ratio ="' + str(img_aspect_ratio) + '">'
get_set_img.img_list.append(img_att)

# 下载图片,返回图片的相对路径
def get_img_url(img_url):
img_url = img_url.replace('720w.jpg?','1440w.jpg?')

try:
with open(get_set_img.base_url +'img/' + str(get_set_img.img_name_index) + '.jpg','wb') as f:
f.write(requests.get(img_url).content)
f.close()
except Exception as e:
raise StatusError(-3, '文件下载失败', e)
get_set_img.img_name_index += 1
return 'img/' + str(get_set_img.img_name_index - 1) + '.jpg'

# 把懒加载的<figure>替换成<img>
def change_img_attr(content):
def replace_img(m):
return get_set_img.img_list.pop(0)
content = re.sub(r"(<figure[^>]*>.*?<\/figure>)", replace_img, (content), 0, flags=re.MULTILINE)
return content

46 changes: 46 additions & 0 deletions initial_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import json
from status_raise import StatusError, Status

class initial_data(object):
is_initial = False
__pri = []
__pub = []

def initial():
try :
with open('./data.json') as f:
data = json.load(f)
f.close()
initial_data.__pub = [data['pub_base_url'], data['pub_get_way'], data['pub_push_way'], data['pub_trans_local_way'], data['pub_trans_pull_way'], data['pub_trans_push_way']]
initial_data.__pri = [data['pri_base_url'], data['pri_get_way'], data['pri_push_way'], data['pri_trans_local_way'], data['pri_trans_pull_way'], data['pri_trans_push_way']]
except Exception as e:
if initial_data.__pub == []:
raise StatusError(1, 'data.json文件未正确配置,缺少必要的公开/默认路径 pub 设置。', '配置文件错误')
elif initial_data.__pri == []:
initial_data.is_initial = True
return Status(1, 'data.json文件正确配置,但缺少可选的私人路径 pri 设置。', '配置文件不完整')

raise StatusError(-1, 'data.json文件读取出错。\n' + str(e.args), '未知系统错误')

initial_data.is_initial = True
return Status(0,'data.json文件读取成功,数据完整。', '配置文件完整读取')

def get_url_list(classed_obj):
if classed_obj.is_pri:
if initial_data.__pri != []:
return initial_data.__pri[0:3]
else:
raise StatusError(1, '未提供对应功能所需信息,请检查data.json文件是否正确进行私人路径 pri 设置。', '配置文件错误')
else :
return initial_data.__pub[0:3]

def get_trans(is_pri):
if is_pri:
if initial_data.__pri != []:
return initial_data.__pri[3], initial_data.__pri[4], initial_data.__pri[5]
else:
raise StatusError(1, '未提供对应功能所需信息,请检查data.json文件是否正确进行私人路径 pri 设置。', '配置文件错误')
else:
return initial_data.__pub[3], initial_data.__pub[4], initial_data.__pub[5]


Loading

0 comments on commit 460fc37

Please sign in to comment.