forked from xiaosimao/transfrom_zhihu_answer_to_md
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathzhihu.py
165 lines (135 loc) · 5.65 KB
/
zhihu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Created by shimeng on 17-6-5
import os
import re
import json
import urllib
import urlparse
import requests
import html2text
from parse_content import parse
"""
just for study and fun
Talk is cheap
show me your code
"""
class ZhiHu(object):
def __init__(self):
pass
def request(self, url, retry_times=10):
host = urlparse.urlparse(url).netloc
header = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
'Host': host
}
content = None
times = 0
while retry_times > 0:
times += 1
print 'request %s, times: %d' % (url, times)
try:
content = requests.get(url, headers=header, timeout=10).content
except Exception, e:
print e
retry_times -= 1
else:
return content
def get_all_answer_content(self, question_id, flag=2):
first_url_format = 'https://www.zhihu.com/api/v4/questions/{}/answers?include=data[*].content,voteup_count,created_time&offset=0&limit=20&sort_by=default'
first_url = first_url_format.format(question_id)
response = self.request(first_url)
if response:
contents = json.loads(response)
print contents.get('paging').get('is_end')
if contents.get('paging').get('is_end'):
for content in contents.get('data'):
self.parse_content(content, flag)
else:
while not contents.get('paging').get('is_end'):
for content in contents.get('data'):
self.parse_content(content, flag)
next_page_url = contents.get('paging').get('next').replace('http', 'https')
contents = json.loads(self.request(next_page_url))
else:
raise ValueError('failed 10 times, quit......')
def get_single_answer_content(self, answer_url, flag=1):
all_content = {}
question_id, answer_id = re.findall('https://www.zhihu.com/question/(\d+)/answer/(\d+)', answer_url)[0]
html_content = self.request(answer_url)
if html_content:
all_content['main_content'] = html_content
else:
raise ValueError('failed 10 times, quit......')
ajax_answer_url = 'https://www.zhihu.com/api/v4/answers/{}'.format(answer_id)
ajax_content = self.request(ajax_answer_url)
if ajax_content:
all_content['ajax_content'] = json.loads(ajax_content)
else:
raise ValueError('failed 10 times, quit......')
self.parse_content(all_content, flag, )
def parse_content(self, content, flag=None):
data = parse(content, flag)
self.transform_to_markdown(data)
def transform_to_markdown(self, data):
content = data['content']
author_name = data['author_name'].encode('utf-8')
answer_id = data['answer_id']
question_id = data['question_id']
question_title = data['question_title'].encode('utf-8')
vote_up_count = data['vote_up_count']
create_time = data['create_time']
file_name = unicode('%s--%s的回答[%d].md' % (question_title, author_name, answer_id), 'utf-8')
folder_name = unicode('%s' % (question_title), 'utf-8')
if not os.path.exists(os.path.join(os.getcwd(), folder_name.encode('utf-8'))):
os.mkdir(folder_name)
f = open(os.path.join(os.getcwd(), folder_name.encode('utf-8'), file_name.encode('utf-8','ignore')),"wt")
f.write("-" * 40 + "\n")
origin_url = 'https://www.zhihu.com/question/{}/answer/{}'.format(question_id, answer_id)
f.write("## 本答案原始链接: " + origin_url + "\n")
f.write("### question_title: " + question_title + "\n")
f.write("### Author_Name: " + author_name + "\n")
f.write("### Answer_ID: %d" % answer_id + "\n")
f.write("### Question_ID %d: " % question_id + "\n")
f.write("### VoteCount: %s" % vote_up_count + "\n")
f.write("### Create_Time: " + create_time + "\n")
f.write("-" * 40 + "\n")
text = html2text.html2text(content.decode('utf-8')).encode("utf-8")
# 标题
r = re.findall(r'\*\*(.*?)\*\*', text, re.S)
for i in r:
if i != " ":
text = text.replace(i, i.strip())
r = re.findall(r'_(.*)_', text)
for i in r:
if i != " ":
text = text.replace(i, i.strip())
text = text.replace('_ _', '')
text = text.replace('_b.', '_r.')
# 图片
r = re.findall(r'!\[\]\((?:.*?)\)', text)
for i in r:
text = text.replace(i, i + "\n\n")
cwd = os.getcwd()
folder_name = '%s/image' % (cwd)
if not os.path.exists(folder_name):
os.mkdir(folder_name)
img_url = re.findall('\((.*)\)', i)[0]
save_name = img_url.split('/')[-1]
pic_file_name = '%s/%s' % (folder_name, save_name)
try:
urllib.urlretrieve(img_url, pic_file_name)
except Exception, e:
print e
else:
text = text.replace(img_url, pic_file_name)
f.write(text)
f.close()
if __name__ == '__main__':
zhihu = ZhiHu()
# url = 'https://www.zhihu.com/question/27621722/answer/105331078'
# zhihu.get_single_answer_content(url)
# 29470294
question_id = '24400664'
zhihu.get_all_answer_content(question_id)