Skip to content

Commit

Permalink
Add/get hot api / gzh => gzh_info (#109)
Browse files Browse the repository at this point in the history
* add hot api and doc

* test and gzh_info => gzh

* fix readme name
  • Loading branch information
chyroc authored Jul 29, 2017
1 parent 3574b89 commit 303f92f
Show file tree
Hide file tree
Showing 8 changed files with 246 additions and 12 deletions.
54 changes: 52 additions & 2 deletions readme.md → README.md
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ Out[1]:
},
...
],
'gzh_info': {
'gzh': {
'authentication': '南京航空航天大学',
'headimage': 'http://wx.qlogo.cn/mmhead/Q3auHgzwzM4xV5PgPjK5XoPaaQoxnWJAFicibMvPAnsoybawMBFxua1g/0',
'introduction': '南航大志愿活动的领跑者,为你提供校内外的志愿资源和精彩消息。',
Expand All @@ -269,7 +269,7 @@ Out[1]:
- 数据结构
```python
{
'gzh_info': {
'gzh': {
'wechat_name': '', # 名称
'wechat_id': '', # 微信id
'introduction': '', # 简介
Expand Down Expand Up @@ -297,6 +297,56 @@ Out[1]:

```

### 解析 首页热门 页 - get_gzh_artilce_by_hot

![ws_api.get_gzh_artilce_by_hot(WechatSogouConst.hot_index.food)](https://raw.githubusercontent.com/chyroc/wechatsogou/master/screenshot/get_gzh_artilce_by_hot.png)

- 使用
```
In [1]: from pprint import pprint
...: from wechatsogou import WechatSogouAPI, WechatSogouConst
...:
...: ws_api = WechatSogouAPI()
...: gzh_articles = ws_api.get_gzh_artilce_by_hot(WechatSogouConst.hot_index.food)
...: for i in gzh_articles:
...: pprint(i)
...:
{
'article': {
'abstract': '闷热的夏天有什么事情能比吃上凉凉的甜品更惬意的呢?快一起动手做起来吧,简单方便,放冰箱冻一冻,那感觉~橙汁蒸木瓜木瓜1个(300-400克左右),橙子4个,枫糖浆20克(如果家里没有,也可以用蜂蜜、炼乳等代替),椰果适量。做法1.用削皮',
'main_img': 'http://img01.sogoucdn.com/net/a/04/link?appid=100520033&url=http%3A%2F%2Fmmbiz.qpic.cn%2Fmmbiz_jpg%2Fw9UGwFPia7QTUIadPibgW8OFkqf1ibR40xicKfzofRS0sDpaFp3CG0jkPyQKeXl44TXswztW1SJnic7tmCibjB8rIIGw%2F0%3Fwx_fmt%3Djpeg',
'open_id': 'oIWsFty9hHVI9F10amtzx5TOWIq8',
'time': 1501325220,
'title': '夏日甜品制作方法,不收藏后悔哦!',
'url': 'http://mp.weixin.qq.com/s?src=3&timestamp=1501328525&ver=1&signature=n9*oX0k4YbNFhNMsOjIekYrsha44lfBSCbG9jicAbGYrWNN8*48NzpcaHdxwUnC12syY5-ZxwcBfiJlMzdbAwWKlo26EW14w2Ax*gjLVlOX-AGXB4443obZ-GK0pw*AFZAGZD8sI4AFBZSZpyeaxN4sS7cpynxdIuw6S2h*--LI='
},
'gzh': {
'headimage': 'http://img03.sogoucdn.com/app/a/100520090/oIWsFty9hHVI9F10amtzx5TOWIq8',
'wechat_name': '甜品烘焙制作坊'
}
}
...
...
```

- 数据结构
```python
{
'gzh': {
'headimage': str, # 公众号头像
'wechat_name': str, # 公众号名称
},
'article': {
'url': str, # 文章临时链接
'title': str, # 文章标题
'abstract': str, # 文章摘要
'time': int, # 推送时间,10位时间戳
'open_id': str, # open id
'main_img': str # 封面图片
}
}
```

### 获取关键字联想词
- 使用
```
Expand Down
Binary file added screenshot/get_gzh_artilce_by_hot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions test/file/wapindex-wap-0612-wap_8-0.html

Large diffs are not rendered by default.

14 changes: 12 additions & 2 deletions test/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from nose.tools import assert_equal, assert_true, assert_in, assert_greater_equal
import httpretty

from wechatsogou.const import WechatSogouConst
from wechatsogou.request import WechatSogouRequest
from wechatsogou.api import WechatSogouAPI
from test import fake_data_path, gaokao_keyword
Expand Down Expand Up @@ -50,11 +51,20 @@ def test_get_gzh_artilce_by_history_real(self):
gzh_artilce = ws_api.get_gzh_artilce_by_history(gaokao_keyword,
identify_image_callback_search=identify_image_callback_ruokuai_search,
identify_image_callback_history=identify_image_callback_ruokuai_history)
assert_in('gzh_info', gzh_artilce)
assert_in('gzh', gzh_artilce)
assert_in('article', gzh_artilce)
assert_in('wx.qlogo.cn', gzh_artilce['gzh_info']['headimage'])
assert_in('wx.qlogo.cn', gzh_artilce['gzh']['headimage'])
assert_greater_equal(len(gzh_artilce['article']), 1)

def test_get_gzh_artilce_by_hot_real(self):
gzh_artilces = ws_api.get_gzh_artilce_by_hot(WechatSogouConst.hot_index.recommendation,
identify_image_callback=identify_image_callback_ruokuai_search)
for gzh_artilce in gzh_artilces:
assert_in('gzh', gzh_artilce)
assert_in('article', gzh_artilce)
assert_in('http://mp.weixin.qq.com/s?src=', gzh_artilce['article']['url'])
assert_greater_equal(len(gzh_artilces), 10)

def test_get_sugg(self):
sugg_gaokao = ws_api.get_sugg(gaokao_keyword)
assert_equal(10, len(sugg_gaokao))
Expand Down
68 changes: 65 additions & 3 deletions test/test_structuring.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,12 +229,74 @@ def test_get_article_by_history_json(self):
def test_get_gzh_info_and_article_by_history(self):
file_name = '{}/{}/{}'.format(os.getcwd(), fake_data_path, 'bitsea-history.html')
with io.open(file_name, encoding='utf-8') as f:
gzh_history = f.read()
gzh_info_and_article_by_history = f.read()

gzh_article_list = WechatSogouStructuring.get_gzh_info_and_article_by_history(gzh_history)
assert_in('gzh_info', gzh_article_list)
gzh_article_list = WechatSogouStructuring.get_gzh_info_and_article_by_history(gzh_info_and_article_by_history)
assert_in('gzh', gzh_article_list)
assert_in('article', gzh_article_list)

def test_get_gzh_artilce_by_hot(self):
file_name = '{}/{}/{}'.format(os.getcwd(), fake_data_path, 'wapindex-wap-0612-wap_8-0.html')
with io.open(file_name, encoding='utf-8') as f:
gzh_artilce_by_hot = f.read()

gzh_artilces = WechatSogouStructuring.get_gzh_artilce_by_hot(gzh_artilce_by_hot)

for gzh_artilce in gzh_artilces:
assert_in('gzh', gzh_artilce)
assert_in('article', gzh_artilce)
assert_in('http://mp.weixin.qq.com/s?src=', gzh_artilce['article']['url'])
assert_greater_equal(len(gzh_artilces), 10)

wechat_names = []
headimages = []
titles = []
times = []
for i in gzh_artilces:
wechat_names.append(i['gzh']['wechat_name'])
headimages.append(i['gzh']['headimage'])
titles.append(i['article']['title'])
times.append(i['article']['time'])

assert_equal(
['全球汽车精选', '车早茶', '吴佩频道', '驾考宝典', '腾讯汽车', '新车评', '非常好车', '汽车情报所',
'一猫汽车资讯', '资深科技控', '郎club', '科技日报', '汽车使用宝典', '名车报', '科普中国网'],
wechat_names)
assert_equal(['http://img03.sogoucdn.com/app/a/100520090/oIWsFt1dGMefD1f8dOg2UCwQUjKs',
'http://img04.sogoucdn.com/app/a/100520090/oIWsFtwoQX8wX7w6loDevPqLEC_I',
'http://img03.sogoucdn.com/app/a/100520090/oIWsFt9Hbbtr9VLnfR9i_K5Z8D48',
'http://img04.sogoucdn.com/app/a/100520090/oIWsFt3txmWu-usvUa6gU0qlyEVo',
'http://img01.sogoucdn.com/app/a/100520090/oIWsFt8VDujUqNSCfruXtMNfekaw',
'http://img01.sogoucdn.com/app/a/100520090/oIWsFt9YD5HWLDe5QAkuvh0JWrgw',
'http://img01.sogoucdn.com/app/a/100520090/oIWsFt_WUnpQ7lZajAstgL8o1lWo',
'http://img02.sogoucdn.com/app/a/100520090/oIWsFtzUnzWUMz1PMek5zjVlS42U',
'http://img03.sogoucdn.com/app/a/100520090/oIWsFt2yk491dhhSP940JzLEameY',
'http://img03.sogoucdn.com/app/a/100520090/oIWsFtzm9UtmgY-SkOTFwQFpGsU8',
'http://img02.sogoucdn.com/app/a/100520090/oIWsFt7VwiM8GqYcv8DBNb-k5NBQ',
'http://img03.sogoucdn.com/app/a/100520090/oIWsFt2tjckivF8b0MP_nNTdESkE',
'http://img01.sogoucdn.com/app/a/100520090/oIWsFtzC2r61_riTCWp5iHX04fmo',
'http://img02.sogoucdn.com/app/a/100520090/oIWsFt8JIY_-o7DBMxorP19hcF0Q',
'http://img04.sogoucdn.com/app/a/100520090/oIWsFtyV5sdIXU2uy4m6oVBq77nA'],
headimages)
assert_equal(['不做这个动作,你的轮胎3个月就要换!',
'新车质量最差的十个品牌?国人表示难以接受……',
'带着米其林的指引去看古德伍德|品牌',
'方向盘打法巧记口诀,科目二提分就靠它了!',
'宝马“鸡腿”、奥迪“游艇”,这些奇葩的挡杆你见过几个?',
'你没看错,我们做了期途昂和途锐的对比',
'7成特斯拉被召回,难道是质量不过关?',
'在中国惹不起的7种车,遇到请回避!',
'迈腾摊上大事儿了 全新一代君威17.58万起', '面对这份驾享,朝廷大人都忍不住亲自上阵!',
'外卖小哥被暴晒:底层人士的悲哀,有钱人不会懂',
'自动驾驶还处于“新手”阶段,何时成为“老司机”?院士这样说……',
'高速上碰到石头,是躲还是撞?', '装什么神秘,不就是加长版的讴歌TLX吗!',
'一个动作,车里的人集体中毒!很多人都忽略了'],
titles)
assert_equal(
[1501328135, 1501327941, 1501326826, 1501326716, 1501326675, 1501326455, 1501326222, 1501325595,
1501325529, 1501325521, 1501325223, 1501324531, 1501324443, 1501324310, 1501323274],
times)


if __name__ == '__main__':
unittest.main()
50 changes: 49 additions & 1 deletion wechatsogou/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ def get_gzh_artilce_by_history(self, keyword=None, url=None,
-------
dict
{
'gzh_info': {
'gzh': {
'wechat_name': '', # 名称
'wechat_id': '', # 微信id
'introduction': '', # 描述
Expand Down Expand Up @@ -325,6 +325,54 @@ def get_gzh_artilce_by_history(self, keyword=None, url=None,

return WechatSogouStructuring.get_gzh_info_and_article_by_history(resp.text)

def get_gzh_artilce_by_hot(self, hot_index, page=1, deblocking_callback=None, identify_image_callback=None):
"""获取 首页热门文章
Parameters
----------
hot_index : WechatSogouConst.hot_index
首页热门文章的分类(常量):WechatSogouConst.hot_index.xxx
page : int
页数
Returns
-------
list[dict]
{
'gzh': {
'headimage': str, # 公众号头像
'wechat_name': str, # 公众号名称
},
'article': {
'url': str, # 文章临时链接
'title': str, # 文章标题
'abstract': str, # 文章摘要
'time': int, # 推送时间,10位时间戳
'open_id': str, # open id
'main_img': str # 封面图片
}
}
"""

assert hasattr(WechatSogouConst.hot_index, hot_index)
assert isinstance(page, int) and page > 0

req = requests.session()

url = WechatSogouRequest.gen_hot_url(hot_index, page)

resp = WechatSogouRequest.get(url, req=req, headers=self.__set_cookie())
resp.encoding = 'utf-8'

if not resp.ok:
raise WechatSogouRequestsException('WechatSogouAPI get_hot_article', resp)

if 'antispider' in resp.url:
self.__deblocking(self.__deblocking_search, url, resp, req, deblocking_callback, identify_image_callback)
resp = WechatSogouRequest.get(url, req=req, headers=self.__set_cookie()) # req=req

return WechatSogouStructuring.get_gzh_artilce_by_hot(resp.text)

def get_article_content(self):
"""获取文章原文,避免临时链接失效
Expand Down
3 changes: 1 addition & 2 deletions wechatsogou/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def gen_hot_url(hot_index, page=1):
Parameters
----------
hot_index : str or unicode
hot_index : WechatSogouConst.hot_index
首页热门文章的分类(常量):WechatSogouConst.hot_index.xxx
page : int
页数
Expand All @@ -131,7 +131,6 @@ def gen_hot_url(hot_index, page=1):

assert hasattr(WechatSogouConst.hot_index, hot_index)
assert isinstance(page, int) and page > 0
hot_index = getattr(WechatSogouConst.hot_index, hot_index)

index_urls = {
WechatSogouConst.hot_index.hot: 0, # 热门
Expand Down
68 changes: 66 additions & 2 deletions wechatsogou/structuring.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def get_gzh_info_and_article_by_history(text):
-------
dict
{
'gzh_info': {
'gzh': {
'wechat_name': '', # 名称
'wechat_id': '', # 微信id
'introduction': '', # 描述
Expand All @@ -310,6 +310,70 @@ def get_gzh_info_and_article_by_history(text):
}
"""
return {
'gzh_info': WechatSogouStructuring.get_gzh_info_by_history(text),
'gzh': WechatSogouStructuring.get_gzh_info_by_history(text),
'article': WechatSogouStructuring.get_article_by_history_json(text)
}

@staticmethod
def get_gzh_artilce_by_hot(text):
"""从 首页热门搜索 提取公众号信息 和 文章列表信息
Parameters
----------
text : str or unicode
首页热门搜索 页 中 某一页 的文本
Returns
-------
list[dict]
{
'gzh': {
'headimage': str, # 公众号头像
'wechat_name': str, # 公众号名称
},
'article': {
'url': str, # 文章临时链接
'title': str, # 文章标题
'abstract': str, # 文章摘要
'time': int, # 推送时间,10位时间戳
'open_id': str, # open id
'main_img': str # 封面图片
}
}
"""
page = etree.HTML(text)
lis = page.xpath('/html/body/li')
gzh_article_list = []
for li in lis:
url = li.xpath('div[1]/h4/a/@href')
title = li.xpath('div[1]/h4/a/div/text()')
abstract = li.xpath('div[1]/p[1]/text()')

xpath_time = li.xpath('div[1]/p[2]')[0]
open_id = xpath_time.xpath('span/@data-openid')
headimage = xpath_time.xpath('span/@data-headimage')
gzh_name = xpath_time.xpath('span/text()')
send_time = xpath_time.xpath('a/span/@data-lastmodified')
main_img = li.xpath('div[2]/a/img/@src')

try:
send_time = int(send_time[0])
except:
send_time = send_time[0]

gzh_article_list.append({
'gzh': {
'headimage': headimage[0],
'wechat_name': gzh_name[0],
},
'article': {
'url': url[0],
'title': title[0],
'abstract': abstract[0],
'time': send_time,
'open_id': open_id[0],
'main_img': main_img[0]
}
})

return gzh_article_list

0 comments on commit 303f92f

Please sign in to comment.