Add/get hot api / gzh => gzh_info (#109)

* add hot api and doc * test and gzh_info => gzh * fix readme name
chyroc · Jul 29, 2017 · 303f92f · 303f92f
1 parent 3574b89
commit 303f92f
Show file tree

Hide file tree

Showing 8 changed files with 246 additions and 12 deletions.
diff --git a/readme.md → README.md b/readme.md → README.md
@@ -257,7 +257,7 @@ Out[1]:
     },
   ...
   ],
-  'gzh_info': {
+  'gzh': {
     'authentication': '南京航空航天大学',
     'headimage': 'http://wx.qlogo.cn/mmhead/Q3auHgzwzM4xV5PgPjK5XoPaaQoxnWJAFicibMvPAnsoybawMBFxua1g/0',
     'introduction': '南航大志愿活动的领跑者，为你提供校内外的志愿资源和精彩消息。',
@@ -269,7 +269,7 @@ Out[1]:
 - 数据结构
 ```python
 {
-    'gzh_info': {
+    'gzh': {
         'wechat_name': '',  # 名称
         'wechat_id': '',  # 微信id
         'introduction': '',  # 简介
@@ -297,6 +297,56 @@ Out[1]:
 
 ```
 
+### 解析 首页热门 页 - get_gzh_artilce_by_hot
+
+![ws_api.get_gzh_artilce_by_hot(WechatSogouConst.hot_index.food)](https://raw.githubusercontent.com/chyroc/wechatsogou/master/screenshot/get_gzh_artilce_by_hot.png)
+
+- 使用
+```
+In [1]: from pprint import pprint
+   ...: from wechatsogou import WechatSogouAPI, WechatSogouConst
+   ...:
+   ...: ws_api = WechatSogouAPI()
+   ...: gzh_articles = ws_api.get_gzh_artilce_by_hot(WechatSogouConst.hot_index.food)
+   ...: for i in gzh_articles:
+   ...:     pprint(i)
+   ...:
+{
+    'article': {
+        'abstract': '闷热的夏天有什么事情能比吃上凉凉的甜品更惬意的呢？快一起动手做起来吧，简单方便，放冰箱冻一冻，那感觉~橙汁蒸木瓜木瓜1个（300-400克左右），橙子4个，枫糖浆20克（如果家里没有，也可以用蜂蜜、炼乳等代替），椰果适量。做法1．用削皮',
+        'main_img': 'http://img01.sogoucdn.com/net/a/04/link?appid=100520033&url=http%3A%2F%2Fmmbiz.qpic.cn%2Fmmbiz_jpg%2Fw9UGwFPia7QTUIadPibgW8OFkqf1ibR40xicKfzofRS0sDpaFp3CG0jkPyQKeXl44TXswztW1SJnic7tmCibjB8rIIGw%2F0%3Fwx_fmt%3Djpeg',
+        'open_id': 'oIWsFty9hHVI9F10amtzx5TOWIq8',
+        'time': 1501325220,
+        'title': '夏日甜品制作方法，不收藏后悔哦!',
+        'url': 'http://mp.weixin.qq.com/s?src=3&timestamp=1501328525&ver=1&signature=n9*oX0k4YbNFhNMsOjIekYrsha44lfBSCbG9jicAbGYrWNN8*48NzpcaHdxwUnC12syY5-ZxwcBfiJlMzdbAwWKlo26EW14w2Ax*gjLVlOX-AGXB4443obZ-GK0pw*AFZAGZD8sI4AFBZSZpyeaxN4sS7cpynxdIuw6S2h*--LI='
+    },
+    'gzh': {
+        'headimage': 'http://img03.sogoucdn.com/app/a/100520090/oIWsFty9hHVI9F10amtzx5TOWIq8',
+        'wechat_name': '甜品烘焙制作坊'
+    }
+}
+...
+...
+```
+
+- 数据结构
+```python
+{
+    'gzh': {
+        'headimage': str,  # 公众号头像
+        'wechat_name': str,  # 公众号名称
+    },
+    'article': {
+        'url': str,  # 文章临时链接
+        'title': str,  # 文章标题
+        'abstract': str,  # 文章摘要
+        'time': int,  # 推送时间，10位时间戳
+        'open_id': str,  # open id
+        'main_img': str  # 封面图片
+    }
+}
+```
+
 ### 获取关键字联想词
 - 使用
 ```

diff --git a/screenshot/get_gzh_artilce_by_hot.png b/screenshot/get_gzh_artilce_by_hot.png
diff --git a/test/file/wapindex-wap-0612-wap_8-0.html b/test/file/wapindex-wap-0612-wap_8-0.html
diff --git a/test/test_api.py b/test/test_api.py
@@ -9,6 +9,7 @@
 from nose.tools import assert_equal, assert_true, assert_in, assert_greater_equal
 import httpretty
 
+from wechatsogou.const import WechatSogouConst
 from wechatsogou.request import WechatSogouRequest
 from wechatsogou.api import WechatSogouAPI
 from test import fake_data_path, gaokao_keyword
@@ -50,11 +51,20 @@ def test_get_gzh_artilce_by_history_real(self):
         gzh_artilce = ws_api.get_gzh_artilce_by_history(gaokao_keyword,
                                                         identify_image_callback_search=identify_image_callback_ruokuai_search,
                                                         identify_image_callback_history=identify_image_callback_ruokuai_history)
-        assert_in('gzh_info', gzh_artilce)
+        assert_in('gzh', gzh_artilce)
         assert_in('article', gzh_artilce)
-        assert_in('wx.qlogo.cn', gzh_artilce['gzh_info']['headimage'])
+        assert_in('wx.qlogo.cn', gzh_artilce['gzh']['headimage'])
         assert_greater_equal(len(gzh_artilce['article']), 1)
 
+    def test_get_gzh_artilce_by_hot_real(self):
+        gzh_artilces = ws_api.get_gzh_artilce_by_hot(WechatSogouConst.hot_index.recommendation,
+                                                     identify_image_callback=identify_image_callback_ruokuai_search)
+        for gzh_artilce in gzh_artilces:
+            assert_in('gzh', gzh_artilce)
+            assert_in('article', gzh_artilce)
+            assert_in('http://mp.weixin.qq.com/s?src=', gzh_artilce['article']['url'])
+        assert_greater_equal(len(gzh_artilces), 10)
+
     def test_get_sugg(self):
         sugg_gaokao = ws_api.get_sugg(gaokao_keyword)
         assert_equal(10, len(sugg_gaokao))

diff --git a/test/test_structuring.py b/test/test_structuring.py
@@ -229,12 +229,74 @@ def test_get_article_by_history_json(self):
     def test_get_gzh_info_and_article_by_history(self):
         file_name = '{}/{}/{}'.format(os.getcwd(), fake_data_path, 'bitsea-history.html')
         with io.open(file_name, encoding='utf-8') as f:
-            gzh_history = f.read()
+            gzh_info_and_article_by_history = f.read()
 
-        gzh_article_list = WechatSogouStructuring.get_gzh_info_and_article_by_history(gzh_history)
-        assert_in('gzh_info', gzh_article_list)
+        gzh_article_list = WechatSogouStructuring.get_gzh_info_and_article_by_history(gzh_info_and_article_by_history)
+        assert_in('gzh', gzh_article_list)
         assert_in('article', gzh_article_list)
 
+    def test_get_gzh_artilce_by_hot(self):
+        file_name = '{}/{}/{}'.format(os.getcwd(), fake_data_path, 'wapindex-wap-0612-wap_8-0.html')
+        with io.open(file_name, encoding='utf-8') as f:
+            gzh_artilce_by_hot = f.read()
+
+            gzh_artilces = WechatSogouStructuring.get_gzh_artilce_by_hot(gzh_artilce_by_hot)
+
+        for gzh_artilce in gzh_artilces:
+            assert_in('gzh', gzh_artilce)
+            assert_in('article', gzh_artilce)
+            assert_in('http://mp.weixin.qq.com/s?src=', gzh_artilce['article']['url'])
+        assert_greater_equal(len(gzh_artilces), 10)
+
+        wechat_names = []
+        headimages = []
+        titles = []
+        times = []
+        for i in gzh_artilces:
+            wechat_names.append(i['gzh']['wechat_name'])
+            headimages.append(i['gzh']['headimage'])
+            titles.append(i['article']['title'])
+            times.append(i['article']['time'])
+
+        assert_equal(
+            ['全球汽车精选', '车早茶', '吴佩频道', '驾考宝典', '腾讯汽车', '新车评', '非常好车', '汽车情报所',
+             '一猫汽车资讯', '资深科技控', '郎club', '科技日报', '汽车使用宝典', '名车报', '科普中国网'],
+            wechat_names)
+        assert_equal(['http://img03.sogoucdn.com/app/a/100520090/oIWsFt1dGMefD1f8dOg2UCwQUjKs',
+                      'http://img04.sogoucdn.com/app/a/100520090/oIWsFtwoQX8wX7w6loDevPqLEC_I',
+                      'http://img03.sogoucdn.com/app/a/100520090/oIWsFt9Hbbtr9VLnfR9i_K5Z8D48',
+                      'http://img04.sogoucdn.com/app/a/100520090/oIWsFt3txmWu-usvUa6gU0qlyEVo',
+                      'http://img01.sogoucdn.com/app/a/100520090/oIWsFt8VDujUqNSCfruXtMNfekaw',
+                      'http://img01.sogoucdn.com/app/a/100520090/oIWsFt9YD5HWLDe5QAkuvh0JWrgw',
+                      'http://img01.sogoucdn.com/app/a/100520090/oIWsFt_WUnpQ7lZajAstgL8o1lWo',
+                      'http://img02.sogoucdn.com/app/a/100520090/oIWsFtzUnzWUMz1PMek5zjVlS42U',
+                      'http://img03.sogoucdn.com/app/a/100520090/oIWsFt2yk491dhhSP940JzLEameY',
+                      'http://img03.sogoucdn.com/app/a/100520090/oIWsFtzm9UtmgY-SkOTFwQFpGsU8',
+                      'http://img02.sogoucdn.com/app/a/100520090/oIWsFt7VwiM8GqYcv8DBNb-k5NBQ',
+                      'http://img03.sogoucdn.com/app/a/100520090/oIWsFt2tjckivF8b0MP_nNTdESkE',
+                      'http://img01.sogoucdn.com/app/a/100520090/oIWsFtzC2r61_riTCWp5iHX04fmo',
+                      'http://img02.sogoucdn.com/app/a/100520090/oIWsFt8JIY_-o7DBMxorP19hcF0Q',
+                      'http://img04.sogoucdn.com/app/a/100520090/oIWsFtyV5sdIXU2uy4m6oVBq77nA'],
+                     headimages)
+        assert_equal(['不做这个动作，你的轮胎3个月就要换!',
+                      '新车质量最差的十个品牌?国人表示难以接受……',
+                      '带着米其林的指引去看古德伍德|品牌',
+                      '方向盘打法巧记口诀，科目二提分就靠它了!',
+                      '宝马“鸡腿”、奥迪“游艇”，这些奇葩的挡杆你见过几个?',
+                      '你没看错，我们做了期途昂和途锐的对比',
+                      '7成特斯拉被召回，难道是质量不过关?',
+                      '在中国惹不起的7种车,遇到请回避!',
+                      '迈腾摊上大事儿了 全新一代君威17.58万起', '面对这份驾享，朝廷大人都忍不住亲自上阵!',
+                      '外卖小哥被暴晒：底层人士的悲哀，有钱人不会懂',
+                      '自动驾驶还处于“新手”阶段，何时成为“老司机”?院士这样说……',
+                      '高速上碰到石头，是躲还是撞?', '装什么神秘，不就是加长版的讴歌TLX吗!',
+                      '一个动作，车里的人集体中毒!很多人都忽略了'],
+                     titles)
+        assert_equal(
+            [1501328135, 1501327941, 1501326826, 1501326716, 1501326675, 1501326455, 1501326222, 1501325595,
+             1501325529, 1501325521, 1501325223, 1501324531, 1501324443, 1501324310, 1501323274],
+            times)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/wechatsogou/api.py b/wechatsogou/api.py
@@ -271,7 +271,7 @@ def get_gzh_artilce_by_history(self, keyword=None, url=None,
         -------
         dict
             {
-                'gzh_info': {
+                'gzh': {
                     'wechat_name': '',  # 名称
                     'wechat_id': '',  # 微信id
                     'introduction': '',  # 描述
@@ -325,6 +325,54 @@ def get_gzh_artilce_by_history(self, keyword=None, url=None,
 
         return WechatSogouStructuring.get_gzh_info_and_article_by_history(resp.text)
 
+    def get_gzh_artilce_by_hot(self, hot_index, page=1, deblocking_callback=None, identify_image_callback=None):
+        """获取 首页热门文章
+
+        Parameters
+        ----------
+        hot_index : WechatSogouConst.hot_index
+            首页热门文章的分类（常量）：WechatSogouConst.hot_index.xxx
+        page : int
+            页数
+
+        Returns
+        -------
+        list[dict]
+            {
+                'gzh': {
+                    'headimage': str,  # 公众号头像
+                    'wechat_name': str,  # 公众号名称
+                },
+                'article': {
+                    'url': str,  # 文章临时链接
+                    'title': str,  # 文章标题
+                    'abstract': str,  # 文章摘要
+                    'time': int,  # 推送时间，10位时间戳
+                    'open_id': str,  # open id
+                    'main_img': str  # 封面图片
+                }
+            }
+        """
+
+        assert hasattr(WechatSogouConst.hot_index, hot_index)
+        assert isinstance(page, int) and page > 0
+
+        req = requests.session()
+
+        url = WechatSogouRequest.gen_hot_url(hot_index, page)
+
+        resp = WechatSogouRequest.get(url, req=req, headers=self.__set_cookie())
+        resp.encoding = 'utf-8'
+
+        if not resp.ok:
+            raise WechatSogouRequestsException('WechatSogouAPI get_hot_article', resp)
+
+        if 'antispider' in resp.url:
+            self.__deblocking(self.__deblocking_search, url, resp, req, deblocking_callback, identify_image_callback)
+            resp = WechatSogouRequest.get(url, req=req, headers=self.__set_cookie())  # req=req
+
+        return WechatSogouStructuring.get_gzh_artilce_by_hot(resp.text)
+
     def get_article_content(self):
         """获取文章原文，避免临时链接失效
 

diff --git a/wechatsogou/request.py b/wechatsogou/request.py
@@ -118,7 +118,7 @@ def gen_hot_url(hot_index, page=1):
 
         Parameters
         ----------
-        hot_index : str or unicode
+        hot_index : WechatSogouConst.hot_index
             首页热门文章的分类（常量）：WechatSogouConst.hot_index.xxx
         page : int
             页数
@@ -131,7 +131,6 @@ def gen_hot_url(hot_index, page=1):
 
         assert hasattr(WechatSogouConst.hot_index, hot_index)
         assert isinstance(page, int) and page > 0
-        hot_index = getattr(WechatSogouConst.hot_index, hot_index)
 
         index_urls = {
             WechatSogouConst.hot_index.hot: 0,  # 热门

diff --git a/wechatsogou/structuring.py b/wechatsogou/structuring.py
@@ -283,7 +283,7 @@ def get_gzh_info_and_article_by_history(text):
         -------
         dict
             {
-                'gzh_info': {
+                'gzh': {
                     'wechat_name': '',  # 名称
                     'wechat_id': '',  # 微信id
                     'introduction': '',  # 描述
@@ -310,6 +310,70 @@ def get_gzh_info_and_article_by_history(text):
             }
         """
         return {
-            'gzh_info': WechatSogouStructuring.get_gzh_info_by_history(text),
+            'gzh': WechatSogouStructuring.get_gzh_info_by_history(text),
             'article': WechatSogouStructuring.get_article_by_history_json(text)
         }
+
+    @staticmethod
+    def get_gzh_artilce_by_hot(text):
+        """从 首页热门搜索 提取公众号信息 和 文章列表信息
+
+        Parameters
+        ----------
+        text : str or unicode
+            首页热门搜索 页 中 某一页 的文本
+
+        Returns
+        -------
+        list[dict]
+            {
+                'gzh': {
+                    'headimage': str,  # 公众号头像
+                    'wechat_name': str,  # 公众号名称
+                },
+                'article': {
+                    'url': str,  # 文章临时链接
+                    'title': str,  # 文章标题
+                    'abstract': str,  # 文章摘要
+                    'time': int,  # 推送时间，10位时间戳
+                    'open_id': str,  # open id
+                    'main_img': str  # 封面图片
+                }
+            }
+        """
+        page = etree.HTML(text)
+        lis = page.xpath('/html/body/li')
+        gzh_article_list = []
+        for li in lis:
+            url = li.xpath('div[1]/h4/a/@href')
+            title = li.xpath('div[1]/h4/a/div/text()')
+            abstract = li.xpath('div[1]/p[1]/text()')
+
+            xpath_time = li.xpath('div[1]/p[2]')[0]
+            open_id = xpath_time.xpath('span/@data-openid')
+            headimage = xpath_time.xpath('span/@data-headimage')
+            gzh_name = xpath_time.xpath('span/text()')
+            send_time = xpath_time.xpath('a/span/@data-lastmodified')
+            main_img = li.xpath('div[2]/a/img/@src')
+
+            try:
+                send_time = int(send_time[0])
+            except:
+                send_time = send_time[0]
+
+            gzh_article_list.append({
+                'gzh': {
+                    'headimage': headimage[0],
+                    'wechat_name': gzh_name[0],
+                },
+                'article': {
+                    'url': url[0],
+                    'title': title[0],
+                    'abstract': abstract[0],
+                    'time': send_time,
+                    'open_id': open_id[0],
+                    'main_img': main_img[0]
+                }
+            })
+
+        return gzh_article_list