diff --git a/config.example.py b/config.example.py index 20b80fec5..3c08cad39 100644 --- a/config.example.py +++ b/config.example.py @@ -46,4 +46,5 @@ # 'setu', 'translate', # 'twitter', + # 'weibo' } diff --git a/hoshino/modules/weibo/__init__.py b/hoshino/modules/weibo/__init__.py new file mode 100644 index 000000000..9b4ac21d2 --- /dev/null +++ b/hoshino/modules/weibo/__init__.py @@ -0,0 +1,149 @@ +from .weibo import WeiboSpider +from hoshino.service import Service, Privilege as Priv +from hoshino.res import R +from hoshino import util +from .exception import * + +''' +sample config.json + +[{ + "service_name": "weibo-bcr", + "enable_on_default": true, + "users":[{ + "user_id": "6603867494", + "alias": ["公主连接", "公主连结", "公主链接"], + "filter": true + }] + +}] +''' + +lmt = util.FreqLimiter(5) + +def _load_config(services_config): + for sv_config in services_config: + sv.logger.debug(sv_config) + service_name = sv_config["service_name"] + enable_on_default = sv_config.get("enable_on_default", False) + + users_config = sv_config["users"] + + sv_spider_list = [] + for user_config in users_config: + wb_spider = WeiboSpider(user_config) + sv_spider_list.append(wb_spider) + alias_list = user_config.get("alias", []) + for alias in alias_list: + if alias in alias_dic: + raise DuplicateError(f"Alias {alias} is duplicate") + alias_dic[alias] = { + "service_name":service_name, + "user_id":wb_spider.get_user_id() + } + + subService = Service(service_name, enable_on_default=enable_on_default) + subr_dic[service_name] = {"service": subService, "spiders": sv_spider_list} + + + +sv = Service('weibo-poller', manage_priv=Priv.SUPERUSER, visible=False) +services_config = util.load_config(__file__) +subr_dic = {} +alias_dic = {} +_load_config(services_config) + +def wb_to_message(wb): + msg = f'@{wb["screen_name"]}' + if "retweet" in wb: + msg = f'{msg} 转发:\n{wb["text"]}\n======================' + wb = wb["retweet"] + else: + msg = f'{msg}:' + + msg = f'{msg}\n{wb["text"]}' + + if sv.bot.config.IS_CQPRO and len(wb["pics"]) > 0: + images_url = wb["pics"] + msg = f'{msg}\n' + res_imgs = [R.remote_img(url).cqcode for url in images_url] + for img in res_imgs: + msg = f'{msg}{img}' + if len(wb["video_url"]) > 0: + videos = wb["video_url"] + res_videos = ';'.join(videos) + msg = f'{msg}\n视频链接:{res_videos}' + + return msg + +weibo_url_prefix = "https://weibo.com/u" +@sv.on_command('weibo-config',aliases=('查看微博服务', '微博服务', '微博配置', '查看微博配置')) +async def weibo_config(session): + msg = '微博推送配置:服务名,别名,微博链接' + index = 1 + for service_config in services_config: + service_name = service_config['service_name'] + users_config = service_config['users'] + for user_config in users_config: + weibo_id = user_config['user_id'] + alias = user_config['alias'] + weibo_url = f'{weibo_url_prefix}/{weibo_id}' + msg = f'{msg}\n{index}. {service_name}, {alias}, {weibo_url}' + index+=1 + session.finish(msg) + + +# @bot 看微博 alias +@sv.on_command('看微博', only_to_me=True) +async def get_last_5_weibo(session): + uid = session.ctx['user_id'] + if not lmt.check(uid): + session.finish('您查询得过于频繁,请稍等片刻', at_sender=True) + return + + lmt.start_cd(uid) + + alias = session.current_arg_text + if alias not in alias_dic: + session.finish(f"未找到微博: {alias}") + return + + service_name = alias_dic[alias]["service_name"] + user_id = alias_dic[alias]["user_id"] + + spiders = subr_dic[service_name]["spiders"] + for spider in spiders: + if spider.get_user_id() == user_id: + last_5_weibos = spider.get_last_5_weibos() + formatted_weibos = [wb_to_message(wb) for wb in last_5_weibos] + for wb in formatted_weibos: + await session.send(wb) + session.finish(f"以上为 {alias} 的最新 {len(formatted_weibos)} 条微博") + return + session.finish(f"未找到微博: {alias}") + +@sv.scheduled_job('interval', seconds=20*60) +async def weibo_poller(): + for sv_name, serviceObj in subr_dic.items(): + weibos = [] + ssv = serviceObj["service"] + spiders = serviceObj["spiders"] + for spider in spiders: + latest_weibos = await spider.get_latest_weibos() + formatted_weibos = [wb_to_message(wb) for wb in latest_weibos] + + if l := len(formatted_weibos): + sv.logger.info(f"成功获取@{spider.get_username()}的新微博{l}条") + else: + sv.logger.info(f"未检测到@{spider.get_username()}的新微博") + + weibos.extend(formatted_weibos) + await ssv.broadcast(weibos, ssv.name, 0.5) + +@sv.scheduled_job('interval', seconds=60*60*24) +async def clear_spider_buffer(): + sv.logger.info("Clearing weibo spider buffer...") + for sv_name, serviceObj in subr_dic.items(): + spiders = serviceObj["spiders"] + for spider in spiders: + spider.clear_buffer() \ No newline at end of file diff --git a/hoshino/modules/weibo/exception.py b/hoshino/modules/weibo/exception.py new file mode 100644 index 000000000..bda1619b8 --- /dev/null +++ b/hoshino/modules/weibo/exception.py @@ -0,0 +1,24 @@ +class WeiboError(Exception): + def __init__(self, msg, *msgs): + self._msgs = [msg, *msgs] + + def __str__(self): + return '\n'.join(self._msgs) + + @property + def message(self): + return str(self) + + def append(self, msg:str): + self._msgs.append(msg) + + +class ParseError(WeiboError): + pass + + +class NotFoundError(WeiboError): + pass + +class DuplicateError(WeiboError): + pass \ No newline at end of file diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py new file mode 100644 index 000000000..7d2e18183 --- /dev/null +++ b/hoshino/modules/weibo/weibo.py @@ -0,0 +1,460 @@ +# -*- coding: UTF-8 -*- + +import json +import random +import sys +import asyncio +from collections import OrderedDict +from datetime import date, datetime, timedelta +from time import sleep + +import httpx +from lxml import etree +from hoshino import logger +from .exception import * + +class WeiboSpider(object): + def __init__(self, config): + """Weibo类初始化""" + self.validate_config(config) + self.filter = config['filter'] + self.user_id = config['user_id'] + self.received_weibo_ids = [] + self.last_5_weibos = [] + self.__recent = False + asyncio.get_event_loop().run_until_complete(self._async_init()) + + async def _async_init(self): + self.__init = True + self.user = await self.get_user_info(self.user_id) + await self.get_latest_weibos() + self.__init = False + + async def get_json(self, params): + """获取网页中json数据""" + url = 'https://m.weibo.cn/api/container/getIndex?' + async with httpx.AsyncClient() as client: + r = await client.get(url, params=params, timeout=10.0) # sometimes timeout + return r.json() + + async def get_user_info(self, user_id): + """获取用户信息""" + params = {'containerid': '100505' + str(user_id)} + js = await self.get_json(params) + if js['ok']: + info = js['data']['userInfo'] + user_info = OrderedDict() + user_info['id'] = user_id + user_info['screen_name'] = info.get('screen_name', '') + user_info['gender'] = info.get('gender', '') + params = { + 'containerid': + '230283' + str(user_id) + '_-_INFO' + } + zh_list = [ + u'生日', u'所在地', u'小学', u'初中', u'高中', u'大学', u'公司', u'注册时间', + u'阳光信用' + ] + en_list = [ + 'birthday', 'location', 'education', 'education', 'education', + 'education', 'company', 'registration_time', 'sunshine' + ] + for i in en_list: + user_info[i] = '' + js = await self.get_json(params) + if js['ok']: + cards = js['data']['cards'] + if isinstance(cards, list) and len(cards) > 1: + card_list = cards[0]['card_group'] + cards[1]['card_group'] + for card in card_list: + if card.get('item_name') in zh_list: + user_info[en_list[zh_list.index( + card.get('item_name'))]] = card.get( + 'item_content', '') + user_info['statuses_count'] = info.get('statuses_count', 0) + user_info['followers_count'] = info.get('followers_count', 0) + user_info['follow_count'] = info.get('follow_count', 0) + user_info['description'] = info.get('description', '') + user_info['profile_url'] = info.get('profile_url', '') + user_info['profile_image_url'] = info.get('profile_image_url', '') + user_info['avatar_hd'] = info.get('avatar_hd', '') + user_info['urank'] = info.get('urank', 0) + user_info['mbrank'] = info.get('mbrank', 0) + user_info['verified'] = info.get('verified', False) + user_info['verified_type'] = info.get('verified_type', 0) + user_info['verified_reason'] = info.get('verified_reason', '') + user = self.standardize_info(user_info) + return user + + def clear_buffer(self): + """ + 如果清理缓存前一分钟,该微博账号瞬间发送了 20 条微博 + 然后清理缓存仅仅保留后 10 条的微博id,因此可能会重复推送前 10 条微博 + 当然这种情况通常不会发生 + """ + self.received_weibo_ids = self.received_weibo_ids[-10:] + + def validate_config(self, config): + """验证配置是否正确""" + exist_argument_list = ['user_id'] + true_false_argument_list = ['filter'] + + for argument in true_false_argument_list: + if argument not in config: + raise NotFoundError(f'未找到参数{argument}') + if config[argument] != True and config[argument] != False: + raise ParseError(f'{argument} 值应为 True 或 False') + + for argument in exist_argument_list: + if argument not in config: + raise NotFoundError(f'未找到参数{argument}') + + def get_pics(self, weibo_info): + """获取微博原始图片url""" + if weibo_info.get('pics'): + pic_info = weibo_info['pics'] + pic_list = [pic['large']['url'] for pic in pic_info] + else: + pic_list = [] + + """获取文章封面图片url""" + if 'page_info' in weibo_info and weibo_info['page_info']['type'] == 'article': + if 'page_pic' in weibo_info['page_info']: + pic_list.append(weibo_info['page_info']['page_pic']['url']) + + return pic_list + + def get_live_photo(self, weibo_info): + """获取live photo中的视频url""" + live_photo_list = [] + live_photo = weibo_info.get('pic_video') + if live_photo: + prefix = 'https://video.weibo.com/media/play?livephoto=//us.sinaimg.cn/' + for i in live_photo.split(','): + if len(i.split(':')) == 2: + url = prefix + i.split(':')[1] + '.mov' + live_photo_list.append(url) + return live_photo_list + + def get_video_url(self, weibo_info): + """获取微博视频url""" + video_url = '' + video_url_list = [] + if weibo_info.get('page_info'): + if weibo_info['page_info'].get('media_info') and weibo_info[ + 'page_info'].get('type') == 'video': + media_info = weibo_info['page_info']['media_info'] + video_url = media_info.get('mp4_720p_mp4') + if not video_url: + video_url = media_info.get('mp4_hd_url') + if not video_url: + video_url = media_info.get('mp4_sd_url') + if not video_url: + video_url = media_info.get('stream_url_hd') + if not video_url: + video_url = media_info.get('stream_url') + if video_url: + video_url_list.append(video_url) + live_photo_list = self.get_live_photo(weibo_info) + if live_photo_list: + video_url_list += live_photo_list + return video_url_list + + def get_location(self, selector): + """获取微博发布位置""" + location_icon = 'timeline_card_small_location_default.png' + span_list = selector.xpath('//span') + location = '' + for i, span in enumerate(span_list): + if span.xpath('img/@src'): + if location_icon in span.xpath('img/@src')[0]: + location = span_list[i + 1].xpath('string(.)') + break + return location + + def get_topics(self, selector): + """获取参与的微博话题""" + span_list = selector.xpath("//span[@class='surl-text']") + topics = '' + topic_list = [] + for span in span_list: + text = span.xpath('string(.)') + if len(text) > 2 and text[0] == '#' and text[-1] == '#': + topic_list.append(text[1:-1]) + if topic_list: + topics = ','.join(topic_list) + return topics + + def get_at_users(self, selector): + """获取@用户""" + a_list = selector.xpath('//a') + at_users = '' + at_list = [] + for a in a_list: + if '@' + a.xpath('@href')[0][3:] == a.xpath('string(.)'): + at_list.append(a.xpath('string(.)')[1:]) + if at_list: + at_users = ','.join(at_list) + return at_users + + def get_text(self, text_body): + selector = etree.HTML(text_body) + url_lists = selector.xpath('//a[@data-url]/@data-url') + url_elems = selector.xpath('//a[@data-url]/span[@class="surl-text"]') + + ''' + Add the url of to the text of + For example: + + + + 本地化笔记第三期——剧情活动排期调整及版本更新内容前瞻 + + + replace 本地化笔记第三期——剧情活动排期调整及版本更新内容前瞻 + with 本地化笔记第三期——剧情活动排期调整及版本更新内容前瞻(http://t.cn/A622uDbW) + ''' + for i in range(0, len(url_lists)): + url_elems[i].text = f'{url_elems[i].text}({url_lists[i]})' + return selector.xpath('string(.)') + + def string_to_int(self, string): + """字符串转换为整数""" + if isinstance(string, int): + return string + elif string.endswith(u'万+'): + string = int(string[:-2] + '0000') + elif string.endswith(u'万'): + string = int(string[:-1] + '0000') + return int(string) + + def standardize_date(self, created_at): + """标准化微博发布时间""" + if u"刚刚" in created_at: + created_at = datetime.now().strftime("%Y-%m-%d") + self.__recent = True + elif u"分钟" in created_at: + minute = created_at[:created_at.find(u"分钟")] + minute = timedelta(minutes=int(minute)) + created_at = (datetime.now() - minute).strftime("%Y-%m-%d") + self.__recent = True + elif u"小时" in created_at: + hour = created_at[:created_at.find(u"小时")] + hour = timedelta(hours=int(hour)) + created_at = (datetime.now() - hour).strftime("%Y-%m-%d") + if self.__init: + self.__recent = True + else: + self.__recent = False + elif u"昨天" in created_at: + day = timedelta(days=1) + created_at = (datetime.now() - day).strftime("%Y-%m-%d") + if self.__init: + self.__recent = True + else: + self.__recent = False + elif created_at.count('-') == 1: + year = datetime.now().strftime("%Y") + created_at = year + "-" + created_at + if self.__init: + self.__recent = True + else: + self.__recent = False + return created_at + + def standardize_info(self, weibo): + """标准化信息,去除乱码""" + for k, v in weibo.items(): + if 'bool' not in str(type(v)) and 'int' not in str( + type(v)) and 'list' not in str( + type(v)) and 'long' not in str(type(v)): + weibo[k] = v.replace(u"\u200b", "").encode( + sys.stdout.encoding, "ignore").decode(sys.stdout.encoding) + return weibo + + def parse_weibo(self, weibo_info): + weibo = OrderedDict() + if weibo_info['user']: + weibo['user_id'] = weibo_info['user']['id'] + weibo['screen_name'] = weibo_info['user']['screen_name'] + else: + weibo['user_id'] = '' + weibo['screen_name'] = '' + weibo['id'] = int(weibo_info['id']) + weibo['bid'] = weibo_info['bid'] + text_body = weibo_info['text'] + selector = etree.HTML(text_body) + + + weibo['text'] = self.get_text(text_body) + + weibo['pics'] = self.get_pics(weibo_info) + weibo['video_url'] = self.get_video_url(weibo_info) + weibo['location'] = self.get_location(selector) + weibo['created_at'] = weibo_info['created_at'] + weibo['source'] = weibo_info['source'] + weibo['attitudes_count'] = self.string_to_int( + weibo_info.get('attitudes_count', 0)) + weibo['comments_count'] = self.string_to_int( + weibo_info.get('comments_count', 0)) + weibo['reposts_count'] = self.string_to_int( + weibo_info.get('reposts_count', 0)) + weibo['topics'] = self.get_topics(selector) + weibo['at_users'] = self.get_at_users(selector) + return self.standardize_info(weibo) + + def print_one_weibo(self, weibo): + """打印一条微博""" + try: + logger.info(u'微博id:%d' % weibo['id']) + logger.info(u'微博正文:%s' % weibo['text']) + logger.info(u'原始图片url:%s' % weibo['pics']) + logger.info(u'微博位置:%s' % weibo['location']) + logger.info(u'发布时间:%s' % weibo['created_at']) + logger.info(u'发布工具:%s' % weibo['source']) + logger.info(u'点赞数:%d' % weibo['attitudes_count']) + logger.info(u'评论数:%d' % weibo['comments_count']) + logger.info(u'转发数:%d' % weibo['reposts_count']) + logger.info(u'话题:%s' % weibo['topics']) + logger.info(u'@用户:%s' % weibo['at_users']) + logger.info(u'url:https://m.weibo.cn/detail/%d' % weibo['id']) + except OSError: + pass + + def print_weibo(self, weibo): + """打印微博,若为转发微博,会同时打印原创和转发部分""" + if weibo.get('retweet'): + logger.info('*' * 100) + logger.info(u'转发部分:') + self.print_one_weibo(weibo['retweet']) + logger.info('*' * 100) + logger.info(u'原创部分:') + self.print_one_weibo(weibo) + logger.info('-' * 120) + + def get_username(self): + return self.user["screen_name"] + + def get_user_id(self): + return self.user_id + + def get_last_5_weibos(self): + return self.last_5_weibos + + async def get_weibo_json(self, page): + """获取网页中微博json数据""" + params = { + 'containerid': '107603' + self.get_user_id(), + 'page': page + } + js = await self.get_json(params) + return js + + async def get_long_weibo(self, id): + """获取长微博""" + for i in range(5): + url = 'https://m.weibo.cn/detail/%s' % id + async with httpx.AsyncClient() as client: + html = await client.get(url) + html = html.text + html = html[html.find('"status":'):] + html = html[:html.rfind('"hotScheme"')] + html = html[:html.rfind(',')] + html = '{' + html + '}' + js = json.loads(html, strict=False) + weibo_info = js.get('status') + if weibo_info: + weibo = self.parse_weibo(weibo_info) + return weibo + asyncio.sleep(random.randint(6, 10)) + + def print_user_info(self): + """打印用户信息""" + logger.info('+' * 100) + logger.info(u'用户信息') + logger.info(u'用户id:%s' % self.user['id']) + logger.info(u'用户昵称:%s' % self.user['screen_name']) + gender = u'女' if self.user['gender'] == 'f' else u'男' + logger.info(u'性别:%s' % gender) + logger.info(u'生日:%s' % self.user['birthday']) + logger.info(u'所在地:%s' % self.user['location']) + logger.info(u'教育经历:%s' % self.user['education']) + logger.info(u'公司:%s' % self.user['company']) + logger.info(u'阳光信用:%s' % self.user['sunshine']) + logger.info(u'注册时间:%s' % self.user['registration_time']) + logger.info(u'微博数:%d' % self.user['statuses_count']) + logger.info(u'粉丝数:%d' % self.user['followers_count']) + logger.info(u'关注数:%d' % self.user['follow_count']) + logger.info(u'url:https://m.weibo.cn/profile/%s' % self.user['id']) + if self.user.get('verified_reason'): + logger.info(self.user['verified_reason']) + logger.info(self.user['description']) + logger.info('+' * 100) + + async def get_one_weibo(self, info): + """获取一条微博的全部信息""" + try: + weibo_info = info['mblog'] + weibo_id = weibo_info['id'] + retweeted_status = weibo_info.get('retweeted_status') + is_long = weibo_info.get('isLongText') + if retweeted_status and retweeted_status.get('id'): # 转发 + retweet_id = retweeted_status.get('id') + is_long_retweet = retweeted_status.get('isLongText') + if is_long: + weibo = await self.get_long_weibo(weibo_id) + if not weibo: + weibo = self.parse_weibo(weibo_info) + else: + weibo = self.parse_weibo(weibo_info) + if is_long_retweet: + retweet = await self.get_long_weibo(retweet_id) + if not retweet: + retweet = self.parse_weibo(retweeted_status) + else: + retweet = self.parse_weibo(retweeted_status) + retweet['created_at'] = self.standardize_date( + retweeted_status['created_at']) + weibo['retweet'] = retweet + else: # 原创 + if is_long: + weibo = await self.get_long_weibo(weibo_id) + if not weibo: + weibo = self.parse_weibo(weibo_info) + else: + weibo = self.parse_weibo(weibo_info) + weibo['created_at'] = self.standardize_date( + weibo_info['created_at']) + return weibo + except Exception as e: + logger.exception(e) + self.__recent = False + + async def get_latest_weibos(self): + try: + latest_weibos = [] + js = await self.get_weibo_json(1) + if js['ok']: + weibos = js['data']['cards'] + for w in weibos: + if w['card_type'] == 9: + wb = await self.get_one_weibo(w) + if wb: + if not self.__recent: + continue + if wb["id"] in self.received_weibo_ids: + continue + if (not self.filter) or ( + 'retweet' not in wb.keys()): + if len(self.last_5_weibos) == 5: + self.last_5_weibos.pop(0) + self.last_5_weibos.append(wb) + + latest_weibos.append(wb) + self.received_weibo_ids.append(wb["id"]) + self.print_weibo(wb) + + return latest_weibos + except Exception as e: + logger.exception(e) + return [] \ No newline at end of file diff --git a/hoshino/res.py b/hoshino/res.py index 436e16959..6aace99c5 100644 --- a/hoshino/res.py +++ b/hoshino/res.py @@ -1,5 +1,8 @@ import os +import asyncio from PIL import Image +import httpx +from io import BytesIO from urllib.request import pathname2url from urllib.parse import urljoin @@ -19,7 +22,28 @@ def get(path, *paths): def img(path, *paths): return ResImg(os.path.join('img', path, *paths)) + @staticmethod + def remote_img(url): + return RemoteResImg(url) + +class RemoteResObj: + def __init__(self, url): + self.__path = url + + @property + def url(self): + return self.__path +class RemoteResImg(RemoteResObj): + @property + def cqcode(self) -> MessageSegment: + return MessageSegment.image(self.url) + + async def open(self) -> Image: + async with httpx.AsyncClient() as client: + r = await client.get(self.url) + return Image.open(BytesIO(r)) + class ResObj: diff --git a/requirements.txt b/requirements.txt index 9bfe4eaa9..c2eb23792 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,5 @@ Pillow>=6.2.1 TwitterAPI>=2.5.10 matplotlib>=3.2.0 numpy>=1.18.0 -beautifulsoup4>=4.9.0 +httpx>=0.12.1 +beautifulsoup4>=4.9.0 \ No newline at end of file