Add feature: AI summarize article

cdhigh · Nov 17, 2024 · 36eb080 · 36eb080
1 parent e908894
commit 36eb080
Show file tree

Hide file tree

Showing 23 changed files with 1,032 additions and 226 deletions.
diff --git a/application/back_end/db_models.py b/application/back_end/db_models.py
@@ -5,12 +5,14 @@
 #Author: cdhigh <https://github.com/cdhigh>
 import os, random, datetime
 from operator import attrgetter
-from ..utils import PasswordManager, ke_encrypt, ke_decrypt, utcnow
+from ..utils import PasswordManager, ke_encrypt, ke_decrypt, utcnow, compare_version
 
 if os.getenv('DATABASE_URL', '').startswith(("datastore", "mongodb", "redis", "pickle")):
     from .db_models_nosql import *
+    DB_CATEGORY = 'nosql'
 else:
     from .db_models_sql import *
+    DB_CATEGORY = 'sql'
 
 class KeUser(MyBaseModel): # kindleEar User
     name = CharField(unique=True)
@@ -188,6 +190,7 @@ class Recipe(MyBaseModel):
     language = CharField(default='')
     translator = JSONField(default=JSONField.dict_default) #用于自定义RSS的备份，实际使用的是BookedRecipe
     tts = JSONField(default=JSONField.dict_default) #用于自定义RSS的备份，实际使用的是BookedRecipe
+    summarizer = JSONField(default=JSONField.dict_default) #用于自定义RSS的备份，实际使用的是BookedRecipe
     custom = JSONField(default=JSONField.dict_default) #留着扩展，避免后续一些小特性还需要升级数据表结构
 
     #在程序内其他地方使用的id，在数据库内则使用 self.id
@@ -221,6 +224,7 @@ class BookedRecipe(MyBaseModel):
     time = DateTimeField(default=utcnow) #源被订阅的时间，用于排序
     translator = JSONField(default=JSONField.dict_default)
     tts = JSONField(default=JSONField.dict_default)
+    summarizer = JSONField(default=JSONField.dict_default)
     custom = JSONField(default=JSONField.dict_default) #留着扩展，避免后续一些小特性还需要升级数据表结构
 
     @property
@@ -319,12 +323,25 @@ def set_value(cls, name, value):
 
 #创建数据库表格，一个数据库只需要创建一次
 def create_database_tables():
-    dbInstance.create_tables([KeUser, UserBlob, Recipe, BookedRecipe, DeliverLog, WhiteList,
-        SharedRss, SharedRssCategory, LastDelivered, InBox, AppInfo], safe=True)
-    if not AppInfo.get_value(AppInfo.dbSchemaVersion):
+    if DB_CATEGORY != 'sql':
+        return
+
+    try:
+        connect_database()
+        if not AppInfo.table_exists():
+            default_log.warning("Database not found. Creating new database...")
+            dbInstance.create_tables([KeUser, UserBlob, Recipe, BookedRecipe, DeliverLog, WhiteList,
+                SharedRss, SharedRssCategory, LastDelivered, InBox, AppInfo], safe=True)
+            AppInfo.set_value(AppInfo.dbSchemaVersion, appVer)
+            default_log.warning("Created database tables successfully.")
+        else:
+            check_upgrade_database()
+    except OperationalError:
+        default_log.warning("Database not initialized or connection error. Creating new database...")
+        dbInstance.create_tables([KeUser, UserBlob, Recipe, BookedRecipe, DeliverLog, WhiteList,
+            SharedRss, SharedRssCategory, LastDelivered, InBox, AppInfo], safe=True)
         AppInfo.set_value(AppInfo.dbSchemaVersion, appVer)
-
-    return 'Created database tables successfully'
+        default_log.warning("Created database tables successfully.")
 
 #删除所有表格的所有数据，相当于恢复出厂设置
 def delete_database_all_data():
@@ -334,3 +351,20 @@ def delete_database_all_data():
             model.delete().execute()
         except:
             pass
+
+#升级数据库，仅用于SQL数据库
+def check_upgrade_database():
+    if DB_CATEGORY != 'sql':
+        return
+
+    dbSchemaVersion = AppInfo.get_value(AppInfo.dbSchemaVersion, appVer)
+    #v3.2版本给两个表添加了 summarizer 列
+    if compare_version(dbSchemaVersion, '3.2') > 0:
+        default_log.warning(f"Upgrading database to version {appVer}...")
+        try:
+            dbInstance.execute_sql("ALTER TABLE recipe ADD COLUMN summarizer TEXT DEFAULT '{}';")
+            dbInstance.execute_sql("ALTER TABLE bookedrecipe ADD COLUMN summarizer TEXT DEFAULT '{}';")
+            AppInfo.set_value(AppInfo.dbSchemaVersion, appVer)
+        except OperationalError as e:
+            default_log.warning(f"Column already exists or another issue occurred: {e}")
+
diff --git a/application/lib/calibre/web/feeds/news.py b/application/lib/calibre/web/feeds/news.py
@@ -441,6 +441,7 @@ class BasicNewsRecipe(Recipe):
     # set by worker.py
     translator = {}
     tts = {}
+    summarizer = {}
     delivery_reason = 'cron'
 
     # See the built-in recipes for examples of these settings.
@@ -1130,12 +1131,16 @@ def _postprocess_html(self, soup, first_fetch, job_info):
         #for x in ans.find_all('mark'):
         #    x.name = 'strong'
 
-        #If tts need, tts propery is set by WorkerImpl
+        #If tts need, 'tts' propery is set by WorkerImpl
         tts_enable = self.tts.get('enable')
         if tts_enable:
             self.audiofy_html(soup, title, job_info)
 
-        #If translation need, translator propery is set by WorkerImpl
+        #If ai-summarization need, 'summarizer' propery is set by WorkerImpl
+        if self.summarizer.get('enable'):
+            self.summarize_html(soup, title)
+
+        #If translation need, 'translator' propery is set by WorkerImpl
         if self.translator.get('enable') and (tts_enable != 'audio_only'):
             self.translate_html(soup, title)
 
@@ -1534,9 +1539,9 @@ def build_index(self):
                 self.jobs.append(req)
 
         self.jobs_done = 0
-        trans_enable = self.translator.get('enable') or self.tts.get('enable')
-        #如果翻译使能，则不能使用多线程，否则容易触发流量告警导致IP被封锁
-        if (self.simultaneous_downloads > 1) and not trans_enable:
+        ai_enable = self.translator.get('enable') or self.tts.get('enable') or self.summarizer.get('enable')
+        #如果翻译/AI使能，则不能使用多线程，否则容易触发流量告警导致IP被封锁
+        if (self.simultaneous_downloads > 1) and not ai_enable:
             tp = ThreadPool(self.simultaneous_downloads)
             for req in self.jobs:
                 tp.putRequest(req, block=True, timeout=0)
@@ -2060,8 +2065,8 @@ def internal_postprocess_book(self, oeb, opts, log):
     #调用在线翻译服务平台，翻译html
     def translate_html(self, soup, title):
         from ebook_translator import HtmlTranslator
-        translator = HtmlTranslator(self.translator, self.simultaneous_downloads)
         self.log.info(f'Translating html [{title}]')
+        translator = HtmlTranslator(self.translator, self.simultaneous_downloads)
         translator.translate_soup(soup)
 
     #翻译Feed的title，toc时用到
@@ -2111,6 +2116,12 @@ def audiofy_html(self, soup, title, job_info):
         else:
             self.log.warning(f'Failed to audiofy "{title}": {ret["error"]}')
 
+    #调用AI服务给html写一个摘要
+    def summarize_html(self, soup, title):
+        from ebook_summarizer import HtmlSummarizer
+        self.log.info(f'Summarizing html [{title}]')
+        HtmlSummarizer(self.summarizer).summarize_soup(soup)
+
 class CustomIndexRecipe(BasicNewsRecipe):
 
     def custom_index(self):

diff --git a/application/lib/ebook_summarizer/__init__.py b/application/lib/ebook_summarizer/__init__.py
@@ -0,0 +1,3 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+from .html_summarizer import get_summarizer_engines, HtmlSummarizer
diff --git a/application/lib/ebook_summarizer/html_summarizer.py b/application/lib/ebook_summarizer/html_summarizer.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+#用AI对文章进行摘要
+#Author: cdhigh <https://github.com/cdhigh>
+import re, time
+import simple_ai_provider
+
+def get_summarizer_engines():
+    return simple_ai_provider._PROV_AI_LIST
+
+class HtmlSummarizer:
+    def __init__(self, params: dict):
+        self.params = params
+        engineName = self.params.get('engine')
+        if engineName not in simple_ai_provider._PROV_AI_LIST:
+            engineName = 'gemini'
+        self.engineName = engineName
+        self.engineProperty = simple_ai_provider._PROV_AI_LIST.get(engineName, {})
+        self.aiAgent = self.create_engine(self.engineName, params)
+
+    #创建一个AI封装实例
+    def create_engine(self, engineName, params):
+        return simple_ai_provider.SimpleAiProvider(engineName, params.get('api_key', ''), 
+            model=params.get('model', ''), api_host=params.get('api_host', ''))
+
+    #给一段文字做摘要，记住不要太长
+    #返回 {'error': '', 'summary': ''}
+    def summarize_text(self, text):
+        #token是字节数根据不同的语种不能很好的对应，比如对应英语大约一个token对应4字节左右，
+        #中文对应1-2字节，这里采用保守策略，一个token对应1字节，然后减去prompt的花销
+        chunkSize = self.engineProperty.get('context_size', 4096) - 200
+        if chunkSize < 2000:
+            chunkSize = 2000
+
+        summarySize = self.params.get('summary_size', 200)
+        summary = ''
+        errMsg = ''
+        lang = self.params.get('summary_lang', '')
+        if lang:
+            summaryTips = (f"Summarize the following text in {lang}. The summary should accurately represent the content "
+                f"and be no more than {summarySize} words:\n\n")
+        else:
+            summaryTips = (f"Summarize the following text in the same language as the original text. The summary should accurately represent the content "
+            f"and be no more than {summarySize} words:\n\n")
+
+        text = re.sub(r'<[^>]+>', '', text)[:chunkSize]
+        #try:
+        summary = self.aiAgent.chat(f"{summaryTips}{text}")
+        #except Exception as e:
+        #errMsg = str(e)
+
+        return {'error': errMsg, 'summary': summary}
+
+    #使用 refine 方法生成长 HTML 文章的摘要
+    #soup: BeautifulSoup实例
+    #chunkSize: 每次处理的 HTML 文本块大小，可以覆盖默认值
+    #maxIterations: 最大处理块数，避免执行时间过长
+    def summarize_soup(self, soup, chunkSize=None, maxIterations=5):
+        body = soup.find('body')
+        if not body:
+            return
+        text = body.get_text()
+
+        #token是字节数根据不同的语种不能很好的对应，比如对应英语大约一个token对应4字节左右，
+        #中文对应1-2字节，这里采用保守策略，一个token对应1字节，然后减去prompt的花销大约500字节
+        if not chunkSize:
+            chunkSize = self.engineProperty.get('context_size', 4096) - 500
+        if chunkSize < 2000:
+            chunkSize = 2000
+
+        #将文本分块，这个分块比较粗糙，可能按照段落分块会更好，但是考虑到AI的适应能力比较强，
+        #并且仅用于生成摘要，所以这个简单方案还是可以接受的
+        chunks = [text[i:i + chunkSize] for i in range(0, len(text), chunkSize)]
+        summarySize = self.params.get('summary_size', 200)
+        interval = self.engineProperty.get('request_interval', 0)
+        summary = None
+
+        lang = self.params.get('summary_lang', '')
+        if lang:
+            summaryTips = f"Please refine or update the summary based on the following text block, ensuring the summary is in the language: {lang}, and make it more accurately reflect the article content:\n\n"
+        else:
+            summaryTips = f"Please refine or update the summary based on the following text block, ensuring the summary is in the same language as the article/preset summary, and make it more accurately reflect the article content:\n\n"
+        errMsg = ''
+        for i, chunk in enumerate(chunks[:maxIterations]):
+            prompt = (
+                f"The current summary is:\n{summary}\n\n{summaryTips}"
+                f"Text block {i + 1}:\n{chunk}\n\n"
+                f"Please generate an updated summary of no more than {summarySize} words."
+            )
+            try:
+                summary = self.aiAgent.chat(prompt)
+            except Exception as e:
+                errMsg = str(e)
+                break
+            if interval > 0:
+                time.sleep(interval)
+
+        if errMsg:
+            default_log.info(f'Error in summary_soup: {errMsg}')
+            return
+
+        #将摘要插在文章标题之后
+        summaryTag = soup.new_tag('p', attrs={'class': 'ai_generated_summary'})
+        style = self.params.get('summary_style', '')
+        if style:
+            summaryTag['style'] = style
+        b = soup.new_tag('b')
+        b.string = 'AI-Generated Summary: '
+        summaryTag.append(b)
+        summaryTag.append(summary)
+
+        hTag = body.find(['h1','h2']) #type:ignore
+        #判断此H1/H2是否在文章中间出现，如果是则不是文章标题
+        if hTag and all(len(tag.get_text(strip=True)) < 100 for tag in hTag.previous_siblings): #type:ignore
+            hTag.insert_after(summaryTag)
+        else:
+            body.insert(0, summaryTag)
diff --git a/application/lib/ebook_translator/html_translator.py b/application/lib/ebook_translator/html_translator.py
@@ -2,7 +2,7 @@
 # -*- coding:utf-8 -*-
 # 调用在线翻译服务，翻译html文件，移植了calibre的 Ebook Translator 插件的在线翻译接口实现
 import re, time, copy
-from bs4 import BeautifulSoup, NavigableString
+from bs4 import BeautifulSoup, NavigableString, Tag
 from ebook_translator.engines import *
 from application.utils import loc_exc_pos
 
@@ -123,6 +123,10 @@ def _tag_has_only_text(tag):
         #position: 翻译后的文本显示的位置
         def _extract(tag, position):
             for child in tag.find_all(recursive=False):
+                #跳过AI自动生成的摘要
+                if isinstance(child, Tag) and 'ai_generated_summary' in child.get('class', []):
+                    continue
+
                 if _contains_text(child) and not _tag_is_filtered(child):
                     text = str(child).strip() if position == 'replace' else child.get_text().strip()
                     if text and _tag_has_only_text(child) or len(text) < maxLen: