Skip to content

Commit

Permalink
Add feature: AI summarize article
Browse files Browse the repository at this point in the history
  • Loading branch information
cdhigh committed Nov 17, 2024
1 parent e908894 commit 36eb080
Show file tree
Hide file tree
Showing 23 changed files with 1,032 additions and 226 deletions.
46 changes: 40 additions & 6 deletions application/back_end/db_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
#Author: cdhigh <https://github.com/cdhigh>
import os, random, datetime
from operator import attrgetter
from ..utils import PasswordManager, ke_encrypt, ke_decrypt, utcnow
from ..utils import PasswordManager, ke_encrypt, ke_decrypt, utcnow, compare_version

if os.getenv('DATABASE_URL', '').startswith(("datastore", "mongodb", "redis", "pickle")):
from .db_models_nosql import *
DB_CATEGORY = 'nosql'
else:
from .db_models_sql import *
DB_CATEGORY = 'sql'

class KeUser(MyBaseModel): # kindleEar User
name = CharField(unique=True)
Expand Down Expand Up @@ -188,6 +190,7 @@ class Recipe(MyBaseModel):
language = CharField(default='')
translator = JSONField(default=JSONField.dict_default) #用于自定义RSS的备份,实际使用的是BookedRecipe
tts = JSONField(default=JSONField.dict_default) #用于自定义RSS的备份,实际使用的是BookedRecipe
summarizer = JSONField(default=JSONField.dict_default) #用于自定义RSS的备份,实际使用的是BookedRecipe
custom = JSONField(default=JSONField.dict_default) #留着扩展,避免后续一些小特性还需要升级数据表结构

#在程序内其他地方使用的id,在数据库内则使用 self.id
Expand Down Expand Up @@ -221,6 +224,7 @@ class BookedRecipe(MyBaseModel):
time = DateTimeField(default=utcnow) #源被订阅的时间,用于排序
translator = JSONField(default=JSONField.dict_default)
tts = JSONField(default=JSONField.dict_default)
summarizer = JSONField(default=JSONField.dict_default)
custom = JSONField(default=JSONField.dict_default) #留着扩展,避免后续一些小特性还需要升级数据表结构

@property
Expand Down Expand Up @@ -319,12 +323,25 @@ def set_value(cls, name, value):

#创建数据库表格,一个数据库只需要创建一次
def create_database_tables():
dbInstance.create_tables([KeUser, UserBlob, Recipe, BookedRecipe, DeliverLog, WhiteList,
SharedRss, SharedRssCategory, LastDelivered, InBox, AppInfo], safe=True)
if not AppInfo.get_value(AppInfo.dbSchemaVersion):
if DB_CATEGORY != 'sql':
return

try:
connect_database()
if not AppInfo.table_exists():
default_log.warning("Database not found. Creating new database...")
dbInstance.create_tables([KeUser, UserBlob, Recipe, BookedRecipe, DeliverLog, WhiteList,
SharedRss, SharedRssCategory, LastDelivered, InBox, AppInfo], safe=True)
AppInfo.set_value(AppInfo.dbSchemaVersion, appVer)
default_log.warning("Created database tables successfully.")
else:
check_upgrade_database()
except OperationalError:
default_log.warning("Database not initialized or connection error. Creating new database...")
dbInstance.create_tables([KeUser, UserBlob, Recipe, BookedRecipe, DeliverLog, WhiteList,
SharedRss, SharedRssCategory, LastDelivered, InBox, AppInfo], safe=True)
AppInfo.set_value(AppInfo.dbSchemaVersion, appVer)

return 'Created database tables successfully'
default_log.warning("Created database tables successfully.")

#删除所有表格的所有数据,相当于恢复出厂设置
def delete_database_all_data():
Expand All @@ -334,3 +351,20 @@ def delete_database_all_data():
model.delete().execute()
except:
pass

#升级数据库,仅用于SQL数据库
def check_upgrade_database():
if DB_CATEGORY != 'sql':
return

dbSchemaVersion = AppInfo.get_value(AppInfo.dbSchemaVersion, appVer)
#v3.2版本给两个表添加了 summarizer 列
if compare_version(dbSchemaVersion, '3.2') > 0:
default_log.warning(f"Upgrading database to version {appVer}...")
try:
dbInstance.execute_sql("ALTER TABLE recipe ADD COLUMN summarizer TEXT DEFAULT '{}';")
dbInstance.execute_sql("ALTER TABLE bookedrecipe ADD COLUMN summarizer TEXT DEFAULT '{}';")
AppInfo.set_value(AppInfo.dbSchemaVersion, appVer)
except OperationalError as e:
default_log.warning(f"Column already exists or another issue occurred: {e}")

23 changes: 17 additions & 6 deletions application/lib/calibre/web/feeds/news.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,7 @@ class BasicNewsRecipe(Recipe):
# set by worker.py
translator = {}
tts = {}
summarizer = {}
delivery_reason = 'cron'

# See the built-in recipes for examples of these settings.
Expand Down Expand Up @@ -1130,12 +1131,16 @@ def _postprocess_html(self, soup, first_fetch, job_info):
#for x in ans.find_all('mark'):
# x.name = 'strong'

#If tts need, tts propery is set by WorkerImpl
#If tts need, 'tts' propery is set by WorkerImpl
tts_enable = self.tts.get('enable')
if tts_enable:
self.audiofy_html(soup, title, job_info)

#If translation need, translator propery is set by WorkerImpl
#If ai-summarization need, 'summarizer' propery is set by WorkerImpl
if self.summarizer.get('enable'):
self.summarize_html(soup, title)

#If translation need, 'translator' propery is set by WorkerImpl
if self.translator.get('enable') and (tts_enable != 'audio_only'):
self.translate_html(soup, title)

Expand Down Expand Up @@ -1534,9 +1539,9 @@ def build_index(self):
self.jobs.append(req)

self.jobs_done = 0
trans_enable = self.translator.get('enable') or self.tts.get('enable')
#如果翻译使能,则不能使用多线程,否则容易触发流量告警导致IP被封锁
if (self.simultaneous_downloads > 1) and not trans_enable:
ai_enable = self.translator.get('enable') or self.tts.get('enable') or self.summarizer.get('enable')
#如果翻译/AI使能,则不能使用多线程,否则容易触发流量告警导致IP被封锁
if (self.simultaneous_downloads > 1) and not ai_enable:
tp = ThreadPool(self.simultaneous_downloads)
for req in self.jobs:
tp.putRequest(req, block=True, timeout=0)
Expand Down Expand Up @@ -2060,8 +2065,8 @@ def internal_postprocess_book(self, oeb, opts, log):
#调用在线翻译服务平台,翻译html
def translate_html(self, soup, title):
from ebook_translator import HtmlTranslator
translator = HtmlTranslator(self.translator, self.simultaneous_downloads)
self.log.info(f'Translating html [{title}]')
translator = HtmlTranslator(self.translator, self.simultaneous_downloads)
translator.translate_soup(soup)

#翻译Feed的title,toc时用到
Expand Down Expand Up @@ -2111,6 +2116,12 @@ def audiofy_html(self, soup, title, job_info):
else:
self.log.warning(f'Failed to audiofy "{title}": {ret["error"]}')

#调用AI服务给html写一个摘要
def summarize_html(self, soup, title):
from ebook_summarizer import HtmlSummarizer
self.log.info(f'Summarizing html [{title}]')
HtmlSummarizer(self.summarizer).summarize_soup(soup)

class CustomIndexRecipe(BasicNewsRecipe):

def custom_index(self):
Expand Down
3 changes: 3 additions & 0 deletions application/lib/ebook_summarizer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from .html_summarizer import get_summarizer_engines, HtmlSummarizer
117 changes: 117 additions & 0 deletions application/lib/ebook_summarizer/html_summarizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
#用AI对文章进行摘要
#Author: cdhigh <https://github.com/cdhigh>
import re, time
import simple_ai_provider

def get_summarizer_engines():
return simple_ai_provider._PROV_AI_LIST

class HtmlSummarizer:
def __init__(self, params: dict):
self.params = params
engineName = self.params.get('engine')
if engineName not in simple_ai_provider._PROV_AI_LIST:
engineName = 'gemini'
self.engineName = engineName
self.engineProperty = simple_ai_provider._PROV_AI_LIST.get(engineName, {})
self.aiAgent = self.create_engine(self.engineName, params)

#创建一个AI封装实例
def create_engine(self, engineName, params):
return simple_ai_provider.SimpleAiProvider(engineName, params.get('api_key', ''),
model=params.get('model', ''), api_host=params.get('api_host', ''))

#给一段文字做摘要,记住不要太长
#返回 {'error': '', 'summary': ''}
def summarize_text(self, text):
#token是字节数根据不同的语种不能很好的对应,比如对应英语大约一个token对应4字节左右,
#中文对应1-2字节,这里采用保守策略,一个token对应1字节,然后减去prompt的花销
chunkSize = self.engineProperty.get('context_size', 4096) - 200
if chunkSize < 2000:
chunkSize = 2000

summarySize = self.params.get('summary_size', 200)
summary = ''
errMsg = ''
lang = self.params.get('summary_lang', '')
if lang:
summaryTips = (f"Summarize the following text in {lang}. The summary should accurately represent the content "
f"and be no more than {summarySize} words:\n\n")
else:
summaryTips = (f"Summarize the following text in the same language as the original text. The summary should accurately represent the content "
f"and be no more than {summarySize} words:\n\n")

text = re.sub(r'<[^>]+>', '', text)[:chunkSize]
#try:
summary = self.aiAgent.chat(f"{summaryTips}{text}")
#except Exception as e:
#errMsg = str(e)

return {'error': errMsg, 'summary': summary}

#使用 refine 方法生成长 HTML 文章的摘要
#soup: BeautifulSoup实例
#chunkSize: 每次处理的 HTML 文本块大小,可以覆盖默认值
#maxIterations: 最大处理块数,避免执行时间过长
def summarize_soup(self, soup, chunkSize=None, maxIterations=5):
body = soup.find('body')
if not body:
return
text = body.get_text()

#token是字节数根据不同的语种不能很好的对应,比如对应英语大约一个token对应4字节左右,
#中文对应1-2字节,这里采用保守策略,一个token对应1字节,然后减去prompt的花销大约500字节
if not chunkSize:
chunkSize = self.engineProperty.get('context_size', 4096) - 500
if chunkSize < 2000:
chunkSize = 2000

#将文本分块,这个分块比较粗糙,可能按照段落分块会更好,但是考虑到AI的适应能力比较强,
#并且仅用于生成摘要,所以这个简单方案还是可以接受的
chunks = [text[i:i + chunkSize] for i in range(0, len(text), chunkSize)]
summarySize = self.params.get('summary_size', 200)
interval = self.engineProperty.get('request_interval', 0)
summary = None

lang = self.params.get('summary_lang', '')
if lang:
summaryTips = f"Please refine or update the summary based on the following text block, ensuring the summary is in the language: {lang}, and make it more accurately reflect the article content:\n\n"
else:
summaryTips = f"Please refine or update the summary based on the following text block, ensuring the summary is in the same language as the article/preset summary, and make it more accurately reflect the article content:\n\n"
errMsg = ''
for i, chunk in enumerate(chunks[:maxIterations]):
prompt = (
f"The current summary is:\n{summary}\n\n{summaryTips}"
f"Text block {i + 1}:\n{chunk}\n\n"
f"Please generate an updated summary of no more than {summarySize} words."
)
try:
summary = self.aiAgent.chat(prompt)
except Exception as e:
errMsg = str(e)
break
if interval > 0:
time.sleep(interval)

if errMsg:
default_log.info(f'Error in summary_soup: {errMsg}')
return

#将摘要插在文章标题之后
summaryTag = soup.new_tag('p', attrs={'class': 'ai_generated_summary'})
style = self.params.get('summary_style', '')
if style:
summaryTag['style'] = style
b = soup.new_tag('b')
b.string = 'AI-Generated Summary: '
summaryTag.append(b)
summaryTag.append(summary)

hTag = body.find(['h1','h2']) #type:ignore
#判断此H1/H2是否在文章中间出现,如果是则不是文章标题
if hTag and all(len(tag.get_text(strip=True)) < 100 for tag in hTag.previous_siblings): #type:ignore
hTag.insert_after(summaryTag)
else:
body.insert(0, summaryTag)
6 changes: 5 additions & 1 deletion application/lib/ebook_translator/html_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# -*- coding:utf-8 -*-
# 调用在线翻译服务,翻译html文件,移植了calibre的 Ebook Translator 插件的在线翻译接口实现
import re, time, copy
from bs4 import BeautifulSoup, NavigableString
from bs4 import BeautifulSoup, NavigableString, Tag
from ebook_translator.engines import *
from application.utils import loc_exc_pos

Expand Down Expand Up @@ -123,6 +123,10 @@ def _tag_has_only_text(tag):
#position: 翻译后的文本显示的位置
def _extract(tag, position):
for child in tag.find_all(recursive=False):
#跳过AI自动生成的摘要
if isinstance(child, Tag) and 'ai_generated_summary' in child.get('class', []):
continue

if _contains_text(child) and not _tag_is_filtered(child):
text = str(child).strip() if position == 'replace' else child.get_text().strip()
if text and _tag_has_only_text(child) or len(text) < maxLen:
Expand Down
Loading

0 comments on commit 36eb080

Please sign in to comment.