Skip to content

Commit

Permalink
add pymp3cat to work in gae
Browse files Browse the repository at this point in the history
  • Loading branch information
cdhigh committed Apr 23, 2024
1 parent 43dd3fe commit 9b4d37a
Show file tree
Hide file tree
Showing 9 changed files with 460 additions and 41 deletions.
23 changes: 16 additions & 7 deletions application/lib/calibre/web/feeds/news.py
Original file line number Diff line number Diff line change
Expand Up @@ -764,7 +764,7 @@ def index_to_soup(self, url_or_raw, raw=False, as_tree=False, save_raw=None):
return parse(_raw)
return BeautifulSoup(_raw, 'lxml')

#提取正文
#使用自动算法提取正文
def extract_readable_article(self, html, url):
try:
doc = readability.Document(html, positive_keywords=self.auto_cleanup_keep, url=url)
Expand All @@ -780,7 +780,7 @@ def extract_readable_article(self, html, url):
body_tag = soup.find('body')

#如果readability解析失败,则启用备用算法(不够好,但有全天候适应能力)
if not body_tag or len(body_tag.contents) == 0:
if not body_tag or len(body_tag.get_text(strip=True)) < 100:
soup = simple_extract(html)
body_tag = soup.find('body')
if not body_tag or len(body_tag.contents) == 0: #再次失败
Expand Down Expand Up @@ -2034,8 +2034,8 @@ def audiofy_html(self, soup, title, job_info):
if not self.tts.get('audio_dir'):
system_temp_dir = os.environ.get('KE_TEMP_DIR')
self.tts['audio_dir'] = PersistentTemporaryDirectory(prefix='tts_', dir=system_temp_dir)
if not self.tts.get('audios'):
self.tts['audios'] = []
if not self.tts.get('audio_files'):
self.tts['audio_files'] = []
audio_dir = self.tts['audio_dir']
ext = ret['mime'].split('/')[-1]
ext = {'mpeg': 'mp3'}.get(ext, ext)
Expand All @@ -2045,7 +2045,7 @@ def audiofy_html(self, soup, title, job_info):
try:
with open(filename, 'wb') as f:
f.write(audio)
self.tts['audios'].append(filename)
self.tts['audio_files'].append(filename)
except Exception as e:
self.log.warning(f'Failed to write "{filename}": {e}')
else:
Expand Down Expand Up @@ -2221,8 +2221,17 @@ def preprocess_raw_html(self, raw_html, url):
for rules in self.content_extract_rules:
newBody.extend(self.get_tags_from_rules(soup, rules))

oldBody.replace_with(newBody)
return str(soup)
#提取失败,尝试自动提取
if len(newBody.get_text(strip=True)) < 100:
self.log.warning(f'Failed to extract content using content_extract_rules, try readability algorithm: {url}')
try:
raw_html = self.extract_readable_article(raw_html, url)
except:
self.log.warning(f'Failed to auto cleanup URL: {url}')
return raw_html
else:
oldBody.replace_with(newBody)
return str(soup)

#根据一个规则列表,从soup中获取符合条件的tag列表
#rules: 字符串列表或字典列表
Expand Down
Loading

0 comments on commit 9b4d37a

Please sign in to comment.