Skip to content

Commit

Permalink
improve dict feature
Browse files Browse the repository at this point in the history
  • Loading branch information
cdhigh committed Jun 17, 2024
1 parent 5790ede commit be3086c
Show file tree
Hide file tree
Showing 7 changed files with 307 additions and 45 deletions.
2 changes: 1 addition & 1 deletion application/lib/calibre/web/feeds/news.py
Original file line number Diff line number Diff line change
Expand Up @@ -1070,7 +1070,7 @@ def _postprocess_html(self, soup, first_fetch, job_info):
# for x in soup.find_all(attrs={attr: True}):
# del x[attr]

for bad_tag in list(soup.find_all(['base', 'iframe', 'canvas', 'embed',
for bad_tag in list(soup.find_all(['base', 'iframe', 'canvas', 'embed', 'source',
'command', 'datalist', 'video', 'audio', 'noscript', 'link', 'meta', 'button'])):
bad_tag.extract()

Expand Down
61 changes: 50 additions & 11 deletions application/lib/dictionary/mdict/mdict.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
#stardict离线词典支持
import os, re, zlib, json
import os
from bs4 import BeautifulSoup
from application.utils import xml_escape
from .readmdict import MDX
try:
import marisa_trie
Expand Down Expand Up @@ -79,6 +80,8 @@ def __init__(self, fname, encoding="", substyle=False, passcode=None):
return

#重建索引
#为什么不使用单独的后台任务自动重建索引?是因为运行时间还不是最重要的约束,而是服务器内存
#如果是大词典,内存可能要爆,怎么运行都不行,如果是小词典,则时间可以接受
default_log.info(f"Building trie for {dictName}")
#为了能制作大词典,mdx中这些数据都是64bit的,但是为了节省空间,这里只使用32bit保存(>LLLLLL)
self.trie = marisa_trie.RecordTrie(self.TRIE_FMT, self.mdx.get_index()) #type:ignore
Expand All @@ -95,6 +98,10 @@ def get(self, word):
if not self.trie:
return ''
word = word.lower().strip()
#和mdict官方应用一样,输入:about返回词典基本信息
if word == ':about':
return self.dictHtmlInfo()

indexes = self.trie[word] if word in self.trie else None
ret = self.get_content_by_Index(indexes)
if ret.startswith('@@@LINK='):
Expand All @@ -119,18 +126,37 @@ def post_process(self, content):

soup = BeautifulSoup(content, 'html.parser') #html.parser不会自动添加html/body

#删除图像
for tag in soup.find_all('img'):
#浏览器不支持 entry:// 协议,会直接拦截导致无法跳转,
#预先将其转换为 https://kindleear/entry/ 前缀,然后在js里面判断这个前缀
for tag in soup.find_all('a', href=True):
href = tag['href']
if href.startswith('entry://'):
tag['href'] = f'https://kindleear/entry/{href[8:]}'

#kindle对html支持很差,有一些词典又使用到这些标签
for tag in soup.find_all(['article', 'aside', 'header', 'footer', 'nav', 'main',
'figcaption', 'figure', 'section', 'time']):
tag.name = 'div'

#删除多媒体资源和脚本
for tag in list(soup.find_all(['img', 'script', 'base', 'iframe', 'canvas', 'embed', 'source',
'command', 'datalist', 'video', 'audio', 'noscript', 'meta', 'button'])):
tag.extract()

self.adjust_css(soup)
#self.inline_css(soup) #碰到稍微复杂一些的CSS文件性能就比较低下,暂时屏蔽对CSS文件的支持
self.remove_empty_tags(soup)
self.inline_css(soup)
#self.remove_empty_tags(soup)

body = soup.body
if body:
body.name = 'div'
tag = soup.head
if tag:
tag.extract()

#mdict质量良莠不齐,有些词典在html/body外写释义
#所以不能直接提取body内容,直接修改为div简单粗暴也有效
for tag in (soup.html, soup.body):
if tag:
tag.name = 'div'

return str(soup)

#调整一些CSS
Expand All @@ -149,8 +175,9 @@ def adjust_css(self, soup):
#将外部单独css文件的样式内联到html标签中
def inline_css(self, soup):
link = soup.find('link', attrs={'rel': 'stylesheet', 'href': True})
if not link:
return
if link:
link.extract()
return #碰到稍微复杂一些的CSS文件性能就比较低下,暂时屏蔽对CSS文件的支持

link.extract()
css = ''
Expand Down Expand Up @@ -211,3 +238,15 @@ def remove_empty_tags(self, soup, preserve_tags=None):
self.remove_empty_tags(tag, preserve_tags)
for tag in empty_tags:
tag.decompose()

#返回当前词典的基本信息,html格式
def dictHtmlInfo(self):
ret = []
header = self.mdx.header.copy()
ret.append('<strong>{}</strong><hr/>'.format(header.pop('Title', '')))
ret.append('<b>Description:</b><br/>{}<br/><hr/>'.format(header.pop('Description', '')))
stylesheet = xml_escape(header.pop('StyleSheet', '').replace('\n', '\\n'))
for k,v in header.items():
ret.append('<b>{}:</b>&nbsp;&nbsp;{}<br/>'.format(k, v))
ret.append('<b>StyleSheet:</b>{}<br/>'.format(stylesheet))
return ''.join(ret)
100 changes: 100 additions & 0 deletions application/lib/dictionary/mdict/readmdict.py
Original file line number Diff line number Diff line change
Expand Up @@ -823,6 +823,106 @@ def get_content_by_Index(self, indexes) -> str:
txt = b'<hr/>'.join(ret).decode(self.encoding)
return self._substitute_stylesheet(txt) if self.stylesheet else txt

def compare_keys(self, key1, key2):
"""
排序要求:
header中KeyCaseSensitive表明排序时是否大小写不敏感,为No时要转化为小写字母比较。
header中StripKey只对mdx有效,为No,则不分词,字母、空格、符号都参与排序,为Yes,则分词,仅字母参与排序,去掉空格、符号。
MDX的编码有utf-8,utf-16,gb18030(包括gbk,gb2313,gb18030),BIG5,ISO8859-1。
MDD的编码为utf-16le,尽管utf-16默认也是utf-16le,但是会加前缀\xff\xfe
排序:utf-16按照utf-16le编解码,按照utf-16be排序,其他按照各自编码排序。
@param key1: the key user input
@param key2: the key from the file
@return:
"""
# mdx和mdd中的key都是bytes,查询key是str,因此str转bytes要在lower()之后进行。
# if type(key1) == str:
# key1 = key1.encode(self._encoding)
# if type(key2) == str:
# key2 = key2.encode(self._encoding)
# Dictionary of Engineering的最后一个词条是b'\xc5ngstr\xf6m compensation pyrheliometer',其中\xc5和\xf6解码报错,因此用replace。
key1 = self.process_str_keys(key1)
key2 = self.process_str_keys(key2)

# if operator.__lt__(key1, key2):
# return -1
# elif operator.__eq__(key1, key2):
# return 0
# elif operator.__gt__(key1, key2):
# return 1
import operator
if self.__class__.__name__ == 'MDX':
if self.encoding == 'UTF-16':
t_key1 = key1.encode('utf-16be', errors='ignore')
t_key2 = key2.encode('utf-16be', errors='ignore')
if operator.__lt__(t_key1, t_key2):
return -1
elif operator.__eq__(t_key1, t_key2):
return 0
elif operator.__gt__(t_key1, t_key2):
return 1
if self.encoding == 'BIG-5':
t_key1 = key1.encode('utf-8', errors='ignore')
t_key2 = key2.encode('utf-8', errors='ignore')
if operator.__lt__(t_key1, t_key2):
return -1
elif operator.__eq__(t_key1, t_key2):
return 0
elif operator.__gt__(t_key1, t_key2):
return 1
else:
t_key1 = key1.encode(self.encoding, errors='ignore')
t_key2 = key2.encode(self.encoding, errors='ignore')
if operator.__lt__(t_key1, t_key2):
return -1
elif operator.__eq__(t_key1, t_key2):
return 0
elif operator.__gt__(t_key1, t_key2):
return 1
else:
t_key1 = key1.encode('utf-8', errors='ignore')
t_key2 = key2.encode('utf-8', errors='ignore')
if operator.__lt__(t_key1, t_key2):
return -1
elif operator.__eq__(t_key1, t_key2):
return 0
elif operator.__gt__(t_key1, t_key2):
return 1

def lower_str_keys(self, key):
"""自动转换为小写"""
return key if self.header.get('KeyCaseSensitive') == 'Yes' else key.lower()

def strip_key(self):
# 0:False,1:True,2:None
if 'StripKey' in self.header.keys():
if self.header['StripKey'] == 'Yes':
self._strip_key = 1
elif self.header['StripKey'] == 'No':
self._strip_key = 0
else:
self._strip_key = 2
else:
self._strip_key = 2

if self.__class__.__name__ == 'MDD':
self._strip_key = 0

def process_str_keys(self, key):
if self.__class__.__name__ == 'MDX':
if isinstance(key, bytes):
if self.encoding == 'UTF-16':
key = key.decode('utf-16le', errors='ignore')
else:
# ISO8859-1编码中文报错latin-1 UnicodeDecodeError
key = key.decode(self.encoding, errors='ignore')
else:
if isinstance(key, bytes):
key = key.decode(self.encoding)
if self._strip_key == 1:
key = re.sub(r'[ _=,.;:!?@%&#~`()\[\]<>{}/\\\$\+\-\*\^\'"\t|]', '', key)
return self.lower_str_keys(key) # 这里不能strip()

if __name__ == "__main__":
import sys
import os
Expand Down
63 changes: 54 additions & 9 deletions application/static/reader.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

var g_iframeScrollHeight = 500; //在 iframeLoadEvent 里更新
//var g_iframeClientHeight = 500;
var g_currentArticle = {};
var g_currentArticle = {}; //{title:,src:,}
var g_dictMode = false;
const g_trTextContainerHeight = 350; //350px在reader.css定义tr-text-container和tr-result-text

Expand Down Expand Up @@ -519,6 +519,25 @@ function scrollToNode(container, node) {
container.scrollTop = pos;
}

//高亮显示当前正在读的书
function highlightCurrentArticle() {
var art = g_currentArticle;
if (isEmpty(art)) {
return;
}

var navContent = document.getElementById('nav-content');
var items = navContent.querySelectorAll('.nav-title');
for (var i = 0; i < items.length; i++) {
var item = items[i];
if (item.getAttribute('data-src') == art.src) {
item.style.fontWeight = 'bold';
} else {
item.style.fontWeight = 'normal';
}
}
}

//删除一本或多本书
function navDeleteBooks(event) {
hidePopMenu();
Expand Down Expand Up @@ -660,7 +679,22 @@ function toggleDictMode() {
}

//关闭查词窗口
function closeDictDialog() {
function closeDictDialog(event) {
//处理词典内词条跳转
var target = event ? event.target || event.srcElement : null;
if (target && (target.tagName == 'A')) {
event.stopPropagation();
event.preventDefault();
var href = target.getAttribute('href') || '';
if (href.startsWith('https://kindleear/entry/')) {
var word = href.substring(24);
if (word) {
translateWord(word);
return;
}
}
}

g_dictMode = false;
document.getElementById('tr-result').style.display = 'none';
document.getElementById('corner-dict-hint').style.display = 'none';
Expand Down Expand Up @@ -864,6 +898,7 @@ function openArticle(article) {
}
hideNavbar();
closeDictDialog();
highlightCurrentArticle();
}

//打开上一篇文章
Expand Down Expand Up @@ -930,48 +965,56 @@ function iframeLoadEvent(evt) {
adjustIFrameStyle(iframe);
var doc = iframe.contentDocument || iframe.contentWindow.document;
doc.addEventListener('click', function(event) {
//处理链接的点击事件
var target = event.target || event.srcElement;
if (target && (target.tagName == 'A')) {
event.stopPropagation();
event.preventDefault();
var href = target.getAttribute('href');
if (href && g_allowLinks) {
//window.open(href, '_blank');
window.location.href = href; //kindle不支持window.open()
return;
}
}

//判断是否查词典
var selection = doc.getSelection();
var text = selection.toString();
var dictDialog = document.getElementById('tr-result');
if (g_dictMode) {
text = text || getWordAtClick(event, iframe);
if (text) {
translateWord(text, selection);
}
g_dictMode = false;
document.getElementById('corner-dict-hint').style.display = 'none';
} else if (dictDialog && dictDialog.style.display != 'none') { //关闭查词窗口
closeDictDialog();
} else if (!text) { //没有选择文本才翻页
clickEvent(event);
}
});

//只有PC有键盘快捷键
doc.addEventListener('keydown', documentKeyDownEvent);
}

//每次iframe加载完成后调整其样式和容器高度
function adjustIFrameStyle(iframe) {
iframe = iframe || document.getElementById('iframe');
var doc = iframe.contentDocument || iframe.contentWindow.document;
var doc = iframe.contentWindow.document || iframe.contentDocument;
var body = doc.body;
iframe.style.display = "block";
iframe.style.height = 'auto';
body.style.textAlign = 'justify';
body.style.wordWrap = 'break-word';
body.style.hyphens = 'auto';
body.style.marginRight = '10px';
body.style.margin = '10px 20px 10px 20px';
body.style.paddingBottom = '20px';
body.style.fontSize = g_fontSize.toFixed(1) + 'em';
body.style.cursor = 'pointer';
body.style.webkitTapHighlightColor = 'transparent';
body.style.webkitTouchCallout = 'none';
iframe.style.display = "block";

var images = doc.querySelectorAll('img');
for (var i = 0; i < images.length; i++) {
Expand All @@ -980,9 +1023,11 @@ function adjustIFrameStyle(iframe) {
}

var vh = getViewportHeight();
g_iframeScrollHeight = Math.max(doc.documentElement.scrollHeight || body.scrollHeight, vh);
//g_iframeClientHeight = Math.max(doc.documentElement.clientHeight || body.clientHeight, vh);
iframe.style.height = g_iframeScrollHeight + 'px';
var html = doc.documentElement;
var height = Math.max(body.scrollHeight, body.clientHeight, body.offsetHeight,
html.scrollHeight, html.clientHeight, html.offsetHeight, vh) + 40;
iframe.style.height = height + 'px';
g_iframeScrollHeight = height;
}

//使用键盘快捷键翻页
Expand Down
12 changes: 6 additions & 6 deletions application/templates/reader.html
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
</svg>
</div>
<!-- 查词典的结果显示窗口 -->
<div class="tr-result" id="tr-result" onclick="closeDictDialog()">
<div class="tr-result" id="tr-result" onclick="closeDictDialog(event)">
<div class="tr-dict-name" id="tr-dict-name">
<select id="tr-dict-name-sel" onclick="javascript:event.stopPropagation()" onchange="changeDictToTranslate(event)">
</select>
Expand All @@ -34,7 +34,7 @@
<path d="M 4.375 7.1875 L 10 12.8125 L 15.625 7.1875" stroke="black" stroke-width="1.875" stroke-linecap="round" stroke-linejoin="round" fill="none"/>
</svg>
</div>
<div class="tr-close-icon" id="tr-close-icon" onclick="closeDictDialog()">X</div>
<div class="tr-close-icon" id="tr-close-icon" onclick="closeDictDialog(event)">X</div>
<div class="tr-word" id="tr-word"></div>
<div class="tr-result-text">
<div class="tr-text-container" id="tr-text-container">
Expand Down Expand Up @@ -171,10 +171,10 @@
{% autoescape off -%}
<script type="text/javascript">
var g_books = {{oebBooks|safe}}; //[{date:, books: [{title:, articles:[{title:, src:}],},...]}, ]
var g_allowLinks = {{params.get('allowLinks', 0)}};
var g_topleftDict = {{params.get('topleftDict', 1)}};
var g_inkMode = {{params.get('inkMode', 1)}};
var g_fontSize = {{params.get('fontSize', 1.0)}};
var g_allowLinks = {{params.get('allowLinks', 0) | int}};
var g_topleftDict = {{params.get('topleftDict', 1) | int}};
var g_inkMode = {{params.get('inkMode', 1) | int}};
var g_fontSize = {{params.get('fontSize', 1.0) | float}};
var g_shareKey = "{{shareKey}}";

var i18n = {
Expand Down
Loading

0 comments on commit be3086c

Please sign in to comment.