improve dict feature

cdhigh · Jun 17, 2024 · be3086c · be3086c
1 parent 5790ede
commit be3086c
Show file tree

Hide file tree

Showing 7 changed files with 307 additions and 45 deletions.
diff --git a/application/lib/calibre/web/feeds/news.py b/application/lib/calibre/web/feeds/news.py
@@ -1070,7 +1070,7 @@ def _postprocess_html(self, soup, first_fetch, job_info):
         #    for x in soup.find_all(attrs={attr: True}):
         #        del x[attr]
 
-        for bad_tag in list(soup.find_all(['base', 'iframe', 'canvas', 'embed', 
+        for bad_tag in list(soup.find_all(['base', 'iframe', 'canvas', 'embed', 'source',
             'command', 'datalist', 'video', 'audio', 'noscript', 'link', 'meta', 'button'])):
             bad_tag.extract()
 

diff --git a/application/lib/dictionary/mdict/mdict.py b/application/lib/dictionary/mdict/mdict.py
@@ -4,8 +4,9 @@
 #!/usr/bin/env python3
 # -*- coding:utf-8 -*-
 #stardict离线词典支持
-import os, re, zlib, json
+import os
 from bs4 import BeautifulSoup
+from application.utils import xml_escape
 from .readmdict import MDX
 try:
     import marisa_trie
@@ -79,6 +80,8 @@ def __init__(self, fname, encoding="", substyle=False, passcode=None):
             return
 
         #重建索引
+        #为什么不使用单独的后台任务自动重建索引？是因为运行时间还不是最重要的约束，而是服务器内存
+        #如果是大词典，内存可能要爆，怎么运行都不行，如果是小词典，则时间可以接受
         default_log.info(f"Building trie for {dictName}")
         #为了能制作大词典，mdx中这些数据都是64bit的，但是为了节省空间，这里只使用32bit保存(>LLLLLL)
         self.trie = marisa_trie.RecordTrie(self.TRIE_FMT, self.mdx.get_index()) #type:ignore
@@ -95,6 +98,10 @@ def get(self, word):
         if not self.trie:
             return ''
         word = word.lower().strip()
+        #和mdict官方应用一样，输入:about返回词典基本信息
+        if word == ':about':
+            return self.dictHtmlInfo()
+
         indexes = self.trie[word] if word in self.trie else None
         ret = self.get_content_by_Index(indexes)
         if ret.startswith('@@@LINK='):
@@ -119,18 +126,37 @@ def post_process(self, content):
 
         soup = BeautifulSoup(content, 'html.parser') #html.parser不会自动添加html/body
 
-        #删除图像
-        for tag in soup.find_all('img'):
+        #浏览器不支持 entry:// 协议，会直接拦截导致无法跳转，
+        #预先将其转换为 https://kindleear/entry/ 前缀，然后在js里面判断这个前缀
+        for tag in soup.find_all('a', href=True):
+            href = tag['href']
+            if href.startswith('entry://'):
+                tag['href'] = f'https://kindleear/entry/{href[8:]}'
+
+        #kindle对html支持很差，有一些词典又使用到这些标签
+        for tag in soup.find_all(['article', 'aside', 'header', 'footer', 'nav', 'main',
+            'figcaption', 'figure', 'section', 'time']):
+            tag.name = 'div'
+
+        #删除多媒体资源和脚本
+        for tag in list(soup.find_all(['img', 'script', 'base', 'iframe', 'canvas', 'embed', 'source',
+            'command', 'datalist', 'video', 'audio', 'noscript', 'meta', 'button'])):
             tag.extract()
-
+        
         self.adjust_css(soup)
-        #self.inline_css(soup) #碰到稍微复杂一些的CSS文件性能就比较低下，暂时屏蔽对CSS文件的支持
-        self.remove_empty_tags(soup)
+        self.inline_css(soup)
+        #self.remove_empty_tags(soup)
 
-        body = soup.body
-        if body:
-            body.name = 'div'
+        tag = soup.head
+        if tag:
+            tag.extract()
 
+        #mdict质量良莠不齐，有些词典在html/body外写释义
+        #所以不能直接提取body内容，直接修改为div简单粗暴也有效
+        for tag in (soup.html, soup.body):
+            if tag:
+                tag.name = 'div'
+
         return str(soup)
 
     #调整一些CSS
@@ -149,8 +175,9 @@ def adjust_css(self, soup):
     #将外部单独css文件的样式内联到html标签中
     def inline_css(self, soup):
         link = soup.find('link', attrs={'rel': 'stylesheet', 'href': True})
-        if not link:
-            return
+        if link:
+            link.extract()
+        return #碰到稍微复杂一些的CSS文件性能就比较低下，暂时屏蔽对CSS文件的支持
 
         link.extract()
         css = ''
@@ -211,3 +238,15 @@ def remove_empty_tags(self, soup, preserve_tags=None):
                 self.remove_empty_tags(tag, preserve_tags)
         for tag in empty_tags:
             tag.decompose()
+
+    #返回当前词典的基本信息，html格式
+    def dictHtmlInfo(self):
+        ret = []
+        header = self.mdx.header.copy()
+        ret.append('<strong>{}</strong><hr/>'.format(header.pop('Title', '')))
+        ret.append('<b>Description:</b><br/>{}<br/><hr/>'.format(header.pop('Description', '')))
+        stylesheet = xml_escape(header.pop('StyleSheet', '').replace('\n', '\\n'))
+        for k,v in header.items():
+            ret.append('<b>{}:</b>&nbsp;&nbsp;{}<br/>'.format(k, v))
+        ret.append('<b>StyleSheet:</b>{}<br/>'.format(stylesheet))
+        return ''.join(ret)
diff --git a/application/lib/dictionary/mdict/readmdict.py b/application/lib/dictionary/mdict/readmdict.py
@@ -823,6 +823,106 @@ def get_content_by_Index(self, indexes) -> str:
         txt = b'<hr/>'.join(ret).decode(self.encoding)
         return self._substitute_stylesheet(txt) if self.stylesheet else txt
 
+    def compare_keys(self, key1, key2):
+        """
+        排序要求：
+        header中KeyCaseSensitive表明排序时是否大小写不敏感,为No时要转化为小写字母比较。
+        header中StripKey只对mdx有效，为No，则不分词，字母、空格、符号都参与排序，为Yes，则分词，仅字母参与排序，去掉空格、符号。
+        MDX的编码有utf-8,utf-16,gb18030(包括gbk，gb2313,gb18030),BIG5,ISO8859-1。
+        MDD的编码为utf-16le,尽管utf-16默认也是utf-16le，但是会加前缀\xff\xfe。
+        排序:utf-16按照utf-16le编解码，按照utf-16be排序，其他按照各自编码排序。
+        @param key1: the key user input
+        @param key2: the key from the file
+        @return:
+        """
+        # mdx和mdd中的key都是bytes，查询key是str，因此str转bytes要在lower()之后进行。
+        # if type(key1) == str:
+        #     key1 = key1.encode(self._encoding)
+        # if type(key2) == str:
+        #     key2 = key2.encode(self._encoding)
+        # Dictionary of Engineering的最后一个词条是b'\xc5ngstr\xf6m compensation pyrheliometer'，其中\xc5和\xf6解码报错，因此用replace。
+        key1 = self.process_str_keys(key1)
+        key2 = self.process_str_keys(key2)
+
+        # if operator.__lt__(key1, key2):
+        #     return -1
+        # elif operator.__eq__(key1, key2):
+        #     return 0
+        # elif operator.__gt__(key1, key2):
+        #     return 1
+        import operator
+        if self.__class__.__name__ == 'MDX':
+            if self.encoding == 'UTF-16':
+                t_key1 = key1.encode('utf-16be', errors='ignore')
+                t_key2 = key2.encode('utf-16be', errors='ignore')
+                if operator.__lt__(t_key1, t_key2):
+                    return -1
+                elif operator.__eq__(t_key1, t_key2):
+                    return 0
+                elif operator.__gt__(t_key1, t_key2):
+                    return 1
+            if self.encoding == 'BIG-5':
+                t_key1 = key1.encode('utf-8', errors='ignore')
+                t_key2 = key2.encode('utf-8', errors='ignore')
+                if operator.__lt__(t_key1, t_key2):
+                    return -1
+                elif operator.__eq__(t_key1, t_key2):
+                    return 0
+                elif operator.__gt__(t_key1, t_key2):
+                    return 1
+            else:
+                t_key1 = key1.encode(self.encoding, errors='ignore')
+                t_key2 = key2.encode(self.encoding, errors='ignore')
+                if operator.__lt__(t_key1, t_key2):
+                    return -1
+                elif operator.__eq__(t_key1, t_key2):
+                    return 0
+                elif operator.__gt__(t_key1, t_key2):
+                    return 1
+        else:
+            t_key1 = key1.encode('utf-8', errors='ignore')
+            t_key2 = key2.encode('utf-8', errors='ignore')
+            if operator.__lt__(t_key1, t_key2):
+                return -1
+            elif operator.__eq__(t_key1, t_key2):
+                return 0
+            elif operator.__gt__(t_key1, t_key2):
+                return 1
+
+    def lower_str_keys(self, key):
+        """自动转换为小写"""
+        return key if self.header.get('KeyCaseSensitive') == 'Yes' else key.lower()
+
+    def strip_key(self):
+        # 0:False,1:True,2:None
+        if 'StripKey' in self.header.keys():
+            if self.header['StripKey'] == 'Yes':
+                self._strip_key = 1
+            elif self.header['StripKey'] == 'No':
+                self._strip_key = 0
+            else:
+                self._strip_key = 2
+        else:
+            self._strip_key = 2
+
+        if self.__class__.__name__ == 'MDD':
+            self._strip_key = 0
+
+    def process_str_keys(self, key):
+        if self.__class__.__name__ == 'MDX':
+            if isinstance(key, bytes):
+                if self.encoding == 'UTF-16':
+                    key = key.decode('utf-16le', errors='ignore')
+                else:
+                    # ISO8859-1编码中文报错latin-1 UnicodeDecodeError
+                    key = key.decode(self.encoding, errors='ignore')
+        else:
+            if isinstance(key, bytes):
+                key = key.decode(self.encoding)
+        if self._strip_key == 1:
+            key = re.sub(r'[ _=,.;:!?@%&#~`()\[\]<>{}/\\\$\+\-\*\^\'"\t|]', '', key)
+        return self.lower_str_keys(key) # 这里不能strip()
+
 if __name__ == "__main__":
     import sys
     import os

diff --git a/application/static/reader.js b/application/static/reader.js
@@ -4,7 +4,7 @@
 
 var g_iframeScrollHeight = 500; //在 iframeLoadEvent 里更新
 //var g_iframeClientHeight = 500;
-var g_currentArticle = {};
+var g_currentArticle = {}; //{title:,src:,}
 var g_dictMode = false;
 const g_trTextContainerHeight = 350; //350px在reader.css定义tr-text-container和tr-result-text
 
@@ -519,6 +519,25 @@ function scrollToNode(container, node) {
   container.scrollTop = pos;
 }
 
+//高亮显示当前正在读的书
+function highlightCurrentArticle() {
+  var art = g_currentArticle;
+  if (isEmpty(art)) {
+    return;
+  }
+
+  var navContent = document.getElementById('nav-content');
+  var items = navContent.querySelectorAll('.nav-title');
+  for (var i = 0; i < items.length; i++) {
+    var item = items[i];
+    if (item.getAttribute('data-src') == art.src) {
+      item.style.fontWeight = 'bold';
+    } else {
+      item.style.fontWeight = 'normal';
+    }
+  }
+}
+
 //删除一本或多本书
 function navDeleteBooks(event) {
   hidePopMenu();
@@ -660,7 +679,22 @@ function toggleDictMode() {
 }
 
 //关闭查词窗口
-function closeDictDialog() {
+function closeDictDialog(event) {
+  //处理词典内词条跳转
+  var target = event ? event.target || event.srcElement : null;
+  if (target && (target.tagName == 'A')) {
+    event.stopPropagation();
+    event.preventDefault();
+    var href = target.getAttribute('href') || '';
+    if (href.startsWith('https://kindleear/entry/')) {
+      var word = href.substring(24);
+      if (word) {
+        translateWord(word);
+        return;
+      }
+    }
+  }
+
   g_dictMode = false;
   document.getElementById('tr-result').style.display = 'none';
   document.getElementById('corner-dict-hint').style.display = 'none';
@@ -864,6 +898,7 @@ function openArticle(article) {
   }
   hideNavbar();
   closeDictDialog();
+  highlightCurrentArticle();
 }
 
 //打开上一篇文章
@@ -930,48 +965,56 @@ function iframeLoadEvent(evt) {
   adjustIFrameStyle(iframe);
   var doc = iframe.contentDocument || iframe.contentWindow.document;
   doc.addEventListener('click', function(event) {
+    //处理链接的点击事件
     var target = event.target || event.srcElement;
     if (target && (target.tagName == 'A')) {
       event.stopPropagation();
       event.preventDefault();
       var href = target.getAttribute('href');
       if (href && g_allowLinks) {
-        //window.open(href, '_blank');
         window.location.href = href; //kindle不支持window.open()
         return;
       }
     }
+
+    //判断是否查词典
     var selection = doc.getSelection();
     var text = selection.toString();
+    var dictDialog = document.getElementById('tr-result');
     if (g_dictMode) {
       text = text || getWordAtClick(event, iframe);
       if (text) {
         translateWord(text, selection);
       }
       g_dictMode = false;
       document.getElementById('corner-dict-hint').style.display = 'none';
+    } else if (dictDialog && dictDialog.style.display != 'none') { //关闭查词窗口
+      closeDictDialog();
     } else if (!text) { //没有选择文本才翻页
       clickEvent(event);
     }
   });
+
+  //只有PC有键盘快捷键
   doc.addEventListener('keydown', documentKeyDownEvent);
 }
 
 //每次iframe加载完成后调整其样式和容器高度
 function adjustIFrameStyle(iframe) {
   iframe = iframe || document.getElementById('iframe');
-  var doc = iframe.contentDocument || iframe.contentWindow.document;
+  var doc = iframe.contentWindow.document || iframe.contentDocument;
   var body = doc.body;
+  iframe.style.display = "block";
   iframe.style.height = 'auto';
   body.style.textAlign = 'justify';
   body.style.wordWrap = 'break-word';
   body.style.hyphens = 'auto';
-  body.style.marginRight = '10px';
+  body.style.margin = '10px 20px 10px 20px';
+  body.style.paddingBottom = '20px';
   body.style.fontSize = g_fontSize.toFixed(1) + 'em';
   body.style.cursor = 'pointer';
   body.style.webkitTapHighlightColor = 'transparent';
   body.style.webkitTouchCallout = 'none';
-  iframe.style.display = "block";
 
   var images = doc.querySelectorAll('img');
   for (var i = 0; i < images.length; i++) {
@@ -980,9 +1023,11 @@ function adjustIFrameStyle(iframe) {
   }
 
   var vh = getViewportHeight();
-  g_iframeScrollHeight = Math.max(doc.documentElement.scrollHeight || body.scrollHeight, vh);
-  //g_iframeClientHeight = Math.max(doc.documentElement.clientHeight || body.clientHeight, vh);
-  iframe.style.height = g_iframeScrollHeight + 'px';
+  var html = doc.documentElement;
+  var height = Math.max(body.scrollHeight, body.clientHeight, body.offsetHeight,
+        html.scrollHeight, html.clientHeight, html.offsetHeight, vh) + 40;
+  iframe.style.height = height + 'px';
+  g_iframeScrollHeight = height;
 }
 
 //使用键盘快捷键翻页

diff --git a/application/templates/reader.html b/application/templates/reader.html
@@ -19,7 +19,7 @@
       </svg>
     </div>
     <!-- 查词典的结果显示窗口 -->
-    <div class="tr-result" id="tr-result" onclick="closeDictDialog()">
+    <div class="tr-result" id="tr-result" onclick="closeDictDialog(event)">
       <div class="tr-dict-name" id="tr-dict-name">
         <select id="tr-dict-name-sel" onclick="javascript:event.stopPropagation()" onchange="changeDictToTranslate(event)">
         </select>
@@ -34,7 +34,7 @@
           <path d="M 4.375 7.1875 L 10 12.8125 L 15.625 7.1875" stroke="black" stroke-width="1.875" stroke-linecap="round" stroke-linejoin="round" fill="none"/>
         </svg>
       </div>
-      <div class="tr-close-icon" id="tr-close-icon" onclick="closeDictDialog()">X</div>
+      <div class="tr-close-icon" id="tr-close-icon" onclick="closeDictDialog(event)">X</div>
       <div class="tr-word" id="tr-word"></div>
       <div class="tr-result-text">
         <div class="tr-text-container" id="tr-text-container">
@@ -171,10 +171,10 @@
   {% autoescape off -%}
   <script type="text/javascript">
     var g_books = {{oebBooks|safe}}; //[{date:, books: [{title:, articles:[{title:, src:}],},...]}, ]
-    var g_allowLinks = {{params.get('allowLinks', 0)}};
-    var g_topleftDict = {{params.get('topleftDict', 1)}};
-    var g_inkMode = {{params.get('inkMode', 1)}};
-    var g_fontSize = {{params.get('fontSize', 1.0)}};
+    var g_allowLinks = {{params.get('allowLinks', 0) | int}};
+    var g_topleftDict = {{params.get('topleftDict', 1) | int}};
+    var g_inkMode = {{params.get('inkMode', 1) | int}};
+    var g_fontSize = {{params.get('fontSize', 1.0) | float}};
     var g_shareKey = "{{shareKey}}";
 
     var i18n = {