improve dict feature

cdhigh · Jun 17, 2024 · 7edaea8 · 7edaea8
1 parent be3086c
commit 7edaea8
Show file tree

Hide file tree

Showing 8 changed files with 40 additions and 41 deletions.
diff --git a/application/lib/dictionary/mdict/lzo.py b/application/lib/dictionary/mdict/lzo.py
@@ -4,19 +4,19 @@
 class FlexBuffer:
     def __init__(self):
         self.blockSize = None
-        self.c = None
-        self.l = None
-        self.buf = None
+        self.c = 0
+        self.len = 0
+        self.buf = b''
 
     def require(self, n):
-        r = self.c - self.l + n
-        if r > 0:
-            self.l = self.l + self.blockSize * math.ceil(r / self.blockSize)
-            # tmp = bytearray(self.l)
+        r = self.c + n - self.len
+        if r > 0: #缓冲区不够了，需要添加
+            self.len += self.blockSize * math.ceil(r / self.blockSize)
+            # tmp = bytearray(self.len)
             # for i in len(self.buf):
             #    tmp[i] = self.buf[i]
             # self.buf = tmp
-            self.buf = self.buf + bytearray(self.l - len(self.buf))
+            self.buf = self.buf + bytearray(self.len - len(self.buf))
         self.c = self.c + n
         return self.buf
 
@@ -27,9 +27,9 @@ def alloc(self, initSize, blockSize):
             sz = 4096
         self.blockSize = self.roundUp(sz)
         self.c = 0
-        self.l = self.roundUp(initSize) | 0
-        self.l += self.blockSize - (self.l % self.blockSize)
-        self.buf = bytearray(self.l)
+        self.len = self.roundUp(initSize) | 0
+        self.len += self.blockSize - (self.len % self.blockSize)
+        self.buf = bytearray(self.len)
         return self.buf
 
     def roundUp(self, n):
@@ -41,7 +41,7 @@ def roundUp(self, n):
 
     def reset(self):
         self.c = 0
-        self.l = len(self.buf)
+        self.len = len(self.buf)
 
     def pack(self, size):
         return self.buf[0:size]

diff --git a/application/lib/dictionary/mdict/mdict.py b/application/lib/dictionary/mdict/mdict.py
@@ -58,7 +58,7 @@ def definition(self, word, language=''):
 
 #经过词典树缓存的Mdx
 class IndexedMdx:
-    TRIE_FMT = '>LLLLLL'
+    TRIE_FMT = '>LLLLL'
 
     #fname: mdx文件全路径名
     def __init__(self, fname, encoding="", substyle=False, passcode=None):
@@ -83,7 +83,7 @@ def __init__(self, fname, encoding="", substyle=False, passcode=None):
         #为什么不使用单独的后台任务自动重建索引？是因为运行时间还不是最重要的约束，而是服务器内存
         #如果是大词典，内存可能要爆，怎么运行都不行，如果是小词典，则时间可以接受
         default_log.info(f"Building trie for {dictName}")
-        #为了能制作大词典，mdx中这些数据都是64bit的，但是为了节省空间，这里只使用32bit保存(>LLLLLL)
+        #为了能制作大词典，mdx中这些数据都是64bit的，但是为了节省空间，这里只使用32bit保存
         self.trie = marisa_trie.RecordTrie(self.TRIE_FMT, self.mdx.get_index()) #type:ignore
         self.trie.save(trieName)
 

diff --git a/application/lib/dictionary/mdict/readmdict.py b/application/lib/dictionary/mdict/readmdict.py
@@ -183,36 +183,35 @@ def _decode_key_block_info(self, key_block_info_compressed):
     def _decode_key_block(self, key_block_compressed, key_block_info_list):
         key_list = []
         i = 0
+        key_block = b''
         for compressed_size, decompressed_size in key_block_info_list:
             start = i
             end = i + compressed_size
             # 4 bytes : compression type
             key_block_type = key_block_compressed[start : start + 4]
             # 4 bytes : adler checksum of decompressed key block
-            adler32 = unpack(
-                NumberFmt.be_uint, key_block_compressed[start + 4 : start + 8]
-            )[0]
+            #adler32 = unpack(NumberFmt.be_uint, key_block_compressed[start + 4 : start + 8])[0]
             if key_block_type == b"\x00\x00\x00\x00":
                 key_block = key_block_compressed[start + 8 : end]
             elif key_block_type == b"\x01\x00\x00\x00":
                 header = b"\xf0" + pack(NumberFmt.be_uint, decompressed_size)
-                key_block = lzo.decompress(key_block_compressed[start + 8 : end],
+                key_block = lzo.decompress(header + key_block_compressed[start + 8 : end],
                     initSize=decompressed_size, blockSize=1308672)
             elif key_block_type == b"\x02\x00\x00\x00":
                 key_block = zlib.decompress(key_block_compressed[start + 8 : end])
             # extract one single key block into a key list
-            key_list += self._split_key_block(key_block)
+            key_list.extend(self._split_key_block(key_block))
             # notice that adler32 returns signed value
-            assert adler32 == zlib.adler32(key_block) & 0xFFFFFFFF
-
+            #assert adler32 == zlib.adler32(key_block) & 0xFFFFFFFF
             i += compressed_size
         return key_list
 
     def _split_key_block(self, key_block):
-        key_list = []
+        #key_list = []
         key_start_index = 0
+        key_end_index = 0
         while key_start_index < len(key_block):
-            temp = key_block[key_start_index : key_start_index + self._number_width]
+            #temp = key_block[key_start_index : key_start_index + self._number_width]
             # the corresponding record's offset in record block
             key_id = unpack(
                 self._number_format,
@@ -238,8 +237,8 @@ def _split_key_block(self, key_block):
                 .strip()
             )
             key_start_index = key_end_index + width
-            key_list += [(key_id, key_text)]
-        return key_list
+            yield (key_id, key_text)
+        return #key_list
 
     #读取文件头，生成一个python字典
     def _read_header(self):
@@ -469,7 +468,7 @@ def _decode_record_block(self):
                 record_block = record_block_compressed[8:]
             elif record_block_type == b"\x01\x00\x00\x00":
                 header = b"\xf0" + pack(NumberFmt.be_uint, decompressed_size)
-                record_block = lzo.decompress(record_block_compressed[start + 8 : end],
+                record_block = lzo.decompress(header + record_block_compressed[start + 8 : end],
                     initSize=decompressed_size, blockSize=1308672)
             elif record_block_type == b"\x02\x00\x00\x00":
                 record_block = zlib.decompress(record_block_compressed[8:])
@@ -551,9 +550,9 @@ def get_index(self, check_block=True):
                     record_block = record_block_compressed[8:]
             elif record_block_type == b"\x01\x00\x00\x00":
                 _type = 1
-                #header = b"\xf0" + pack(NumberFmt.be_uint, decompressed_size)
+                header = b"\xf0" + pack(NumberFmt.be_uint, decompressed_size)
                 if check_block:
-                    record_block = lzo.decompress(record_block_compressed[start + 8 : end],
+                    record_block = lzo.decompress(header + record_block_compressed[start + 8 : end],
                         initSize=decompressed_size, blockSize=1308672)
             elif record_block_type == b"\x02\x00\x00\x00":
                 _type = 2
@@ -677,7 +676,7 @@ def _decode_record_block(self):
                 record_block = record_block_compressed[8:]
             elif record_block_type == b"\x01\x00\x00\x00":
                 header = b"\xf0" + pack(NumberFmt.be_uint, decompressed_size)
-                record_block = lzo.decompress(record_block_compressed[8:],
+                record_block = lzo.decompress(header + record_block_compressed[8:],
                     initSize=decompressed_size, blockSize=1308672)
             elif record_block_type == b"\x02\x00\x00\x00":
                 # decompress
@@ -784,8 +783,8 @@ def get_index(self, check_block=True):
                     break
 
                 record_end = keyList[i + 1][0] if i < keyListLen - 1 else (decompressed_size + offset)
-                index_tuple = (current_pos, compressed_size, decompressed_size, record_start, 
-                    record_end, offset)
+                index_tuple = (current_pos, compressed_size, decompressed_size, record_start - offset, 
+                    record_end - offset)
                 yield (key_text.decode('utf-8'), index_tuple)
                 i += 1
 
@@ -802,21 +801,21 @@ def get_content_by_Index(self, indexes) -> str:
         ret = []
         f = open(self._fname, 'rb')
         for index in indexes:
-            #这6个变量是保存到trie的数据格式，都是32位保存
-            filePos, compSize, decompSize, startPos, endPos, offset = index
+            #这些变量是保存到trie的数据格式，32位
+            filePos, compSize, decompSize, startIdx, endIdx = index
             f.seek(filePos)
             compressed = f.read(compSize)
             type_ = compressed[:4] #32bit-type, 32bit-adler, data
             if type_ == b"\x00\x00\x00\x00":
                 data = compressed[8:]
             elif type_ == b"\x01\x00\x00\x00":
-                #header = b"\xf0" + pack(">I", decompSize)
-                data = lzo.decompress(compressed[8:], initSize=decompSize, blockSize=1308672)
+                header = b"\xf0" + pack(">I", decompSize)
+                data = lzo.decompress(header + compressed[8:], initSize=decompSize, blockSize=1308672)
             elif type_ == b"\x02\x00\x00\x00":
                 data = zlib.decompress(compressed[8:])
             else:
                 continue
-            record = data[startPos - offset : endPos - offset]
+            record = data[startIdx : endIdx]
             ret.append(record) #.strip(b"\x00"))
 
         f.close()

diff --git a/application/view/reader.py b/application/view/reader.py
@@ -239,7 +239,7 @@ def ReaderDictPost(user: KeUser, userDir: str):
         #import traceback
         #traceback.print_exc()
         definition = f'Error:<br/>{e}'
-    print(json.dumps(definition)) #TODO
+    #print(json.dumps(definition)) #TODO
     return {'status': 'ok', 'word': word, 'definition': definition, 
         'dictname': str(inst), 'others': others}
 

diff --git a/docs/Chinese/deployment.md b/docs/Chinese/deployment.md
@@ -275,7 +275,7 @@ sudo usermod -aG ubuntu www-data #or add nginx www-data to my group ubuntu
 cd kindleear
 virtualenv --python=python3 venv  #create virtual environ
 vim ./config.py  #start to modify some config items
-python3 ./tools/update_req.py #update requirements.txt
+python3 ./tools/update_req.py docker #update requirements.txt
 
 source ./venv/bin/activate  #activate virtual environ
 pip install -r requirements.txt #install dependencies

diff --git a/docs/Chinese/reader.md b/docs/Chinese/reader.md
@@ -48,7 +48,7 @@ KindleEar支持邮件推送和在线阅读，内置一个为电子墨水屏进
 1. KindleEar支持在线词典 [dict.org](https://dict.org/), [dict.cc](https://www.dict.cc/), [dict.cn](http://dict.cn/), [韦氏词典](https://www.merriam-webster.com/)，[牛津词典](https://www.oxfordlearnersdictionaries.com/)， 这几个词典不需要安装，开箱即用。    
 2. 在线词典很方便，但是避免有时候因为网络原因不是太稳定，所以如果要稳定使用，最好还是使用离线词典，为此，KindleEar同时支持 mdict/stardict 格式词典，下载对应的词典后，解压到 `data/dict` 目录（可以使用子目录整理不同的词典）。   
 3. 离线词典第一次查词会比较慢，因为要创建索引文件(后缀为trie)，之后就很快了。 
-如果要使用大型词典，在生成索引的过程中会消耗比较多的内存，如你的服务器内存比较小，可能会创建索引失败，你可以在你的本地机器先使用对应词典查一次单词，待本地生成trie文件后，拷贝到服务器对应目录即可。   
+如果要使用大型词典（比如几百兆以上），在生成索引的过程中会消耗比较多的内存，如你的服务器内存比较小，可能会创建索引失败，你可以在你的本地机器先使用对应词典查一次单词，待本地生成trie文件后，拷贝到服务器对应目录即可。   
 4. 已经默认支持美式英语的构词法规则，可以查询单词时态语态复数等变形，如果需要支持其他语种的构词法，请下载对应的hunspell格式的文件（.dic/.aff），然后拷贝到 `data/dict/morphology` (请直接创建此目录) ，注意不要存放到子目录下，KindleEar会自动使用和书本语言相匹配的构词法规则。   
 至于到哪里下载Hunspell/MySpell构词法文件，可以到github/sourceforge等网站上搜索，下面是几个直链。  
 [LibreOffice](https://github.com/LibreOffice/dictionaries)    

diff --git a/docs/English/deployment.md b/docs/English/deployment.md
@@ -283,7 +283,7 @@ chmod -R 775 ~    #nginx user www-data read static resource
 cd kindleear
 virtualenv --python=python3 venv  #create virtual environ
 vim ./config.py  #start to modify some config items
-python3 ./tools/update_req.py #update requirements.txt
+python3 ./tools/update_req.py docker #update requirements.txt
 
 source ./venv/bin/activate  #activate virtual environ
 pip install -r requirements.txt #install dependencies

diff --git a/docs/English/reader.md b/docs/English/reader.md
@@ -54,7 +54,7 @@ The extracted word is sent to your deployed KindleEar site for translation, and
 2. KindleEar also supports offline dictionaries in the stardict format. After downloading the corresponding dictionary, unzip it into the `data/dict` directory. You can organize different dictionaries into subdirectories. Then, restart the KindleEar service to refresh the dictionary list.    
 
 3. The first time you look up a word in the offline dictionary, it may be slow because it needs to create an index file (suffix: trie), After that, it will be much faster. 
-If you are using a large dictionary, the indexing process will consume a significant amount of memory. If the server has limited memory, the indexing might fail. You can first use the dictionary on your local machine to look up a word and generate the "trie" file, then copy it to the corresponding directory on the server.    
+If you are using a large dictionary (for example, above several hundred megabytes), the indexing process will consume a significant amount of memory. If the server has limited memory, the indexing might fail. You can first use the dictionary on your local machine to look up a word and generate the "trie" file, then copy it to the corresponding directory on the server.    
 
 4. By default, American English morphology queries are supported (tense, voice, plural etc.).    
 If you need to support morphology rules for other languages, please download the corresponding Hunspell format files (.dic/.aff), and then copy them to `data/dict/morphology` (create it if not exists). Be careful not to store them in a subdirectory.