From 7edaea84ed348092d4b923592cbaa69445dbf22c Mon Sep 17 00:00:00 2001 From: cdhigh Date: Mon, 17 Jun 2024 13:28:51 -0300 Subject: [PATCH] improve dict feature --- application/lib/dictionary/mdict/lzo.py | 24 +++++------ application/lib/dictionary/mdict/mdict.py | 4 +- application/lib/dictionary/mdict/readmdict.py | 43 +++++++++---------- application/view/reader.py | 2 +- docs/Chinese/deployment.md | 2 +- docs/Chinese/reader.md | 2 +- docs/English/deployment.md | 2 +- docs/English/reader.md | 2 +- 8 files changed, 40 insertions(+), 41 deletions(-) diff --git a/application/lib/dictionary/mdict/lzo.py b/application/lib/dictionary/mdict/lzo.py index 8006053f..4d147375 100644 --- a/application/lib/dictionary/mdict/lzo.py +++ b/application/lib/dictionary/mdict/lzo.py @@ -4,19 +4,19 @@ class FlexBuffer: def __init__(self): self.blockSize = None - self.c = None - self.l = None - self.buf = None + self.c = 0 + self.len = 0 + self.buf = b'' def require(self, n): - r = self.c - self.l + n - if r > 0: - self.l = self.l + self.blockSize * math.ceil(r / self.blockSize) - # tmp = bytearray(self.l) + r = self.c + n - self.len + if r > 0: #缓冲区不够了,需要添加 + self.len += self.blockSize * math.ceil(r / self.blockSize) + # tmp = bytearray(self.len) # for i in len(self.buf): # tmp[i] = self.buf[i] # self.buf = tmp - self.buf = self.buf + bytearray(self.l - len(self.buf)) + self.buf = self.buf + bytearray(self.len - len(self.buf)) self.c = self.c + n return self.buf @@ -27,9 +27,9 @@ def alloc(self, initSize, blockSize): sz = 4096 self.blockSize = self.roundUp(sz) self.c = 0 - self.l = self.roundUp(initSize) | 0 - self.l += self.blockSize - (self.l % self.blockSize) - self.buf = bytearray(self.l) + self.len = self.roundUp(initSize) | 0 + self.len += self.blockSize - (self.len % self.blockSize) + self.buf = bytearray(self.len) return self.buf def roundUp(self, n): @@ -41,7 +41,7 @@ def roundUp(self, n): def reset(self): self.c = 0 - self.l = len(self.buf) + self.len = len(self.buf) def pack(self, size): return self.buf[0:size] diff --git a/application/lib/dictionary/mdict/mdict.py b/application/lib/dictionary/mdict/mdict.py index 1d8af080..9396364f 100644 --- a/application/lib/dictionary/mdict/mdict.py +++ b/application/lib/dictionary/mdict/mdict.py @@ -58,7 +58,7 @@ def definition(self, word, language=''): #经过词典树缓存的Mdx class IndexedMdx: - TRIE_FMT = '>LLLLLL' + TRIE_FMT = '>LLLLL' #fname: mdx文件全路径名 def __init__(self, fname, encoding="", substyle=False, passcode=None): @@ -83,7 +83,7 @@ def __init__(self, fname, encoding="", substyle=False, passcode=None): #为什么不使用单独的后台任务自动重建索引?是因为运行时间还不是最重要的约束,而是服务器内存 #如果是大词典,内存可能要爆,怎么运行都不行,如果是小词典,则时间可以接受 default_log.info(f"Building trie for {dictName}") - #为了能制作大词典,mdx中这些数据都是64bit的,但是为了节省空间,这里只使用32bit保存(>LLLLLL) + #为了能制作大词典,mdx中这些数据都是64bit的,但是为了节省空间,这里只使用32bit保存 self.trie = marisa_trie.RecordTrie(self.TRIE_FMT, self.mdx.get_index()) #type:ignore self.trie.save(trieName) diff --git a/application/lib/dictionary/mdict/readmdict.py b/application/lib/dictionary/mdict/readmdict.py index a20edc13..c56cc9a6 100644 --- a/application/lib/dictionary/mdict/readmdict.py +++ b/application/lib/dictionary/mdict/readmdict.py @@ -183,36 +183,35 @@ def _decode_key_block_info(self, key_block_info_compressed): def _decode_key_block(self, key_block_compressed, key_block_info_list): key_list = [] i = 0 + key_block = b'' for compressed_size, decompressed_size in key_block_info_list: start = i end = i + compressed_size # 4 bytes : compression type key_block_type = key_block_compressed[start : start + 4] # 4 bytes : adler checksum of decompressed key block - adler32 = unpack( - NumberFmt.be_uint, key_block_compressed[start + 4 : start + 8] - )[0] + #adler32 = unpack(NumberFmt.be_uint, key_block_compressed[start + 4 : start + 8])[0] if key_block_type == b"\x00\x00\x00\x00": key_block = key_block_compressed[start + 8 : end] elif key_block_type == b"\x01\x00\x00\x00": header = b"\xf0" + pack(NumberFmt.be_uint, decompressed_size) - key_block = lzo.decompress(key_block_compressed[start + 8 : end], + key_block = lzo.decompress(header + key_block_compressed[start + 8 : end], initSize=decompressed_size, blockSize=1308672) elif key_block_type == b"\x02\x00\x00\x00": key_block = zlib.decompress(key_block_compressed[start + 8 : end]) # extract one single key block into a key list - key_list += self._split_key_block(key_block) + key_list.extend(self._split_key_block(key_block)) # notice that adler32 returns signed value - assert adler32 == zlib.adler32(key_block) & 0xFFFFFFFF - + #assert adler32 == zlib.adler32(key_block) & 0xFFFFFFFF i += compressed_size return key_list def _split_key_block(self, key_block): - key_list = [] + #key_list = [] key_start_index = 0 + key_end_index = 0 while key_start_index < len(key_block): - temp = key_block[key_start_index : key_start_index + self._number_width] + #temp = key_block[key_start_index : key_start_index + self._number_width] # the corresponding record's offset in record block key_id = unpack( self._number_format, @@ -238,8 +237,8 @@ def _split_key_block(self, key_block): .strip() ) key_start_index = key_end_index + width - key_list += [(key_id, key_text)] - return key_list + yield (key_id, key_text) + return #key_list #读取文件头,生成一个python字典 def _read_header(self): @@ -469,7 +468,7 @@ def _decode_record_block(self): record_block = record_block_compressed[8:] elif record_block_type == b"\x01\x00\x00\x00": header = b"\xf0" + pack(NumberFmt.be_uint, decompressed_size) - record_block = lzo.decompress(record_block_compressed[start + 8 : end], + record_block = lzo.decompress(header + record_block_compressed[start + 8 : end], initSize=decompressed_size, blockSize=1308672) elif record_block_type == b"\x02\x00\x00\x00": record_block = zlib.decompress(record_block_compressed[8:]) @@ -551,9 +550,9 @@ def get_index(self, check_block=True): record_block = record_block_compressed[8:] elif record_block_type == b"\x01\x00\x00\x00": _type = 1 - #header = b"\xf0" + pack(NumberFmt.be_uint, decompressed_size) + header = b"\xf0" + pack(NumberFmt.be_uint, decompressed_size) if check_block: - record_block = lzo.decompress(record_block_compressed[start + 8 : end], + record_block = lzo.decompress(header + record_block_compressed[start + 8 : end], initSize=decompressed_size, blockSize=1308672) elif record_block_type == b"\x02\x00\x00\x00": _type = 2 @@ -677,7 +676,7 @@ def _decode_record_block(self): record_block = record_block_compressed[8:] elif record_block_type == b"\x01\x00\x00\x00": header = b"\xf0" + pack(NumberFmt.be_uint, decompressed_size) - record_block = lzo.decompress(record_block_compressed[8:], + record_block = lzo.decompress(header + record_block_compressed[8:], initSize=decompressed_size, blockSize=1308672) elif record_block_type == b"\x02\x00\x00\x00": # decompress @@ -784,8 +783,8 @@ def get_index(self, check_block=True): break record_end = keyList[i + 1][0] if i < keyListLen - 1 else (decompressed_size + offset) - index_tuple = (current_pos, compressed_size, decompressed_size, record_start, - record_end, offset) + index_tuple = (current_pos, compressed_size, decompressed_size, record_start - offset, + record_end - offset) yield (key_text.decode('utf-8'), index_tuple) i += 1 @@ -802,21 +801,21 @@ def get_content_by_Index(self, indexes) -> str: ret = [] f = open(self._fname, 'rb') for index in indexes: - #这6个变量是保存到trie的数据格式,都是32位保存 - filePos, compSize, decompSize, startPos, endPos, offset = index + #这些变量是保存到trie的数据格式,32位 + filePos, compSize, decompSize, startIdx, endIdx = index f.seek(filePos) compressed = f.read(compSize) type_ = compressed[:4] #32bit-type, 32bit-adler, data if type_ == b"\x00\x00\x00\x00": data = compressed[8:] elif type_ == b"\x01\x00\x00\x00": - #header = b"\xf0" + pack(">I", decompSize) - data = lzo.decompress(compressed[8:], initSize=decompSize, blockSize=1308672) + header = b"\xf0" + pack(">I", decompSize) + data = lzo.decompress(header + compressed[8:], initSize=decompSize, blockSize=1308672) elif type_ == b"\x02\x00\x00\x00": data = zlib.decompress(compressed[8:]) else: continue - record = data[startPos - offset : endPos - offset] + record = data[startIdx : endIdx] ret.append(record) #.strip(b"\x00")) f.close() diff --git a/application/view/reader.py b/application/view/reader.py index f8f0f362..5e421a2a 100644 --- a/application/view/reader.py +++ b/application/view/reader.py @@ -239,7 +239,7 @@ def ReaderDictPost(user: KeUser, userDir: str): #import traceback #traceback.print_exc() definition = f'Error:
{e}' - print(json.dumps(definition)) #TODO + #print(json.dumps(definition)) #TODO return {'status': 'ok', 'word': word, 'definition': definition, 'dictname': str(inst), 'others': others} diff --git a/docs/Chinese/deployment.md b/docs/Chinese/deployment.md index 736e5fbb..13068bd6 100644 --- a/docs/Chinese/deployment.md +++ b/docs/Chinese/deployment.md @@ -275,7 +275,7 @@ sudo usermod -aG ubuntu www-data #or add nginx www-data to my group ubuntu cd kindleear virtualenv --python=python3 venv #create virtual environ vim ./config.py #start to modify some config items -python3 ./tools/update_req.py #update requirements.txt +python3 ./tools/update_req.py docker #update requirements.txt source ./venv/bin/activate #activate virtual environ pip install -r requirements.txt #install dependencies diff --git a/docs/Chinese/reader.md b/docs/Chinese/reader.md index 51075376..33b704ab 100644 --- a/docs/Chinese/reader.md +++ b/docs/Chinese/reader.md @@ -48,7 +48,7 @@ KindleEar支持邮件推送和在线阅读,内置一个为电子墨水屏进 1. KindleEar支持在线词典 [dict.org](https://dict.org/), [dict.cc](https://www.dict.cc/), [dict.cn](http://dict.cn/), [韦氏词典](https://www.merriam-webster.com/),[牛津词典](https://www.oxfordlearnersdictionaries.com/), 这几个词典不需要安装,开箱即用。 2. 在线词典很方便,但是避免有时候因为网络原因不是太稳定,所以如果要稳定使用,最好还是使用离线词典,为此,KindleEar同时支持 mdict/stardict 格式词典,下载对应的词典后,解压到 `data/dict` 目录(可以使用子目录整理不同的词典)。 3. 离线词典第一次查词会比较慢,因为要创建索引文件(后缀为trie),之后就很快了。 -如果要使用大型词典,在生成索引的过程中会消耗比较多的内存,如你的服务器内存比较小,可能会创建索引失败,你可以在你的本地机器先使用对应词典查一次单词,待本地生成trie文件后,拷贝到服务器对应目录即可。 +如果要使用大型词典(比如几百兆以上),在生成索引的过程中会消耗比较多的内存,如你的服务器内存比较小,可能会创建索引失败,你可以在你的本地机器先使用对应词典查一次单词,待本地生成trie文件后,拷贝到服务器对应目录即可。 4. 已经默认支持美式英语的构词法规则,可以查询单词时态语态复数等变形,如果需要支持其他语种的构词法,请下载对应的hunspell格式的文件(.dic/.aff),然后拷贝到 `data/dict/morphology` (请直接创建此目录) ,注意不要存放到子目录下,KindleEar会自动使用和书本语言相匹配的构词法规则。 至于到哪里下载Hunspell/MySpell构词法文件,可以到github/sourceforge等网站上搜索,下面是几个直链。 [LibreOffice](https://github.com/LibreOffice/dictionaries) diff --git a/docs/English/deployment.md b/docs/English/deployment.md index ab95d552..e003b18f 100644 --- a/docs/English/deployment.md +++ b/docs/English/deployment.md @@ -283,7 +283,7 @@ chmod -R 775 ~ #nginx user www-data read static resource cd kindleear virtualenv --python=python3 venv #create virtual environ vim ./config.py #start to modify some config items -python3 ./tools/update_req.py #update requirements.txt +python3 ./tools/update_req.py docker #update requirements.txt source ./venv/bin/activate #activate virtual environ pip install -r requirements.txt #install dependencies diff --git a/docs/English/reader.md b/docs/English/reader.md index 56b7c9ee..ba21c5fa 100644 --- a/docs/English/reader.md +++ b/docs/English/reader.md @@ -54,7 +54,7 @@ The extracted word is sent to your deployed KindleEar site for translation, and 2. KindleEar also supports offline dictionaries in the stardict format. After downloading the corresponding dictionary, unzip it into the `data/dict` directory. You can organize different dictionaries into subdirectories. Then, restart the KindleEar service to refresh the dictionary list. 3. The first time you look up a word in the offline dictionary, it may be slow because it needs to create an index file (suffix: trie), After that, it will be much faster. -If you are using a large dictionary, the indexing process will consume a significant amount of memory. If the server has limited memory, the indexing might fail. You can first use the dictionary on your local machine to look up a word and generate the "trie" file, then copy it to the corresponding directory on the server. +If you are using a large dictionary (for example, above several hundred megabytes), the indexing process will consume a significant amount of memory. If the server has limited memory, the indexing might fail. You can first use the dictionary on your local machine to look up a word and generate the "trie" file, then copy it to the corresponding directory on the server. 4. By default, American English morphology queries are supported (tense, voice, plural etc.). If you need to support morphology rules for other languages, please download the corresponding Hunspell format files (.dic/.aff), and then copy them to `data/dict/morphology` (create it if not exists). Be careful not to store them in a subdirectory.