Skip to content

Commit

Permalink
improve dict feature
Browse files Browse the repository at this point in the history
  • Loading branch information
cdhigh committed Jun 17, 2024
1 parent be3086c commit 7edaea8
Show file tree
Hide file tree
Showing 8 changed files with 40 additions and 41 deletions.
24 changes: 12 additions & 12 deletions application/lib/dictionary/mdict/lzo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,19 @@
class FlexBuffer:
def __init__(self):
self.blockSize = None
self.c = None
self.l = None
self.buf = None
self.c = 0
self.len = 0
self.buf = b''

def require(self, n):
r = self.c - self.l + n
if r > 0:
self.l = self.l + self.blockSize * math.ceil(r / self.blockSize)
# tmp = bytearray(self.l)
r = self.c + n - self.len
if r > 0: #缓冲区不够了,需要添加
self.len += self.blockSize * math.ceil(r / self.blockSize)
# tmp = bytearray(self.len)
# for i in len(self.buf):
# tmp[i] = self.buf[i]
# self.buf = tmp
self.buf = self.buf + bytearray(self.l - len(self.buf))
self.buf = self.buf + bytearray(self.len - len(self.buf))
self.c = self.c + n
return self.buf

Expand All @@ -27,9 +27,9 @@ def alloc(self, initSize, blockSize):
sz = 4096
self.blockSize = self.roundUp(sz)
self.c = 0
self.l = self.roundUp(initSize) | 0
self.l += self.blockSize - (self.l % self.blockSize)
self.buf = bytearray(self.l)
self.len = self.roundUp(initSize) | 0
self.len += self.blockSize - (self.len % self.blockSize)
self.buf = bytearray(self.len)
return self.buf

def roundUp(self, n):
Expand All @@ -41,7 +41,7 @@ def roundUp(self, n):

def reset(self):
self.c = 0
self.l = len(self.buf)
self.len = len(self.buf)

def pack(self, size):
return self.buf[0:size]
Expand Down
4 changes: 2 additions & 2 deletions application/lib/dictionary/mdict/mdict.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def definition(self, word, language=''):

#经过词典树缓存的Mdx
class IndexedMdx:
TRIE_FMT = '>LLLLLL'
TRIE_FMT = '>LLLLL'

#fname: mdx文件全路径名
def __init__(self, fname, encoding="", substyle=False, passcode=None):
Expand All @@ -83,7 +83,7 @@ def __init__(self, fname, encoding="", substyle=False, passcode=None):
#为什么不使用单独的后台任务自动重建索引?是因为运行时间还不是最重要的约束,而是服务器内存
#如果是大词典,内存可能要爆,怎么运行都不行,如果是小词典,则时间可以接受
default_log.info(f"Building trie for {dictName}")
#为了能制作大词典,mdx中这些数据都是64bit的,但是为了节省空间,这里只使用32bit保存(>LLLLLL)
#为了能制作大词典,mdx中这些数据都是64bit的,但是为了节省空间,这里只使用32bit保存
self.trie = marisa_trie.RecordTrie(self.TRIE_FMT, self.mdx.get_index()) #type:ignore
self.trie.save(trieName)

Expand Down
43 changes: 21 additions & 22 deletions application/lib/dictionary/mdict/readmdict.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,36 +183,35 @@ def _decode_key_block_info(self, key_block_info_compressed):
def _decode_key_block(self, key_block_compressed, key_block_info_list):
key_list = []
i = 0
key_block = b''
for compressed_size, decompressed_size in key_block_info_list:
start = i
end = i + compressed_size
# 4 bytes : compression type
key_block_type = key_block_compressed[start : start + 4]
# 4 bytes : adler checksum of decompressed key block
adler32 = unpack(
NumberFmt.be_uint, key_block_compressed[start + 4 : start + 8]
)[0]
#adler32 = unpack(NumberFmt.be_uint, key_block_compressed[start + 4 : start + 8])[0]
if key_block_type == b"\x00\x00\x00\x00":
key_block = key_block_compressed[start + 8 : end]
elif key_block_type == b"\x01\x00\x00\x00":
header = b"\xf0" + pack(NumberFmt.be_uint, decompressed_size)
key_block = lzo.decompress(key_block_compressed[start + 8 : end],
key_block = lzo.decompress(header + key_block_compressed[start + 8 : end],
initSize=decompressed_size, blockSize=1308672)
elif key_block_type == b"\x02\x00\x00\x00":
key_block = zlib.decompress(key_block_compressed[start + 8 : end])
# extract one single key block into a key list
key_list += self._split_key_block(key_block)
key_list.extend(self._split_key_block(key_block))
# notice that adler32 returns signed value
assert adler32 == zlib.adler32(key_block) & 0xFFFFFFFF

#assert adler32 == zlib.adler32(key_block) & 0xFFFFFFFF
i += compressed_size
return key_list

def _split_key_block(self, key_block):
key_list = []
#key_list = []
key_start_index = 0
key_end_index = 0
while key_start_index < len(key_block):
temp = key_block[key_start_index : key_start_index + self._number_width]
#temp = key_block[key_start_index : key_start_index + self._number_width]
# the corresponding record's offset in record block
key_id = unpack(
self._number_format,
Expand All @@ -238,8 +237,8 @@ def _split_key_block(self, key_block):
.strip()
)
key_start_index = key_end_index + width
key_list += [(key_id, key_text)]
return key_list
yield (key_id, key_text)
return #key_list

#读取文件头,生成一个python字典
def _read_header(self):
Expand Down Expand Up @@ -469,7 +468,7 @@ def _decode_record_block(self):
record_block = record_block_compressed[8:]
elif record_block_type == b"\x01\x00\x00\x00":
header = b"\xf0" + pack(NumberFmt.be_uint, decompressed_size)
record_block = lzo.decompress(record_block_compressed[start + 8 : end],
record_block = lzo.decompress(header + record_block_compressed[start + 8 : end],
initSize=decompressed_size, blockSize=1308672)
elif record_block_type == b"\x02\x00\x00\x00":
record_block = zlib.decompress(record_block_compressed[8:])
Expand Down Expand Up @@ -551,9 +550,9 @@ def get_index(self, check_block=True):
record_block = record_block_compressed[8:]
elif record_block_type == b"\x01\x00\x00\x00":
_type = 1
#header = b"\xf0" + pack(NumberFmt.be_uint, decompressed_size)
header = b"\xf0" + pack(NumberFmt.be_uint, decompressed_size)
if check_block:
record_block = lzo.decompress(record_block_compressed[start + 8 : end],
record_block = lzo.decompress(header + record_block_compressed[start + 8 : end],
initSize=decompressed_size, blockSize=1308672)
elif record_block_type == b"\x02\x00\x00\x00":
_type = 2
Expand Down Expand Up @@ -677,7 +676,7 @@ def _decode_record_block(self):
record_block = record_block_compressed[8:]
elif record_block_type == b"\x01\x00\x00\x00":
header = b"\xf0" + pack(NumberFmt.be_uint, decompressed_size)
record_block = lzo.decompress(record_block_compressed[8:],
record_block = lzo.decompress(header + record_block_compressed[8:],
initSize=decompressed_size, blockSize=1308672)
elif record_block_type == b"\x02\x00\x00\x00":
# decompress
Expand Down Expand Up @@ -784,8 +783,8 @@ def get_index(self, check_block=True):
break

record_end = keyList[i + 1][0] if i < keyListLen - 1 else (decompressed_size + offset)
index_tuple = (current_pos, compressed_size, decompressed_size, record_start,
record_end, offset)
index_tuple = (current_pos, compressed_size, decompressed_size, record_start - offset,
record_end - offset)
yield (key_text.decode('utf-8'), index_tuple)
i += 1

Expand All @@ -802,21 +801,21 @@ def get_content_by_Index(self, indexes) -> str:
ret = []
f = open(self._fname, 'rb')
for index in indexes:
#这6个变量是保存到trie的数据格式,都是32位保存
filePos, compSize, decompSize, startPos, endPos, offset = index
#这些变量是保存到trie的数据格式,32位
filePos, compSize, decompSize, startIdx, endIdx = index
f.seek(filePos)
compressed = f.read(compSize)
type_ = compressed[:4] #32bit-type, 32bit-adler, data
if type_ == b"\x00\x00\x00\x00":
data = compressed[8:]
elif type_ == b"\x01\x00\x00\x00":
#header = b"\xf0" + pack(">I", decompSize)
data = lzo.decompress(compressed[8:], initSize=decompSize, blockSize=1308672)
header = b"\xf0" + pack(">I", decompSize)
data = lzo.decompress(header + compressed[8:], initSize=decompSize, blockSize=1308672)
elif type_ == b"\x02\x00\x00\x00":
data = zlib.decompress(compressed[8:])
else:
continue
record = data[startPos - offset : endPos - offset]
record = data[startIdx : endIdx]
ret.append(record) #.strip(b"\x00"))

f.close()
Expand Down
2 changes: 1 addition & 1 deletion application/view/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def ReaderDictPost(user: KeUser, userDir: str):
#import traceback
#traceback.print_exc()
definition = f'Error:<br/>{e}'
print(json.dumps(definition)) #TODO
#print(json.dumps(definition)) #TODO
return {'status': 'ok', 'word': word, 'definition': definition,
'dictname': str(inst), 'others': others}

Expand Down
2 changes: 1 addition & 1 deletion docs/Chinese/deployment.md
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ sudo usermod -aG ubuntu www-data #or add nginx www-data to my group ubuntu
cd kindleear
virtualenv --python=python3 venv #create virtual environ
vim ./config.py #start to modify some config items
python3 ./tools/update_req.py #update requirements.txt
python3 ./tools/update_req.py docker #update requirements.txt

source ./venv/bin/activate #activate virtual environ
pip install -r requirements.txt #install dependencies
Expand Down
2 changes: 1 addition & 1 deletion docs/Chinese/reader.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ KindleEar支持邮件推送和在线阅读,内置一个为电子墨水屏进
1. KindleEar支持在线词典 [dict.org](https://dict.org/), [dict.cc](https://www.dict.cc/), [dict.cn](http://dict.cn/), [韦氏词典](https://www.merriam-webster.com/)[牛津词典](https://www.oxfordlearnersdictionaries.com/), 这几个词典不需要安装,开箱即用。
2. 在线词典很方便,但是避免有时候因为网络原因不是太稳定,所以如果要稳定使用,最好还是使用离线词典,为此,KindleEar同时支持 mdict/stardict 格式词典,下载对应的词典后,解压到 `data/dict` 目录(可以使用子目录整理不同的词典)。
3. 离线词典第一次查词会比较慢,因为要创建索引文件(后缀为trie),之后就很快了。
如果要使用大型词典,在生成索引的过程中会消耗比较多的内存,如你的服务器内存比较小,可能会创建索引失败,你可以在你的本地机器先使用对应词典查一次单词,待本地生成trie文件后,拷贝到服务器对应目录即可。
如果要使用大型词典(比如几百兆以上),在生成索引的过程中会消耗比较多的内存,如你的服务器内存比较小,可能会创建索引失败,你可以在你的本地机器先使用对应词典查一次单词,待本地生成trie文件后,拷贝到服务器对应目录即可。
4. 已经默认支持美式英语的构词法规则,可以查询单词时态语态复数等变形,如果需要支持其他语种的构词法,请下载对应的hunspell格式的文件(.dic/.aff),然后拷贝到 `data/dict/morphology` (请直接创建此目录) ,注意不要存放到子目录下,KindleEar会自动使用和书本语言相匹配的构词法规则。
至于到哪里下载Hunspell/MySpell构词法文件,可以到github/sourceforge等网站上搜索,下面是几个直链。
[LibreOffice](https://github.com/LibreOffice/dictionaries)
Expand Down
2 changes: 1 addition & 1 deletion docs/English/deployment.md
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ chmod -R 775 ~ #nginx user www-data read static resource
cd kindleear
virtualenv --python=python3 venv #create virtual environ
vim ./config.py #start to modify some config items
python3 ./tools/update_req.py #update requirements.txt
python3 ./tools/update_req.py docker #update requirements.txt

source ./venv/bin/activate #activate virtual environ
pip install -r requirements.txt #install dependencies
Expand Down
2 changes: 1 addition & 1 deletion docs/English/reader.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ The extracted word is sent to your deployed KindleEar site for translation, and
2. KindleEar also supports offline dictionaries in the stardict format. After downloading the corresponding dictionary, unzip it into the `data/dict` directory. You can organize different dictionaries into subdirectories. Then, restart the KindleEar service to refresh the dictionary list.

3. The first time you look up a word in the offline dictionary, it may be slow because it needs to create an index file (suffix: trie), After that, it will be much faster.
If you are using a large dictionary, the indexing process will consume a significant amount of memory. If the server has limited memory, the indexing might fail. You can first use the dictionary on your local machine to look up a word and generate the "trie" file, then copy it to the corresponding directory on the server.
If you are using a large dictionary (for example, above several hundred megabytes), the indexing process will consume a significant amount of memory. If the server has limited memory, the indexing might fail. You can first use the dictionary on your local machine to look up a word and generate the "trie" file, then copy it to the corresponding directory on the server.

4. By default, American English morphology queries are supported (tense, voice, plural etc.).
If you need to support morphology rules for other languages, please download the corresponding Hunspell format files (.dic/.aff), and then copy them to `data/dict/morphology` (create it if not exists). Be careful not to store them in a subdirectory.
Expand Down

0 comments on commit 7edaea8

Please sign in to comment.