forked from Id405/sitelen-pona-ucsur-guide
-
Notifications
You must be signed in to change notification settings - Fork 5
/
gen_script.py
439 lines (342 loc) · 15.4 KB
/
gen_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
from ruamel.yaml.scalarstring import SingleQuotedScalarString as sq
from ruamel.yaml.comments import CommentedSeq
from urllib.request import urlopen, Request
import ruamel.yaml
import argparse
import datetime
import uuid
import json
import io
import os
long_to_short = {"kijetesantakalu":["kij"],"nanpa":["nnp","#"],"lili":["lil","v"],"sina":["sna","b"],"wile":["wil","w"],"ijo":["ijo","0"],"wan":["wan","1"],"kin":["kin","!"],"kokosila":["kks"],"misikeke":["msk"],"mi":["mi","p"],"pi":["pi","L"],"tu":["tu","2"],"kepeken":["kpk"],"sitelen":["stl"],"monsuta":["mon"],"kalama":["klm"],"kulupu":["klp"],"pakala":["pkl"],"palisa":["pls"],"pimeja":["pmj"],"sijelo":["sjl"],"sinpin":["snp"],"soweli":["swl"],"namako":["nmk"],"kipisi":["kps"],"majuna":["maj"],"jasima":["jas"],"lanpan":["lan"],"akesi":["aks"],"alasa":["als"],"apeja":["ape"],"kiwen":["kwn"],"linja":["lnj"],"lukin":["lkn"],"monsi":["mns"],"nasin":["nsn"],"pilin":["pln"],"tenpo":["tnp"],"utala":["utl"],"tonsi":["tns"],"epiku":["epk"],"anpa":["anp"],"ante":["ant"],"awen":["awn"],"esun":["esn"],"insa":["ins"],"jaki":["jak"],"jelo":["jel"],"kala":["kal"],"kama":["kam"],"kasi":["kas"],"kili":["kil"],"kule":["kul"],"kute":["kut"],"lape":["lap"],"laso":["las"],"lawa":["law"],"lete":["let"],"lipu":["lip"],"loje":["loj"],"luka":["luk"],"lupa":["lup"],"mama":["mam"],"mani":["man"],"meli":["mel"],"mije":["mij"],"moku":["mok"],"moli":["mol"],"musi":["mus"],"mute":["mut"],"nasa":["nas"],"nena":["nen"],"nimi":["nim"],"noka":["nok"],"olin":["oln"],"open":["opn"],"pake":["pak"],"pali":["pal"],"pana":["pna"],"pini":["pin"],"pipi":["pip"],"poka":["pka"],"poki":["pki"],"pona":["pon"],"powe":["pow"],"sama":["sam"],"seli":["sli"],"selo":["slo"],"seme":["sem"],"sewi":["sew"],"sike":["sik"],"sona":["son"],"suli":["sul"],"suno":["sun"],"supa":["sup"],"suwi":["suw"],"taso":["tas"],"tawa":["taw"],"telo":["tel"],"toki":["tok"],"tomo":["tom"],"unpa":["unp"],"walo":["wal"],"waso":["was"],"wawa":["waw"],"weka":["wek"],"leko":["lek"],"soko":["sok"],"meso":["mes"],"ala":["ala"],"ale":["ale"],"anu":["anu"],"ike":["ike"],"ilo":["ilo"],"jan":["jan"],"ken":["ken"],"kon":["kon"],"len":["len"],"lon":["lon"],"mun":["mun"],"ona":["ona"],"pan":["pan"],"sin":["sin"],"tan":["tan"],"uta":["uta"],"oko":["oko"],"en":["en"],"jo":["jo"],"ko":["ko"],"la":["la"],"li":["li"],"ma":["ma"],"mu":["mu"],"ni":["ni"],"pu":["pu"],"te":["te"],"to":["to"],"ku":["ku"],"a":["a"],"e":["e"],"o":["o"],"n":["n"],"ali":["ali"]}
gen_header = f"-*- Generated by {os.path.basename(__file__)} -*-"
sort_key = lambda x: len(x)
def parse_args():
p = argparse.ArgumentParser(description="Python script to create input plugins from linku dictionary")
p.add_argument('outputtype', choices=["macos", "ahk", "ibus", "espanso"])
p.add_argument('-f', '--file', help="What file to output to. Uses stdout by default")
p.add_argument('-t', '--ahk-toggle', help="Whether or not to output the toggle variant of the AHK script", action='store_true')
p.add_argument('-e', '--end-chars', help="What end/word-separator characters to use. Only supported for AHK and Espanso. Espanso will only use the first character of the string.")
p.add_argument('-s', '--short', help="Whether or not to use short aliases for the words", action='store_true')
return p.parse_args()
def get_words():
request = Request("https://api.linku.la/v1/words", headers={'User-Agent' : 'Mozilla'})
with urlopen(request) as r:
if r.status == 200:
return json.loads(r.read().decode('utf-8'))
else:
exit(1)
def get_ucsur(words) -> list:
filter_fn = lambda x: 'ucsur' in words[x]['representations']
result = []
for word in filter(filter_fn, words):
codepoint = int(words[word]['representations']['ucsur'][2:], 16)
result.append((word, chr(codepoint)))
return result
def to_short(word: str) -> list[str]:
return long_to_short[word]
# TODO reduce code duplication (only if this was nim...)
def espanso(words, short: bool = False, end_chars: str | None = "") -> str:
# python uses lazy evaluation, so this runs (`len()` is not defined for `None`)
if end_chars is None or len(end_chars) == 0:
end_chars = " "
end_char = end_chars[0]
def match(trigger: str, replace: str) -> dict:
# force into sing-quoted string for consistency
# return {"trigger": sq(trigger), "replace": sq(replace), "word": True, "word_separators": list(end_chars)}
return {"trigger": sq(trigger + end_char), "replace": sq(replace)}
def matches(triggers: list[str], replace: str) -> dict:
# return {"trigger": triggers, "replace": sq(replace), "word": True, "word_separators": list(end_chars)}
return {"triggers": list(map(lambda trigger: sq(trigger + end_char), triggers)), "replace": sq(replace)}
yaml = ruamel.yaml.YAML(typ=['rt', 'string'])
yaml.allow_unicode = True
yaml.encoding = 'utf-8'
yaml.default_flow_style = False
yaml.block_seq_indent = 2
yaml.indent = 4
matches_dict = {"matches": [
match("zz", " "), match(" ", " "), match("-", ""), match("^", ""), match("*", ""),
match(":", ""), match(".", ""), match("[", ""), match("]", ""), match("<", "「"),
match(">", "」"), match("(", ""), match(")", ""), match("{", ""), match("}", "")
]}
# store len to use as a "key" when adding comments
punctuation_len = len(matches_dict["matches"])
for i in words:
if short: keys = to_short(i[0])
else: keys = [i[0]]
if len(keys) == 1:
matches_dict["matches"].append(match(keys[0], i[1]))
else:
matches_dict["matches"].append(matches(keys, i[1]))
words_len = len(matches_dict["matches"])
# use `+=` and a list if this needs to be extended
# rather have a json array, but this works fine
matches_dict["matches"] += [
matches(["msa", "misonala", "misonaala"], "")
]
## sort_key = lambda x: len(x["trigger"]) if "trigger" in x else len(str(['triggers']))
## matches["matches"].sort(key=sort_key, reverse=True)
# translate into `CommentedSeq` so we can add comments
matches_dict["matches"] = CommentedSeq(matches_dict["matches"])
# add comments
for i in range(1, len(matches_dict["matches"])):
matches_dict["matches"].yaml_set_comment_before_after_key(i, before='\n')
matches_dict["matches"].yaml_set_comment_before_after_key(0, before="punctuation etc", indent=2)
matches_dict["matches"].yaml_set_comment_before_after_key(punctuation_len, before="nimi ale", indent=2)
matches_dict["matches"].yaml_set_comment_before_after_key(words_len, before="contractions", indent=2)
# init stream for use with `yaml.dump`
buf = io.BytesIO()
yaml.dump(matches_dict, buf)
return f"# {gen_header}\n\n" + buf.getvalue().decode() # yaml.dump_to_string(matches, add_final_eol=True)
def ibus_table(words, short: bool = False) -> str:
def to_ibus_fmt(words):
result = []
for i in words:
keys = to_short(i[0]) if short else [i[0]]
result += [f"{key}\t{i[1]}\t1" for key in keys]
result.sort(key=sort_key, reverse=True)
return "\n".join(result)
table_id = str(uuid.uuid4())
now = datetime.datetime.now().strftime("%Y%m%d")
table = f"""### {gen_header}
### File header must not be modified
### This file must be encoded into UTF-8.
### This table under LGPL
### comments start with ### not single #
### Derive from the format of SCIM Table, so you can modify the table from
### scim-tables' table
SCIM_Generic_Table_Phrase_Library_TEXT
VERSION_1_0
### Begin Table definition.
BEGIN_DEFINITION
### License
LICENSE = LGPL
### An unique id to distinguish this table among others.
### Use uuidgen to generate this kind of id.
UUID = {table_id}
### A unique number indicates the version of this file.
### For example the last modified date of this file.
### This number must be less than 2^32.
### Just make your table version-able
SERIAL_NUMBER = {now}
### ICON can be any format as long as your pygtk can recognized
### the most widely ones are "png" and "svg", letter one is recommended
ICON = ibus-table.svg
### The symbol to be displayed in IM switchers
SYMBOL =
### The default name of this table, this is needed
NAME = sitelen pona 4.0
### The local names of this table, this is optional
### NAME.zh_CN = 形码
### NAME.zh_HK = 形碼
### NAME.zh_TW = 形碼
### Description
DESCRIPTION = sitelen pona input method for IBus, ported from nasin-nanpa's ahk script.
### Supported languages of this table
### sigle "zh_CN" just be recognized as zh_CN,
### but "zh_CN, zh_HK" or more zh_XX will be recognized as zh;
### and "en_US, zh_CN" will be just ignored.
LANGUAGES = en_US
### The author of this table
AUTHOR = jan Komi
### Prompt string to be displayed in the status area, CN will be replaced by
### the gettext tools in runtime as 中.
STATUS_PROMPT = toki
### Valid input chars.
VALID_INPUT_CHARS = ()*+-.:<>[]_aeijklmnopstuwz{{}}~
### Layout
LAYOUT = us
### The max number of input keys for every phrase or character.
MAX_KEY_LENGTH = 20
### Use auto_commit mode as default
AUTO_COMMIT = TRUE
### Automatically selects the first phrase when typing
AUTO_SELECT = FALSE
### Use full width punctuation by default
DEF_FULL_WIDTH_PUNCT = FALSE
### Not use full width letter by default
DEF_FULL_WIDTH_LETTER = FALSE
### Whether user are allow to define phrase, default is true
### You have to define the word construction rules below.
### For input methods which do not input phrases, set this to False
USER_CAN_DEFINE_PHRASE = FALSE
### Whether support PinYin Mode, default is true.
### this feature is just for Chinese, set it to False if your IM is not
### Chinese.
PINYIN_MODE = FALSE
### If true then the phrases' frequencies will be adjusted dynamically
### according your using frequency.
DYNAMIC_ADJUST = TRUE
### Some characters whose frequencies should be fix all the time, e.g.
### some punctuations
### NO_CHECK_CHARS =
### Rules for constructing user defined phrase
### "ce" stands for "ci equal", a Chinese English :), means "phrase length
### equal to", thus ce2 -> phrase length equal to 2; and "ca" means "phrase
### length equal or above", so ca4 -> phrase length equal or above 4.
### p21 -> the 1st key of 2nd character in the phrase, and so on.
### Each rule separate via ";".
### Example below is a complete rule-set,
### becuase [2,2] ∩ [3,3] ∩ [4,+∞] = [2,+∞], which is the range of length
### of phrase. This have to be satisfied if you need ibus-table to build up
### your own inputed phrase via your daily using.
### RULES = ce2:p11+p12+p21+p22;ce3:p11+p21+p22+p31;ca4:p11+p21+p31+p41
### The key strokes to page up the lookup table.
### PAGE_UP_KEYS = Page_Up,KP_Page_Up,minus,comma
### The key strokes to page down.
### PAGE_DOWN_KEYS = Page_Down,KP_Page_Down,equal,period
### The key strokes to select candidiate phrases.
### Usually "1,2,3,4,5,6,7,8,9" but if this conflicts with
### characters one wants to use for input one can also
### use something like “F1,F2,F3,F4,F5,F6,F7,F8,F9”
SELECT_KEYS = 1,2,3,4,5,6,7,8,9
### The default orientation of the candidate list
### TRUE means the candidate list is vertical, FALSE means it is horizontal
ORIENTATION=TRUE
END_DEFINITION
### Begin Table data.
### Format of every line whose formated in "input_keys\\tphrase\\tfreq\\n" is an
### entry.
### From left to right, the 1st column are the input key combination that you
### entered via keyboard; the 2nd column are presented character or phrase of
### the key combination you want; the 3rd column are frequency of the character
### or phrase.
BEGIN_TABLE
"""
table += to_ibus_fmt(words)
table += """
[ 1
] 1
* 1
+ 1
- 1
( 1
) 1
_ 1
{ 1
} 1
. 1
: 1
ss 1
zz 1
< 「 1
> 」 1
~~ ︁ 1
~ ︀ 1
END_TABLE"""
return table
def ahk_script(words, short: bool = False, toggle: bool = False, end_chars: str | None = "") -> str:
def to_ahk_fmt(words):
result = []
for i in words:
str_codepoint = str(hex(ord(i[1])))[2:].upper()
keys = to_short(i[0]) if short else [i[0]]
result += [f"::{key}::{{U+{str_codepoint}}} ; {i[1]}" for key in keys]
result.sort(key=sort_key, reverse=True)
return "\n".join(result)
# python uses lazy evaluation, so this runs (`len()` is not defined for `None`)
if end_chars is None or len(end_chars) == 0:
if toggle:
end_chars = " "
else:
end_chars = "``"
if toggle:
# ` ^42` center-aligns the string to 42 characters
# (len of prev line minus the two spaces and semicolons)
header = f"""
; ---- USE ALT + SPACE TO TOGGLE SCRIPT ---- ;
; {gen_header: ^42} ;
#Requires AutoHotkey v2.0+
#SingleInstance force
#Hotstring O Z
Hotstring("EndChars", "{end_chars}")
#SuspendExempt
!Space::Suspend
#SuspendExempt False
"""
else:
header = f"""
; {gen_header}
#Requires AutoHotkey v2.0+
#SingleInstance force
#Hotstring O
Hotstring("EndChars", "{end_chars}")
"""
return header.lstrip() + to_ahk_fmt(words) + """
::[::{U+F1990} ; cartouche start
::]::{U+F1991} ; cartouche end
::=::{U+200D} ; default zero width joiner
::+::{U+F1996} ; scaling joiner
::-::{U+F1995} ; stacking joiner
::(::{U+F1997} ; start left-combining (normal) long glyph
::)::{U+F1998} ; end left-combining (normal) long glyph
::_::{U+F1999} ; container extender
::{::{U+F199A} ; start right-combining (reversed) long glyph
::}::{U+F199B} ; end right-combining (reversed) long glyph
::.::{U+F199C} ; sitelen pona full stop
::`:::{U+F199D} ; sitelen pona colon
::`s`s::{U+3000} ; logograph fullwidth space
::zz::{U+3000} ; logograph fullwidth space
::<::{U+300C} ; 「 start quote
::>::{U+300D} ; 」 end quote
::~~::{U+FE01} ; variation selector 2
::~::{U+FE00} ; variation selector 1
"""
# TODO update input plugin (e.g. what else uses `pi_`??)
def mac_os_plugin(words, short: bool = False) -> str:
def to_mac_fmt(words):
result = []
for i in words:
keys = to_short(i[0]) if short else [i[0]]
result += [f"{key} {i[1]}" for key in keys]
result.sort(key=sort_key, reverse=True)
return "\n".join(result)
return f"""# {gen_header}
METHOD: TABLE
ENCODE: Unicode
PROMPT: sitelen pona (UCSUR)
DELIMITER: ,
VERSION: 3.2
MAXINPUTCODE: 15
VALIDINPUTKEY: aeioujklmnpstw[_].:/+{{}}
BEGINCHARACTER
[
]
_
pi_
___
-
^
+
(
)
__
{{
}}
.
:
te 「
to 」
< 「
> 」""" + to_mac_fmt(words) + "\nENDCHARACTER"
if __name__ == "__main__":
args = parse_args()
words = get_words()
encoding = "utf-8"
match args.outputtype:
case "macos":
text = mac_os_plugin(get_ucsur(words), short=bool(args.short))
# Apple documentation says to use utf-16
# https://support.apple.com/guide/mac-help/create-and-use-your-own-input-source-on-mac-mchlp2866/mac
encoding = "utf-16"
case "ahk":
text = ahk_script(get_ucsur(words), short=bool(args.short), toggle=bool(args.ahk_toggle), end_chars=args.end_chars)
case "ibus":
text = ibus_table(get_ucsur(words), short=bool(args.short))
case "espanso":
text = espanso(get_ucsur(words), short=bool(args.short), end_chars=args.end_chars)
if args.file:
with open(args.file, 'w', encoding=encoding) as f:
f.write(text)
else:
print(text)