forked from dbuenzli/uucp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ucharinfo.ml
672 lines (606 loc) · 24.7 KB
/
ucharinfo.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
(*---------------------------------------------------------------------------
Copyright (c) 2017 The uucp programmers. All rights reserved.
SPDX-License-Identifier: ISC
---------------------------------------------------------------------------*)
let log_err s =
Printf.printf "%s: %s\n%!" (Filename.basename Sys.executable_name) s
(* Char and String and char utilities *)
let is_white = function ' ' | '\t' .. '\r' -> true | _ -> false
let is_digit = function '0' .. '9' -> true | _ -> false
let is_hex_digit = function
| '0' .. '9' | 'A' .. 'F' | 'a' .. 'f' -> true
| _ -> false
let hex_digit_value c = (* assert (is_hex_digit c) *)
let c = Char.code c in
if c <= 0x39 then c - 48 else
if c <= 0x46 then c - 55 else
c - 87
let strf = Format.asprintf
let str_starts s pre =
let plen = String.length pre in
let slen = String.length s in
if plen > slen then false else
try for i = 0 to plen - 1 do if pre.[i] <> s.[i] then raise Exit; done; true
with Exit -> false
let str_is_ascii s =
let max = String.length s - 1 in
let rec loop s i =
if i > max then true else
if Char.code s.[i] > 0x7F then false else loop s (i + 1)
in
loop s 0
let includes ~affix s =
let len_a = String.length affix in
let len_s = String.length s in
if len_a > len_s then false else
let max_idx_a = len_a - 1 in
let max_idx_s = len_s - len_a in
let rec loop i k =
if i > max_idx_s then false else
if k > max_idx_a then true else
if k > 0 then
if String.unsafe_get affix k = String.unsafe_get s (i + k)
then loop i (k + 1) else loop (i + 1) 0
else
if String.unsafe_get affix 0 = String.unsafe_get s i
then loop i 1 else loop (i + 1) 0
in
loop 0 0
(* Data conversion to strings and UTF-X *)
let esc_non_ascii s =
let b = Buffer.create 255 in
let add_byte = function
| c when Char.code c > 0x7F ->
Buffer.add_string b (strf "\\x%02X" (Char.code c))
| c -> Buffer.add_char b c
in
String.iter add_byte s; Buffer.contents b
let uchars_to_utf_bytes utf uchars =
let b = Buffer.create 255 in
let add_utf = match utf with
| `UTF_8 -> Buffer.add_utf_8_uchar
| `UTF_16BE -> Buffer.add_utf_16be_uchar
| `UTF_16LE -> Buffer.add_utf_16le_uchar
in
List.iter (add_utf b) uchars; Buffer.contents b
let uchars_to_utf_bytes_esc utf uchars =
let utf = uchars_to_utf_bytes utf uchars in
let b = Buffer.create 255 in
let add_byte c = Buffer.add_string b (strf "\\x%02X" (Char.code c)) in
String.iter add_byte utf; Buffer.contents b
let uchars_to_cps uchars =
let to_str u = strf "U+%04X" (Uchar.to_int u) in
String.concat " " (List.map to_str uchars)
let uchars_to_named_cps uchars =
let to_str u = strf "%s (U+%04X)" (Uucp.Name.name u) (Uchar.to_int u) in
String.concat " " (List.map to_str uchars)
(* Character specification *)
let str_of_spec_fmt = function
| `UTF_8 -> "UTF-8"
| `UTF_16BE -> "UTF-16BE"
| `UTF_16LE -> "UTF-16LE"
| `Uchar_esc -> "a Unicode character escape"
| `Bytes_esc -> "a byte sequence escape"
| `Guess -> "a character specification"
| `By_name -> "a name substring"
let uchar_of_utf utf s =
let get_utf = match utf with
| `UTF_8 -> String.get_utf_8_uchar
| `UTF_16BE -> String.get_utf_16be_uchar
| `UTF_16LE -> String.get_utf_16le_uchar
in
let dec = get_utf s 0 in
if Uchar.utf_decode_is_valid dec &&
Uchar.utf_decode_length dec = String.length s
then Some (Uchar.utf_decode_uchar dec)
else None
let try_uchar_of_utfs s =
let rec try_decs s = function
| [] -> None
| enc :: encs ->
match uchar_of_utf enc s with
| None -> try_decs s encs
| Some _ as u -> u
in
try_decs s [`UTF_8; `UTF_16BE; `UTF_16LE]
let parse_decimal_esc s = (* prefix has been removed from [s] *)
let max = String.length s - 1 in
let rec loop s i acc = match i = max with
| true ->
if s.[max] = ';' && Uchar.is_valid acc then Some (Uchar.of_int acc) else
None
| false ->
if not (is_digit s.[i]) then None else
loop s (i + 1) (acc * 10 + (Char.code s.[i] - 48))
in
loop s 0 0
let parse_hex_esc s ~finish = (* prefix has been removed from [s] *)
let max = String.length s - 1 in
let parse_hex s start =
let rec loop s i acc =
if i > max then acc, max else
if not (is_hex_digit s.[i]) then acc, i - 1 else
loop s (i + 1) (acc * 16 + hex_digit_value s.[i])
in
loop s start 0
in
let hex, last = parse_hex s 0 in
let to_uchar hex =
if Uchar.is_valid hex then Some (Uchar.of_int hex) else None
in
match finish with
| `Empty -> if last = max then to_uchar hex else None
| `Char c -> if last = max - 1 && s.[max] = c then to_uchar hex else None
| `Maybe_surrogate ->
if last = max then to_uchar hex else
match 0xD800 <= hex && hex <= 0xDBFF && last <= max - 3 &&
s.[last + 1] = '\\' && s.[last + 2] = 'u'
with
| false -> None
| true ->
let lo, last' = parse_hex s (last + 3) in
match last' = max && 0xDC00 <= lo && lo <= 0xDFFF with
| false -> None
| true ->
Some (Uchar.of_int @@
(((hex land 0x3FF) lsl 10) lor (lo land 0x3FF)) + 0x10000)
let uchar_of_uchar_esc s =
let prefixes =
[ "U+"; "u+"; "U"; "u"; "\\u{"; "\\U{"; "\\u"; "\\U"; "&#x"; "&#" ]
in
match try Some (List.find (str_starts s) prefixes) with Not_found -> None with
| None -> None
| Some pre ->
let pre_len = String.length pre in
let s = String.sub s pre_len (String.length s - pre_len) in
match pre with
| "&#" -> parse_decimal_esc s
| "&#x" -> parse_hex_esc s ~finish:(`Char ';')
| "\\u" -> parse_hex_esc s ~finish:`Maybe_surrogate
| "\\u{" | "\\U{" -> parse_hex_esc s ~finish:(`Char '}')
| _ -> parse_hex_esc s ~finish:`Empty
let uchar_of_utf_8_bytes_esc s =
let b = Buffer.create 255 in
let max = String.length s - 1 in
let parse_hex_byte s i =
match i + 1 <= max && is_hex_digit s.[i] && is_hex_digit s.[i+1] with
| false -> None
| true -> Some (16 * (hex_digit_value s.[i]) + (hex_digit_value s.[i+1]))
in
let rec loop s i =
if i > max then uchar_of_utf `UTF_8 (Buffer.contents b) else
if is_white s.[i] then loop s (i + 1) else
if i + 1 > max then None else
let i = match s.[i] with
| ('0' | '\\') when s.[i+1] = 'x' -> i + 2
| 'x' -> i + 1
| _ -> i
in
match parse_hex_byte s i with
| None -> None
| Some byte -> Buffer.add_char b (Char.chr byte); loop s (i + 2)
in
loop s 0
let search_names spec = (* Quadratics :-) *)
let names u = Uucp.Name.name u :: (List.map snd (Uucp.Name.name_alias u)) in
let match' names spec = List.exists (includes ~affix:spec) names in
let rec loop spec acc u =
let names = names u in
let acc = if match' names spec then u :: acc else acc in
if u = Uchar.max then List.rev acc else loop spec acc (Uchar.succ u)
in
match loop (String.uppercase_ascii spec) [] Uchar.min with
| [] -> None
| us -> Some us
let singleton = Option.map (fun u -> [u])
let guess_spec s = match str_is_ascii s with
| false -> singleton (try_uchar_of_utfs s)
| true when String.length s = 1 -> Some ([Uchar.of_int (Char.code s.[0])])
| true ->
match uchar_of_uchar_esc s with
| Some _ as u -> singleton u
| None ->
match uchar_of_utf_8_bytes_esc s with
| Some _ as u -> singleton u
| None -> search_names s
let parse_spec spec_fmt s = match spec_fmt with
| `UTF_8 -> singleton (uchar_of_utf `UTF_8 s)
| `UTF_16BE -> singleton (uchar_of_utf `UTF_16BE s)
| `UTF_16LE -> singleton (uchar_of_utf `UTF_16LE s)
| `Uchar_esc -> singleton (uchar_of_uchar_esc s)
| `Bytes_esc -> singleton (uchar_of_utf_8_bytes_esc s)
| `Guess -> guess_spec s
| `By_name -> search_names s
let parse_specs spec_fmt specs =
let rec loop acc = function
| [] ->
let us = List.concat acc in
let module Uset = Set.Make (Uchar) in
if us = []
then Error (strf "No character matched the specifications.")
else Ok (Uset.elements (Uset.of_list us))
| s :: ss ->
match parse_spec spec_fmt s with
| Some us -> loop (us :: acc) ss
| None ->
match spec_fmt with
| `Guess | `By_name -> loop acc ss
| spec_fmt ->
let fmt = str_of_spec_fmt spec_fmt and sesc = esc_non_ascii s in
Error (strf "Could not parse %s from '%s'\n%!" fmt sesc)
in
loop [] specs
(* Version *)
let unicode_version () = Printf.printf "%s\n%!" Uucp.unicode_version
(* Character information *)
type out_fmt =
{ ascii : bool;
no_labels : bool;
raw_bytes : bool; }
let str (pp, key) (out_fmt : out_fmt) u = strf "%a" pp (key u)
let str_bool key (out_fmt : out_fmt) u = strf "%b" (key u)
let str_int key (out_fmt : out_fmt) u = strf "%d" (key u)
let str_uchar (out_fmt : out_fmt) u = uchars_to_cps [u]
let str_str key (out_fmt : out_fmt) u = key u
let str_utf utf out_fmt u = match out_fmt.raw_bytes with
| true -> uchars_to_utf_bytes utf [u]
| false -> uchars_to_utf_bytes_esc utf [u]
let str_cps out_fmt uchars =
let cps = uchars_to_cps uchars in
match out_fmt.ascii with
| true -> cps
| false ->
let utf_8 = uchars_to_utf_bytes `UTF_8 uchars in
strf "%s (%s)" utf_8 cps
let str_case_map map (out_fmt : out_fmt) u =
let uchars = match map u with `Self -> [u] | `Uchars uchars -> uchars in
str_cps out_fmt uchars
let str_decomposition_mapping (out_fmt : out_fmt) u =
let uchars = match Uunf.decomp u with
| [||] -> [u]
| a ->
let to_uchar i u = match i = 0 with
| true -> Uunf.d_uchar u
| false -> Uchar.of_int u
in
Array.(to_list @@ mapi to_uchar a)
in
str_cps out_fmt uchars
let str_name_alias key (out_fmt : out_fmt) u =
let str_alias (t, n) = strf "%s (%a)" n Uucp.Name.pp_alias_tag t in
String.concat ", " (List.map str_alias (key u))
let str_script_extensions key (out_fmt : out_fmt) u =
String.concat ", " (List.map (strf "%a" Uucp.Script.pp) (key u))
let all_keys = [
`P "Age", str Uucp.Age.(pp, age);
`P "Alphabetic", str_bool Uucp.Alpha.is_alphabetic;
`P "Block", str Uucp.Block.(pp, block);
(* Break *)
`P "Line_Break", str Uucp.Break.(pp_line, line);
`P "Grapheme_Cluster_Break",
str Uucp.Break.(pp_grapheme_cluster, grapheme_cluster);
`P "Word_Break", str Uucp.Break.(pp_word, word);
`P "Sentence_Break", str Uucp.Break.(pp_sentence, sentence);
`P "Indic_Conjunct_Break",
str Uucp.Break.(pp_indic_conjunct_break, indic_conjunct_break);
`P "East_Asian_Width", str Uucp.Break.(pp_east_asian_width, east_asian_width);
(* Case *)
`P "Lowercase", str_bool Uucp.Case.is_lower;
`P "Uppercase", str_bool Uucp.Case.is_upper;
`P "Cased", str_bool Uucp.Case.is_cased;
`P "Case_ignorable", str_bool Uucp.Case.is_case_ignorable;
`P "Lowercase_Mapping", str_case_map Uucp.Case.Map.to_lower;
`P "Uppercase_Mapping", str_case_map Uucp.Case.Map.to_upper;
`P "Titlecase_Mapping", str_case_map Uucp.Case.Map.to_title;
`P "Case_Folding", str_case_map Uucp.Case.Fold.fold;
`P "NFKC_Casefold", str_case_map Uucp.Case.Nfkc_fold.fold;
`P "NFKC_Simple_Casefold", str_case_map Uucp.Case.Nfkc_simple_fold.fold;
(* CJK *)
`P "Ideographic", str_bool Uucp.Cjk.is_ideographic;
`P "IDS_Unary_Operator", str_bool Uucp.Cjk.is_ids_unary_operator;
`P "IDS_Binary_Operator", str_bool Uucp.Cjk.is_ids_binary_operator;
`P "IDS_Trinary_Operator", str_bool Uucp.Cjk.is_ids_trinary_operator;
`P "Radical", str_bool Uucp.Cjk.is_radical;
`P "Unified_Ideograph", str_bool Uucp.Cjk.is_unified_ideograph;
(* Emoji *)
`P "Emoji", str_bool Uucp.Emoji.is_emoji;
`P "Emoji_Presentation", str_bool Uucp.Emoji.is_emoji_presentation;
`P "Emoji_Modifier", str_bool Uucp.Emoji.is_emoji_modifier;
`P "Emoji_Modifier_Base", str_bool Uucp.Emoji.is_emoji_modifier_base;
`P "Emoji_Component", str_bool Uucp.Emoji.is_emoji_component;
`P "Extended_Pictographic", str_bool Uucp.Emoji.is_extended_pictographic;
(* Func *)
`P "Dash", str_bool Uucp.Func.is_dash;
`P "Diacritic", str_bool Uucp.Func.is_diacritic;
`P "Extender", str_bool Uucp.Func.is_extender;
`P "Grapheme_Base", str_bool Uucp.Func.is_grapheme_base;
`P "Grapheme_Extend", str_bool Uucp.Func.is_grapheme_extend;
`P "Math", str_bool Uucp.Func.is_math;
`P "Quotation_Mark", str_bool Uucp.Func.is_quotation_mark;
`P "Soft_Dotted", str_bool Uucp.Func.is_soft_dotted;
`P "Terminal_Punctuation", str_bool Uucp.Func.is_terminal_punctuation;
`P "Regional_Indicator", str_bool Uucp.Func.is_regional_indicator;
`P "Join_Control", str_bool Uucp.Func.is_join_control;
(* Gc *)
`P "General_Category", str Uucp.Gc.(pp, general_category);
(* Gen *)
`P "Default_Ignorable_Code_Point", str_bool Uucp.Gen.is_default_ignorable;
`P "Deprecated", str_bool Uucp.Gen.is_deprecated;
`P "Logical_Order_Exception", str_bool Uucp.Gen.is_logical_order_exception;
`P "Noncharacter_Code_Point", str_bool Uucp.Gen.is_non_character;
`P "Variation_Selector", str_bool Uucp.Gen.is_variation_selector;
(* Hangul *)
`P "Hangul_Syllable_Type", str Uucp.Hangul.(pp_syllable_type, syllable_type);
(* Id *)
`P "ID_Start", str_bool Uucp.Id.is_id_start;
`P "ID_Continue", str_bool Uucp.Id.is_id_continue;
`P "XID_Start", str_bool Uucp.Id.is_xid_start;
`P "XID_Continue", str_bool Uucp.Id.is_xid_continue;
`P "ID_Compat_Math_Start", str_bool Uucp.Id.is_id_compat_math_start;
`P "ID_Compat_Math_Continue", str_bool Uucp.Id.is_id_compat_math_continue;
`P "Pattern_Syntax", str_bool Uucp.Id.is_pattern_syntax;
`P "Pattern_White_Space", str_bool Uucp.Id.is_pattern_white_space;
(* Name *)
`P "Name", str_str Uucp.Name.name;
`P "Name_alias", str_name_alias Uucp.Name.name_alias;
(* Num *)
`P "ASCII_Hex_Digit", str_bool Uucp.Num.is_ascii_hex_digit;
`P "Hex_Digit", str_bool Uucp.Num.is_hex_digit;
`P "Numeric_Type", str Uucp.Num.(pp_numeric_type, numeric_type);
`P "Numeric_Value", str Uucp.Num.(pp_numeric_value, numeric_value);
(* Script *)
`P "Script", str Uucp.Script.(pp, script);
`P "Script_extensions", str_script_extensions Uucp.Script.script_extensions;
(* White *)
`P "White_Space", str_bool Uucp.White.is_white_space;
(* Uunf *)
`P "Canonical_Combining_Class", str_int Uunf.ccc;
`P "Decomposition_Mapping", str_decomposition_mapping;
(* Non UCD keys *)
`N "Uchar", str_uchar;
`N "UTF-8", str_utf `UTF_8;
`N "UTF-16BE", str_utf `UTF_16BE;
`N "UTF-16LE", str_utf `UTF_16LE;
`H "Tty_width_hint", str_int Uucp.Break.tty_width_hint; ]
let compare_key (k0, _) (k1, _) = match k0, k1 with
| (`P k0 | `N k0 | `H k0), (`P k1 | `N k1 | `H k1) -> compare k0 k1
let eq_key k (k', _) = match k' with
| `P k' | `N k' | `H k' ->
String.(compare (lowercase_ascii k) (lowercase_ascii k') = 0)
let exist_key k = List.exists (eq_key k) all_keys
let find_key k = List.find (eq_key k) all_keys
let key_id (k, _) = match k with `P k | `N k | `H k -> k
let list_keys () =
let keys = List.sort compare_key all_keys in
let pr_key_name (n, _) = match n with
| `P k -> Printf.printf "%s (UCD)\n" k
| `N k -> Printf.printf "%s\n" k
| `H k -> Printf.printf "%s (Uucp library heuristic)\n" k
in
List.iter pr_key_name keys
let get_keys keys = try Ok (List.map find_key keys) with
| Not_found ->
let _, not = List.partition exist_key keys in
match not with
| [p] -> Error (strf "Unknown key: %s" p)
| ps -> Error (strf "Unknown keys: %s" (String.concat ", " ps))
(* Key sets *)
let default_keys =
[ "Name"; "Uchar"; "Age"; "Block"; "Script"; "Script_extensions";
"General_Category"; "Decomposition_Mapping";
"UTF-8"; "UTF-16BE"; "UTF-16LE"]
let case_keys =
[ "Lowercase"; "Uppercase"; "Cased"; "Case_ignorable"; "Lowercase_Mapping";
"Uppercase_Mapping"; "Titlecase_Mapping"; "Case_Folding"; "NFKC_Casefold" ]
let cjk_keys =
[ "Ideographic"; "IDS_Binary_Operator"; "IDS_Trinary_Operator"; "Radical";
"Unified_Ideograph" ]
let emoji_keys =
[ "Emoji"; "Emoji_Presentation"; "Emoji_Modifier"; "Emoji_Modifier_Base";
"Emoji_Component"; "Extended_Pictographic" ]
let id_keys =
[ "ID_Start"; "ID_Continue"; "XID_Start"; "XID_Continue"; "Pattern_Syntax";
"Pattern_White_Space" ]
let num_keys =
[ "ASCII_Hex_Digit"; "Hex_Digit"; "Numeric_Type"; "Numeric_Value" ]
(* Query keys *)
let output_key out_fmt uchar (k, str) = match out_fmt.no_labels with
| true -> Printf.printf "%s\n" (str out_fmt uchar)
| false ->
let l = match k with `P k | `N k | `H k -> k in
Printf.printf "%s: %s\n" l (str out_fmt uchar)
let query_keys keys out_fmt uchar = List.iter (output_key out_fmt uchar) keys
(* Cmd *)
let ucharinfo cmd keys spec_fmt out_fmt specs = match cmd with
| `Unicode_version -> unicode_version (); 0
| `List_keys -> list_keys (); 0
| `Query ->
match get_keys keys with
| Error e -> log_err e; 2
| Ok keys ->
let uchars = parse_specs spec_fmt specs in
match uchars with
| Error e -> log_err e; 1
| Ok uchars ->
let rec loop = function
| [] -> assert false
| [u] -> query_keys keys out_fmt u; 0
| u :: us -> query_keys keys out_fmt u; Printf.printf "\n"; loop us
in
loop uchars
(* Cmdline interface *)
open Cmdliner
let cmd =
let list_keys =
let doc = "List available keys." in
`List_keys, Arg.info ["l"; "key-list"] ~doc
in
let unicode_version =
let doc = "Output supported Unicode version." in
`Unicode_version, Arg.info ["unicode-version"] ~doc
in
Arg.(value & vflag `Query [unicode_version; list_keys])
let spec_fmt =
let spec_fmt =
[ "UTF-8", `UTF_8; "UTF-16BE", `UTF_16BE; "UTF-16LE", `UTF_16LE;
"uchar-esc", `Uchar_esc; "bytes-esc", `Bytes_esc; "name", `By_name;
"guess", `Guess ]
in
let doc =
strf "The character specification format. $(docv) must be one of %s.
See CHARACTER SPECIFICATION for details."
(Arg.doc_alts_enum spec_fmt)
in
let spec_format = Arg.enum spec_fmt in
Arg.(value & opt spec_format `Guess &
info ["s"; "spec-format"] ~doc ~docv:"FMT")
let out_fmt =
let no_labels =
let doc = "Output key values without labels." in
Arg.(value & flag & info ["n"; "no-labels"] ~doc)
in
let raw_bytes =
let doc = "Output byte sequences unescaped." in
Arg.(value & flag & info ["b"; "raw-bytes"] ~doc)
in
let ascii =
let doc = "Output information using only the US-ASCII charset." in
Arg.(value & flag & info ["a"; "ascii" ] ~doc) in
let out_fmt ascii no_labels raw_bytes = { ascii; no_labels; raw_bytes } in
Term.(const out_fmt $ ascii $ no_labels $ raw_bytes)
let keys =
let all =
let doc = "Output value of all keys in alphabetic order." in
Arg.(value & flag & info ["all"] ~doc)
in
let default =
let doc = "Output default information." in
Arg.(value & flag & info ["d"; "default"] ~doc)
in
let case =
let doc = "Output values of Unicode case keys." in
Arg.(value & flag & info ["case"] ~doc)
in
let cjk =
let doc = "Output values of Unicode CJK keys." in
Arg.(value & flag & info ["cjk"] ~doc)
in
let emoji =
let doc = "Output values of Unicode emoji keys." in
Arg.(value & flag & info ["emoji"] ~doc)
in
let id =
let doc = "Output values of Unicode identifier keys." in
Arg.(value & flag & info ["id"] ~doc)
in
let num =
let doc = "Output values of Unicode numeric keys." in
Arg.(value & flag & info ["num"] ~doc)
in
let keys =
let doc = "Output the value of $(docv). Use $(b,-l) to list available keys."
in
Arg.(value & opt_all string [] & info ["k"; "key"] ~doc ~docv:"KEY")
in
let choose all default case cjk emoji id num keys = match all with
| true -> List.sort compare (List.map key_id all_keys)
| false ->
let keys = if num then num_keys @ keys else keys in
let keys = if id then id_keys @ keys else keys in
let keys = if cjk then cjk_keys @ keys else keys in
let keys = if emoji then emoji_keys @ keys else keys in
let keys = if case then case_keys @ keys else keys in
let keys = if default then default_keys @ keys else keys in
if keys = [] then default_keys else keys
in
Term.(const choose $ all $ default $ case $ cjk $ emoji $ id $ num $ keys)
let uspecs =
let doc = "The character specification. See CHARACTER SPECIFICATION \
and option $(b,--spec-format) for details. Repeatable. \
Information about each character is output in scalar value \
order separated by blank lines."
in
Arg.(value & pos_all string [] & info [] ~doc ~docv:"UCHAR")
let doc = "Query Unicode character information"
let exits =
Cmd.Exit.info 1 ~doc:"if the character specification is invalid." ::
Cmd.Exit.info 2 ~doc:"if a key doesn't exist." ::
Cmd.Exit.defaults
let man = [
`S Manpage.s_description;
`P "$(mname) outputs information about Unicode characters specified
on the command line.";
`Pre " $(mname) $(b,U+1F42B)\n\
\ $(mname) $(b,bactrian)\n\
\ $(mname) $(b,f09F90ab) # UTF-8 byte sequence";
`P "The information to output is selected by specifying keys, use
the $(b,-l) option to list them. By default the tool outputs a
selection of keys for a character. To output all its keys in
alphabetic order use the $(b,--all) option. To output specific
keys use the repeatable $(b,-k) option. Examples:";
`Pre " $(mname) $(b,--all f09F90ab) # All keys\n\
\ $(mname) $(b,-k utf-8 U+1F42B) # Escaped UTF-8 for U+1F42B\n\
\ $(mname) $(b,-k utf-8 -n -b U+1F42B) # UTF-8 for U+1F42B\n\
\ $(mname) $(b,-k name U+1F42B) # Name of U+1F42B\n\
\ $(mname) $(b,-k name '\\\\uD83D\\\\uDC2B') # Idem\n\
\ $(mname) $(b,-k name -k age U+1f42B) # Name and age of U+1F42B\n\
\ $(mname) $(b,-l) # List keys";
`S Manpage.s_arguments;
`S Manpage.s_options;
`S Manpage.s_common_options;
`S "CHARACTER SPECIFICATION";
`P "The character specification must be either a substring of its
Unicode name or represent an \
Unicode scalar value, that is a code point in the range U+0000..U+D7FF or
U+E000..U+10FFFF. $(mname) errors on the textually meaningless range
U+D800..U+DFFF of surrogate code points.";
`P "You need to make sure the character specification is passed in
a single command line argument by adding appropriate shell quotes if
needed. Then, unless the $(b,--spec-format) option is used the following
heuristic is applied, in order, to guess the input format:";
`I ("1. UTF decode.",
"If the specification has a non US-ASCII byte, the byte sequence
is decoded as UTF-8, UTF-16BE or UTF-16LE (in that order)
to a single Unicode character, taking the first decode that succeeds.");
`I ("2. US-ASCII character.",
"If the specification is made of a single US-ASCII character, the
corresponding Unicode character is used.");
`I ("3. Unicode escapes.", "If all the bytes are US-ASCII and have
one of the following Unicode escape form, the unescaped Unicode
character is used: U+H u+H uH UH \\\\uH \\\\uH\\\\uH \\\\UH \\\\u{H}
\\\\U{H} &#D; &#xH; with H and D respectively denoting non empty
sequence of caseless hexadecimal and decimal digits.");
`I ("4. UTF-8 byte sequence escapes.", "If all the bytes are US-ASCII and
match an escaped hexadecimal byte sequence of the form:
((0|\\\\)?x)?hh[ ]*)+ with h denoting a caseless hexadecimal digit,
the unescaped byte sequence is decoded from UTF-8 to a single Unicode
character.");
`I ("5. Unicode character name.", "Select any character whose
Unicode name or name aliases has the specification as a substring. \
Matching is caseless.");
`P "If none of this succeeds the tool errors.";
`P "For example all the following specifications, appropriately quoted
for your shell, are acceptable specifications for the Unicode character
U+1F42B:";
`Pre " $(b,bactrian)\n\
\ $(b,U+1F42B) $(b,u+1F42B) $(b,U1F42B) $(b,u1F42B)\n\
\ $(b,\\\\u1F42B) $(b,\\\\u{1F42B}) $(b,\\\\U{1F42B})\n\
\ $(b,\\\\uD83D\\\\uDC2B) \
# Surrogate escape (e.g. JSON/Java[Script])\n\
\ $(b,🐫) # HTML/XML hexadecimal character reference\n\
\ $(b,🐫) # HTML/XML decimal character reference\n\
\ $(b,f0 9f 90 ab) # UTF-8 escaped byte sequence\n\
\ $(b,F09F90AB) # Idem\n\
\ $(b,0xF0 0x9F 0x90 0xAB) # Idem\n\
\ $(b,\\\\xF0\\\\x9f\\\\x90\\\\xAB) # Idem";
`S Manpage.s_bugs;
`P "This program is distributed with the Uucp OCaml library.
See $(i,https://erratique.ch/software/uucp) for contact information.";
`S "REFERENCES";
`P "UAX #44 Unicode Character Database.
$(i,http://www.unicode.org/reports/tr44/)"; ]
let ucharinfo =
Cmd.v (Cmd.info "ucharinfo" ~version:"%%VERSION%%" ~doc ~exits ~man)
Term.(const ucharinfo $ cmd $ keys $ spec_fmt $ out_fmt $ uspecs)
let main () = exit (Cmd.eval' ucharinfo)
let () = if !Sys.interactive then () else main ()