forked from xmirror/m17n-db
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathLINEBREAK.awk
133 lines (121 loc) · 3.7 KB
/
LINEBREAK.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# LINEBREAK.awk -- awk script to produce a compact linebreak property map
# Copyright (C) 2005
# National Institute of Advanced Industrial Science and Technology (AIST)
# Registration Number H15PRO112
# This file is part of the m17n database; a sub-part of the m17n
# library.
# The m17n library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public License
# as published by the Free Software Foundation; either version 2.1 of
# the License, or (at your option) any later version.
# The m17n library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
# You should have received a copy of the GNU Lesser General Public
# License along with the m17n library; if not, write to the Free
# Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
# Boston, MA 02110-1301, USA.
function setLBC(name, code) {
if (code % 10 == 0)
printf "\n# %2d:%s", code, name;
else
printf " %2d:%s", code, name;
to_lbc[name] = code;
}
BEGIN {
FS = "[; ]";
from = "";
to = "";
prev_lbc = -1;
i = 0;
printf "# Code:LineBreakingClass";
# Assign a uniq integer code to each line breaking class.
# The codes must be the same as "enum LineBreakingClass" of
# m17n-lib/src/linebreak.c
setLBC("OP", i++); # open
setLBC("CL", i++); # close
setLBC("QU", i++); # quotation
setLBC("GL", i++); # glue
setLBC("NS", i++); # no-start
setLBC("EX", i++); # exclamation/interrogation
setLBC("SY", i++); # Syntax (slash)
setLBC("IS", i++); # infix (numeric) separator
setLBC("PR", i++); # prefix
setLBC("PO", i++); # postfix
setLBC("NU", i++); # numeric
setLBC("AL", i++); # alphabetic
setLBC("ID", i++); # ideograph (atomic)
setLBC("IN", i++); # inseparable
setLBC("HY", i++); # hyphen
setLBC("BA", i++); # break after
setLBC("BB", i++); # break before
setLBC("B2", i++); # break both
setLBC("ZW", i++); # ZW space
setLBC("CM", i++); # combining mark
setLBC("WJ", i++); # word joiner
# For UAX#14 7.6 Korean Syllable Block Pair Table.
setLBC("H2", i++); # Hamgul 2 Jamo Syllable
setLBC("H3", i++); # Hangul 3 Jamo Syllable
setLBC("JL", i++); # Jamo leading consonant
setLBC("JV", i++); # Jamo vowel
setLBC("JT", i++); # Jamo trailing consonant
# Not handled in the pair table.
setLBC("SA", i++); # south (east) asian
setLBC("SP", i++); # space
setLBC("PS", i++); # paragraph and line separators
setLBC("BK", i++); # hard break (newline)
setLBC("CR", i++); # carriage return
setLBC("LF", i++); # line feed
setLBC("NL", i++); # next line
setLBC("CB", i++); # contingent break opportunity
setLBC("SG", i++); # surrogate
setLBC("AI", i++); # ambiguous
setLBC("XX", i); # unknown
# The default is "XX".
printf "\n0x0000-0x3FFFFF %d\n", i;
}
/^[0-9A-Za-z]*;/ {
lbc = to_lbc[$2];
if (prev_lbc != lbc)
{
if (prev_lbc != -1)
{
if (from == to)
printf "%s %d\n", from, prev_lbc;
else
printf "%s-%s %d\n", from, to, prev_lbc;
}
from = "0x" $1;
to = "0x" $1;
prev_lbc = lbc;
}
else
to = "0x" $1;
next;
}
/^[0-9A-Za-z]*\.\.[0-9A-Za-z]*;/ {
lbc = to_lbc[$2];
if (prev_lbc != -1)
{
if (from == to)
printf "%s %d\n", from, prev_lbc;
else
printf "%s-%s %d\n", from, to, prev_lbc;
}
gsub("\\.\\.", "-0x");
printf "0x%s %d\n", $1, lbc;
from = "";
to = "";
prev_lbc = -1;
next;
}
END {
if (prev_lbc != -1)
{
if (from == to)
printf "0x%s %d\n", from, prev_lbc;
else
printf "0x%s-0x%s %d\n", from, to, prev_lbc;
}
}