forked from pig4210/xlib
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ws_utf8.cpp
230 lines (201 loc) · 5.69 KB
/
ws_utf8.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
#include "ws_utf8.h"
using namespace std;
static const size_t gk_utf8_max_byte = 6; //utf8最大占用字节
#pragma warning(push)
#pragma warning(disable:4244) //warning C4244: “=”: 从“const unsigned long”转换到“unsigned char”,可能丢失数据
size_t unicode_byte2utf8_byte(p_utf8 utf8,
const unsigned long unicode)
{
unsigned char tu[gk_utf8_max_byte];
if(utf8 == nullptr) utf8 = (unsigned char*)&tu;
//0000 0000 0000 0000 0000 0000 0111 1111 7bit
//0000 0000 0000 0000 0000 0000 0XXX XXXX 7bit
if(unicode < 0x00000080)
{
utf8[0] = ((unicode & 0x0000007F) >> 0) | 0x00;
return 1;
}
//0000 0000 0000 0000 0000 0000 10XX XXXX 6bit
const unsigned long a = ((unicode & 0x0000003F) >> 0) | 0x80;
//0000 0000 0000 0000 0000 0111 1111 1111 11bit
//0000 0000 0000 0000 0011 0XXX XX00 0000 5bit
if(unicode < 0x00000800)
{
utf8[1] = a;
utf8[0] = ((unicode & 0x000007C0) >> 6) | 0xC0;
return 2;
}
//0000 0000 0000 0000 0010 XXXX XX00 0000 6bit
const unsigned long b = ((unicode & 0x00000FC0) >> 6) | 0x80;
//0000 0000 0000 0000 1111 1111 1111 1111 16bit
//0000 0000 0000 1110 XXXX 0000 0000 0000 4bit
if(unicode < 0x00010000)
{
utf8[2] = a; utf8[1] = b;
utf8[0] = ((unicode & 0x0000F000) >> 12) | 0xE0;
return 3;
}
//0000 0000 0000 10XX XXXX 0000 0000 0000 6bit
const unsigned long c = ((unicode & 0x0003F000) >> 12) | 0x80;
//0000 0000 0001 1111 1111 1111 1111 1111 21bit
//0000 0011 110X XX00 0000 0000 0000 0000 3bit
if(unicode < 0x00200000)
{
utf8[3] = a; utf8[2] = b; utf8[1] = c;
utf8[0] = ((unicode & 0x001C0000) >> 18) | 0xF0;
return 4;
}
//0000 0010 XXXX XX00 0000 0000 0000 0000 6bit
const unsigned long d = ((unicode & 0x00FC0000) >> 18) | 0x80;
//0000 0011 1111 1111 1111 1111 1111 1111 26bit
//1111 10XX 0000 0000 0000 0000 0000 0000 2bit
if(unicode < 0x04000000)
{
utf8[4] = a; utf8[3] = b; utf8[2] = c; utf8[1] = d;
utf8[0] = ((unicode & 0x03000000) >> 24) | 0xF8;
return 5;
}
//00XX XXXX 0000 0000 0000 0000 0000 0000 6bit
const unsigned long e = ((unicode & 0x3F000000) >> 24) | 0x80;
//0111 1111 1111 1111 1111 1111 1111 1111 31bit
//0X00 0000 0000 0000 0000 0000 0000 0000 1bit
if(unicode < 0x80000000)
{
utf8[5] = a; utf8[4] = b; utf8[3] = c; utf8[2] = d; utf8[1] = e;
utf8[0] = ((unicode & 0x04000000) >> 30) | 0xFC;
return 6;
}
return 0;
}
#pragma warning(pop)
size_t utf8_byte2unicode_byte(unsigned long* unicode,
const p_utf8 utf8)
{
if(utf8 == nullptr) return 0;
unsigned long tu;
if(unicode == nullptr) unicode = &tu;
const unsigned char utf8_flag[gk_utf8_max_byte] = {0x7F,0xC0,0xE0,0xF0,0xF8,0xFC};
if(utf8[0] <= utf8_flag[0])
{
*unicode = utf8[0];
return 1;
}
if(utf8[0] < utf8_flag[1]) //首字节非法
{
return 0;
}
size_t lp = 0;
for(size_t i = 2; i < sizeof(utf8_flag); ++i)
{
//判定首字节处于哪个区域
if((utf8[lp] < utf8_flag[i]))
{
unsigned long u = utf8[lp] ^ utf8_flag[i - 1];
++lp;
for(size_t j = 1; j < i; ++j)
{
if(utf8[lp] >= utf8_flag[1]) return 0; //后继字节非法,跳过
u <<= 6;
u |= (utf8[lp] & 0x3F);
++lp;
}
*unicode = u;
return lp;
}
}
return 0;
}
size_t ws2utf8(p_utf8 utf8,
const size_t max_utf8,
const wchar_t* ws,
const size_t ws_len)
{
if(utf8 == nullptr || ws == nullptr || (intptr_t)max_utf8 < 1)
return 0;
size_t wlen = ws_len;
if((intptr_t)ws_len < 0)
{
for(wlen = 0; ws[wlen] != L'\0'; ++wlen);
}
size_t lp = 0;
unsigned char tu[gk_utf8_max_byte];
//转换写入缓冲而不是直接写入,考虑有溢出情况
for(size_t i = 0; i < wlen ; ++i)
{
const size_t k = unicode_byte2utf8_byte(tu, ws[i]);
if(k == 0) return 0;
if(lp + k >= max_utf8) return 0;
memcpy(&utf8[lp], tu, k);
lp += k;
}
//自行追加结束符,以应对ws指定长度且没有null结尾的情况
const size_t k = unicode_byte2utf8_byte(tu, L'\0');
if(lp + k >= max_utf8) return 0;
memcpy(&utf8[lp], tu, k);
lp += k;
return lp;
}
xutf8 ws2utf8(const wstring& ws)
{
xutf8 utf8;
unsigned char tu[gk_utf8_max_byte];
for(auto ch : ws)
{
const size_t k = unicode_byte2utf8_byte(tu, ch);
if(k == 0)
{
utf8.clear();
break;
}
utf8.append(tu, k);
}
return utf8;
}
size_t utf82ws(wchar_t* ws,
const size_t max_ws,
const p_utf8 utf8,
const size_t utf8_len)
{
if(ws == nullptr || utf8 == nullptr || (intptr_t)max_ws < 1)
return 0;
size_t ulen = utf8_len;
if((intptr_t)utf8_len < 0)
{
for(ulen = 0; utf8[ulen] != '\0'; ++ulen);
}
size_t lp = 0;
unsigned long ch;
for(size_t i = 0; i < ulen ;)
{
const size_t k = utf8_byte2unicode_byte(&ch,&utf8[i]);
if(k == 0) return 0;
i += k;
if(i > ulen) break;
ws[lp] = (wchar_t)ch;
if(lp >= max_ws) return 0;
++lp;
}
if(lp >= max_ws) return 0;
ws[lp] = L'\0';
++lp;
return lp;
}
wstring utf82ws(const xutf8& utf8)
{
wstring ws;
const p_utf8 pu = (const p_utf8)utf8.c_str();
unsigned long ch;
for(size_t i = 0; i < utf8.size();)
{
const size_t k = utf8_byte2unicode_byte(&ch,&pu[i]);
if(k == 0)
{
ws.clear();
return ws;
}
i += k;
if(i > utf8.size()) break;
ws.push_back((wchar_t)ch);
}
return ws;
}