9 wc_uint8 WC_UTF8_MAP[ 0x100 ] = {
10 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
11 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
12 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
13 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8,
19 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
20 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
21 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
23 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
24 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
25 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
26 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7,
29 static wc_uchar utf8_buf[7];
32 wc_ucs_to_utf8(wc_uint32 ucs, wc_uchar *utf8)
34 if (ucs < WC_C_UTF8_L2) {
38 } else if (ucs < WC_C_UTF8_L3) {
39 utf8[0] = (ucs >> 6) | 0xc0;
40 utf8[1] = (ucs & 0x3f) | 0x80;
43 } else if (ucs < WC_C_UTF8_L4) {
44 utf8[0] = (ucs >> 12) | 0xe0;
45 utf8[1] = ((ucs >> 6) & 0x3f) | 0x80;
46 utf8[2] = (ucs & 0x3f) | 0x80;
49 } else if (ucs < WC_C_UTF8_L5) {
50 utf8[0] = (ucs >> 18) | 0xf0;
51 utf8[1] = ((ucs >> 12) & 0x3f) | 0x80;
52 utf8[2] = ((ucs >> 6) & 0x3f) | 0x80;
53 utf8[3] = (ucs & 0x3f) | 0x80;
56 } else if (ucs < WC_C_UTF8_L6) {
57 utf8[0] = (ucs >> 24) | 0xf8;
58 utf8[1] = ((ucs >> 18) & 0x3f) | 0x80;
59 utf8[2] = ((ucs >> 12) & 0x3f) | 0x80;
60 utf8[3] = ((ucs >> 6) & 0x3f) | 0x80;
61 utf8[4] = (ucs & 0x3f) | 0x80;
64 } else if (ucs <= WC_C_UCS4_END) {
65 utf8[0] = (ucs >> 30) | 0xfc;
66 utf8[1] = ((ucs >> 24) & 0x3f) | 0x80;
67 utf8[2] = ((ucs >> 18) & 0x3f) | 0x80;
68 utf8[3] = ((ucs >> 12) & 0x3f) | 0x80;
69 utf8[4] = ((ucs >> 6) & 0x3f) | 0x80;
70 utf8[5] = (ucs & 0x3f) | 0x80;
80 wc_utf8_to_ucs(wc_uchar *utf8)
84 switch (WC_UTF8_MAP[utf8[0]]) {
86 ucs = (wc_uint32) utf8[0];
87 if (ucs >= WC_C_UTF8_L2)
91 ucs = ((wc_uint32)(utf8[0] & 0x1f) << 6)
92 | (wc_uint32)(utf8[1] & 0x3f);
93 if (ucs < WC_C_UTF8_L2)
97 ucs = ((wc_uint32)(utf8[0] & 0x0f) << 12)
98 | ((wc_uint32)(utf8[1] & 0x3f) << 6)
99 | (wc_uint32)(utf8[2] & 0x3f);
100 if (ucs < WC_C_UTF8_L3)
104 ucs = ((wc_uint32)(utf8[0] & 0x07) << 18)
105 | ((wc_uint32)(utf8[1] & 0x3f) << 12)
106 | ((wc_uint32)(utf8[2] & 0x3f) << 6)
107 | (wc_uint32)(utf8[3] & 0x3f);
108 if (ucs < WC_C_UTF8_L4)
112 ucs = ((wc_uint32)(utf8[0] & 0x03) << 24)
113 | ((wc_uint32)(utf8[1] & 0x3f) << 18)
114 | ((wc_uint32)(utf8[2] & 0x3f) << 12)
115 | ((wc_uint32)(utf8[3] & 0x3f) << 6)
116 | (wc_uint32)(utf8[4] & 0x3f);
117 if (ucs < WC_C_UTF8_L5)
121 ucs = ((wc_uint32)(utf8[0] & 0x01) << 30)
122 | ((wc_uint32)(utf8[1] & 0x3f) << 24)
123 | ((wc_uint32)(utf8[2] & 0x3f) << 18)
124 | ((wc_uint32)(utf8[3] & 0x3f) << 12)
125 | ((wc_uint32)(utf8[4] & 0x3f) << 6)
126 | (wc_uint32)(utf8[5] & 0x3f);
127 if (ucs < WC_C_UTF8_L6)
133 return WC_C_UCS4_ERROR;
137 wc_conv_from_utf8(Str is, wc_ces ces)
140 wc_uchar *sp = (wc_uchar *)is->ptr;
141 wc_uchar *ep = sp + is->length;
144 int state = WC_UTF8_NOSTATE;
149 for (p = sp; p < ep && *p < 0x80; p++)
153 os = Strnew_size(is->length * 4 / 3);
155 Strcat_charp_n(os, is->ptr, (int)(p - sp));
159 for (; p < ep; p++) {
161 case WC_UTF8_NOSTATE:
162 next = WC_UTF8_MAP[*p];
165 wtf_push_ucs(os, (wc_uint32)*p, &st);
168 Strcat_char(os, (char)*p);
172 wtf_push_unknown(os, p, 1);
177 state = WC_UTF8_NEXT;
182 if (WC_UTF8_MAP[*p]) {
183 wtf_push_unknown(os, q, p - q + 1);
184 state = WC_UTF8_NOSTATE;
189 state = WC_UTF8_NOSTATE;
190 ucs = wc_utf8_to_ucs(q);
191 if (ucs == WC_C_UCS4_ERROR ||
192 (ucs >= WC_C_UCS2_SURROGATE && ucs <= WC_C_UCS2_SURROGATE_END))
193 wtf_push_unknown(os, q, p - q + 1);
194 else if (ucs != WC_C_UCS2_BOM)
195 wtf_push_ucs(os, ucs, &st);
201 wtf_push_unknown(os, q, p - q);
208 wc_push_tag_to_utf8(Str os, int ntag)
213 p = wc_ucs_get_tag(ntag);
218 wc_ucs_to_utf8(WC_C_LANGUAGE_TAG, utf8_buf);
219 Strcat_charp(os, (char *)utf8_buf);
221 wc_ucs_to_utf8(WC_C_LANGUAGE_TAG0 | *p, utf8_buf);
222 Strcat_charp(os, (char *)utf8_buf);
225 wc_ucs_to_utf8(WC_C_CANCEL_TAG, utf8_buf);
226 Strcat_charp(os, (char *)utf8_buf);
232 wc_push_to_utf8(Str os, wc_wchar_t cc, wc_status *st)
235 switch (WC_CCS_SET(cc.ccs)) {
236 case WC_CCS_US_ASCII:
238 st->ntag = wc_push_tag_to_utf8(os, 0);
239 Strcat_char(os, (char)(cc.code & 0x7f));
244 st->ntag = wc_push_tag_to_utf8(os, 0);
245 wc_ucs_to_utf8(cc.code, utf8_buf);
246 Strcat_charp(os, (char *)utf8_buf);
249 if (WcOption.use_language_tag && wc_ucs_tag_to_tag(cc.code) != st->ntag)
250 st->ntag = wc_push_tag_to_utf8(os, wc_ucs_tag_to_tag(cc.code));
251 wc_ucs_to_utf8(wc_ucs_tag_to_ucs(cc.code), utf8_buf);
252 Strcat_charp(os, (char *)utf8_buf);
254 case WC_CCS_ISO_8859_1:
256 st->ntag = wc_push_tag_to_utf8(os, 0);
257 wc_ucs_to_utf8((cc.code | 0x80), utf8_buf);
258 Strcat_charp(os, (char *)utf8_buf);
260 case WC_CCS_UNKNOWN_W:
261 if (!WcOption.no_replace) {
263 st->ntag = wc_push_tag_to_utf8(os, 0);
264 Strcat_charp(os, WC_REPLACE_W);
268 if (!WcOption.no_replace) {
270 st->ntag = wc_push_tag_to_utf8(os, 0);
271 Strcat_charp(os, WC_REPLACE);
275 if (WcOption.ucs_conv &&
276 (cc.code = wc_any_to_ucs(cc)) != WC_C_UCS4_ERROR)
277 cc.ccs = WC_CCS_UCS2;
279 cc.ccs = WC_CCS_IS_WIDE(cc.ccs) ? WC_CCS_UNKNOWN_W : WC_CCS_UNKNOWN;
286 wc_push_to_utf8_end(Str os, wc_status *st)
289 st->ntag = wc_push_tag_to_utf8(os, 0);
294 wc_char_conv_from_utf8(wc_uchar c, wc_status *st)
297 static wc_uchar buf[6];
298 static size_t nbuf, next;
301 if (st->state == -1) {
302 st->state = WC_UTF8_NOSTATE;
310 case WC_UTF8_NOSTATE:
311 switch (next = WC_UTF8_MAP[c]) {
313 wtf_push_ucs(os, (wc_uint32)c, st);
316 Strcat_char(os, (char)c);
324 st->state = WC_UTF8_NEXT;
334 ucs = wc_utf8_to_ucs(buf);
335 if (ucs == WC_C_UCS4_ERROR ||
336 (ucs >= WC_C_UCS2_SURROGATE && ucs <= WC_C_UCS2_SURROGATE_END))
338 if (ucs != WC_C_UCS2_BOM)
339 wtf_push_ucs(os, ucs, st);