lib: charset: upper/lower case conversion
[platform/kernel/u-boot.git] / lib / charset.c
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  *  charset conversion utils
4  *
5  *  Copyright (c) 2017 Rob Clark
6  */
7
8 #include <charset.h>
9 #include <capitalization.h>
10 #include <malloc.h>
11
12 static struct capitalization_table capitalization_table[] =
13 #ifdef CONFIG_EFI_UNICODE_CAPITALIZATION
14         UNICODE_CAPITALIZATION_TABLE;
15 #elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250
16         CP1250_CAPITALIZATION_TABLE;
17 #else
18         CP437_CAPITALIZATION_TABLE;
19 #endif
20
21 s32 utf8_get(const char **src)
22 {
23         s32 code = 0;
24         unsigned char c;
25
26         if (!src || !*src)
27                 return -1;
28         if (!**src)
29                 return 0;
30         c = **src;
31         if (c >= 0x80) {
32                 ++*src;
33                 if (!**src)
34                         return -1;
35                 /*
36                  * We do not expect a continuation byte (0x80 - 0xbf).
37                  * 0x80 is coded as 0xc2 0x80, so we cannot have less then 0xc2
38                  * here.
39                  * The highest code point is 0x10ffff which is coded as
40                  * 0xf4 0x8f 0xbf 0xbf. So we cannot have a byte above 0xf4.
41                  */
42                 if (c < 0xc2 || code > 0xf4)
43                         return -1;
44                 if (c >= 0xe0) {
45                         if (c >= 0xf0) {
46                                 /* 0xf0 - 0xf4 */
47                                 c &= 0x07;
48                                 code = c << 18;
49                                 c = **src;
50                                 ++*src;
51                                 if (!**src)
52                                         return -1;
53                                 if (c < 0x80 || c > 0xbf)
54                                         return -1;
55                                 c &= 0x3f;
56                         } else {
57                                 /* 0xe0 - 0xef */
58                                 c &= 0x0f;
59                         }
60                         code += c << 12;
61                         if ((code >= 0xD800 && code <= 0xDFFF) ||
62                             code >= 0x110000)
63                                 return -1;
64                         c = **src;
65                         ++*src;
66                         if (!**src)
67                                 return -1;
68                         if (c < 0x80 || c > 0xbf)
69                                 return -1;
70                 }
71                 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
72                 c &= 0x3f;
73                 code += c << 6;
74                 c = **src;
75                 if (c < 0x80 || c > 0xbf)
76                         return -1;
77                 c &= 0x3f;
78         }
79         code += c;
80         ++*src;
81         return code;
82 }
83
84 int utf8_put(s32 code, char **dst)
85 {
86         if (!dst || !*dst)
87                 return -1;
88         if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
89                 return -1;
90         if (code <= 0x007F) {
91                 **dst = code;
92         } else {
93                 if (code <= 0x07FF) {
94                         **dst = code >> 6 | 0xC0;
95                 } else {
96                         if (code < 0x10000) {
97                                 **dst = code >> 12 | 0xE0;
98                         } else {
99                                 **dst = code >> 18 | 0xF0;
100                                 ++*dst;
101                                 **dst = (code >> 12 & 0x3F) | 0x80;
102                         }
103                         ++*dst;
104                         **dst = (code >> 6 & 0x3F) | 0x80;
105                 }
106                 ++*dst;
107                 **dst = (code & 0x3F) | 0x80;
108         }
109         ++*dst;
110         return 0;
111 }
112
113 size_t utf8_utf16_strnlen(const char *src, size_t count)
114 {
115         size_t len = 0;
116
117         for (; *src && count; --count)  {
118                 s32 code = utf8_get(&src);
119
120                 if (!code)
121                         break;
122                 if (code < 0) {
123                         /* Reserve space for a replacement character */
124                         len += 1;
125                 } else if (code < 0x10000) {
126                         len += 1;
127                 } else {
128                         len += 2;
129                 }
130         }
131         return len;
132 }
133
134 int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
135 {
136         if (!src || !dst || !*dst)
137                 return -1;
138
139         for (; count && *src; --count) {
140                 s32 code = utf8_get(&src);
141
142                 if (code < 0)
143                         code = '?';
144                 utf16_put(code, dst);
145         }
146         **dst = 0;
147         return 0;
148 }
149
150 s32 utf16_get(const u16 **src)
151 {
152         s32 code, code2;
153
154         if (!src || !*src)
155                 return -1;
156         if (!**src)
157                 return 0;
158         code = **src;
159         ++*src;
160         if (code >= 0xDC00 && code <= 0xDFFF)
161                 return -1;
162         if (code >= 0xD800 && code <= 0xDBFF) {
163                 if (!**src)
164                         return -1;
165                 code &= 0x3ff;
166                 code <<= 10;
167                 code += 0x10000;
168                 code2 = **src;
169                 ++*src;
170                 if (code2 <= 0xDC00 || code2 >= 0xDFFF)
171                         return -1;
172                 code2 &= 0x3ff;
173                 code += code2;
174         }
175         return code;
176 }
177
178 int utf16_put(s32 code, u16 **dst)
179 {
180         if (!dst || !*dst)
181                 return -1;
182         if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
183                 return -1;
184         if (code < 0x10000) {
185                 **dst = code;
186         } else {
187                 code -= 0x10000;
188                 **dst = code >> 10 | 0xD800;
189                 ++*dst;
190                 **dst = (code & 0x3ff) | 0xDC00;
191         }
192         ++*dst;
193         return 0;
194 }
195
196 size_t utf16_strnlen(const u16 *src, size_t count)
197 {
198         size_t len = 0;
199
200         for (; *src && count; --count)  {
201                 s32 code = utf16_get(&src);
202
203                 if (!code)
204                         break;
205                 /*
206                  * In case of an illegal sequence still reserve space for a
207                  * replacement character.
208                  */
209                 ++len;
210         }
211         return len;
212 }
213
214 size_t utf16_utf8_strnlen(const u16 *src, size_t count)
215 {
216         size_t len = 0;
217
218         for (; *src && count; --count)  {
219                 s32 code = utf16_get(&src);
220
221                 if (!code)
222                         break;
223                 if (code < 0)
224                         /* Reserve space for a replacement character */
225                         len += 1;
226                 else if (code < 0x80)
227                         len += 1;
228                 else if (code < 0x800)
229                         len += 2;
230                 else if (code < 0x10000)
231                         len += 3;
232                 else
233                         len += 4;
234         }
235         return len;
236 }
237
238 int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
239 {
240         if (!src || !dst || !*dst)
241                 return -1;
242
243         for (; count && *src; --count) {
244                 s32 code = utf16_get(&src);
245
246                 if (code < 0)
247                         code = '?';
248                 utf8_put(code, dst);
249         }
250         **dst = 0;
251         return 0;
252 }
253
254 s32 utf_to_lower(const s32 code)
255 {
256         struct capitalization_table *pos = capitalization_table;
257         s32 ret = code;
258
259         if (code <= 0x7f) {
260                 if (code >= 'A' && code <= 'Z')
261                         ret += 0x20;
262                 return ret;
263         }
264         for (; pos->upper; ++pos) {
265                 if (pos->upper == code) {
266                         ret = pos->lower;
267                         break;
268                 }
269         }
270         return ret;
271 }
272
273 s32 utf_to_upper(const s32 code)
274 {
275         struct capitalization_table *pos = capitalization_table;
276         s32 ret = code;
277
278         if (code <= 0x7f) {
279                 if (code >= 'a' && code <= 'z')
280                         ret -= 0x20;
281                 return ret;
282         }
283         for (; pos->lower; ++pos) {
284                 if (pos->lower == code) {
285                         ret = pos->upper;
286                         break;
287                 }
288         }
289         return ret;
290 }
291
292 size_t u16_strlen(const u16 *in)
293 {
294         size_t i;
295         for (i = 0; in[i]; i++);
296         return i;
297 }
298
299 size_t u16_strnlen(const u16 *in, size_t count)
300 {
301         size_t i;
302         for (i = 0; count-- && in[i]; i++);
303         return i;
304 }
305
306 /* Convert UTF-16 to UTF-8.  */
307 uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
308 {
309         uint32_t code_high = 0;
310
311         while (size--) {
312                 uint32_t code = *src++;
313
314                 if (code_high) {
315                         if (code >= 0xDC00 && code <= 0xDFFF) {
316                                 /* Surrogate pair.  */
317                                 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
318
319                                 *dest++ = (code >> 18) | 0xF0;
320                                 *dest++ = ((code >> 12) & 0x3F) | 0x80;
321                                 *dest++ = ((code >> 6) & 0x3F) | 0x80;
322                                 *dest++ = (code & 0x3F) | 0x80;
323                         } else {
324                                 /* Error...  */
325                                 *dest++ = '?';
326                                 /* *src may be valid. Don't eat it.  */
327                                 src--;
328                         }
329
330                         code_high = 0;
331                 } else {
332                         if (code <= 0x007F) {
333                                 *dest++ = code;
334                         } else if (code <= 0x07FF) {
335                                 *dest++ = (code >> 6) | 0xC0;
336                                 *dest++ = (code & 0x3F) | 0x80;
337                         } else if (code >= 0xD800 && code <= 0xDBFF) {
338                                 code_high = code;
339                                 continue;
340                         } else if (code >= 0xDC00 && code <= 0xDFFF) {
341                                 /* Error... */
342                                 *dest++ = '?';
343                         } else if (code < 0x10000) {
344                                 *dest++ = (code >> 12) | 0xE0;
345                                 *dest++ = ((code >> 6) & 0x3F) | 0x80;
346                                 *dest++ = (code & 0x3F) | 0x80;
347                         } else {
348                                 *dest++ = (code >> 18) | 0xF0;
349                                 *dest++ = ((code >> 12) & 0x3F) | 0x80;
350                                 *dest++ = ((code >> 6) & 0x3F) | 0x80;
351                                 *dest++ = (code & 0x3F) | 0x80;
352                         }
353                 }
354         }
355
356         return dest;
357 }