1 /* EINA - EFL data type library
2 * Copyright (C) 2010 Tom Hacohen,
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library;
17 * if not, see <http://www.gnu.org/licenses/>.
24 #include "eina_config.h"
25 #include "eina_private.h"
28 /* undefs EINA_ARG_NONULL() so NULL checks are not compiled out! */
29 #include "eina_safety_checks.h"
30 #include "eina_unicode.h"
32 /* FIXME: check if sizeof(wchar_t) == sizeof(Eina_Unicode) if so,
33 * probably better to use the standard functions */
35 /* Maybe I'm too tired, but this is the only thing that actually worked. */
36 const Eina_Unicode _EINA_UNICODE_EMPTY_STRING[1] = {0};
37 EAPI const Eina_Unicode *EINA_UNICODE_EMPTY_STRING = _EINA_UNICODE_EMPTY_STRING;
39 eina_unicode_strcmp(const Eina_Unicode *a, const Eina_Unicode *b)
41 EINA_SAFETY_ON_NULL_RETURN_VAL(a, -1);
42 EINA_SAFETY_ON_NULL_RETURN_VAL(b, -1);
44 for (; *a && *a == *b; a++, b++)
55 eina_unicode_strcpy(Eina_Unicode *dest, const Eina_Unicode *source)
57 Eina_Unicode *ret = dest;
59 EINA_SAFETY_ON_NULL_RETURN_VAL(dest, NULL);
60 EINA_SAFETY_ON_NULL_RETURN_VAL(source, NULL);
69 eina_unicode_strncpy(Eina_Unicode *dest, const Eina_Unicode *source, size_t n)
71 Eina_Unicode *ret = dest;
73 EINA_SAFETY_ON_NULL_RETURN_VAL(dest, NULL);
74 EINA_SAFETY_ON_NULL_RETURN_VAL(source, NULL);
76 for ( ; n && *source ; n--)
84 eina_unicode_strlen(const Eina_Unicode *ustr)
86 const Eina_Unicode *end;
88 EINA_SAFETY_ON_NULL_RETURN_VAL(ustr, 0);
90 for (end = ustr; *end; end++)
96 eina_unicode_strnlen(const Eina_Unicode *ustr, int n)
98 const Eina_Unicode *end;
99 const Eina_Unicode *last = ustr + n; /* technically not portable ;-) */
101 EINA_SAFETY_ON_NULL_RETURN_VAL(ustr, 0);
103 for (end = ustr; end < last && *end; end++)
112 eina_unicode_strndup(const Eina_Unicode *text, size_t n)
116 EINA_SAFETY_ON_NULL_RETURN_VAL(text, NULL);
118 ustr = malloc((n + 1) * sizeof(Eina_Unicode));
119 memcpy(ustr, text, n * sizeof(Eina_Unicode));
125 eina_unicode_strdup(const Eina_Unicode *text)
129 EINA_SAFETY_ON_NULL_RETURN_VAL(text, NULL);
131 len = eina_unicode_strlen(text);
132 return eina_unicode_strndup(text, len);
136 eina_unicode_strstr(const Eina_Unicode *haystack, const Eina_Unicode *needle)
138 const Eina_Unicode *i, *j;
140 EINA_SAFETY_ON_NULL_RETURN_VAL(haystack, NULL);
141 EINA_SAFETY_ON_NULL_RETURN_VAL(needle, NULL);
143 for (i = haystack; *i; i++)
145 haystack = i; /* set this location as the base position */
146 for (j = needle; *j && *i && *j == *i; j++, i++)
149 if (!*j) /*if we got to the end of j this means we got a full match */
151 return (Eina_Unicode *)haystack; /* return the new base position */
159 eina_unicode_escape(const Eina_Unicode *str)
161 Eina_Unicode *s2, *d;
162 const Eina_Unicode *s;
164 EINA_SAFETY_ON_NULL_RETURN_VAL(str, NULL);
166 s2 = malloc((eina_unicode_strlen(str) * 2) + 1);
170 for (s = str, d = s2; *s != 0; s++, d++)
172 if ((*s == ' ') || (*s == '\\') || (*s == '\''))
186 #define EINA_UNICODE_UTF8_BYTES_PER_CHAR 6
187 /* The replacement range that will be used for bad utf8 chars. */
188 #define ERROR_REPLACEMENT_BASE 0xDC80
189 #define ERROR_REPLACEMENT_END 0xDCFF
190 #define IS_INVALID_BYTE(x) ((x == 192) || (x == 193) || (x >= 245))
191 #define IS_CONTINUATION_BYTE(x) ((x & 0xC0) == 0x80)
194 eina_unicode_utf8_get_next(const char *buf, int *iindex)
200 EINA_SAFETY_ON_NULL_RETURN_VAL(buf, 0);
201 EINA_SAFETY_ON_NULL_RETURN_VAL(iindex, 0);
203 /* if this char is the null terminator, exit */
204 if ((d = buf[ind++]) == 0) return 0;
207 { // 1 byte (7bit) - 0xxxxxxx
211 if ((d & 0xe0) == 0xc0)
212 { // 2 byte (11bit) - 110xxxxx 10xxxxxx
214 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
215 !IS_CONTINUATION_BYTE(d)) goto error;
217 if (r <= 0x7F) goto error;
221 if ((d & 0xf0) == 0xe0)
222 { // 3 byte (16bit) - 1110xxxx 10xxxxxx 10xxxxxx
223 r = (d & 0x0f) << 12;
224 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
225 !IS_CONTINUATION_BYTE(d)) goto error;
226 r |= (d & 0x3f) << 6;
227 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
228 !IS_CONTINUATION_BYTE(d)) goto error;
230 if (r <= 0x7FF) goto error;
234 if ((d & 0xf8) == 0xf0)
235 { // 4 byte (21bit) - 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
236 r = (d & 0x07) << 18;
237 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
238 !IS_CONTINUATION_BYTE(d)) goto error;
239 r |= (d & 0x3f) << 12;
240 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
241 !IS_CONTINUATION_BYTE(d)) goto error;
242 r |= (d & 0x3f) << 6;
243 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
244 !IS_CONTINUATION_BYTE(d)) goto error;
246 if (r <= 0xFFFF) goto error;
250 if ((d & 0xfc) == 0xf8)
251 { // 5 byte (26bit) - 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
252 r = (d & 0x03) << 24;
253 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
254 !IS_CONTINUATION_BYTE(d)) goto error;
255 r |= (d & 0x3f) << 18;
256 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
257 !IS_CONTINUATION_BYTE(d)) goto error;
258 r |= (d & 0x3f) << 12;
259 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
260 !IS_CONTINUATION_BYTE(d)) goto error;
261 r |= (d & 0x3f) << 6;
262 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
263 !IS_CONTINUATION_BYTE(d)) goto error;
265 if (r <= 0x1FFFFF) goto error;
269 if ((d & 0xfe) == 0xfc)
270 { // 6 byte (31bit) - 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
271 r = (d & 0x01) << 30;
272 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
273 !IS_CONTINUATION_BYTE(d)) goto error;
274 r |= (d & 0x3f) << 24;
275 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
276 !IS_CONTINUATION_BYTE(d)) goto error;
277 r |= (d & 0x3f) << 18;
278 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
279 !IS_CONTINUATION_BYTE(d)) goto error;
280 r |= (d & 0x3f) << 12;
281 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
282 !IS_CONTINUATION_BYTE(d)) goto error;
283 r |= (d & 0x3f) << 6;
284 if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
285 !IS_CONTINUATION_BYTE(d)) goto error;
287 if (r <= 0x3FFFFFF) goto error;
292 /* Gets here where there was an error and we want to replace the char
293 * we just use the invalid unicode codepoints 8 lower bits represent
294 * the original char */
298 return ERROR_REPLACEMENT_BASE | d;
302 eina_unicode_utf8_get_prev(const char *buf, int *iindex)
306 EINA_SAFETY_ON_NULL_RETURN_VAL(buf, 0);
307 EINA_SAFETY_ON_NULL_RETURN_VAL(iindex, 0);
310 /* First obtain the codepoint at iindex */
311 r = eina_unicode_utf8_get_next(buf, &ind);
313 /* although when ind == 0 there's no previous char, we still want to get
314 * the current char */
318 /* Next advance iindex to previous codepoint */
321 while ((ind > 0) && ((buf[ind] & 0xc0) == 0x80))
329 eina_unicode_utf8_get_len(const char *buf)
331 /* returns the number of utf8 characters (not bytes) in the string */
334 EINA_SAFETY_ON_NULL_RETURN_VAL(buf, 0);
336 while (eina_unicode_utf8_get_next(buf, &i))
343 eina_unicode_utf8_to_unicode(const char *utf, int *_len)
345 /* FIXME: Should optimize! */
348 Eina_Unicode *buf, *uind;
350 EINA_SAFETY_ON_NULL_RETURN_VAL(utf, NULL);
352 len = eina_unicode_utf8_get_len(utf);
355 buf = (Eina_Unicode *) calloc(sizeof(Eina_Unicode), (len + 1));
356 if (!buf) return buf;
358 for (i = 0, ind = 0, uind = buf ; i < len ; i++, uind++)
360 *uind = eina_unicode_utf8_get_next(utf, &ind);
367 eina_unicode_unicode_to_utf8(const Eina_Unicode *uni, int *_len)
370 const Eina_Unicode *uind;
374 EINA_SAFETY_ON_NULL_RETURN_VAL(uni, NULL);
376 ulen = eina_unicode_strlen(uni);
377 buf = (char *) calloc(ulen + 1, EINA_UNICODE_UTF8_BYTES_PER_CHAR);
380 for (uind = uni, ind = buf ; *uind ; uind++)
382 if (*uind <= 0x7F) /* 1 byte char */
387 else if (*uind <= 0x7FF) /* 2 byte char */
389 *ind++ = 0xC0 | (unsigned char) (*uind >> 6);
390 *ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
393 else if (*uind <= 0xFFFF) /* 3 byte char */
395 /* If it's a special replacement codepoint */
396 if (*uind >= ERROR_REPLACEMENT_BASE &&
397 *uind <= ERROR_REPLACEMENT_END)
399 *ind++ = *uind & 0xFF;
404 *ind++ = 0xE0 | (unsigned char) (*uind >> 12);
405 *ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F);
406 *ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
410 else if (*uind <= 0x1FFFFF) /* 4 byte char */
412 *ind++ = 0xF0 | (unsigned char) ((*uind >> 18) & 0x07);
413 *ind++ = 0x80 | (unsigned char) ((*uind >> 12) & 0x3F);
414 *ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F);
415 *ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
418 else if (*uind <= 0x3FFFFFF) /* 5 byte char */
420 *ind++ = 0xF8 | (unsigned char) ((*uind >> 24) & 0x03);
421 *ind++ = 0x80 | (unsigned char) ((*uind >> 18) & 0x3F);
422 *ind++ = 0x80 | (unsigned char) ((*uind >> 12) & 0x3F);
423 *ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F);
424 *ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
427 else if (*uind <= 0x7FFFFFFF) /* 6 byte char */
429 *ind++ = 0xFC | (unsigned char) ((*uind >> 30) & 0x01);
430 *ind++ = 0x80 | (unsigned char) ((*uind >> 24) & 0x3F);
431 *ind++ = 0x80 | (unsigned char) ((*uind >> 18) & 0x3F);
432 *ind++ = 0x80 | (unsigned char) ((*uind >> 12) & 0x3F);
433 *ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F);
434 *ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
442 buf = realloc(buf, len + 1);