src/lib/eina_unicode.c

   1 /* EINA - EFL data type library
   2  * Copyright (C) 2010 Tom Hacohen,
   3  *              Brett Nash
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2.1 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with this library;
  17  * if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include "config.h"
  22 #endif
  23
  24 #include "eina_config.h"
  25 #include "eina_private.h"
  26 #include <string.h>
  27
  28 /* undefs EINA_ARG_NONULL() so NULL checks are not compiled out! */
  29 #include "eina_safety_checks.h"
  30 #include "eina_unicode.h"
  31
  32 /* FIXME: check if sizeof(wchar_t) == sizeof(Eina_Unicode) if so,
  33  * probably better to use the standard functions */
  34
  35 /* Maybe I'm too tired, but this is the only thing that actually worked. */
  36 const Eina_Unicode _EINA_UNICODE_EMPTY_STRING[1] = {0};
  37 EAPI const Eina_Unicode *EINA_UNICODE_EMPTY_STRING = _EINA_UNICODE_EMPTY_STRING;
  38 EAPI int
  39 eina_unicode_strcmp(const Eina_Unicode *a, const Eina_Unicode *b)
  40 {
  41    EINA_SAFETY_ON_NULL_RETURN_VAL(a, -1);
  42    EINA_SAFETY_ON_NULL_RETURN_VAL(b, -1);
  43
  44    for (; *a && *a == *b; a++, b++)
  45       ;
  46    if (*a == *b)
  47       return 0;
  48    else if (*a < *b)
  49       return -1;
  50    else
  51       return 1;
  52 }
  53
  54 EAPI Eina_Unicode *
  55 eina_unicode_strcpy(Eina_Unicode *dest, const Eina_Unicode *source)
  56 {
  57    Eina_Unicode *ret = dest;
  58
  59    EINA_SAFETY_ON_NULL_RETURN_VAL(dest, NULL);
  60    EINA_SAFETY_ON_NULL_RETURN_VAL(source, NULL);
  61
  62    while (*source)
  63       *dest++ = *source++;
  64    *dest = 0;
  65    return ret;
  66 }
  67
  68 EAPI Eina_Unicode *
  69 eina_unicode_strncpy(Eina_Unicode *dest, const Eina_Unicode *source, size_t n)
  70 {
  71    Eina_Unicode *ret = dest;
  72
  73    EINA_SAFETY_ON_NULL_RETURN_VAL(dest, NULL);
  74    EINA_SAFETY_ON_NULL_RETURN_VAL(source, NULL);
  75
  76    for ( ; n && *source ; n--)
  77       *dest++ = *source++;
  78    for (; n; n--)
  79       *dest++ = 0;
  80    return ret;
  81 }
  82
  83 EAPI size_t
  84 eina_unicode_strlen(const Eina_Unicode *ustr)
  85 {
  86    const Eina_Unicode *end;
  87
  88    EINA_SAFETY_ON_NULL_RETURN_VAL(ustr, 0);
  89
  90    for (end = ustr; *end; end++)
  91       ;
  92    return end - ustr;
  93 }
  94
  95 EAPI size_t
  96 eina_unicode_strnlen(const Eina_Unicode *ustr, int n)
  97 {
  98    const Eina_Unicode *end;
  99    const Eina_Unicode *last = ustr + n; /* technically not portable ;-) */
 100
 101    EINA_SAFETY_ON_NULL_RETURN_VAL(ustr, 0);
 102
 103    for (end = ustr; end < last && *end; end++)
 104       ;
 105    return end - ustr;
 106 }
 107
 108
 109
 110
 111 EAPI Eina_Unicode *
 112 eina_unicode_strndup(const Eina_Unicode *text, size_t n)
 113 {
 114    Eina_Unicode *ustr;
 115
 116    EINA_SAFETY_ON_NULL_RETURN_VAL(text, NULL);
 117
 118    ustr = malloc((n + 1) * sizeof(Eina_Unicode));
 119    memcpy(ustr, text, n * sizeof(Eina_Unicode));
 120    ustr[n] = 0;
 121    return ustr;
 122 }
 123
 124 EAPI Eina_Unicode *
 125 eina_unicode_strdup(const Eina_Unicode *text)
 126 {
 127    size_t len;
 128
 129    EINA_SAFETY_ON_NULL_RETURN_VAL(text, NULL);
 130
 131    len = eina_unicode_strlen(text);
 132    return eina_unicode_strndup(text, len);
 133 }
 134
 135 EAPI Eina_Unicode *
 136 eina_unicode_strstr(const Eina_Unicode *haystack, const Eina_Unicode *needle)
 137 {
 138    const Eina_Unicode *i, *j;
 139
 140    EINA_SAFETY_ON_NULL_RETURN_VAL(haystack, NULL);
 141    EINA_SAFETY_ON_NULL_RETURN_VAL(needle, NULL);
 142
 143    for (i = haystack; *i; i++)
 144      {
 145         haystack = i; /* set this location as the base position */
 146         for (j = needle; *j && *i && *j == *i; j++, i++)
 147            ;
 148
 149         if (!*j) /*if we got to the end of j this means we got a full match */
 150           {
 151              return (Eina_Unicode *)haystack; /* return the new base position */
 152           }
 153      }
 154
 155    return NULL;
 156 }
 157
 158 EAPI Eina_Unicode *
 159 eina_unicode_escape(const Eina_Unicode *str)
 160 {
 161    Eina_Unicode *s2, *d;
 162    const Eina_Unicode *s;
 163
 164    EINA_SAFETY_ON_NULL_RETURN_VAL(str, NULL);
 165
 166    s2 = malloc((eina_unicode_strlen(str) * 2) + 1);
 167    if (!s2)
 168       return NULL;
 169
 170    for (s = str, d = s2; *s != 0; s++, d++)
 171      {
 172         if ((*s == ' ') || (*s == '\\') || (*s == '\''))
 173           {
 174              *d = '\\';
 175              d++;
 176           }
 177
 178         *d = *s;
 179      }
 180    *d = 0;
 181    return s2;
 182 }
 183
 184 /* UTF-8 Handling */
 185
 186 #define EINA_UNICODE_UTF8_BYTES_PER_CHAR 6
 187 /* The replacement range that will be used for bad utf8 chars. */
 188 #define ERROR_REPLACEMENT_BASE  0xDC80
 189 #define ERROR_REPLACEMENT_END   0xDCFF
 190 #define IS_INVALID_BYTE(x)      ((x == 192) || (x == 193) || (x >= 245))
 191 #define IS_CONTINUATION_BYTE(x) ((x & 0xC0) == 0x80)
 192
 193 EAPI Eina_Unicode
 194 eina_unicode_utf8_get_next(const char *buf, int *iindex)
 195 {
 196    int ind = *iindex;
 197    Eina_Unicode r;
 198    unsigned char d;
 199
 200    EINA_SAFETY_ON_NULL_RETURN_VAL(buf, 0);
 201    EINA_SAFETY_ON_NULL_RETURN_VAL(iindex, 0);
 202
 203    /* if this char is the null terminator, exit */
 204    if ((d = buf[ind++]) == 0) return 0;
 205
 206    if ((d & 0x80) == 0)
 207      { // 1 byte (7bit) - 0xxxxxxx
 208         *iindex = ind;
 209         return d;
 210      }
 211    if ((d & 0xe0) == 0xc0)
 212      { // 2 byte (11bit) - 110xxxxx 10xxxxxx
 213         r  = (d & 0x1f) << 6;
 214         if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
 215             !IS_CONTINUATION_BYTE(d)) goto error;
 216         r |= (d & 0x3f);
 217         if (r <= 0x7F) goto error;
 218         *iindex = ind;
 219         return r;
 220      }
 221    if ((d & 0xf0) == 0xe0)
 222      { // 3 byte (16bit) - 1110xxxx 10xxxxxx 10xxxxxx
 223         r  = (d & 0x0f) << 12;
 224         if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
 225             !IS_CONTINUATION_BYTE(d)) goto error;
 226         r |= (d & 0x3f) << 6;
 227         if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
 228             !IS_CONTINUATION_BYTE(d)) goto error;
 229         r |= (d & 0x3f);
 230         if (r <= 0x7FF) goto error;
 231         *iindex = ind;
 232         return r;
 233      }
 234    if ((d & 0xf8) == 0xf0)
 235      { // 4 byte (21bit) - 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 236         r  = (d & 0x07) << 18;
 237         if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
 238             !IS_CONTINUATION_BYTE(d)) goto error;
 239         r |= (d & 0x3f) << 12;
 240         if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
 241             !IS_CONTINUATION_BYTE(d)) goto error;
 242         r |= (d & 0x3f) << 6;
 243         if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
 244             !IS_CONTINUATION_BYTE(d)) goto error;
 245         r |= (d & 0x3f);
 246         if (r <= 0xFFFF) goto error;
 247         *iindex = ind;
 248         return r;
 249      }
 250    if ((d & 0xfc) == 0xf8)
 251      { // 5 byte (26bit) - 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 252         r  = (d & 0x03) << 24;
 253         if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
 254             !IS_CONTINUATION_BYTE(d)) goto error;
 255         r |= (d & 0x3f) << 18;
 256         if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
 257             !IS_CONTINUATION_BYTE(d)) goto error;
 258         r |= (d & 0x3f) << 12;
 259         if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
 260             !IS_CONTINUATION_BYTE(d)) goto error;
 261         r |= (d & 0x3f) << 6;
 262         if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
 263             !IS_CONTINUATION_BYTE(d)) goto error;
 264         r |= (d & 0x3f);
 265         if (r <= 0x1FFFFF) goto error;
 266         *iindex = ind;
 267         return r;
 268      }
 269    if ((d & 0xfe) == 0xfc)
 270      { // 6 byte (31bit) - 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 271         r  = (d & 0x01) << 30;
 272         if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
 273             !IS_CONTINUATION_BYTE(d)) goto error;
 274         r |= (d & 0x3f) << 24;
 275         if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
 276             !IS_CONTINUATION_BYTE(d)) goto error;
 277         r |= (d & 0x3f) << 18;
 278         if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
 279             !IS_CONTINUATION_BYTE(d)) goto error;
 280         r |= (d & 0x3f) << 12;
 281         if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
 282             !IS_CONTINUATION_BYTE(d)) goto error;
 283         r |= (d & 0x3f) << 6;
 284         if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
 285             !IS_CONTINUATION_BYTE(d)) goto error;
 286         r |= (d & 0x3f);
 287         if (r <= 0x3FFFFFF) goto error;
 288         *iindex = ind;
 289         return r;
 290      }
 291
 292 /* Gets here where there was an error and we want to replace the char
 293  * we just use the invalid unicode codepoints 8 lower bits represent
 294  * the original char */
 295 error:
 296    d = buf[*iindex];
 297    (*iindex)++;
 298    return ERROR_REPLACEMENT_BASE | d;
 299 }
 300
 301 EAPI Eina_Unicode
 302 eina_unicode_utf8_get_prev(const char *buf, int *iindex)
 303 {
 304    int r, ind;
 305
 306    EINA_SAFETY_ON_NULL_RETURN_VAL(buf, 0);
 307    EINA_SAFETY_ON_NULL_RETURN_VAL(iindex, 0);
 308
 309    ind = *iindex;
 310    /* First obtain the codepoint at iindex */
 311    r = eina_unicode_utf8_get_next(buf, &ind);
 312
 313    /* although when ind == 0 there's no previous char, we still want to get
 314     * the current char */
 315    if (*iindex <= 0)
 316      return r;
 317
 318    /* Next advance iindex to previous codepoint */
 319    ind = *iindex;
 320    ind--;
 321    while ((ind > 0) && ((buf[ind] & 0xc0) == 0x80))
 322      ind--;
 323
 324    *iindex = ind;
 325    return r;
 326 }
 327
 328 EAPI int
 329 eina_unicode_utf8_get_len(const char *buf)
 330 {
 331    /* returns the number of utf8 characters (not bytes) in the string */
 332    int i = 0, len = 0;
 333
 334    EINA_SAFETY_ON_NULL_RETURN_VAL(buf, 0);
 335
 336    while (eina_unicode_utf8_get_next(buf, &i))
 337         len++;
 338
 339    return len;
 340 }
 341
 342 EAPI Eina_Unicode *
 343 eina_unicode_utf8_to_unicode(const char *utf, int *_len)
 344 {
 345    /* FIXME: Should optimize! */
 346    int len, i;
 347    int ind;
 348    Eina_Unicode *buf, *uind;
 349
 350    EINA_SAFETY_ON_NULL_RETURN_VAL(utf, NULL);
 351
 352    len = eina_unicode_utf8_get_len(utf);
 353    if (_len)
 354       *_len = len;
 355    buf = (Eina_Unicode *) calloc(sizeof(Eina_Unicode), (len + 1));
 356    if (!buf) return buf;
 357
 358    for (i = 0, ind = 0, uind = buf ; i < len ; i++, uind++)
 359      {
 360         *uind = eina_unicode_utf8_get_next(utf, &ind);
 361      }
 362
 363    return buf;
 364 }
 365
 366 EAPI char *
 367 eina_unicode_unicode_to_utf8(const Eina_Unicode *uni, int *_len)
 368 {
 369    char *buf;
 370    const Eina_Unicode *uind;
 371    char *ind;
 372    int ulen, len;
 373
 374    EINA_SAFETY_ON_NULL_RETURN_VAL(uni, NULL);
 375
 376    ulen = eina_unicode_strlen(uni);
 377    buf = (char *) calloc(ulen + 1, EINA_UNICODE_UTF8_BYTES_PER_CHAR);
 378
 379    len = 0;
 380    for (uind = uni, ind = buf ; *uind ; uind++)
 381      {
 382         if (*uind <= 0x7F) /* 1 byte char */
 383           {
 384              *ind++ = *uind;
 385              len += 1;
 386           }
 387         else if (*uind <= 0x7FF) /* 2 byte char */
 388           {
 389              *ind++ = 0xC0 | (unsigned char) (*uind >> 6);
 390              *ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
 391              len += 2;
 392           }
 393         else if (*uind <= 0xFFFF) /* 3 byte char */
 394           {
 395              /* If it's a special replacement codepoint */
 396              if (*uind >= ERROR_REPLACEMENT_BASE &&
 397                  *uind <= ERROR_REPLACEMENT_END)
 398                {
 399                   *ind++ = *uind & 0xFF;
 400                   len += 1;
 401                }
 402              else
 403                {
 404                   *ind++ = 0xE0 | (unsigned char) (*uind >> 12);
 405                   *ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F);
 406                   *ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
 407                   len += 3;
 408                }
 409           }
 410         else if (*uind <= 0x1FFFFF) /* 4 byte char */
 411           {
 412              *ind++ = 0xF0 | (unsigned char) ((*uind >> 18) & 0x07);
 413              *ind++ = 0x80 | (unsigned char) ((*uind >> 12) & 0x3F);
 414              *ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F);
 415              *ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
 416              len += 4;
 417           }
 418         else if (*uind <= 0x3FFFFFF) /* 5 byte char */
 419           {
 420              *ind++ = 0xF8 | (unsigned char) ((*uind >> 24) & 0x03);
 421              *ind++ = 0x80 | (unsigned char) ((*uind >> 18) & 0x3F);
 422              *ind++ = 0x80 | (unsigned char) ((*uind >> 12) & 0x3F);
 423              *ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F);
 424              *ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
 425              len += 5;
 426           }
 427         else if (*uind <= 0x7FFFFFFF) /* 6 byte char */
 428           {
 429              *ind++ = 0xFC | (unsigned char) ((*uind >> 30) & 0x01);
 430              *ind++ = 0x80 | (unsigned char) ((*uind >> 24) & 0x3F);
 431              *ind++ = 0x80 | (unsigned char) ((*uind >> 18) & 0x3F);
 432              *ind++ = 0x80 | (unsigned char) ((*uind >> 12) & 0x3F);
 433              *ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F);
 434              *ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
 435              len += 6;
 436           }
 437         else /* error */
 438           {
 439              /* Do something */
 440           }
 441      }
 442    buf = realloc(buf, len + 1);
 443    buf[len] = '\0';
 444    if (_len)
 445       *_len = len;
 446    return buf;
 447 }
 448
 449
 450