glib/gutf8.c

   1 /* gutf8.c - Operations on UTF-8 strings.
   2  *
   3  * Copyright (C) 1999 Tom Tromey
   4  * Copyright (C) 2000 Red Hat, Inc.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  */
  21
  22 #include <config.h>
  23
  24 #include <stdlib.h>
  25 #ifdef HAVE_CODESET
  26 #include <langinfo.h>
  27 #endif
  28 #include <string.h>
  29
  30 #include "glib.h"
  31
  32 #ifdef G_OS_WIN32
  33 #include <windows.h>
  34 #endif
  35
  36 #define UTF8_COMPUTE(Char, Mask, Len)                                         \
  37   if (Char < 128)                                                             \
  38     {                                                                         \
  39       Len = 1;                                                                \
  40       Mask = 0x7f;                                                            \
  41     }                                                                         \
  42   else if ((Char & 0xe0) == 0xc0)                                             \
  43     {                                                                         \
  44       Len = 2;                                                                \
  45       Mask = 0x1f;                                                            \
  46     }                                                                         \
  47   else if ((Char & 0xf0) == 0xe0)                                             \
  48     {                                                                         \
  49       Len = 3;                                                                \
  50       Mask = 0x0f;                                                            \
  51     }                                                                         \
  52   else if ((Char & 0xf8) == 0xf0)                                             \
  53     {                                                                         \
  54       Len = 4;                                                                \
  55       Mask = 0x07;                                                            \
  56     }                                                                         \
  57   else if ((Char & 0xfc) == 0xf8)                                             \
  58     {                                                                         \
  59       Len = 5;                                                                \
  60       Mask = 0x03;                                                            \
  61     }                                                                         \
  62   else if ((Char & 0xfe) == 0xfc)                                             \
  63     {                                                                         \
  64       Len = 6;                                                                \
  65       Mask = 0x01;                                                            \
  66     }                                                                         \
  67   else                                                                        \
  68     Len = -1;
  69
  70 #define UTF8_GET(Result, Chars, Count, Mask, Len)                             \
  71   (Result) = (Chars)[0] & (Mask);                                             \
  72   for ((Count) = 1; (Count) < (Len); ++(Count))                               \
  73     {                                                                         \
  74       if (((Chars)[(Count)] & 0xc0) != 0x80)                                  \
  75         {                                                                     \
  76           (Result) = -1;                                                      \
  77           break;                                                              \
  78         }                                                                     \
  79       (Result) <<= 6;                                                         \
  80       (Result) |= ((Chars)[(Count)] & 0x3f);                                  \
  81     }
  82 gchar g_utf8_skip[256] = {
  83   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  84   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  85   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  86   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  87   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  88   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  89   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  90   3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0
  91 };
  92
  93 /**
  94  * g_utf8_find_prev_char:
  95  * @str: pointer to the beginning of a UTF-8 string
  96  * @p: pointer to some position within @str
  97  *
  98  * Given a position @p with a UTF-8 encoded string @str, find the start
  99  * of the previous UTF-8 character starting before @p. Returns %NULL if no
 100  * UTF-8 characters are present in @p before @str.
 101  *
 102  * @p does not have to be at the beginning of a UTF-8 chracter. No check
 103  * is made to see if the character found is actually valid other than
 104  * it starts with an appropriate byte.
 105  *
 106  * Return value: a pointer to the found character or %NULL.
 107  **/
 108 gchar *
 109 g_utf8_find_prev_char (const char *str,
 110                        const char *p)
 111 {
 112   for (--p; p > str; --p)
 113     {
 114       if ((*p & 0xc0) != 0x80)
 115         return (gchar *)p;
 116     }
 117   return NULL;
 118 }
 119
 120 /**
 121  * g_utf8_find_next_char:
 122  * @p: a pointer to a position within a UTF-8 encoded string
 123  * @end: a pointer to the end of the string, or %NULL to indicate
 124  *        that the string is NULL terminated, in which case
 125  *        the returned value will be
 126  *
 127  * Find the start of the next utf-8 character in the string after @p
 128  *
 129  * @p does not have to be at the beginning of a UTF-8 chracter. No check
 130  * is made to see if the character found is actually valid other than
 131  * it starts with an appropriate byte.
 132  *
 133  * Return value: a pointer to the found character or %NULL
 134  **/
 135 gchar *
 136 g_utf8_find_next_char (const gchar *p,
 137                        const gchar *end)
 138 {
 139   if (*p)
 140     {
 141       if (end)
 142         for (++p; p < end && (*p & 0xc0) == 0x80; ++p)
 143           ;
 144       else
 145         for (++p; (*p & 0xc0) == 0x80; ++p)
 146           ;
 147     }
 148   return (p == end) ? NULL : (gchar *)p;
 149 }
 150
 151 /**
 152  * g_utf8_prev_char:
 153  * @p: a pointer to a position within a UTF-8 encoded string
 154  *
 155  * Find the previous UTF-8 character in the string before @p
 156  *
 157  * @p does not have to be at the beginning of a UTF-8 character. No check
 158  * is made to see if the character found is actually valid other than
 159  * it starts with an appropriate byte. If @p might be the first
 160  * character of the string, you must use g_utf8_find_prev_char instead.
 161  *
 162  * Return value: a pointer to the found character.
 163  **/
 164 gchar *
 165 g_utf8_prev_char (const gchar *p)
 166 {
 167   while (TRUE)
 168     {
 169       p--;
 170       if ((*p & 0xc0) != 0x80)
 171         return (gchar *)p;
 172     }
 173 }
 174
 175 /**
 176  * g_utf8_strlen:
 177  * @p: pointer to the start of a UTF-8 string.
 178  * @max: the maximum number of bytes to examine. If @max
 179  *       is less than 0, then the string is assumed to be
 180  *       nul-terminated.
 181  *
 182  * Return value: the length of the string in characters
 183  */
 184 gint
 185 g_utf8_strlen (const gchar *p, gint max)
 186 {
 187   int len = 0;
 188   const gchar *start = p;
 189   /* special case for the empty string */
 190   if (!*p)
 191     return 0;
 192   /* Note that the test here and the test in the loop differ subtly.
 193      In the loop we want to see if we've passed the maximum limit --
 194      for instance if the buffer ends mid-character.  Here at the top
 195      of the loop we want to see if we've just reached the last byte.  */
 196   while (max < 0 || p - start < max)
 197     {
 198       p = g_utf8_next_char (p);
 199       ++len;
 200       if (! *p || (max > 0 && p - start > max))
 201         break;
 202     }
 203   return len;
 204 }
 205
 206 /**
 207  * g_utf8_get_char:
 208  * @p: a pointer to unicode character encoded as UTF-8
 209  *
 210  * Convert a sequence of bytes encoded as UTF-8 to a unicode character.
 211  *
 212  * Return value: the resulting character or (gunichar)-1 if @p does
 213  *               not point to a valid UTF-8 encoded unicode character
 214  **/
 215 gunichar
 216 g_utf8_get_char (const gchar *p)
 217 {
 218   int i, mask = 0, len;
 219   gunichar result;
 220   unsigned char c = (unsigned char) *p;
 221
 222   UTF8_COMPUTE (c, mask, len);
 223   if (len == -1)
 224     return (gunichar)-1;
 225   UTF8_GET (result, p, i, mask, len);
 226
 227   return result;
 228 }
 229
 230 /**
 231  * g_utf8_offset_to_pointer:
 232  * @str: a UTF-8 encoded string
 233  * @offset: a character offset within the string.
 234  *
 235  * Converts from an integer character offset to a pointer to a position
 236  * within the string.
 237  *
 238  * Return value: the resulting pointer
 239  **/
 240 gchar *
 241 g_utf8_offset_to_pointer  (const gchar *str,
 242                            gint         offset)
 243 {
 244   const gchar *s = str;
 245   while (offset--)
 246     s = g_utf8_next_char (s);
 247
 248   return (gchar *)s;
 249 }
 250
 251 /**
 252  * g_utf8_pointer_to_offset:
 253  * @str: a UTF-8 encoded string
 254  * @pos: a pointer to a position within @str
 255  *
 256  * Converts from a pointer to position within a string to a integer
 257  * character offset
 258  *
 259  * Return value: the resulting character offset
 260  **/
 261 gint
 262 g_utf8_pointer_to_offset (const gchar *str,
 263                           const gchar *pos)
 264 {
 265   const gchar *s = str;
 266   gint offset = 0;
 267
 268   while (s < pos)
 269     {
 270       s = g_utf8_next_char (s);
 271       offset++;
 272     }
 273
 274   return offset;
 275 }
 276
 277
 278 gchar *
 279 g_utf8_strncpy (gchar *dest, const gchar *src, size_t n)
 280 {
 281   const gchar *s = src;
 282   while (n && *s)
 283     {
 284       s = g_utf8_next_char(s);
 285       n--;
 286     }
 287   strncpy(dest, src, s - src);
 288   dest[s - src] = 0;
 289   return dest;
 290 }
 291
 292 static gboolean
 293 g_utf8_get_charset_internal (char **a)
 294 {
 295   char *charset = getenv("CHARSET");
 296
 297   if (charset && a && ! *a)
 298     *a = charset;
 299
 300   if (charset && strstr (charset, "UTF-8"))
 301       return TRUE;
 302
 303 #ifdef HAVE_CODESET
 304   charset = nl_langinfo(CODESET);
 305   if (charset)
 306     {
 307       if (a && ! *a)
 308         *a = charset;
 309       if (strcmp (charset, "UTF-8") == 0)
 310         return TRUE;
 311     }
 312 #endif
 313
 314 #if 0 /* #ifdef _NL_CTYPE_CODESET_NAME */
 315   charset = nl_langinfo (_NL_CTYPE_CODESET_NAME);
 316   if (charset)
 317     {
 318       if (a && ! *a)
 319         *a = charset;
 320       if (strcmp (charset, "UTF-8") == 0)
 321         return TRUE;
 322     }
 323 #endif
 324
 325 #ifdef G_OS_WIN32
 326   if (a && ! *a)
 327     {
 328       static char codepage[10];
 329
 330       sprintf (codepage, "CP%d", GetACP ());
 331       *a = codepage;
 332       /* What about codepage 1200? Is that UTF-8? */
 333       return FALSE;
 334     }
 335 #else
 336   if (a && ! *a)
 337     *a = "US-ASCII";
 338 #endif
 339
 340   /* Assume this for compatibility at present.  */
 341   return FALSE;
 342 }
 343
 344 static int utf8_locale_cache = -1;
 345 static char *utf8_charset_cache = NULL;
 346
 347 gboolean
 348 g_get_charset (char **charset)
 349 {
 350   if (utf8_locale_cache != -1)
 351     {
 352       if (charset)
 353         *charset = utf8_charset_cache;
 354       return utf8_locale_cache;
 355     }
 356   utf8_locale_cache = g_utf8_get_charset_internal (&utf8_charset_cache);
 357   if (charset)
 358     *charset = utf8_charset_cache;
 359   return utf8_locale_cache;
 360 }
 361
 362 /* unicode_strchr */
 363
 364 /**
 365  * g_unichar_to_utf8:
 366  * @c: a ISO10646 character code
 367  * @outbuf: output buffer, must have at least 6 bytes of space.
 368  *       If %NULL, the length will be computed and returned
 369  *       and nothing will be written to @out.
 370  *
 371  * Convert a single character to utf8
 372  *
 373  * Return value: number of bytes written
 374  **/
 375 int
 376 g_unichar_to_utf8 (gunichar c, gchar *outbuf)
 377 {
 378   size_t len = 0;
 379   int first;
 380   int i;
 381
 382   if (c < 0x80)
 383     {
 384       first = 0;
 385       len = 1;
 386     }
 387   else if (c < 0x800)
 388     {
 389       first = 0xc0;
 390       len = 2;
 391     }
 392   else if (c < 0x10000)
 393     {
 394       first = 0xe0;
 395       len = 3;
 396     }
 397    else if (c < 0x200000)
 398     {
 399       first = 0xf0;
 400       len = 4;
 401     }
 402   else if (c < 0x4000000)
 403     {
 404       first = 0xf8;
 405       len = 5;
 406     }
 407   else
 408     {
 409       first = 0xfc;
 410       len = 6;
 411     }
 412
 413   if (outbuf)
 414     {
 415       for (i = len - 1; i > 0; --i)
 416         {
 417           outbuf[i] = (c & 0x3f) | 0x80;
 418           c >>= 6;
 419         }
 420       outbuf[0] = c | first;
 421     }
 422
 423   return len;
 424 }
 425
 426 /**
 427  * g_utf8_strchr:
 428  * @p: a nul-terminated utf-8 string
 429  * @c: a iso-10646 character/
 430  *
 431  * Find the leftmost occurence of the given iso-10646 character
 432  * in a UTF-8 string.
 433  *
 434  * Return value: NULL if the string does not contain the character, otherwise, a
 435  *               a pointer to the start of the leftmost of the character in the string.
 436  **/
 437 gchar *
 438 g_utf8_strchr (const char *p, gunichar c)
 439 {
 440   gchar ch[10];
 441
 442   gint len = g_unichar_to_utf8 (c, ch);
 443   ch[len] = '\0';
 444
 445   return strstr(p, ch);
 446 }
 447
 448 #if 0
 449 /**
 450  * g_utf8_strrchr:
 451  * @p: a nul-terminated utf-8 string
 452  * @c: a iso-10646 character/
 453  *
 454  * Find the rightmost occurence of the given iso-10646 character
 455  * in a UTF-8 string.
 456  *
 457  * Return value: NULL if the string does not contain the character, otherwise, a
 458  *               a pointer to the start of the rightmost of the character in the string.
 459  **/
 460
 461 /* This is ifdefed out atm as there is no strrstr function in libc.
 462  */
 463 gchar *
 464 unicode_strrchr (const char *p, gunichar c)
 465 {
 466   gchar ch[10];
 467
 468   len = g_unichar_to_utf8 (c, ch);
 469   ch[len] = '\0';
 470
 471   return strrstr(p, ch);
 472 }
 473 #endif
 474
 475
 476 /**
 477  * g_utf8_to_ucs4:
 478  * @str: a UTF-8 encoded strnig
 479  * @len: the length of @
 480  *
 481  * Convert a string from UTF-8 to a 32-bit fixed width
 482  * representation as UCS-4.
 483  *
 484  * Return value: a pointer to a newly allocated UCS-4 string.
 485  *               This value must be freed with g_free()
 486  **/
 487 gunichar *
 488 g_utf8_to_ucs4 (const char *str, int len)
 489 {
 490   gunichar *result;
 491   gint n_chars, i;
 492   const gchar *p;
 493
 494   n_chars = g_utf8_strlen (str, len);
 495   result = g_new (gunichar, n_chars);
 496
 497   p = str;
 498   for (i=0; i < n_chars; i++)
 499     {
 500       result[i] = g_utf8_get_char (p);
 501       p = g_utf8_next_char (p);
 502     }
 503
 504   return result;
 505 }
 506
 507 /**
 508  * g_ucs4_to_utf8:
 509  * @str: a UCS-4 encoded string
 510  * @len: the length of @
 511  *
 512  * Convert a string from a 32-bit fixed width representation as UCS-4.
 513  * to UTF-8.
 514  *
 515  * Return value: a pointer to a newly allocated UTF-8 string.
 516  *               This value must be freed with g_free()
 517  **/
 518 gchar *
 519 g_ucs4_to_utf8 (const gunichar *str, int len)
 520 {
 521   gint result_length;
 522   gchar *result, *p;
 523   gint i;
 524
 525   result_length = 0;
 526   for (i = 0; i < len ; i++)
 527     result_length += g_unichar_to_utf8 (str[i], NULL);
 528
 529   result_length++;
 530
 531   result = g_malloc (result_length + 1);
 532   p = result;
 533
 534   for (i = 0; i < len ; i++)
 535     p += g_unichar_to_utf8 (str[i], p);
 536
 537   *p = '\0';
 538
 539   return result;
 540 }
 541
 542 /**
 543  * g_utf8_validate:
 544  * @str: a pointer to character data
 545  * @max_len: max bytes to validate, or -1 to go until nul
 546  * @end: return location for end of valid data
 547  *
 548  * Validates UTF-8 encoded text. @str is the text to validate;
 549  * if @str is nul-terminated, then @max_len can be -1, otherwise
 550  * @max_len should be the number of bytes to validate.
 551  * If @end is non-NULL, then the end of the valid range
 552  * will be stored there (i.e. the address of the first invalid byte
 553  * if some bytes were invalid, or the end of the text being validated
 554  * otherwise).
 555  *
 556  * Returns TRUE if all of @str was valid. Many GLib and GTK+
 557  * routines <emphasis>require</emphasis> valid UTF8 as input;
 558  * so data read from a file or the network should be checked
 559  * with g_utf8_validate() before doing anything else with it.
 560  *
 561  * Return value: TRUE if the text was valid UTF-8.
 562  **/
 563 gboolean
 564 g_utf8_validate (const gchar  *str,
 565                  gint          max_len,
 566                  const gchar **end)
 567 {
 568
 569   const gchar *p;
 570
 571   if (end)
 572     *end = str;
 573
 574   p = str;
 575
 576   while ((max_len < 0 || (p - str) < max_len) && *p)
 577     {
 578       int i, mask = 0, len;
 579       gunichar result;
 580       unsigned char c = (unsigned char) *p;
 581
 582       UTF8_COMPUTE (c, mask, len);
 583
 584       if (len == -1)
 585         break;
 586
 587       /* check that the expected number of bytes exists in str */
 588       if (max_len >= 0 &&
 589           ((max_len - (p - str)) < len))
 590         break;
 591
 592       UTF8_GET (result, p, i, mask, len);
 593
 594       if (result == (gunichar)-1)
 595         break;
 596
 597       p += len;
 598     }
 599
 600   if (end)
 601     *end = p;
 602
 603   /* See that we covered the entire length if a length was
 604    * passed in, or that we ended on a nul if not
 605    */
 606   if (max_len >= 0 &&
 607       p != (str + max_len))
 608     return FALSE;
 609   else if (max_len < 0 &&
 610            *p != '\0')
 611     return FALSE;
 612   else
 613     return TRUE;
 614 }
 615
 616