glib/guniprop.c

   1 /* guniprop.c - Unicode character properties.
   2  *
   3  * Copyright (C) 1999 Tom Tromey
   4  * Copyright (C) 2000 Red Hat, Inc.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  */
  21
  22 #include "glib.h"
  23 #include "gunichartables.h"
  24
  25 #include <config.h>
  26
  27 #include <stddef.h>
  28 #include <string.h>
  29 #include <locale.h>
  30
  31 #define ATTTABLE(Page, Char) \
  32   ((attr_table[Page] == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[attr_table[Page]][Char]))
  33
  34 /* We cheat a bit and cast type values to (char *).  We detect these
  35    using the &0xff trick.  */
  36 #define TTYPE(Page, Char) \
  37   ((type_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
  38    ? (type_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
  39    : (type_data[type_table[Page]][Char]))
  40
  41
  42 #define TYPE(Char) (((Char) > (G_UNICODE_LAST_CHAR)) ? G_UNICODE_UNASSIGNED : TTYPE ((Char) >> 8, (Char) & 0xff))
  43
  44 #define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER       \
  45                        || (Type) == G_UNICODE_LETTER_NUMBER     \
  46                        || (Type) == G_UNICODE_OTHER_NUMBER)
  47
  48 #define ISALPHA(Type) ((Type) == G_UNICODE_LOWERCASE_LETTER     \
  49                        || (Type) == G_UNICODE_UPPERCASE_LETTER  \
  50                        || (Type) == G_UNICODE_TITLECASE_LETTER  \
  51                        || (Type) == G_UNICODE_MODIFIER_LETTER   \
  52                        || (Type) == G_UNICODE_OTHER_LETTER)
  53
  54 #define ISMARK(Type) ((Type) == G_UNICODE_NON_SPACING_MARK ||   \
  55                       (Type) == G_UNICODE_COMBINING_MARK ||     \
  56                       (Type) == G_UNICODE_ENCLOSING_MARK)
  57
  58
  59 /**
  60  * g_unichar_isalnum:
  61  * @c: a Unicode character
  62  *
  63  * Determines whether a character is alphanumeric.
  64  * Given some UTF-8 text, obtain a character value
  65  * with g_utf8_get_char().
  66  *
  67  * Return value: %TRUE if @c is an alphanumeric character
  68  **/
  69 gboolean
  70 g_unichar_isalnum (gunichar c)
  71 {
  72   int t = TYPE (c);
  73   return ISDIGIT (t) || ISALPHA (t);
  74 }
  75
  76 /**
  77  * g_unichar_isalpha:
  78  * @c: a Unicode character
  79  *
  80  * Determines whether a character is alphabetic (i.e. a letter).
  81  * Given some UTF-8 text, obtain a character value with
  82  * g_utf8_get_char().
  83  *
  84  * Return value: %TRUE if @c is an alphabetic character
  85  **/
  86 gboolean
  87 g_unichar_isalpha (gunichar c)
  88 {
  89   int t = TYPE (c);
  90   return ISALPHA (t);
  91 }
  92
  93
  94 /**
  95  * g_unichar_iscntrl:
  96  * @c: a Unicode character
  97  *
  98  * Determines whether a character is a control character.
  99  * Given some UTF-8 text, obtain a character value with
 100  * g_utf8_get_char().
 101  *
 102  * Return value: %TRUE if @c is a control character
 103  **/
 104 gboolean
 105 g_unichar_iscntrl (gunichar c)
 106 {
 107   return TYPE (c) == G_UNICODE_CONTROL;
 108 }
 109
 110 /**
 111  * g_unichar_isdigit:
 112  * @c: a Unicode character
 113  *
 114  * Determines whether a character is numeric (i.e. a digit).  This
 115  * covers ASCII 0-9 and also digits in other languages/scripts.  Given
 116  * some UTF-8 text, obtain a character value with g_utf8_get_char().
 117  *
 118  * Return value: %TRUE if @c is a digit
 119  **/
 120 gboolean
 121 g_unichar_isdigit (gunichar c)
 122 {
 123   return TYPE (c) == G_UNICODE_DECIMAL_NUMBER;
 124 }
 125
 126
 127 /**
 128  * g_unichar_isgraph:
 129  * @c: a Unicode character
 130  *
 131  * Determines whether a character is printable and not a space
 132  * (returns %FALSE for control characters, format characters, and
 133  * spaces). g_unichar_isprint() is similar, but returns %TRUE for
 134  * spaces. Given some UTF-8 text, obtain a character value with
 135  * g_utf8_get_char().
 136  *
 137  * Return value: %TRUE if @c is printable unless it's a space
 138  **/
 139 gboolean
 140 g_unichar_isgraph (gunichar c)
 141 {
 142   int t = TYPE (c);
 143   return (t != G_UNICODE_CONTROL
 144           && t != G_UNICODE_FORMAT
 145           && t != G_UNICODE_UNASSIGNED
 146           && t != G_UNICODE_PRIVATE_USE
 147           && t != G_UNICODE_SURROGATE
 148           && t != G_UNICODE_SPACE_SEPARATOR);
 149 }
 150
 151 /**
 152  * g_unichar_islower:
 153  * @c: a Unicode character
 154  *
 155  * Determines whether a character is a lowercase letter.
 156  * Given some UTF-8 text, obtain a character value with
 157  * g_utf8_get_char().
 158  *
 159  * Return value: %TRUE if @c is a lowercase letter
 160  **/
 161 gboolean
 162 g_unichar_islower (gunichar c)
 163 {
 164   return TYPE (c) == G_UNICODE_LOWERCASE_LETTER;
 165 }
 166
 167
 168 /**
 169  * g_unichar_isprint:
 170  * @c: a Unicode character
 171  *
 172  * Determines whether a character is printable.
 173  * Unlike g_unichar_isgraph(), returns %TRUE for spaces.
 174  * Given some UTF-8 text, obtain a character value with
 175  * g_utf8_get_char().
 176  *
 177  * Return value: %TRUE if @c is printable
 178  **/
 179 gboolean
 180 g_unichar_isprint (gunichar c)
 181 {
 182   int t = TYPE (c);
 183   return (t != G_UNICODE_CONTROL
 184           && t != G_UNICODE_FORMAT
 185           && t != G_UNICODE_UNASSIGNED
 186           && t != G_UNICODE_PRIVATE_USE
 187           && t != G_UNICODE_SURROGATE);
 188 }
 189
 190 /**
 191  * g_unichar_ispunct:
 192  * @c: a Unicode character
 193  *
 194  * Determines whether a character is punctuation or a symbol.
 195  * Given some UTF-8 text, obtain a character value with
 196  * g_utf8_get_char().
 197  *
 198  * Return value: %TRUE if @c is a punctuation or symbol character
 199  **/
 200 gboolean
 201 g_unichar_ispunct (gunichar c)
 202 {
 203   int t = TYPE (c);
 204   return (t == G_UNICODE_CONNECT_PUNCTUATION || t == G_UNICODE_DASH_PUNCTUATION
 205           || t == G_UNICODE_CLOSE_PUNCTUATION || t == G_UNICODE_FINAL_PUNCTUATION
 206           || t == G_UNICODE_INITIAL_PUNCTUATION || t == G_UNICODE_OTHER_PUNCTUATION
 207           || t == G_UNICODE_OPEN_PUNCTUATION || t == G_UNICODE_CURRENCY_SYMBOL
 208           || t == G_UNICODE_MODIFIER_SYMBOL || t == G_UNICODE_MATH_SYMBOL
 209           || t == G_UNICODE_OTHER_SYMBOL);
 210 }
 211
 212 /**
 213  * g_unichar_isspace:
 214  * @c: a Unicode character
 215  *
 216  * Determines whether a character is a space, tab, or line separator
 217  * (newline, carriage return, etc.).  Given some UTF-8 text, obtain a
 218  * character value with g_utf8_get_char().
 219  *
 220  * (Note: don't use this to do word breaking; you have to use
 221  * Pango or equivalent to get word breaking right, the algorithm
 222  * is fairly complex.)
 223  *
 224  * Return value: %TRUE if @c is a punctuation character
 225  **/
 226 gboolean
 227 g_unichar_isspace (gunichar c)
 228 {
 229   switch (c)
 230     {
 231       /* special-case these since Unicode thinks they are not spaces */
 232     case '\t':
 233     case '\n':
 234     case '\r':
 235     case '\f':
 236       return TRUE;
 237       break;
 238
 239     default:
 240       {
 241         int t = TYPE (c);
 242         return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR
 243                 || t == G_UNICODE_PARAGRAPH_SEPARATOR);
 244       }
 245       break;
 246     }
 247 }
 248
 249 /**
 250  * g_unichar_isupper:
 251  * @c: a Unicode character
 252  *
 253  * Determines if a character is uppercase.
 254  *
 255  * Return value: %TRUE if @c is an uppercase character
 256  **/
 257 gboolean
 258 g_unichar_isupper (gunichar c)
 259 {
 260   return TYPE (c) == G_UNICODE_UPPERCASE_LETTER;
 261 }
 262
 263 /**
 264  * g_unichar_istitle:
 265  * @c: a Unicode character
 266  *
 267  * Determines if a character is titlecase. Some characters in
 268  * Unicode which are composites, such as the DZ digraph
 269  * have three case variants instead of just two. The titlecase
 270  * form is used at the beginning of a word where only the
 271  * first letter is capitalized. The titlecase form of the DZ
 272  * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z.
 273  *
 274  * Return value: %TRUE if the character is titlecase
 275  **/
 276 gboolean
 277 g_unichar_istitle (gunichar c)
 278 {
 279   unsigned int i;
 280   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 281     if (title_table[i][0] == c)
 282       return 1;
 283   return 0;
 284 }
 285
 286 /**
 287  * g_unichar_isxdigit:
 288  * @c: a Unicode character.
 289  *
 290  * Determines if a character is a hexidecimal digit.
 291  *
 292  * Return value: %TRUE if the character is a hexadecimal digit
 293  **/
 294 gboolean
 295 g_unichar_isxdigit (gunichar c)
 296 {
 297   int t = TYPE (c);
 298   return ((c >= 'a' && c <= 'f')
 299           || (c >= 'A' && c <= 'F')
 300           || ISDIGIT (t));
 301 }
 302
 303 /**
 304  * g_unichar_isdefined:
 305  * @c: a Unicode character
 306  *
 307  * Determines if a given character is assigned in the Unicode
 308  * standard.
 309  *
 310  * Return value: %TRUE if the character has an assigned value
 311  **/
 312 gboolean
 313 g_unichar_isdefined (gunichar c)
 314 {
 315   int t = TYPE (c);
 316   return t != G_UNICODE_UNASSIGNED;
 317 }
 318
 319 /**
 320  * g_unichar_iswide:
 321  * @c: a Unicode character
 322  *
 323  * Determines if a character is typically rendered in a double-width
 324  * cell.
 325  *
 326  * Return value: %TRUE if the character is wide
 327  **/
 328 /* This function stolen from Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>.  */
 329 gboolean
 330 g_unichar_iswide (gunichar c)
 331 {
 332   if (c < 0x1100)
 333     return 0;
 334
 335   return ((c >= 0x1100 && c <= 0x115f)     /* Hangul Jamo */
 336           || (c >= 0x2e80 && c <= 0xa4cf && (c & ~0x0011) != 0x300a &&
 337               c != 0x303f)                 /* CJK ... Yi */
 338           || (c >= 0xac00 && c <= 0xd7a3)  /* Hangul Syllables */
 339           || (c >= 0xf900 && c <= 0xfaff)  /* CJK Compatibility Ideographs */
 340           || (c >= 0xfe30 && c <= 0xfe6f)  /* CJK Compatibility Forms */
 341           || (c >= 0xff00 && c <= 0xff5f)  /* Fullwidth Forms */
 342           || (c >= 0xffe0 && c <= 0xffe6));
 343 }
 344
 345 /**
 346  * g_unichar_toupper:
 347  * @c: a Unicode character
 348  *
 349  * Converts a character to uppercase.
 350  *
 351  * Return value: the result of converting @c to uppercase.
 352  *               If @c is not an lowercase or titlecase character,
 353  *               @c is returned unchanged.
 354  **/
 355 gunichar
 356 g_unichar_toupper (gunichar c)
 357 {
 358   int t = TYPE (c);
 359   if (t == G_UNICODE_LOWERCASE_LETTER)
 360     {
 361       gunichar val = ATTTABLE (c >> 8, c & 0xff);
 362       if (val >= 0xd800 && val < 0xdc00)
 363         {
 364           const guchar *p = special_case_table[val - 0xd800];
 365           return p[0] * 256 + p[1];
 366         }
 367       else
 368         return val;
 369     }
 370   else if (t == G_UNICODE_TITLECASE_LETTER)
 371     {
 372       unsigned int i;
 373       for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 374         {
 375           if (title_table[i][0] == c)
 376             return title_table[i][1];
 377         }
 378     }
 379   return c;
 380 }
 381
 382 /**
 383  * g_unichar_tolower:
 384  * @c: a Unicode character.
 385  *
 386  * Converts a character to lower case.
 387  *
 388  * Return value: the result of converting @c to lower case.
 389  *               If @c is not an upperlower or titlecase character,
 390  *               @c is returned unchanged.
 391  **/
 392 gunichar
 393 g_unichar_tolower (gunichar c)
 394 {
 395   int t = TYPE (c);
 396   if (t == G_UNICODE_UPPERCASE_LETTER)
 397     {
 398       gunichar val = ATTTABLE (c >> 8, c & 0xff);
 399       if (val >= 0xd800 && val < 0xdc00)
 400         {
 401           const guchar *p = special_case_table[val - 0xd800];
 402           return p[0] * 256 + p[1];
 403         }
 404       else
 405         return val;
 406     }
 407   else if (t == G_UNICODE_TITLECASE_LETTER)
 408     {
 409       unsigned int i;
 410       for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 411         {
 412           if (title_table[i][0] == c)
 413             return title_table[i][2];
 414         }
 415     }
 416   return c;
 417 }
 418
 419 /**
 420  * g_unichar_totitle:
 421  * @c: a Unicode character
 422  *
 423  * Converts a character to the titlecase.
 424  *
 425  * Return value: the result of converting @c to titlecase.
 426  *               If @c is not an uppercase or lowercase character,
 427  *               @c is returned unchanged.
 428  **/
 429 gunichar
 430 g_unichar_totitle (gunichar c)
 431 {
 432   unsigned int i;
 433   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 434     {
 435       if (title_table[i][0] == c || title_table[i][1] == c
 436           || title_table[i][2] == c)
 437         return title_table[i][0];
 438     }
 439   return (TYPE (c) == G_UNICODE_LOWERCASE_LETTER
 440           ? ATTTABLE (c >> 8, c & 0xff)
 441           : c);
 442 }
 443
 444 /**
 445  * g_unichar_digit_value:
 446  * @c: a Unicode character
 447  *
 448  * Determines the numeric value of a character as a decimal
 449  * digit.
 450  *
 451  * Return value: If @c is a decimal digit (according to
 452  * g_unichar_isdigit()), its numeric value. Otherwise, -1.
 453  **/
 454 int
 455 g_unichar_digit_value (gunichar c)
 456 {
 457   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
 458     return ATTTABLE (c >> 8, c & 0xff);
 459   return -1;
 460 }
 461
 462 /**
 463  * g_unichar_xdigit_value:
 464  * @c: a Unicode character
 465  *
 466  * Determines the numeric value of a character as a hexidecimal
 467  * digit.
 468  *
 469  * Return value: If @c is a hex digit (according to
 470  * g_unichar_isxdigit()), its numeric value. Otherwise, -1.
 471  **/
 472 int
 473 g_unichar_xdigit_value (gunichar c)
 474 {
 475   if (c >= 'A' && c <= 'F')
 476     return c - 'A' + 10;
 477   if (c >= 'a' && c <= 'f')
 478     return c - 'a' + 10;
 479   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
 480     return ATTTABLE (c >> 8, c & 0xff);
 481   return -1;
 482 }
 483
 484 /**
 485  * g_unichar_type:
 486  * @c: a Unicode character
 487  *
 488  * Classifies a Unicode character by type.
 489  *
 490  * Return value: the type of the character.
 491  **/
 492 GUnicodeType
 493 g_unichar_type (gunichar c)
 494 {
 495   return TYPE (c);
 496 }
 497
 498 /*
 499  * Case mapping functions
 500  */
 501
 502 typedef enum {
 503   LOCALE_NORMAL,
 504   LOCALE_TURKIC,
 505   LOCALE_LITHUANIAN
 506 } LocaleType;
 507
 508 static LocaleType
 509 get_locale_type (void)
 510 {
 511   const char *locale = setlocale (LC_CTYPE, NULL);
 512
 513   switch (locale[0])
 514     {
 515    case 'a':
 516       if (locale[1] == 'z')
 517         return LOCALE_TURKIC;
 518       break;
 519     case 'l':
 520       if (locale[1] == 't')
 521         return LOCALE_LITHUANIAN;
 522       break;
 523     case 't':
 524       if (locale[1] == 'r')
 525         return LOCALE_TURKIC;
 526       break;
 527     }
 528
 529   return LOCALE_NORMAL;
 530 }
 531
 532 static int
 533 output_marks (const char **p_inout,
 534               char        *out_buffer,
 535               int          len,
 536               gboolean     remove_dot)
 537 {
 538   const char *p = *p_inout;
 539
 540   while (*p)
 541     {
 542       gunichar c = g_utf8_get_char (p);
 543       int t = TYPE(c);
 544
 545       if (ISMARK(t))
 546         {
 547           if (!remove_dot || c != 0x307 /* COMBINING DOT ABOVE */)
 548             len += g_unichar_to_utf8 (c, out_buffer ? out_buffer + len : NULL);
 549           p = g_utf8_next_char (p);
 550         }
 551       else
 552         break;
 553     }
 554
 555   *p_inout = p;
 556   return len;
 557 }
 558
 559 static gsize
 560 output_special_case (gchar *out_buffer,
 561                      gsize  len,
 562                      int    index,
 563                      int    type,
 564                      int    which)
 565 {
 566   const guchar *p = special_case_table[index];
 567
 568   if (type != G_UNICODE_TITLECASE_LETTER)
 569     p += 2; /* +2 to skip over "best single match" */
 570
 571   if (which == 1)
 572     {
 573       while (p[0] && p[1])
 574         p += 2;
 575       p += 2;
 576     }
 577
 578   while (TRUE)
 579     {
 580       gunichar ch = p[0] * 256 + p[1];
 581       if (!ch)
 582         break;
 583
 584       len += g_unichar_to_utf8 (ch, out_buffer ? out_buffer + len : NULL);
 585       p += 2;
 586     }
 587
 588   return len;
 589 }
 590
 591 static gsize
 592 real_toupper (const gchar *str,
 593               gssize       max_len,
 594               gchar       *out_buffer,
 595               LocaleType   locale_type)
 596 {
 597   const gchar *p = str;
 598   const char *last = NULL;
 599   gsize len = 0;
 600   gboolean last_was_i = FALSE;
 601
 602   while ((max_len < 0 || p < str + max_len) && *p)
 603     {
 604       gunichar c = g_utf8_get_char (p);
 605       int t = TYPE (c);
 606       gunichar val;
 607
 608       last = p;
 609       p = g_utf8_next_char (p);
 610
 611       if (locale_type == LOCALE_LITHUANIAN)
 612         {
 613           if (c == 'i')
 614             last_was_i = TRUE;
 615           else
 616             {
 617               if (last_was_i)
 618                 {
 619                   /* Nasty, need to remove any dot above. Though
 620                    * I think only E WITH DOT ABOVE occurs in practice
 621                    * which could simplify this considerably.
 622                    */
 623                   gsize decomp_len, i;
 624                   gunichar *decomp;
 625
 626                   decomp = g_unicode_canonical_decomposition (c, &decomp_len);
 627                   for (i=0; i < decomp_len; i++)
 628                     {
 629                       if (decomp[i] != 0x307 /* COMBINING DOT ABOVE */)
 630                         len += g_unichar_to_utf8 (g_unichar_toupper (decomp[i]), out_buffer ? out_buffer + len : NULL);
 631                     }
 632                   g_free (decomp);
 633
 634                   len = output_marks (&p, out_buffer, len, TRUE);
 635
 636                   continue;
 637                 }
 638
 639               if (!ISMARK(t))
 640                 last_was_i = FALSE;
 641             }
 642         }
 643
 644       if (locale_type == LOCALE_TURKIC && c == 'i')
 645         {
 646           /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE */
 647           len += g_unichar_to_utf8 (0x130, out_buffer ? out_buffer + len : NULL);
 648         }
 649       else if (c == 0x0345)     /* COMBINING GREEK YPOGEGRAMMENI */
 650         {
 651           /* Nasty, need to move it after other combining marks .. this would go away if
 652            * we normalized first.
 653            */
 654           len = output_marks (&p, out_buffer, len, FALSE);
 655
 656           /* And output as GREEK CAPITAL LETTER IOTA */
 657           len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL);
 658         }
 659       else if (t == G_UNICODE_LOWERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER)
 660         {
 661           val = ATTTABLE (c >> 8, c & 0xff);
 662
 663           if (val >= 0xd800 && val < 0xdc00)
 664             {
 665               len += output_special_case (out_buffer, len, val - 0xd800, t,
 666                                           t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1);
 667             }
 668           else
 669             {
 670               if (t == G_UNICODE_TITLECASE_LETTER)
 671                 {
 672                   unsigned int i;
 673                   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 674                     {
 675                       if (title_table[i][0] == c)
 676                         val = title_table[i][1];
 677                     }
 678                 }
 679
 680               len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 681             }
 682         }
 683       else
 684         {
 685           gsize char_len = g_utf8_skip[*(guchar *)last];
 686
 687           if (out_buffer)
 688             memcpy (out_buffer + len, last, char_len);
 689
 690           len += char_len;
 691         }
 692
 693     }
 694
 695   return len;
 696 }
 697
 698 /**
 699  * g_utf8_strup:
 700  * @str: a UTF-8 encoded string
 701  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 702  *
 703  * Converts all Unicode characters in the string that have a case
 704  * to uppercase. The exact manner that this is done depends
 705  * on the current locale, and may result in the number of
 706  * characters in the string increasing. (For instance, the
 707  * German ess-zet will be changed to SS.)
 708  *
 709  * Return value: a newly allocated string, with all characters
 710  *    converted to uppercase.
 711  **/
 712 gchar *
 713 g_utf8_strup (const gchar *str,
 714               gssize       len)
 715 {
 716   gsize result_len;
 717   LocaleType locale_type;
 718   gchar *result;
 719
 720   g_return_val_if_fail (str != NULL, NULL);
 721
 722   locale_type = get_locale_type ();
 723
 724   /*
 725    * We use a two pass approach to keep memory management simple
 726    */
 727   result_len = real_toupper (str, len, NULL, locale_type);
 728   result = g_malloc (result_len + 1);
 729   real_toupper (str, len, result, locale_type);
 730   result[result_len] = '\0';
 731
 732   return result;
 733 }
 734
 735 static gsize
 736 real_tolower (const gchar *str,
 737               gssize       max_len,
 738               gchar       *out_buffer,
 739               LocaleType   locale_type)
 740 {
 741   const gchar *p = str;
 742   const char *last = NULL;
 743   gsize len = 0;
 744
 745   while ((max_len < 0 || p < str + max_len) && *p)
 746     {
 747       gunichar c = g_utf8_get_char (p);
 748       int t = TYPE (c);
 749       gunichar val;
 750
 751       last = p;
 752       p = g_utf8_next_char (p);
 753
 754       if (locale_type == LOCALE_TURKIC && c == 'I')
 755         {
 756           /* I => LATIN SMALL LETTER DOTLESS I */
 757           len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL);
 758         }
 759       else if (c == 0x03A3)     /* GREEK CAPITAL LETTER SIGMA */
 760         {
 761           gunichar next_c = g_utf8_get_char (p);
 762           int next_t = TYPE(next_c);
 763
 764           /* SIGMA mapps differently depending on whether it is
 765            * final or not. The following simplified test would
 766            * fail in the case of combining marks following the
 767            * sigma, but I don't think that occurs in real text.
 768            * The test here matches that in ICU.
 769            */
 770           if (ISALPHA(next_t)) /* Lu,Ll,Lt,Lm,Lo */
 771             val = 0x3c3;        /* GREEK SMALL SIGMA */
 772           else
 773             val = 0x3c2;        /* GREEK SMALL FINAL SIGMA */
 774
 775           len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 776         }
 777       else if (t == G_UNICODE_UPPERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER)
 778         {
 779           val = ATTTABLE (c >> 8, c & 0xff);
 780
 781           if (val >= 0xd800 && val < 0xdc00)
 782             {
 783               len += output_special_case (out_buffer, len, val - 0xd800, t, 0);
 784             }
 785           else
 786             {
 787               if (t == G_UNICODE_TITLECASE_LETTER)
 788                 {
 789                   unsigned int i;
 790                   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 791                     {
 792                       if (title_table[i][0] == c)
 793                         val = title_table[i][2];
 794                     }
 795                 }
 796
 797               len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 798             }
 799         }
 800       else
 801         {
 802           gsize char_len = g_utf8_skip[*(guchar *)last];
 803
 804           if (out_buffer)
 805             memcpy (out_buffer + len, last, char_len);
 806
 807           len += char_len;
 808         }
 809
 810     }
 811
 812   return len;
 813 }
 814
 815 /**
 816  * g_utf8_strdown:
 817  * @str: a UTF-8 encoded string
 818  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 819  *
 820  * Converts all Unicode characters in the string that have a case
 821  * to lowercase. The exact manner that this is done depends
 822  * on the current locale, and may result in the number of
 823  * characters in the string changing.
 824  *
 825  * Return value: a newly allocated string, with all characters
 826  *    converted to lowercase.
 827  **/
 828 gchar *
 829 g_utf8_strdown (const gchar *str,
 830                 gssize       len)
 831 {
 832   gsize result_len;
 833   LocaleType locale_type;
 834   gchar *result;
 835
 836   g_return_val_if_fail (str != NULL, NULL);
 837
 838   locale_type = get_locale_type ();
 839
 840   /*
 841    * We use a two pass approach to keep memory management simple
 842    */
 843   result_len = real_tolower (str, len, NULL, locale_type);
 844   result = g_malloc (result_len + 1);
 845   real_tolower (str, len, result, locale_type);
 846   result[result_len] = '\0';
 847
 848   return result;
 849 }
 850
 851 /**
 852  * g_utf8_casefold:
 853  * @str: a UTF-8 encoded string
 854  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 855  *
 856  * Converts a string into a form that is independent of case. The
 857  * result will not correspond to any particular case, but can be
 858  * compared for equality or ordered with the results of calling
 859  * g_utf8_casefold() on other strings.
 860  *
 861  * Note that calling g_utf8_casefold() followed by g_utf8_collate() is
 862  * only an approximation to the correct linguistic case insensitive
 863  * ordering, though it is a fairly good one. Getting this exactly
 864  * right would require a more sophisticated collation function that
 865  * takes case sensitivity into account. GLib does not currently
 866  * provide such a function.
 867  *
 868  * Return value: a newly allocated string, that is a
 869  *   case independent form of @str.
 870  **/
 871 gchar *
 872 g_utf8_casefold (const gchar *str,
 873                  gssize       len)
 874 {
 875   GString *result = g_string_new (NULL);
 876   const char *p;
 877
 878   p = str;
 879   while ((len < 0 || p < str + len) && *p)
 880     {
 881       gunichar ch = g_utf8_get_char (p);
 882
 883       int start = 0;
 884       int end = G_N_ELEMENTS (casefold_table);
 885
 886       if (ch >= casefold_table[start].ch &&
 887           ch <= casefold_table[end - 1].ch)
 888         {
 889           while (TRUE)
 890             {
 891               int half = (start + end) / 2;
 892               if (ch == casefold_table[half].ch)
 893                 {
 894                   g_string_append (result, casefold_table[half].data);
 895                   goto next;
 896                 }
 897               else if (half == start)
 898                 break;
 899               else if (ch > casefold_table[half].ch)
 900                 start = half;
 901               else
 902                 end = half;
 903             }
 904         }
 905
 906       g_string_append_unichar (result, g_unichar_tolower (ch));
 907
 908     next:
 909       p = g_utf8_next_char (p);
 910     }
 911
 912   return g_string_free (result, FALSE);
 913 }