glib/guniprop.c

   1 /* guniprop.c - Unicode character properties.
   2  *
   3  * Copyright (C) 1999 Tom Tromey
   4  * Copyright (C) 2000 Red Hat, Inc.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  */
  21
  22 #include "config.h"
  23
  24 #include <stddef.h>
  25 #include <string.h>
  26 #include <locale.h>
  27
  28 #include "glib.h"
  29 #include "gunichartables.h"
  30
  31
  32 #define ATTTABLE(Page, Char) \
  33   ((attr_table[Page] == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[attr_table[Page]][Char]))
  34
  35 /* We cheat a bit and cast type values to (char *).  We detect these
  36    using the &0xff trick.  */
  37 #define TTYPE(Page, Char) \
  38   ((type_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
  39    ? (type_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
  40    : (type_data[type_table[Page]][Char]))
  41
  42
  43 #define TYPE(Char) (((Char) > (G_UNICODE_LAST_CHAR)) ? G_UNICODE_UNASSIGNED : TTYPE ((Char) >> 8, (Char) & 0xff))
  44
  45 #define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER       \
  46                        || (Type) == G_UNICODE_LETTER_NUMBER     \
  47                        || (Type) == G_UNICODE_OTHER_NUMBER)
  48
  49 #define ISALPHA(Type) ((Type) == G_UNICODE_LOWERCASE_LETTER     \
  50                        || (Type) == G_UNICODE_UPPERCASE_LETTER  \
  51                        || (Type) == G_UNICODE_TITLECASE_LETTER  \
  52                        || (Type) == G_UNICODE_MODIFIER_LETTER   \
  53                        || (Type) == G_UNICODE_OTHER_LETTER)
  54
  55 #define ISMARK(Type) ((Type) == G_UNICODE_NON_SPACING_MARK ||   \
  56                       (Type) == G_UNICODE_COMBINING_MARK ||     \
  57                       (Type) == G_UNICODE_ENCLOSING_MARK)
  58
  59
  60 /**
  61  * g_unichar_isalnum:
  62  * @c: a Unicode character
  63  *
  64  * Determines whether a character is alphanumeric.
  65  * Given some UTF-8 text, obtain a character value
  66  * with g_utf8_get_char().
  67  *
  68  * Return value: %TRUE if @c is an alphanumeric character
  69  **/
  70 gboolean
  71 g_unichar_isalnum (gunichar c)
  72 {
  73   int t = TYPE (c);
  74   return ISDIGIT (t) || ISALPHA (t);
  75 }
  76
  77 /**
  78  * g_unichar_isalpha:
  79  * @c: a Unicode character
  80  *
  81  * Determines whether a character is alphabetic (i.e. a letter).
  82  * Given some UTF-8 text, obtain a character value with
  83  * g_utf8_get_char().
  84  *
  85  * Return value: %TRUE if @c is an alphabetic character
  86  **/
  87 gboolean
  88 g_unichar_isalpha (gunichar c)
  89 {
  90   int t = TYPE (c);
  91   return ISALPHA (t);
  92 }
  93
  94
  95 /**
  96  * g_unichar_iscntrl:
  97  * @c: a Unicode character
  98  *
  99  * Determines whether a character is a control character.
 100  * Given some UTF-8 text, obtain a character value with
 101  * g_utf8_get_char().
 102  *
 103  * Return value: %TRUE if @c is a control character
 104  **/
 105 gboolean
 106 g_unichar_iscntrl (gunichar c)
 107 {
 108   return TYPE (c) == G_UNICODE_CONTROL;
 109 }
 110
 111 /**
 112  * g_unichar_isdigit:
 113  * @c: a Unicode character
 114  *
 115  * Determines whether a character is numeric (i.e. a digit).  This
 116  * covers ASCII 0-9 and also digits in other languages/scripts.  Given
 117  * some UTF-8 text, obtain a character value with g_utf8_get_char().
 118  *
 119  * Return value: %TRUE if @c is a digit
 120  **/
 121 gboolean
 122 g_unichar_isdigit (gunichar c)
 123 {
 124   return TYPE (c) == G_UNICODE_DECIMAL_NUMBER;
 125 }
 126
 127
 128 /**
 129  * g_unichar_isgraph:
 130  * @c: a Unicode character
 131  *
 132  * Determines whether a character is printable and not a space
 133  * (returns %FALSE for control characters, format characters, and
 134  * spaces). g_unichar_isprint() is similar, but returns %TRUE for
 135  * spaces. Given some UTF-8 text, obtain a character value with
 136  * g_utf8_get_char().
 137  *
 138  * Return value: %TRUE if @c is printable unless it's a space
 139  **/
 140 gboolean
 141 g_unichar_isgraph (gunichar c)
 142 {
 143   int t = TYPE (c);
 144   return (t != G_UNICODE_CONTROL
 145           && t != G_UNICODE_FORMAT
 146           && t != G_UNICODE_UNASSIGNED
 147           && t != G_UNICODE_PRIVATE_USE
 148           && t != G_UNICODE_SURROGATE
 149           && t != G_UNICODE_SPACE_SEPARATOR);
 150 }
 151
 152 /**
 153  * g_unichar_islower:
 154  * @c: a Unicode character
 155  *
 156  * Determines whether a character is a lowercase letter.
 157  * Given some UTF-8 text, obtain a character value with
 158  * g_utf8_get_char().
 159  *
 160  * Return value: %TRUE if @c is a lowercase letter
 161  **/
 162 gboolean
 163 g_unichar_islower (gunichar c)
 164 {
 165   return TYPE (c) == G_UNICODE_LOWERCASE_LETTER;
 166 }
 167
 168
 169 /**
 170  * g_unichar_isprint:
 171  * @c: a Unicode character
 172  *
 173  * Determines whether a character is printable.
 174  * Unlike g_unichar_isgraph(), returns %TRUE for spaces.
 175  * Given some UTF-8 text, obtain a character value with
 176  * g_utf8_get_char().
 177  *
 178  * Return value: %TRUE if @c is printable
 179  **/
 180 gboolean
 181 g_unichar_isprint (gunichar c)
 182 {
 183   int t = TYPE (c);
 184   return (t != G_UNICODE_CONTROL
 185           && t != G_UNICODE_FORMAT
 186           && t != G_UNICODE_UNASSIGNED
 187           && t != G_UNICODE_PRIVATE_USE
 188           && t != G_UNICODE_SURROGATE);
 189 }
 190
 191 /**
 192  * g_unichar_ispunct:
 193  * @c: a Unicode character
 194  *
 195  * Determines whether a character is punctuation or a symbol.
 196  * Given some UTF-8 text, obtain a character value with
 197  * g_utf8_get_char().
 198  *
 199  * Return value: %TRUE if @c is a punctuation or symbol character
 200  **/
 201 gboolean
 202 g_unichar_ispunct (gunichar c)
 203 {
 204   int t = TYPE (c);
 205   return (t == G_UNICODE_CONNECT_PUNCTUATION || t == G_UNICODE_DASH_PUNCTUATION
 206           || t == G_UNICODE_CLOSE_PUNCTUATION || t == G_UNICODE_FINAL_PUNCTUATION
 207           || t == G_UNICODE_INITIAL_PUNCTUATION || t == G_UNICODE_OTHER_PUNCTUATION
 208           || t == G_UNICODE_OPEN_PUNCTUATION || t == G_UNICODE_CURRENCY_SYMBOL
 209           || t == G_UNICODE_MODIFIER_SYMBOL || t == G_UNICODE_MATH_SYMBOL
 210           || t == G_UNICODE_OTHER_SYMBOL);
 211 }
 212
 213 /**
 214  * g_unichar_isspace:
 215  * @c: a Unicode character
 216  *
 217  * Determines whether a character is a space, tab, or line separator
 218  * (newline, carriage return, etc.).  Given some UTF-8 text, obtain a
 219  * character value with g_utf8_get_char().
 220  *
 221  * (Note: don't use this to do word breaking; you have to use
 222  * Pango or equivalent to get word breaking right, the algorithm
 223  * is fairly complex.)
 224  *
 225  * Return value: %TRUE if @c is a punctuation character
 226  **/
 227 gboolean
 228 g_unichar_isspace (gunichar c)
 229 {
 230   switch (c)
 231     {
 232       /* special-case these since Unicode thinks they are not spaces */
 233     case '\t':
 234     case '\n':
 235     case '\r':
 236     case '\f':
 237       return TRUE;
 238       break;
 239
 240     default:
 241       {
 242         int t = TYPE (c);
 243         return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR
 244                 || t == G_UNICODE_PARAGRAPH_SEPARATOR);
 245       }
 246       break;
 247     }
 248 }
 249
 250 /**
 251  * g_unichar_isupper:
 252  * @c: a Unicode character
 253  *
 254  * Determines if a character is uppercase.
 255  *
 256  * Return value: %TRUE if @c is an uppercase character
 257  **/
 258 gboolean
 259 g_unichar_isupper (gunichar c)
 260 {
 261   return TYPE (c) == G_UNICODE_UPPERCASE_LETTER;
 262 }
 263
 264 /**
 265  * g_unichar_istitle:
 266  * @c: a Unicode character
 267  *
 268  * Determines if a character is titlecase. Some characters in
 269  * Unicode which are composites, such as the DZ digraph
 270  * have three case variants instead of just two. The titlecase
 271  * form is used at the beginning of a word where only the
 272  * first letter is capitalized. The titlecase form of the DZ
 273  * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z.
 274  *
 275  * Return value: %TRUE if the character is titlecase
 276  **/
 277 gboolean
 278 g_unichar_istitle (gunichar c)
 279 {
 280   unsigned int i;
 281   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 282     if (title_table[i][0] == c)
 283       return 1;
 284   return 0;
 285 }
 286
 287 /**
 288  * g_unichar_isxdigit:
 289  * @c: a Unicode character.
 290  *
 291  * Determines if a character is a hexidecimal digit.
 292  *
 293  * Return value: %TRUE if the character is a hexadecimal digit
 294  **/
 295 gboolean
 296 g_unichar_isxdigit (gunichar c)
 297 {
 298   int t = TYPE (c);
 299   return ((c >= 'a' && c <= 'f')
 300           || (c >= 'A' && c <= 'F')
 301           || ISDIGIT (t));
 302 }
 303
 304 /**
 305  * g_unichar_isdefined:
 306  * @c: a Unicode character
 307  *
 308  * Determines if a given character is assigned in the Unicode
 309  * standard.
 310  *
 311  * Return value: %TRUE if the character has an assigned value
 312  **/
 313 gboolean
 314 g_unichar_isdefined (gunichar c)
 315 {
 316   int t = TYPE (c);
 317   return t != G_UNICODE_UNASSIGNED;
 318 }
 319
 320 /**
 321  * g_unichar_iswide:
 322  * @c: a Unicode character
 323  *
 324  * Determines if a character is typically rendered in a double-width
 325  * cell.
 326  *
 327  * Return value: %TRUE if the character is wide
 328  **/
 329 /* This function stolen from Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>.  */
 330 gboolean
 331 g_unichar_iswide (gunichar c)
 332 {
 333   if (c < 0x1100)
 334     return 0;
 335
 336   return ((c >= 0x1100 && c <= 0x115f)     /* Hangul Jamo */
 337           || (c >= 0x2e80 && c <= 0xa4cf && (c & ~0x0011) != 0x300a &&
 338               c != 0x303f)                 /* CJK ... Yi */
 339           || (c >= 0xac00 && c <= 0xd7a3)  /* Hangul Syllables */
 340           || (c >= 0xf900 && c <= 0xfaff)  /* CJK Compatibility Ideographs */
 341           || (c >= 0xfe30 && c <= 0xfe6f)  /* CJK Compatibility Forms */
 342           || (c >= 0xff00 && c <= 0xff5f)  /* Fullwidth Forms */
 343           || (c >= 0xffe0 && c <= 0xffe6));
 344 }
 345
 346 /**
 347  * g_unichar_toupper:
 348  * @c: a Unicode character
 349  *
 350  * Converts a character to uppercase.
 351  *
 352  * Return value: the result of converting @c to uppercase.
 353  *               If @c is not an lowercase or titlecase character,
 354  *               or has no upper case equivalent @c is returned unchanged.
 355  **/
 356 gunichar
 357 g_unichar_toupper (gunichar c)
 358 {
 359   int t = TYPE (c);
 360   if (t == G_UNICODE_LOWERCASE_LETTER)
 361     {
 362       gunichar val = ATTTABLE (c >> 8, c & 0xff);
 363       if (val >= 0xd800 && val < 0xdc00)
 364         {
 365           const guchar *p = special_case_table[val - 0xd800];
 366           return p[0] * 256 + p[1];
 367         }
 368       else
 369         return val ? val : c;
 370     }
 371   else if (t == G_UNICODE_TITLECASE_LETTER)
 372     {
 373       unsigned int i;
 374       for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 375         {
 376           if (title_table[i][0] == c)
 377             return title_table[i][1];
 378         }
 379     }
 380   return c;
 381 }
 382
 383 /**
 384  * g_unichar_tolower:
 385  * @c: a Unicode character.
 386  *
 387  * Converts a character to lower case.
 388  *
 389  * Return value: the result of converting @c to lower case.
 390  *               If @c is not an upperlower or titlecase character,
 391  *               or has no lowercase equivalent @c is returned unchanged.
 392  **/
 393 gunichar
 394 g_unichar_tolower (gunichar c)
 395 {
 396   int t = TYPE (c);
 397   if (t == G_UNICODE_UPPERCASE_LETTER)
 398     {
 399       gunichar val = ATTTABLE (c >> 8, c & 0xff);
 400       if (val >= 0xd800 && val < 0xdc00)
 401         {
 402           const guchar *p = special_case_table[val - 0xd800];
 403           return p[0] * 256 + p[1];
 404         }
 405       else
 406         return val ? val : c;
 407     }
 408   else if (t == G_UNICODE_TITLECASE_LETTER)
 409     {
 410       unsigned int i;
 411       for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 412         {
 413           if (title_table[i][0] == c)
 414             return title_table[i][2];
 415         }
 416     }
 417   return c;
 418 }
 419
 420 /**
 421  * g_unichar_totitle:
 422  * @c: a Unicode character
 423  *
 424  * Converts a character to the titlecase.
 425  *
 426  * Return value: the result of converting @c to titlecase.
 427  *               If @c is not an uppercase or lowercase character,
 428  *               @c is returned unchanged.
 429  **/
 430 gunichar
 431 g_unichar_totitle (gunichar c)
 432 {
 433   unsigned int i;
 434   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 435     {
 436       if (title_table[i][0] == c || title_table[i][1] == c
 437           || title_table[i][2] == c)
 438         return title_table[i][0];
 439     }
 440   return (TYPE (c) == G_UNICODE_LOWERCASE_LETTER
 441           ? ATTTABLE (c >> 8, c & 0xff)
 442           : c);
 443 }
 444
 445 /**
 446  * g_unichar_digit_value:
 447  * @c: a Unicode character
 448  *
 449  * Determines the numeric value of a character as a decimal
 450  * digit.
 451  *
 452  * Return value: If @c is a decimal digit (according to
 453  * g_unichar_isdigit()), its numeric value. Otherwise, -1.
 454  **/
 455 int
 456 g_unichar_digit_value (gunichar c)
 457 {
 458   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
 459     return ATTTABLE (c >> 8, c & 0xff);
 460   return -1;
 461 }
 462
 463 /**
 464  * g_unichar_xdigit_value:
 465  * @c: a Unicode character
 466  *
 467  * Determines the numeric value of a character as a hexidecimal
 468  * digit.
 469  *
 470  * Return value: If @c is a hex digit (according to
 471  * g_unichar_isxdigit()), its numeric value. Otherwise, -1.
 472  **/
 473 int
 474 g_unichar_xdigit_value (gunichar c)
 475 {
 476   if (c >= 'A' && c <= 'F')
 477     return c - 'A' + 10;
 478   if (c >= 'a' && c <= 'f')
 479     return c - 'a' + 10;
 480   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
 481     return ATTTABLE (c >> 8, c & 0xff);
 482   return -1;
 483 }
 484
 485 /**
 486  * g_unichar_type:
 487  * @c: a Unicode character
 488  *
 489  * Classifies a Unicode character by type.
 490  *
 491  * Return value: the type of the character.
 492  **/
 493 GUnicodeType
 494 g_unichar_type (gunichar c)
 495 {
 496   return TYPE (c);
 497 }
 498
 499 /*
 500  * Case mapping functions
 501  */
 502
 503 typedef enum {
 504   LOCALE_NORMAL,
 505   LOCALE_TURKIC,
 506   LOCALE_LITHUANIAN
 507 } LocaleType;
 508
 509 static LocaleType
 510 get_locale_type (void)
 511 {
 512   const char *locale = setlocale (LC_CTYPE, NULL);
 513
 514   switch (locale[0])
 515     {
 516    case 'a':
 517       if (locale[1] == 'z')
 518         return LOCALE_TURKIC;
 519       break;
 520     case 'l':
 521       if (locale[1] == 't')
 522         return LOCALE_LITHUANIAN;
 523       break;
 524     case 't':
 525       if (locale[1] == 'r')
 526         return LOCALE_TURKIC;
 527       break;
 528     }
 529
 530   return LOCALE_NORMAL;
 531 }
 532
 533 static int
 534 output_marks (const char **p_inout,
 535               char        *out_buffer,
 536               int          len,
 537               gboolean     remove_dot)
 538 {
 539   const char *p = *p_inout;
 540
 541   while (*p)
 542     {
 543       gunichar c = g_utf8_get_char (p);
 544       int t = TYPE(c);
 545
 546       if (ISMARK(t))
 547         {
 548           if (!remove_dot || c != 0x307 /* COMBINING DOT ABOVE */)
 549             len += g_unichar_to_utf8 (c, out_buffer ? out_buffer + len : NULL);
 550           p = g_utf8_next_char (p);
 551         }
 552       else
 553         break;
 554     }
 555
 556   *p_inout = p;
 557   return len;
 558 }
 559
 560 static gsize
 561 output_special_case (gchar *out_buffer,
 562                      gsize  len,
 563                      int    index,
 564                      int    type,
 565                      int    which)
 566 {
 567   const guchar *p = special_case_table[index];
 568
 569   if (type != G_UNICODE_TITLECASE_LETTER)
 570     p += 2; /* +2 to skip over "best single match" */
 571
 572   if (which == 1)
 573     {
 574       while (p[0] && p[1])
 575         p += 2;
 576       p += 2;
 577     }
 578
 579   while (TRUE)
 580     {
 581       gunichar ch = p[0] * 256 + p[1];
 582       if (!ch)
 583         break;
 584
 585       len += g_unichar_to_utf8 (ch, out_buffer ? out_buffer + len : NULL);
 586       p += 2;
 587     }
 588
 589   return len;
 590 }
 591
 592 static gsize
 593 real_toupper (const gchar *str,
 594               gssize       max_len,
 595               gchar       *out_buffer,
 596               LocaleType   locale_type)
 597 {
 598   const gchar *p = str;
 599   const char *last = NULL;
 600   gsize len = 0;
 601   gboolean last_was_i = FALSE;
 602
 603   while ((max_len < 0 || p < str + max_len) && *p)
 604     {
 605       gunichar c = g_utf8_get_char (p);
 606       int t = TYPE (c);
 607       gunichar val;
 608
 609       last = p;
 610       p = g_utf8_next_char (p);
 611
 612       if (locale_type == LOCALE_LITHUANIAN)
 613         {
 614           if (c == 'i')
 615             last_was_i = TRUE;
 616           else
 617             {
 618               if (last_was_i)
 619                 {
 620                   /* Nasty, need to remove any dot above. Though
 621                    * I think only E WITH DOT ABOVE occurs in practice
 622                    * which could simplify this considerably.
 623                    */
 624                   gsize decomp_len, i;
 625                   gunichar *decomp;
 626
 627                   decomp = g_unicode_canonical_decomposition (c, &decomp_len);
 628                   for (i=0; i < decomp_len; i++)
 629                     {
 630                       if (decomp[i] != 0x307 /* COMBINING DOT ABOVE */)
 631                         len += g_unichar_to_utf8 (g_unichar_toupper (decomp[i]), out_buffer ? out_buffer + len : NULL);
 632                     }
 633                   g_free (decomp);
 634
 635                   len = output_marks (&p, out_buffer, len, TRUE);
 636
 637                   continue;
 638                 }
 639
 640               if (!ISMARK(t))
 641                 last_was_i = FALSE;
 642             }
 643         }
 644
 645       if (locale_type == LOCALE_TURKIC && c == 'i')
 646         {
 647           /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE */
 648           len += g_unichar_to_utf8 (0x130, out_buffer ? out_buffer + len : NULL);
 649         }
 650       else if (c == 0x0345)     /* COMBINING GREEK YPOGEGRAMMENI */
 651         {
 652           /* Nasty, need to move it after other combining marks .. this would go away if
 653            * we normalized first.
 654            */
 655           len = output_marks (&p, out_buffer, len, FALSE);
 656
 657           /* And output as GREEK CAPITAL LETTER IOTA */
 658           len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL);
 659         }
 660       else if (t == G_UNICODE_LOWERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER)
 661         {
 662           val = ATTTABLE (c >> 8, c & 0xff);
 663
 664           if (val >= 0xd800 && val < 0xdc00)
 665             {
 666               len += output_special_case (out_buffer, len, val - 0xd800, t,
 667                                           t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1);
 668             }
 669           else
 670             {
 671               if (t == G_UNICODE_TITLECASE_LETTER)
 672                 {
 673                   unsigned int i;
 674                   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 675                     {
 676                       if (title_table[i][0] == c)
 677                         val = title_table[i][1];
 678                     }
 679                 }
 680
 681               len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 682             }
 683         }
 684       else
 685         {
 686           gsize char_len = g_utf8_skip[*(guchar *)last];
 687
 688           if (out_buffer)
 689             memcpy (out_buffer + len, last, char_len);
 690
 691           len += char_len;
 692         }
 693
 694     }
 695
 696   return len;
 697 }
 698
 699 /**
 700  * g_utf8_strup:
 701  * @str: a UTF-8 encoded string
 702  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 703  *
 704  * Converts all Unicode characters in the string that have a case
 705  * to uppercase. The exact manner that this is done depends
 706  * on the current locale, and may result in the number of
 707  * characters in the string increasing. (For instance, the
 708  * German ess-zet will be changed to SS.)
 709  *
 710  * Return value: a newly allocated string, with all characters
 711  *    converted to uppercase.
 712  **/
 713 gchar *
 714 g_utf8_strup (const gchar *str,
 715               gssize       len)
 716 {
 717   gsize result_len;
 718   LocaleType locale_type;
 719   gchar *result;
 720
 721   g_return_val_if_fail (str != NULL, NULL);
 722
 723   locale_type = get_locale_type ();
 724
 725   /*
 726    * We use a two pass approach to keep memory management simple
 727    */
 728   result_len = real_toupper (str, len, NULL, locale_type);
 729   result = g_malloc (result_len + 1);
 730   real_toupper (str, len, result, locale_type);
 731   result[result_len] = '\0';
 732
 733   return result;
 734 }
 735
 736 static gsize
 737 real_tolower (const gchar *str,
 738               gssize       max_len,
 739               gchar       *out_buffer,
 740               LocaleType   locale_type)
 741 {
 742   const gchar *p = str;
 743   const char *last = NULL;
 744   gsize len = 0;
 745
 746   while ((max_len < 0 || p < str + max_len) && *p)
 747     {
 748       gunichar c = g_utf8_get_char (p);
 749       int t = TYPE (c);
 750       gunichar val;
 751
 752       last = p;
 753       p = g_utf8_next_char (p);
 754
 755       if (locale_type == LOCALE_TURKIC && c == 'I')
 756         {
 757           /* I => LATIN SMALL LETTER DOTLESS I */
 758           len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL);
 759         }
 760       else if (c == 0x03A3)     /* GREEK CAPITAL LETTER SIGMA */
 761         {
 762           if ((max_len < 0 || p < str + max_len) && *p)
 763             {
 764               gunichar next_c = g_utf8_get_char (p);
 765               int next_type = TYPE(next_c);
 766
 767               /* SIGMA mapps differently depending on whether it is
 768                * final or not. The following simplified test would
 769                * fail in the case of combining marks following the
 770                * sigma, but I don't think that occurs in real text.
 771                * The test here matches that in ICU.
 772                */
 773               if (ISALPHA(next_type)) /* Lu,Ll,Lt,Lm,Lo */
 774                 val = 0x3c3;    /* GREEK SMALL SIGMA */
 775               else
 776                 val = 0x3c2;    /* GREEK SMALL FINAL SIGMA */
 777             }
 778           else
 779             val = 0x3c2;        /* GREEK SMALL FINAL SIGMA */
 780
 781           len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 782         }
 783       else if (t == G_UNICODE_UPPERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER)
 784         {
 785           val = ATTTABLE (c >> 8, c & 0xff);
 786
 787           if (val >= 0xd800 && val < 0xdc00)
 788             {
 789               len += output_special_case (out_buffer, len, val - 0xd800, t, 0);
 790             }
 791           else
 792             {
 793               if (t == G_UNICODE_TITLECASE_LETTER)
 794                 {
 795                   unsigned int i;
 796                   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 797                     {
 798                       if (title_table[i][0] == c)
 799                         val = title_table[i][2];
 800                     }
 801                 }
 802
 803               len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 804             }
 805         }
 806       else
 807         {
 808           gsize char_len = g_utf8_skip[*(guchar *)last];
 809
 810           if (out_buffer)
 811             memcpy (out_buffer + len, last, char_len);
 812
 813           len += char_len;
 814         }
 815
 816     }
 817
 818   return len;
 819 }
 820
 821 /**
 822  * g_utf8_strdown:
 823  * @str: a UTF-8 encoded string
 824  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 825  *
 826  * Converts all Unicode characters in the string that have a case
 827  * to lowercase. The exact manner that this is done depends
 828  * on the current locale, and may result in the number of
 829  * characters in the string changing.
 830  *
 831  * Return value: a newly allocated string, with all characters
 832  *    converted to lowercase.
 833  **/
 834 gchar *
 835 g_utf8_strdown (const gchar *str,
 836                 gssize       len)
 837 {
 838   gsize result_len;
 839   LocaleType locale_type;
 840   gchar *result;
 841
 842   g_return_val_if_fail (str != NULL, NULL);
 843
 844   locale_type = get_locale_type ();
 845
 846   /*
 847    * We use a two pass approach to keep memory management simple
 848    */
 849   result_len = real_tolower (str, len, NULL, locale_type);
 850   result = g_malloc (result_len + 1);
 851   real_tolower (str, len, result, locale_type);
 852   result[result_len] = '\0';
 853
 854   return result;
 855 }
 856
 857 /**
 858  * g_utf8_casefold:
 859  * @str: a UTF-8 encoded string
 860  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 861  *
 862  * Converts a string into a form that is independent of case. The
 863  * result will not correspond to any particular case, but can be
 864  * compared for equality or ordered with the results of calling
 865  * g_utf8_casefold() on other strings.
 866  *
 867  * Note that calling g_utf8_casefold() followed by g_utf8_collate() is
 868  * only an approximation to the correct linguistic case insensitive
 869  * ordering, though it is a fairly good one. Getting this exactly
 870  * right would require a more sophisticated collation function that
 871  * takes case sensitivity into account. GLib does not currently
 872  * provide such a function.
 873  *
 874  * Return value: a newly allocated string, that is a
 875  *   case independent form of @str.
 876  **/
 877 gchar *
 878 g_utf8_casefold (const gchar *str,
 879                  gssize       len)
 880 {
 881   GString *result = g_string_new (NULL);
 882   const char *p;
 883
 884   p = str;
 885   while ((len < 0 || p < str + len) && *p)
 886     {
 887       gunichar ch = g_utf8_get_char (p);
 888
 889       int start = 0;
 890       int end = G_N_ELEMENTS (casefold_table);
 891
 892       if (ch >= casefold_table[start].ch &&
 893           ch <= casefold_table[end - 1].ch)
 894         {
 895           while (TRUE)
 896             {
 897               int half = (start + end) / 2;
 898               if (ch == casefold_table[half].ch)
 899                 {
 900                   g_string_append (result, casefold_table[half].data);
 901                   goto next;
 902                 }
 903               else if (half == start)
 904                 break;
 905               else if (ch > casefold_table[half].ch)
 906                 start = half;
 907               else
 908                 end = half;
 909             }
 910         }
 911
 912       g_string_append_unichar (result, g_unichar_tolower (ch));
 913
 914     next:
 915       p = g_utf8_next_char (p);
 916     }
 917
 918   return g_string_free (result, FALSE);
 919 }