glib/guniprop.c

   1 /* guniprop.c - Unicode character properties.
   2  *
   3  * Copyright (C) 1999 Tom Tromey
   4  * Copyright (C) 2000 Red Hat, Inc.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  */
  21
  22 #include "config.h"
  23
  24 #include <stddef.h>
  25 #include <string.h>
  26 #include <locale.h>
  27
  28 #include "glib.h"
  29 #include "gunichartables.h"
  30
  31
  32 #define ATTTABLE(Page, Char) \
  33   ((attr_table[Page] == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[attr_table[Page]][Char]))
  34
  35 #define TTYPE(Page, Char) \
  36   ((type_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
  37    ? (type_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
  38    : (type_data[type_table[Page]][Char]))
  39
  40
  41 #define TYPE(Char) (((Char) > (G_UNICODE_LAST_CHAR)) ? G_UNICODE_UNASSIGNED : TTYPE ((Char) >> 8, (Char) & 0xff))
  42
  43 #define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER       \
  44                        || (Type) == G_UNICODE_LETTER_NUMBER     \
  45                        || (Type) == G_UNICODE_OTHER_NUMBER)
  46
  47 #define ISALPHA(Type) ((Type) == G_UNICODE_LOWERCASE_LETTER     \
  48                        || (Type) == G_UNICODE_UPPERCASE_LETTER  \
  49                        || (Type) == G_UNICODE_TITLECASE_LETTER  \
  50                        || (Type) == G_UNICODE_MODIFIER_LETTER   \
  51                        || (Type) == G_UNICODE_OTHER_LETTER)
  52
  53 #define ISMARK(Type) ((Type) == G_UNICODE_NON_SPACING_MARK ||   \
  54                       (Type) == G_UNICODE_COMBINING_MARK ||     \
  55                       (Type) == G_UNICODE_ENCLOSING_MARK)
  56
  57
  58 /**
  59  * g_unichar_isalnum:
  60  * @c: a Unicode character
  61  *
  62  * Determines whether a character is alphanumeric.
  63  * Given some UTF-8 text, obtain a character value
  64  * with g_utf8_get_char().
  65  *
  66  * Return value: %TRUE if @c is an alphanumeric character
  67  **/
  68 gboolean
  69 g_unichar_isalnum (gunichar c)
  70 {
  71   int t = TYPE (c);
  72   return ISDIGIT (t) || ISALPHA (t);
  73 }
  74
  75 /**
  76  * g_unichar_isalpha:
  77  * @c: a Unicode character
  78  *
  79  * Determines whether a character is alphabetic (i.e. a letter).
  80  * Given some UTF-8 text, obtain a character value with
  81  * g_utf8_get_char().
  82  *
  83  * Return value: %TRUE if @c is an alphabetic character
  84  **/
  85 gboolean
  86 g_unichar_isalpha (gunichar c)
  87 {
  88   int t = TYPE (c);
  89   return ISALPHA (t);
  90 }
  91
  92
  93 /**
  94  * g_unichar_iscntrl:
  95  * @c: a Unicode character
  96  *
  97  * Determines whether a character is a control character.
  98  * Given some UTF-8 text, obtain a character value with
  99  * g_utf8_get_char().
 100  *
 101  * Return value: %TRUE if @c is a control character
 102  **/
 103 gboolean
 104 g_unichar_iscntrl (gunichar c)
 105 {
 106   return TYPE (c) == G_UNICODE_CONTROL;
 107 }
 108
 109 /**
 110  * g_unichar_isdigit:
 111  * @c: a Unicode character
 112  *
 113  * Determines whether a character is numeric (i.e. a digit).  This
 114  * covers ASCII 0-9 and also digits in other languages/scripts.  Given
 115  * some UTF-8 text, obtain a character value with g_utf8_get_char().
 116  *
 117  * Return value: %TRUE if @c is a digit
 118  **/
 119 gboolean
 120 g_unichar_isdigit (gunichar c)
 121 {
 122   return TYPE (c) == G_UNICODE_DECIMAL_NUMBER;
 123 }
 124
 125
 126 /**
 127  * g_unichar_isgraph:
 128  * @c: a Unicode character
 129  *
 130  * Determines whether a character is printable and not a space
 131  * (returns %FALSE for control characters, format characters, and
 132  * spaces). g_unichar_isprint() is similar, but returns %TRUE for
 133  * spaces. Given some UTF-8 text, obtain a character value with
 134  * g_utf8_get_char().
 135  *
 136  * Return value: %TRUE if @c is printable unless it's a space
 137  **/
 138 gboolean
 139 g_unichar_isgraph (gunichar c)
 140 {
 141   int t = TYPE (c);
 142   return (t != G_UNICODE_CONTROL
 143           && t != G_UNICODE_FORMAT
 144           && t != G_UNICODE_UNASSIGNED
 145           && t != G_UNICODE_PRIVATE_USE
 146           && t != G_UNICODE_SURROGATE
 147           && t != G_UNICODE_SPACE_SEPARATOR);
 148 }
 149
 150 /**
 151  * g_unichar_islower:
 152  * @c: a Unicode character
 153  *
 154  * Determines whether a character is a lowercase letter.
 155  * Given some UTF-8 text, obtain a character value with
 156  * g_utf8_get_char().
 157  *
 158  * Return value: %TRUE if @c is a lowercase letter
 159  **/
 160 gboolean
 161 g_unichar_islower (gunichar c)
 162 {
 163   return TYPE (c) == G_UNICODE_LOWERCASE_LETTER;
 164 }
 165
 166
 167 /**
 168  * g_unichar_isprint:
 169  * @c: a Unicode character
 170  *
 171  * Determines whether a character is printable.
 172  * Unlike g_unichar_isgraph(), returns %TRUE for spaces.
 173  * Given some UTF-8 text, obtain a character value with
 174  * g_utf8_get_char().
 175  *
 176  * Return value: %TRUE if @c is printable
 177  **/
 178 gboolean
 179 g_unichar_isprint (gunichar c)
 180 {
 181   int t = TYPE (c);
 182   return (t != G_UNICODE_CONTROL
 183           && t != G_UNICODE_FORMAT
 184           && t != G_UNICODE_UNASSIGNED
 185           && t != G_UNICODE_PRIVATE_USE
 186           && t != G_UNICODE_SURROGATE);
 187 }
 188
 189 /**
 190  * g_unichar_ispunct:
 191  * @c: a Unicode character
 192  *
 193  * Determines whether a character is punctuation or a symbol.
 194  * Given some UTF-8 text, obtain a character value with
 195  * g_utf8_get_char().
 196  *
 197  * Return value: %TRUE if @c is a punctuation or symbol character
 198  **/
 199 gboolean
 200 g_unichar_ispunct (gunichar c)
 201 {
 202   int t = TYPE (c);
 203   return (t == G_UNICODE_CONNECT_PUNCTUATION || t == G_UNICODE_DASH_PUNCTUATION
 204           || t == G_UNICODE_CLOSE_PUNCTUATION || t == G_UNICODE_FINAL_PUNCTUATION
 205           || t == G_UNICODE_INITIAL_PUNCTUATION || t == G_UNICODE_OTHER_PUNCTUATION
 206           || t == G_UNICODE_OPEN_PUNCTUATION || t == G_UNICODE_CURRENCY_SYMBOL
 207           || t == G_UNICODE_MODIFIER_SYMBOL || t == G_UNICODE_MATH_SYMBOL
 208           || t == G_UNICODE_OTHER_SYMBOL);
 209 }
 210
 211 /**
 212  * g_unichar_isspace:
 213  * @c: a Unicode character
 214  *
 215  * Determines whether a character is a space, tab, or line separator
 216  * (newline, carriage return, etc.).  Given some UTF-8 text, obtain a
 217  * character value with g_utf8_get_char().
 218  *
 219  * (Note: don't use this to do word breaking; you have to use
 220  * Pango or equivalent to get word breaking right, the algorithm
 221  * is fairly complex.)
 222  *
 223  * Return value: %TRUE if @c is a punctuation character
 224  **/
 225 gboolean
 226 g_unichar_isspace (gunichar c)
 227 {
 228   switch (c)
 229     {
 230       /* special-case these since Unicode thinks they are not spaces */
 231     case '\t':
 232     case '\n':
 233     case '\r':
 234     case '\f':
 235       return TRUE;
 236       break;
 237
 238     default:
 239       {
 240         int t = TYPE (c);
 241         return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR
 242                 || t == G_UNICODE_PARAGRAPH_SEPARATOR);
 243       }
 244       break;
 245     }
 246 }
 247
 248 /**
 249  * g_unichar_isupper:
 250  * @c: a Unicode character
 251  *
 252  * Determines if a character is uppercase.
 253  *
 254  * Return value: %TRUE if @c is an uppercase character
 255  **/
 256 gboolean
 257 g_unichar_isupper (gunichar c)
 258 {
 259   return TYPE (c) == G_UNICODE_UPPERCASE_LETTER;
 260 }
 261
 262 /**
 263  * g_unichar_istitle:
 264  * @c: a Unicode character
 265  *
 266  * Determines if a character is titlecase. Some characters in
 267  * Unicode which are composites, such as the DZ digraph
 268  * have three case variants instead of just two. The titlecase
 269  * form is used at the beginning of a word where only the
 270  * first letter is capitalized. The titlecase form of the DZ
 271  * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z.
 272  *
 273  * Return value: %TRUE if the character is titlecase
 274  **/
 275 gboolean
 276 g_unichar_istitle (gunichar c)
 277 {
 278   unsigned int i;
 279   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 280     if (title_table[i][0] == c)
 281       return 1;
 282   return 0;
 283 }
 284
 285 /**
 286  * g_unichar_isxdigit:
 287  * @c: a Unicode character.
 288  *
 289  * Determines if a character is a hexidecimal digit.
 290  *
 291  * Return value: %TRUE if the character is a hexadecimal digit
 292  **/
 293 gboolean
 294 g_unichar_isxdigit (gunichar c)
 295 {
 296   int t = TYPE (c);
 297   return ((c >= 'a' && c <= 'f')
 298           || (c >= 'A' && c <= 'F')
 299           || ISDIGIT (t));
 300 }
 301
 302 /**
 303  * g_unichar_isdefined:
 304  * @c: a Unicode character
 305  *
 306  * Determines if a given character is assigned in the Unicode
 307  * standard.
 308  *
 309  * Return value: %TRUE if the character has an assigned value
 310  **/
 311 gboolean
 312 g_unichar_isdefined (gunichar c)
 313 {
 314   int t = TYPE (c);
 315   return t != G_UNICODE_UNASSIGNED;
 316 }
 317
 318 /**
 319  * g_unichar_iswide:
 320  * @c: a Unicode character
 321  *
 322  * Determines if a character is typically rendered in a double-width
 323  * cell.
 324  *
 325  * Return value: %TRUE if the character is wide
 326  **/
 327 /* This function stolen from Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>.  */
 328 gboolean
 329 g_unichar_iswide (gunichar c)
 330 {
 331   if (c < 0x1100)
 332     return FALSE;
 333
 334   return (c <= 0x115f  /* Hangul Jamo init. consonants */
 335           || c == 0x2329 || c == 0x232a     /* angle brackets */
 336           || (c >= 0x2e80 && c <= 0xa4cf && (c < 0x302a || c > 0x302f)
 337               && c != 0x303f && c != 0x3099 && c!= 0x309a) /* CJK ... Yi */
 338           || (c >= 0xac00 && c <= 0xd7a3)   /* Hangul Syllables */
 339           || (c >= 0xf900 && c <= 0xfaff)   /* CJK Compatibility Ideographs */
 340           || (c >= 0xfe30 && c <= 0xfe6f)   /* CJK Compatibility Forms */
 341           || (c >= 0xff00 && c <= 0xff60)   /* Fullwidth Forms */
 342           || (c >= 0xffe0 && c <= 0xffe6)   /* Fullwidth Forms */
 343           || (c >= 0x20000 && c <= 0x2fffd) /* CJK extra stuff */
 344           || (c >= 0x30000 && c <= 0x3fffd));
 345 }
 346
 347 /**
 348  * g_unichar_toupper:
 349  * @c: a Unicode character
 350  *
 351  * Converts a character to uppercase.
 352  *
 353  * Return value: the result of converting @c to uppercase.
 354  *               If @c is not an lowercase or titlecase character,
 355  *               or has no upper case equivalent @c is returned unchanged.
 356  **/
 357 gunichar
 358 g_unichar_toupper (gunichar c)
 359 {
 360   int t = TYPE (c);
 361   if (t == G_UNICODE_LOWERCASE_LETTER)
 362     {
 363       gunichar val = ATTTABLE (c >> 8, c & 0xff);
 364       if (val >= 0xd800 && val < 0xdc00)
 365         {
 366           const guchar *p = special_case_table[val - 0xd800];
 367           return p[0] * 256 + p[1];
 368         }
 369       else
 370         return val ? val : c;
 371     }
 372   else if (t == G_UNICODE_TITLECASE_LETTER)
 373     {
 374       unsigned int i;
 375       for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 376         {
 377           if (title_table[i][0] == c)
 378             return title_table[i][1];
 379         }
 380     }
 381   return c;
 382 }
 383
 384 /**
 385  * g_unichar_tolower:
 386  * @c: a Unicode character.
 387  *
 388  * Converts a character to lower case.
 389  *
 390  * Return value: the result of converting @c to lower case.
 391  *               If @c is not an upperlower or titlecase character,
 392  *               or has no lowercase equivalent @c is returned unchanged.
 393  **/
 394 gunichar
 395 g_unichar_tolower (gunichar c)
 396 {
 397   int t = TYPE (c);
 398   if (t == G_UNICODE_UPPERCASE_LETTER)
 399     {
 400       gunichar val = ATTTABLE (c >> 8, c & 0xff);
 401       if (val >= 0xd800 && val < 0xdc00)
 402         {
 403           const guchar *p = special_case_table[val - 0xd800];
 404           return p[0] * 256 + p[1];
 405         }
 406       else
 407         return val ? val : c;
 408     }
 409   else if (t == G_UNICODE_TITLECASE_LETTER)
 410     {
 411       unsigned int i;
 412       for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 413         {
 414           if (title_table[i][0] == c)
 415             return title_table[i][2];
 416         }
 417     }
 418   return c;
 419 }
 420
 421 /**
 422  * g_unichar_totitle:
 423  * @c: a Unicode character
 424  *
 425  * Converts a character to the titlecase.
 426  *
 427  * Return value: the result of converting @c to titlecase.
 428  *               If @c is not an uppercase or lowercase character,
 429  *               @c is returned unchanged.
 430  **/
 431 gunichar
 432 g_unichar_totitle (gunichar c)
 433 {
 434   unsigned int i;
 435   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 436     {
 437       if (title_table[i][0] == c || title_table[i][1] == c
 438           || title_table[i][2] == c)
 439         return title_table[i][0];
 440     }
 441   return (TYPE (c) == G_UNICODE_LOWERCASE_LETTER
 442           ? ATTTABLE (c >> 8, c & 0xff)
 443           : c);
 444 }
 445
 446 /**
 447  * g_unichar_digit_value:
 448  * @c: a Unicode character
 449  *
 450  * Determines the numeric value of a character as a decimal
 451  * digit.
 452  *
 453  * Return value: If @c is a decimal digit (according to
 454  * g_unichar_isdigit()), its numeric value. Otherwise, -1.
 455  **/
 456 int
 457 g_unichar_digit_value (gunichar c)
 458 {
 459   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
 460     return ATTTABLE (c >> 8, c & 0xff);
 461   return -1;
 462 }
 463
 464 /**
 465  * g_unichar_xdigit_value:
 466  * @c: a Unicode character
 467  *
 468  * Determines the numeric value of a character as a hexidecimal
 469  * digit.
 470  *
 471  * Return value: If @c is a hex digit (according to
 472  * g_unichar_isxdigit()), its numeric value. Otherwise, -1.
 473  **/
 474 int
 475 g_unichar_xdigit_value (gunichar c)
 476 {
 477   if (c >= 'A' && c <= 'F')
 478     return c - 'A' + 10;
 479   if (c >= 'a' && c <= 'f')
 480     return c - 'a' + 10;
 481   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
 482     return ATTTABLE (c >> 8, c & 0xff);
 483   return -1;
 484 }
 485
 486 /**
 487  * g_unichar_type:
 488  * @c: a Unicode character
 489  *
 490  * Classifies a Unicode character by type.
 491  *
 492  * Return value: the type of the character.
 493  **/
 494 GUnicodeType
 495 g_unichar_type (gunichar c)
 496 {
 497   return TYPE (c);
 498 }
 499
 500 /*
 501  * Case mapping functions
 502  */
 503
 504 typedef enum {
 505   LOCALE_NORMAL,
 506   LOCALE_TURKIC,
 507   LOCALE_LITHUANIAN
 508 } LocaleType;
 509
 510 static LocaleType
 511 get_locale_type (void)
 512 {
 513   const char *locale = setlocale (LC_CTYPE, NULL);
 514
 515   switch (locale[0])
 516     {
 517    case 'a':
 518       if (locale[1] == 'z')
 519         return LOCALE_TURKIC;
 520       break;
 521     case 'l':
 522       if (locale[1] == 't')
 523         return LOCALE_LITHUANIAN;
 524       break;
 525     case 't':
 526       if (locale[1] == 'r')
 527         return LOCALE_TURKIC;
 528       break;
 529     }
 530
 531   return LOCALE_NORMAL;
 532 }
 533
 534 static int
 535 output_marks (const char **p_inout,
 536               char        *out_buffer,
 537               int          len,
 538               gboolean     remove_dot)
 539 {
 540   const char *p = *p_inout;
 541
 542   while (*p)
 543     {
 544       gunichar c = g_utf8_get_char (p);
 545       int t = TYPE(c);
 546
 547       if (ISMARK(t))
 548         {
 549           if (!remove_dot || c != 0x307 /* COMBINING DOT ABOVE */)
 550             len += g_unichar_to_utf8 (c, out_buffer ? out_buffer + len : NULL);
 551           p = g_utf8_next_char (p);
 552         }
 553       else
 554         break;
 555     }
 556
 557   *p_inout = p;
 558   return len;
 559 }
 560
 561 static gsize
 562 output_special_case (gchar *out_buffer,
 563                      gsize  len,
 564                      int    index,
 565                      int    type,
 566                      int    which)
 567 {
 568   const guchar *p = special_case_table[index];
 569
 570   if (type != G_UNICODE_TITLECASE_LETTER)
 571     p += 2; /* +2 to skip over "best single match" */
 572
 573   if (which == 1)
 574     {
 575       while (p[0] || p[1])
 576         p += 2;
 577       p += 2;
 578     }
 579
 580   while (TRUE)
 581     {
 582       gunichar ch = p[0] * 256 + p[1];
 583       if (!ch)
 584         break;
 585
 586       len += g_unichar_to_utf8 (ch, out_buffer ? out_buffer + len : NULL);
 587       p += 2;
 588     }
 589
 590   return len;
 591 }
 592
 593 static gsize
 594 real_toupper (const gchar *str,
 595               gssize       max_len,
 596               gchar       *out_buffer,
 597               LocaleType   locale_type)
 598 {
 599   const gchar *p = str;
 600   const char *last = NULL;
 601   gsize len = 0;
 602   gboolean last_was_i = FALSE;
 603
 604   while ((max_len < 0 || p < str + max_len) && *p)
 605     {
 606       gunichar c = g_utf8_get_char (p);
 607       int t = TYPE (c);
 608       gunichar val;
 609
 610       last = p;
 611       p = g_utf8_next_char (p);
 612
 613       if (locale_type == LOCALE_LITHUANIAN)
 614         {
 615           if (c == 'i')
 616             last_was_i = TRUE;
 617           else
 618             {
 619               if (last_was_i)
 620                 {
 621                   /* Nasty, need to remove any dot above. Though
 622                    * I think only E WITH DOT ABOVE occurs in practice
 623                    * which could simplify this considerably.
 624                    */
 625                   gsize decomp_len, i;
 626                   gunichar *decomp;
 627
 628                   decomp = g_unicode_canonical_decomposition (c, &decomp_len);
 629                   for (i=0; i < decomp_len; i++)
 630                     {
 631                       if (decomp[i] != 0x307 /* COMBINING DOT ABOVE */)
 632                         len += g_unichar_to_utf8 (g_unichar_toupper (decomp[i]), out_buffer ? out_buffer + len : NULL);
 633                     }
 634                   g_free (decomp);
 635
 636                   len = output_marks (&p, out_buffer, len, TRUE);
 637
 638                   continue;
 639                 }
 640
 641               if (!ISMARK(t))
 642                 last_was_i = FALSE;
 643             }
 644         }
 645
 646       if (locale_type == LOCALE_TURKIC && c == 'i')
 647         {
 648           /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE */
 649           len += g_unichar_to_utf8 (0x130, out_buffer ? out_buffer + len : NULL);
 650         }
 651       else if (c == 0x0345)     /* COMBINING GREEK YPOGEGRAMMENI */
 652         {
 653           /* Nasty, need to move it after other combining marks .. this would go away if
 654            * we normalized first.
 655            */
 656           len = output_marks (&p, out_buffer, len, FALSE);
 657
 658           /* And output as GREEK CAPITAL LETTER IOTA */
 659           len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL);
 660         }
 661       else if (t == G_UNICODE_LOWERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER)
 662         {
 663           val = ATTTABLE (c >> 8, c & 0xff);
 664
 665           if (val >= 0xd800 && val < 0xdc00)
 666             {
 667               len += output_special_case (out_buffer, len, val - 0xd800, t,
 668                                           t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1);
 669             }
 670           else
 671             {
 672               if (t == G_UNICODE_TITLECASE_LETTER)
 673                 {
 674                   unsigned int i;
 675                   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 676                     {
 677                       if (title_table[i][0] == c)
 678                         val = title_table[i][1];
 679                     }
 680                 }
 681
 682               len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 683             }
 684         }
 685       else
 686         {
 687           gsize char_len = g_utf8_skip[*(guchar *)last];
 688
 689           if (out_buffer)
 690             memcpy (out_buffer + len, last, char_len);
 691
 692           len += char_len;
 693         }
 694
 695     }
 696
 697   return len;
 698 }
 699
 700 /**
 701  * g_utf8_strup:
 702  * @str: a UTF-8 encoded string
 703  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 704  *
 705  * Converts all Unicode characters in the string that have a case
 706  * to uppercase. The exact manner that this is done depends
 707  * on the current locale, and may result in the number of
 708  * characters in the string increasing. (For instance, the
 709  * German ess-zet will be changed to SS.)
 710  *
 711  * Return value: a newly allocated string, with all characters
 712  *    converted to uppercase.
 713  **/
 714 gchar *
 715 g_utf8_strup (const gchar *str,
 716               gssize       len)
 717 {
 718   gsize result_len;
 719   LocaleType locale_type;
 720   gchar *result;
 721
 722   g_return_val_if_fail (str != NULL, NULL);
 723
 724   locale_type = get_locale_type ();
 725
 726   /*
 727    * We use a two pass approach to keep memory management simple
 728    */
 729   result_len = real_toupper (str, len, NULL, locale_type);
 730   result = g_malloc (result_len + 1);
 731   real_toupper (str, len, result, locale_type);
 732   result[result_len] = '\0';
 733
 734   return result;
 735 }
 736
 737 static gsize
 738 real_tolower (const gchar *str,
 739               gssize       max_len,
 740               gchar       *out_buffer,
 741               LocaleType   locale_type)
 742 {
 743   const gchar *p = str;
 744   const char *last = NULL;
 745   gsize len = 0;
 746
 747   while ((max_len < 0 || p < str + max_len) && *p)
 748     {
 749       gunichar c = g_utf8_get_char (p);
 750       int t = TYPE (c);
 751       gunichar val;
 752
 753       last = p;
 754       p = g_utf8_next_char (p);
 755
 756       if (locale_type == LOCALE_TURKIC && c == 'I')
 757         {
 758           /* I => LATIN SMALL LETTER DOTLESS I */
 759           len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL);
 760         }
 761       else if (c == 0x03A3)     /* GREEK CAPITAL LETTER SIGMA */
 762         {
 763           if ((max_len < 0 || p < str + max_len) && *p)
 764             {
 765               gunichar next_c = g_utf8_get_char (p);
 766               int next_type = TYPE(next_c);
 767
 768               /* SIGMA mapps differently depending on whether it is
 769                * final or not. The following simplified test would
 770                * fail in the case of combining marks following the
 771                * sigma, but I don't think that occurs in real text.
 772                * The test here matches that in ICU.
 773                */
 774               if (ISALPHA(next_type)) /* Lu,Ll,Lt,Lm,Lo */
 775                 val = 0x3c3;    /* GREEK SMALL SIGMA */
 776               else
 777                 val = 0x3c2;    /* GREEK SMALL FINAL SIGMA */
 778             }
 779           else
 780             val = 0x3c2;        /* GREEK SMALL FINAL SIGMA */
 781
 782           len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 783         }
 784       else if (t == G_UNICODE_UPPERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER)
 785         {
 786           val = ATTTABLE (c >> 8, c & 0xff);
 787
 788           if (val >= 0xd800 && val < 0xdc00)
 789             {
 790               len += output_special_case (out_buffer, len, val - 0xd800, t, 0);
 791             }
 792           else
 793             {
 794               if (t == G_UNICODE_TITLECASE_LETTER)
 795                 {
 796                   unsigned int i;
 797                   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 798                     {
 799                       if (title_table[i][0] == c)
 800                         val = title_table[i][2];
 801                     }
 802                 }
 803
 804               len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 805             }
 806         }
 807       else
 808         {
 809           gsize char_len = g_utf8_skip[*(guchar *)last];
 810
 811           if (out_buffer)
 812             memcpy (out_buffer + len, last, char_len);
 813
 814           len += char_len;
 815         }
 816
 817     }
 818
 819   return len;
 820 }
 821
 822 /**
 823  * g_utf8_strdown:
 824  * @str: a UTF-8 encoded string
 825  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 826  *
 827  * Converts all Unicode characters in the string that have a case
 828  * to lowercase. The exact manner that this is done depends
 829  * on the current locale, and may result in the number of
 830  * characters in the string changing.
 831  *
 832  * Return value: a newly allocated string, with all characters
 833  *    converted to lowercase.
 834  **/
 835 gchar *
 836 g_utf8_strdown (const gchar *str,
 837                 gssize       len)
 838 {
 839   gsize result_len;
 840   LocaleType locale_type;
 841   gchar *result;
 842
 843   g_return_val_if_fail (str != NULL, NULL);
 844
 845   locale_type = get_locale_type ();
 846
 847   /*
 848    * We use a two pass approach to keep memory management simple
 849    */
 850   result_len = real_tolower (str, len, NULL, locale_type);
 851   result = g_malloc (result_len + 1);
 852   real_tolower (str, len, result, locale_type);
 853   result[result_len] = '\0';
 854
 855   return result;
 856 }
 857
 858 /**
 859  * g_utf8_casefold:
 860  * @str: a UTF-8 encoded string
 861  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 862  *
 863  * Converts a string into a form that is independent of case. The
 864  * result will not correspond to any particular case, but can be
 865  * compared for equality or ordered with the results of calling
 866  * g_utf8_casefold() on other strings.
 867  *
 868  * Note that calling g_utf8_casefold() followed by g_utf8_collate() is
 869  * only an approximation to the correct linguistic case insensitive
 870  * ordering, though it is a fairly good one. Getting this exactly
 871  * right would require a more sophisticated collation function that
 872  * takes case sensitivity into account. GLib does not currently
 873  * provide such a function.
 874  *
 875  * Return value: a newly allocated string, that is a
 876  *   case independent form of @str.
 877  **/
 878 gchar *
 879 g_utf8_casefold (const gchar *str,
 880                  gssize       len)
 881 {
 882   GString *result = g_string_new (NULL);
 883   const char *p;
 884
 885   p = str;
 886   while ((len < 0 || p < str + len) && *p)
 887     {
 888       gunichar ch = g_utf8_get_char (p);
 889
 890       int start = 0;
 891       int end = G_N_ELEMENTS (casefold_table);
 892
 893       if (ch >= casefold_table[start].ch &&
 894           ch <= casefold_table[end - 1].ch)
 895         {
 896           while (TRUE)
 897             {
 898               int half = (start + end) / 2;
 899               if (ch == casefold_table[half].ch)
 900                 {
 901                   g_string_append (result, casefold_table[half].data);
 902                   goto next;
 903                 }
 904               else if (half == start)
 905                 break;
 906               else if (ch > casefold_table[half].ch)
 907                 start = half;
 908               else
 909                 end = half;
 910             }
 911         }
 912
 913       g_string_append_unichar (result, g_unichar_tolower (ch));
 914
 915     next:
 916       p = g_utf8_next_char (p);
 917     }
 918
 919   return g_string_free (result, FALSE);
 920 }