glib/guniprop.c

   1 /* guniprop.c - Unicode character properties.
   2  *
   3  * Copyright (C) 1999 Tom Tromey
   4  * Copyright (C) 2000 Red Hat, Inc.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  */
  21
  22 #include "config.h"
  23
  24 #include <stddef.h>
  25 #include <string.h>
  26 #include <locale.h>
  27
  28 #include "glib.h"
  29 #include "gunichartables.h"
  30
  31
  32 #define ATTTABLE(Page, Char) \
  33   ((attr_table[Page] == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[attr_table[Page]][Char]))
  34
  35 /* We cheat a bit and cast type values to (char *).  We detect these
  36    using the &0xff trick.  */
  37 #define TTYPE(Page, Char) \
  38   ((type_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
  39    ? (type_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
  40    : (type_data[type_table[Page]][Char]))
  41
  42
  43 #define TYPE(Char) (((Char) > (G_UNICODE_LAST_CHAR)) ? G_UNICODE_UNASSIGNED : TTYPE ((Char) >> 8, (Char) & 0xff))
  44
  45 #define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER       \
  46                        || (Type) == G_UNICODE_LETTER_NUMBER     \
  47                        || (Type) == G_UNICODE_OTHER_NUMBER)
  48
  49 #define ISALPHA(Type) ((Type) == G_UNICODE_LOWERCASE_LETTER     \
  50                        || (Type) == G_UNICODE_UPPERCASE_LETTER  \
  51                        || (Type) == G_UNICODE_TITLECASE_LETTER  \
  52                        || (Type) == G_UNICODE_MODIFIER_LETTER   \
  53                        || (Type) == G_UNICODE_OTHER_LETTER)
  54
  55 #define ISMARK(Type) ((Type) == G_UNICODE_NON_SPACING_MARK ||   \
  56                       (Type) == G_UNICODE_COMBINING_MARK ||     \
  57                       (Type) == G_UNICODE_ENCLOSING_MARK)
  58
  59
  60 /**
  61  * g_unichar_isalnum:
  62  * @c: a Unicode character
  63  *
  64  * Determines whether a character is alphanumeric.
  65  * Given some UTF-8 text, obtain a character value
  66  * with g_utf8_get_char().
  67  *
  68  * Return value: %TRUE if @c is an alphanumeric character
  69  **/
  70 gboolean
  71 g_unichar_isalnum (gunichar c)
  72 {
  73   int t = TYPE (c);
  74   return ISDIGIT (t) || ISALPHA (t);
  75 }
  76
  77 /**
  78  * g_unichar_isalpha:
  79  * @c: a Unicode character
  80  *
  81  * Determines whether a character is alphabetic (i.e. a letter).
  82  * Given some UTF-8 text, obtain a character value with
  83  * g_utf8_get_char().
  84  *
  85  * Return value: %TRUE if @c is an alphabetic character
  86  **/
  87 gboolean
  88 g_unichar_isalpha (gunichar c)
  89 {
  90   int t = TYPE (c);
  91   return ISALPHA (t);
  92 }
  93
  94
  95 /**
  96  * g_unichar_iscntrl:
  97  * @c: a Unicode character
  98  *
  99  * Determines whether a character is a control character.
 100  * Given some UTF-8 text, obtain a character value with
 101  * g_utf8_get_char().
 102  *
 103  * Return value: %TRUE if @c is a control character
 104  **/
 105 gboolean
 106 g_unichar_iscntrl (gunichar c)
 107 {
 108   return TYPE (c) == G_UNICODE_CONTROL;
 109 }
 110
 111 /**
 112  * g_unichar_isdigit:
 113  * @c: a Unicode character
 114  *
 115  * Determines whether a character is numeric (i.e. a digit).  This
 116  * covers ASCII 0-9 and also digits in other languages/scripts.  Given
 117  * some UTF-8 text, obtain a character value with g_utf8_get_char().
 118  *
 119  * Return value: %TRUE if @c is a digit
 120  **/
 121 gboolean
 122 g_unichar_isdigit (gunichar c)
 123 {
 124   return TYPE (c) == G_UNICODE_DECIMAL_NUMBER;
 125 }
 126
 127
 128 /**
 129  * g_unichar_isgraph:
 130  * @c: a Unicode character
 131  *
 132  * Determines whether a character is printable and not a space
 133  * (returns %FALSE for control characters, format characters, and
 134  * spaces). g_unichar_isprint() is similar, but returns %TRUE for
 135  * spaces. Given some UTF-8 text, obtain a character value with
 136  * g_utf8_get_char().
 137  *
 138  * Return value: %TRUE if @c is printable unless it's a space
 139  **/
 140 gboolean
 141 g_unichar_isgraph (gunichar c)
 142 {
 143   int t = TYPE (c);
 144   return (t != G_UNICODE_CONTROL
 145           && t != G_UNICODE_FORMAT
 146           && t != G_UNICODE_UNASSIGNED
 147           && t != G_UNICODE_PRIVATE_USE
 148           && t != G_UNICODE_SURROGATE
 149           && t != G_UNICODE_SPACE_SEPARATOR);
 150 }
 151
 152 /**
 153  * g_unichar_islower:
 154  * @c: a Unicode character
 155  *
 156  * Determines whether a character is a lowercase letter.
 157  * Given some UTF-8 text, obtain a character value with
 158  * g_utf8_get_char().
 159  *
 160  * Return value: %TRUE if @c is a lowercase letter
 161  **/
 162 gboolean
 163 g_unichar_islower (gunichar c)
 164 {
 165   return TYPE (c) == G_UNICODE_LOWERCASE_LETTER;
 166 }
 167
 168
 169 /**
 170  * g_unichar_isprint:
 171  * @c: a Unicode character
 172  *
 173  * Determines whether a character is printable.
 174  * Unlike g_unichar_isgraph(), returns %TRUE for spaces.
 175  * Given some UTF-8 text, obtain a character value with
 176  * g_utf8_get_char().
 177  *
 178  * Return value: %TRUE if @c is printable
 179  **/
 180 gboolean
 181 g_unichar_isprint (gunichar c)
 182 {
 183   int t = TYPE (c);
 184   return (t != G_UNICODE_CONTROL
 185           && t != G_UNICODE_FORMAT
 186           && t != G_UNICODE_UNASSIGNED
 187           && t != G_UNICODE_PRIVATE_USE
 188           && t != G_UNICODE_SURROGATE);
 189 }
 190
 191 /**
 192  * g_unichar_ispunct:
 193  * @c: a Unicode character
 194  *
 195  * Determines whether a character is punctuation or a symbol.
 196  * Given some UTF-8 text, obtain a character value with
 197  * g_utf8_get_char().
 198  *
 199  * Return value: %TRUE if @c is a punctuation or symbol character
 200  **/
 201 gboolean
 202 g_unichar_ispunct (gunichar c)
 203 {
 204   int t = TYPE (c);
 205   return (t == G_UNICODE_CONNECT_PUNCTUATION || t == G_UNICODE_DASH_PUNCTUATION
 206           || t == G_UNICODE_CLOSE_PUNCTUATION || t == G_UNICODE_FINAL_PUNCTUATION
 207           || t == G_UNICODE_INITIAL_PUNCTUATION || t == G_UNICODE_OTHER_PUNCTUATION
 208           || t == G_UNICODE_OPEN_PUNCTUATION || t == G_UNICODE_CURRENCY_SYMBOL
 209           || t == G_UNICODE_MODIFIER_SYMBOL || t == G_UNICODE_MATH_SYMBOL
 210           || t == G_UNICODE_OTHER_SYMBOL);
 211 }
 212
 213 /**
 214  * g_unichar_isspace:
 215  * @c: a Unicode character
 216  *
 217  * Determines whether a character is a space, tab, or line separator
 218  * (newline, carriage return, etc.).  Given some UTF-8 text, obtain a
 219  * character value with g_utf8_get_char().
 220  *
 221  * (Note: don't use this to do word breaking; you have to use
 222  * Pango or equivalent to get word breaking right, the algorithm
 223  * is fairly complex.)
 224  *
 225  * Return value: %TRUE if @c is a punctuation character
 226  **/
 227 gboolean
 228 g_unichar_isspace (gunichar c)
 229 {
 230   switch (c)
 231     {
 232       /* special-case these since Unicode thinks they are not spaces */
 233     case '\t':
 234     case '\n':
 235     case '\r':
 236     case '\f':
 237       return TRUE;
 238       break;
 239
 240     default:
 241       {
 242         int t = TYPE (c);
 243         return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR
 244                 || t == G_UNICODE_PARAGRAPH_SEPARATOR);
 245       }
 246       break;
 247     }
 248 }
 249
 250 /**
 251  * g_unichar_isupper:
 252  * @c: a Unicode character
 253  *
 254  * Determines if a character is uppercase.
 255  *
 256  * Return value: %TRUE if @c is an uppercase character
 257  **/
 258 gboolean
 259 g_unichar_isupper (gunichar c)
 260 {
 261   return TYPE (c) == G_UNICODE_UPPERCASE_LETTER;
 262 }
 263
 264 /**
 265  * g_unichar_istitle:
 266  * @c: a Unicode character
 267  *
 268  * Determines if a character is titlecase. Some characters in
 269  * Unicode which are composites, such as the DZ digraph
 270  * have three case variants instead of just two. The titlecase
 271  * form is used at the beginning of a word where only the
 272  * first letter is capitalized. The titlecase form of the DZ
 273  * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z.
 274  *
 275  * Return value: %TRUE if the character is titlecase
 276  **/
 277 gboolean
 278 g_unichar_istitle (gunichar c)
 279 {
 280   unsigned int i;
 281   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 282     if (title_table[i][0] == c)
 283       return 1;
 284   return 0;
 285 }
 286
 287 /**
 288  * g_unichar_isxdigit:
 289  * @c: a Unicode character.
 290  *
 291  * Determines if a character is a hexidecimal digit.
 292  *
 293  * Return value: %TRUE if the character is a hexadecimal digit
 294  **/
 295 gboolean
 296 g_unichar_isxdigit (gunichar c)
 297 {
 298   int t = TYPE (c);
 299   return ((c >= 'a' && c <= 'f')
 300           || (c >= 'A' && c <= 'F')
 301           || ISDIGIT (t));
 302 }
 303
 304 /**
 305  * g_unichar_isdefined:
 306  * @c: a Unicode character
 307  *
 308  * Determines if a given character is assigned in the Unicode
 309  * standard.
 310  *
 311  * Return value: %TRUE if the character has an assigned value
 312  **/
 313 gboolean
 314 g_unichar_isdefined (gunichar c)
 315 {
 316   int t = TYPE (c);
 317   return t != G_UNICODE_UNASSIGNED;
 318 }
 319
 320 /**
 321  * g_unichar_iswide:
 322  * @c: a Unicode character
 323  *
 324  * Determines if a character is typically rendered in a double-width
 325  * cell.
 326  *
 327  * Return value: %TRUE if the character is wide
 328  **/
 329 /* This function stolen from Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>.  */
 330 gboolean
 331 g_unichar_iswide (gunichar c)
 332 {
 333   if (c < 0x1100)
 334     return FALSE;
 335
 336   return (c <= 0x115f  /* Hangul Jamo init. consonants */
 337           || c == 0x2329 || c == 0x232a     /* angle brackets */
 338           || (c >= 0x2e80 && c <= 0xa4cf && (c < 0x302a || c > 0x302f)
 339               && c != 0x303f && c != 0x3099 && c!= 0x309a) /* CJK ... Yi */
 340           || (c >= 0xac00 && c <= 0xd7a3)   /* Hangul Syllables */
 341           || (c >= 0xf900 && c <= 0xfaff)   /* CJK Compatibility Ideographs */
 342           || (c >= 0xfe30 && c <= 0xfe6f)   /* CJK Compatibility Forms */
 343           || (c >= 0xff00 && c <= 0xff60)   /* Fullwidth Forms */
 344           || (c >= 0xffe0 && c <= 0xffe6)   /* Fullwidth Forms */
 345           || (c >= 0x20000 && c <= 0x2fffd) /* CJK extra stuff */
 346           || (c >= 0x30000 && c <= 0x3fffd));
 347 }
 348
 349 /**
 350  * g_unichar_toupper:
 351  * @c: a Unicode character
 352  *
 353  * Converts a character to uppercase.
 354  *
 355  * Return value: the result of converting @c to uppercase.
 356  *               If @c is not an lowercase or titlecase character,
 357  *               or has no upper case equivalent @c is returned unchanged.
 358  **/
 359 gunichar
 360 g_unichar_toupper (gunichar c)
 361 {
 362   int t = TYPE (c);
 363   if (t == G_UNICODE_LOWERCASE_LETTER)
 364     {
 365       gunichar val = ATTTABLE (c >> 8, c & 0xff);
 366       if (val >= 0xd800 && val < 0xdc00)
 367         {
 368           const guchar *p = special_case_table[val - 0xd800];
 369           return p[0] * 256 + p[1];
 370         }
 371       else
 372         return val ? val : c;
 373     }
 374   else if (t == G_UNICODE_TITLECASE_LETTER)
 375     {
 376       unsigned int i;
 377       for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 378         {
 379           if (title_table[i][0] == c)
 380             return title_table[i][1];
 381         }
 382     }
 383   return c;
 384 }
 385
 386 /**
 387  * g_unichar_tolower:
 388  * @c: a Unicode character.
 389  *
 390  * Converts a character to lower case.
 391  *
 392  * Return value: the result of converting @c to lower case.
 393  *               If @c is not an upperlower or titlecase character,
 394  *               or has no lowercase equivalent @c is returned unchanged.
 395  **/
 396 gunichar
 397 g_unichar_tolower (gunichar c)
 398 {
 399   int t = TYPE (c);
 400   if (t == G_UNICODE_UPPERCASE_LETTER)
 401     {
 402       gunichar val = ATTTABLE (c >> 8, c & 0xff);
 403       if (val >= 0xd800 && val < 0xdc00)
 404         {
 405           const guchar *p = special_case_table[val - 0xd800];
 406           return p[0] * 256 + p[1];
 407         }
 408       else
 409         return val ? val : c;
 410     }
 411   else if (t == G_UNICODE_TITLECASE_LETTER)
 412     {
 413       unsigned int i;
 414       for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 415         {
 416           if (title_table[i][0] == c)
 417             return title_table[i][2];
 418         }
 419     }
 420   return c;
 421 }
 422
 423 /**
 424  * g_unichar_totitle:
 425  * @c: a Unicode character
 426  *
 427  * Converts a character to the titlecase.
 428  *
 429  * Return value: the result of converting @c to titlecase.
 430  *               If @c is not an uppercase or lowercase character,
 431  *               @c is returned unchanged.
 432  **/
 433 gunichar
 434 g_unichar_totitle (gunichar c)
 435 {
 436   unsigned int i;
 437   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 438     {
 439       if (title_table[i][0] == c || title_table[i][1] == c
 440           || title_table[i][2] == c)
 441         return title_table[i][0];
 442     }
 443   return (TYPE (c) == G_UNICODE_LOWERCASE_LETTER
 444           ? ATTTABLE (c >> 8, c & 0xff)
 445           : c);
 446 }
 447
 448 /**
 449  * g_unichar_digit_value:
 450  * @c: a Unicode character
 451  *
 452  * Determines the numeric value of a character as a decimal
 453  * digit.
 454  *
 455  * Return value: If @c is a decimal digit (according to
 456  * g_unichar_isdigit()), its numeric value. Otherwise, -1.
 457  **/
 458 int
 459 g_unichar_digit_value (gunichar c)
 460 {
 461   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
 462     return ATTTABLE (c >> 8, c & 0xff);
 463   return -1;
 464 }
 465
 466 /**
 467  * g_unichar_xdigit_value:
 468  * @c: a Unicode character
 469  *
 470  * Determines the numeric value of a character as a hexidecimal
 471  * digit.
 472  *
 473  * Return value: If @c is a hex digit (according to
 474  * g_unichar_isxdigit()), its numeric value. Otherwise, -1.
 475  **/
 476 int
 477 g_unichar_xdigit_value (gunichar c)
 478 {
 479   if (c >= 'A' && c <= 'F')
 480     return c - 'A' + 10;
 481   if (c >= 'a' && c <= 'f')
 482     return c - 'a' + 10;
 483   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
 484     return ATTTABLE (c >> 8, c & 0xff);
 485   return -1;
 486 }
 487
 488 /**
 489  * g_unichar_type:
 490  * @c: a Unicode character
 491  *
 492  * Classifies a Unicode character by type.
 493  *
 494  * Return value: the type of the character.
 495  **/
 496 GUnicodeType
 497 g_unichar_type (gunichar c)
 498 {
 499   return TYPE (c);
 500 }
 501
 502 /*
 503  * Case mapping functions
 504  */
 505
 506 typedef enum {
 507   LOCALE_NORMAL,
 508   LOCALE_TURKIC,
 509   LOCALE_LITHUANIAN
 510 } LocaleType;
 511
 512 static LocaleType
 513 get_locale_type (void)
 514 {
 515   const char *locale = setlocale (LC_CTYPE, NULL);
 516
 517   switch (locale[0])
 518     {
 519    case 'a':
 520       if (locale[1] == 'z')
 521         return LOCALE_TURKIC;
 522       break;
 523     case 'l':
 524       if (locale[1] == 't')
 525         return LOCALE_LITHUANIAN;
 526       break;
 527     case 't':
 528       if (locale[1] == 'r')
 529         return LOCALE_TURKIC;
 530       break;
 531     }
 532
 533   return LOCALE_NORMAL;
 534 }
 535
 536 static int
 537 output_marks (const char **p_inout,
 538               char        *out_buffer,
 539               int          len,
 540               gboolean     remove_dot)
 541 {
 542   const char *p = *p_inout;
 543
 544   while (*p)
 545     {
 546       gunichar c = g_utf8_get_char (p);
 547       int t = TYPE(c);
 548
 549       if (ISMARK(t))
 550         {
 551           if (!remove_dot || c != 0x307 /* COMBINING DOT ABOVE */)
 552             len += g_unichar_to_utf8 (c, out_buffer ? out_buffer + len : NULL);
 553           p = g_utf8_next_char (p);
 554         }
 555       else
 556         break;
 557     }
 558
 559   *p_inout = p;
 560   return len;
 561 }
 562
 563 static gsize
 564 output_special_case (gchar *out_buffer,
 565                      gsize  len,
 566                      int    index,
 567                      int    type,
 568                      int    which)
 569 {
 570   const guchar *p = special_case_table[index];
 571
 572   if (type != G_UNICODE_TITLECASE_LETTER)
 573     p += 2; /* +2 to skip over "best single match" */
 574
 575   if (which == 1)
 576     {
 577       while (p[0] || p[1])
 578         p += 2;
 579       p += 2;
 580     }
 581
 582   while (TRUE)
 583     {
 584       gunichar ch = p[0] * 256 + p[1];
 585       if (!ch)
 586         break;
 587
 588       len += g_unichar_to_utf8 (ch, out_buffer ? out_buffer + len : NULL);
 589       p += 2;
 590     }
 591
 592   return len;
 593 }
 594
 595 static gsize
 596 real_toupper (const gchar *str,
 597               gssize       max_len,
 598               gchar       *out_buffer,
 599               LocaleType   locale_type)
 600 {
 601   const gchar *p = str;
 602   const char *last = NULL;
 603   gsize len = 0;
 604   gboolean last_was_i = FALSE;
 605
 606   while ((max_len < 0 || p < str + max_len) && *p)
 607     {
 608       gunichar c = g_utf8_get_char (p);
 609       int t = TYPE (c);
 610       gunichar val;
 611
 612       last = p;
 613       p = g_utf8_next_char (p);
 614
 615       if (locale_type == LOCALE_LITHUANIAN)
 616         {
 617           if (c == 'i')
 618             last_was_i = TRUE;
 619           else
 620             {
 621               if (last_was_i)
 622                 {
 623                   /* Nasty, need to remove any dot above. Though
 624                    * I think only E WITH DOT ABOVE occurs in practice
 625                    * which could simplify this considerably.
 626                    */
 627                   gsize decomp_len, i;
 628                   gunichar *decomp;
 629
 630                   decomp = g_unicode_canonical_decomposition (c, &decomp_len);
 631                   for (i=0; i < decomp_len; i++)
 632                     {
 633                       if (decomp[i] != 0x307 /* COMBINING DOT ABOVE */)
 634                         len += g_unichar_to_utf8 (g_unichar_toupper (decomp[i]), out_buffer ? out_buffer + len : NULL);
 635                     }
 636                   g_free (decomp);
 637
 638                   len = output_marks (&p, out_buffer, len, TRUE);
 639
 640                   continue;
 641                 }
 642
 643               if (!ISMARK(t))
 644                 last_was_i = FALSE;
 645             }
 646         }
 647
 648       if (locale_type == LOCALE_TURKIC && c == 'i')
 649         {
 650           /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE */
 651           len += g_unichar_to_utf8 (0x130, out_buffer ? out_buffer + len : NULL);
 652         }
 653       else if (c == 0x0345)     /* COMBINING GREEK YPOGEGRAMMENI */
 654         {
 655           /* Nasty, need to move it after other combining marks .. this would go away if
 656            * we normalized first.
 657            */
 658           len = output_marks (&p, out_buffer, len, FALSE);
 659
 660           /* And output as GREEK CAPITAL LETTER IOTA */
 661           len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL);
 662         }
 663       else if (t == G_UNICODE_LOWERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER)
 664         {
 665           val = ATTTABLE (c >> 8, c & 0xff);
 666
 667           if (val >= 0xd800 && val < 0xdc00)
 668             {
 669               len += output_special_case (out_buffer, len, val - 0xd800, t,
 670                                           t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1);
 671             }
 672           else
 673             {
 674               if (t == G_UNICODE_TITLECASE_LETTER)
 675                 {
 676                   unsigned int i;
 677                   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 678                     {
 679                       if (title_table[i][0] == c)
 680                         val = title_table[i][1];
 681                     }
 682                 }
 683
 684               len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 685             }
 686         }
 687       else
 688         {
 689           gsize char_len = g_utf8_skip[*(guchar *)last];
 690
 691           if (out_buffer)
 692             memcpy (out_buffer + len, last, char_len);
 693
 694           len += char_len;
 695         }
 696
 697     }
 698
 699   return len;
 700 }
 701
 702 /**
 703  * g_utf8_strup:
 704  * @str: a UTF-8 encoded string
 705  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 706  *
 707  * Converts all Unicode characters in the string that have a case
 708  * to uppercase. The exact manner that this is done depends
 709  * on the current locale, and may result in the number of
 710  * characters in the string increasing. (For instance, the
 711  * German ess-zet will be changed to SS.)
 712  *
 713  * Return value: a newly allocated string, with all characters
 714  *    converted to uppercase.
 715  **/
 716 gchar *
 717 g_utf8_strup (const gchar *str,
 718               gssize       len)
 719 {
 720   gsize result_len;
 721   LocaleType locale_type;
 722   gchar *result;
 723
 724   g_return_val_if_fail (str != NULL, NULL);
 725
 726   locale_type = get_locale_type ();
 727
 728   /*
 729    * We use a two pass approach to keep memory management simple
 730    */
 731   result_len = real_toupper (str, len, NULL, locale_type);
 732   result = g_malloc (result_len + 1);
 733   real_toupper (str, len, result, locale_type);
 734   result[result_len] = '\0';
 735
 736   return result;
 737 }
 738
 739 static gsize
 740 real_tolower (const gchar *str,
 741               gssize       max_len,
 742               gchar       *out_buffer,
 743               LocaleType   locale_type)
 744 {
 745   const gchar *p = str;
 746   const char *last = NULL;
 747   gsize len = 0;
 748
 749   while ((max_len < 0 || p < str + max_len) && *p)
 750     {
 751       gunichar c = g_utf8_get_char (p);
 752       int t = TYPE (c);
 753       gunichar val;
 754
 755       last = p;
 756       p = g_utf8_next_char (p);
 757
 758       if (locale_type == LOCALE_TURKIC && c == 'I')
 759         {
 760           /* I => LATIN SMALL LETTER DOTLESS I */
 761           len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL);
 762         }
 763       else if (c == 0x03A3)     /* GREEK CAPITAL LETTER SIGMA */
 764         {
 765           if ((max_len < 0 || p < str + max_len) && *p)
 766             {
 767               gunichar next_c = g_utf8_get_char (p);
 768               int next_type = TYPE(next_c);
 769
 770               /* SIGMA mapps differently depending on whether it is
 771                * final or not. The following simplified test would
 772                * fail in the case of combining marks following the
 773                * sigma, but I don't think that occurs in real text.
 774                * The test here matches that in ICU.
 775                */
 776               if (ISALPHA(next_type)) /* Lu,Ll,Lt,Lm,Lo */
 777                 val = 0x3c3;    /* GREEK SMALL SIGMA */
 778               else
 779                 val = 0x3c2;    /* GREEK SMALL FINAL SIGMA */
 780             }
 781           else
 782             val = 0x3c2;        /* GREEK SMALL FINAL SIGMA */
 783
 784           len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 785         }
 786       else if (t == G_UNICODE_UPPERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER)
 787         {
 788           val = ATTTABLE (c >> 8, c & 0xff);
 789
 790           if (val >= 0xd800 && val < 0xdc00)
 791             {
 792               len += output_special_case (out_buffer, len, val - 0xd800, t, 0);
 793             }
 794           else
 795             {
 796               if (t == G_UNICODE_TITLECASE_LETTER)
 797                 {
 798                   unsigned int i;
 799                   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 800                     {
 801                       if (title_table[i][0] == c)
 802                         val = title_table[i][2];
 803                     }
 804                 }
 805
 806               len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 807             }
 808         }
 809       else
 810         {
 811           gsize char_len = g_utf8_skip[*(guchar *)last];
 812
 813           if (out_buffer)
 814             memcpy (out_buffer + len, last, char_len);
 815
 816           len += char_len;
 817         }
 818
 819     }
 820
 821   return len;
 822 }
 823
 824 /**
 825  * g_utf8_strdown:
 826  * @str: a UTF-8 encoded string
 827  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 828  *
 829  * Converts all Unicode characters in the string that have a case
 830  * to lowercase. The exact manner that this is done depends
 831  * on the current locale, and may result in the number of
 832  * characters in the string changing.
 833  *
 834  * Return value: a newly allocated string, with all characters
 835  *    converted to lowercase.
 836  **/
 837 gchar *
 838 g_utf8_strdown (const gchar *str,
 839                 gssize       len)
 840 {
 841   gsize result_len;
 842   LocaleType locale_type;
 843   gchar *result;
 844
 845   g_return_val_if_fail (str != NULL, NULL);
 846
 847   locale_type = get_locale_type ();
 848
 849   /*
 850    * We use a two pass approach to keep memory management simple
 851    */
 852   result_len = real_tolower (str, len, NULL, locale_type);
 853   result = g_malloc (result_len + 1);
 854   real_tolower (str, len, result, locale_type);
 855   result[result_len] = '\0';
 856
 857   return result;
 858 }
 859
 860 /**
 861  * g_utf8_casefold:
 862  * @str: a UTF-8 encoded string
 863  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 864  *
 865  * Converts a string into a form that is independent of case. The
 866  * result will not correspond to any particular case, but can be
 867  * compared for equality or ordered with the results of calling
 868  * g_utf8_casefold() on other strings.
 869  *
 870  * Note that calling g_utf8_casefold() followed by g_utf8_collate() is
 871  * only an approximation to the correct linguistic case insensitive
 872  * ordering, though it is a fairly good one. Getting this exactly
 873  * right would require a more sophisticated collation function that
 874  * takes case sensitivity into account. GLib does not currently
 875  * provide such a function.
 876  *
 877  * Return value: a newly allocated string, that is a
 878  *   case independent form of @str.
 879  **/
 880 gchar *
 881 g_utf8_casefold (const gchar *str,
 882                  gssize       len)
 883 {
 884   GString *result = g_string_new (NULL);
 885   const char *p;
 886
 887   p = str;
 888   while ((len < 0 || p < str + len) && *p)
 889     {
 890       gunichar ch = g_utf8_get_char (p);
 891
 892       int start = 0;
 893       int end = G_N_ELEMENTS (casefold_table);
 894
 895       if (ch >= casefold_table[start].ch &&
 896           ch <= casefold_table[end - 1].ch)
 897         {
 898           while (TRUE)
 899             {
 900               int half = (start + end) / 2;
 901               if (ch == casefold_table[half].ch)
 902                 {
 903                   g_string_append (result, casefold_table[half].data);
 904                   goto next;
 905                 }
 906               else if (half == start)
 907                 break;
 908               else if (ch > casefold_table[half].ch)
 909                 start = half;
 910               else
 911                 end = half;
 912             }
 913         }
 914
 915       g_string_append_unichar (result, g_unichar_tolower (ch));
 916
 917     next:
 918       p = g_utf8_next_char (p);
 919     }
 920
 921   return g_string_free (result, FALSE);
 922 }