glib/guniprop.c

   1 /* guniprop.c - Unicode character properties.
   2  *
   3  * Copyright (C) 1999 Tom Tromey
   4  * Copyright (C) 2000 Red Hat, Inc.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  */
  21
  22 #include "config.h"
  23
  24 #include <stddef.h>
  25 #include <string.h>
  26 #include <locale.h>
  27
  28 #include "glib.h"
  29 #include "gunichartables.h"
  30 #include "gunicodeprivate.h"
  31 #include "galias.h"
  32
  33 #define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \
  34                           ? attr_table_part1[Page] \
  35                           : attr_table_part2[(Page) - 0xe00])
  36
  37 #define ATTTABLE(Page, Char) \
  38   ((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char]))
  39
  40 #define TTYPE_PART1(Page, Char) \
  41   ((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
  42    ? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
  43    : (type_data[type_table_part1[Page]][Char]))
  44
  45 #define TTYPE_PART2(Page, Char) \
  46   ((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
  47    ? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
  48    : (type_data[type_table_part2[Page]][Char]))
  49
  50 #define TYPE(Char) \
  51   (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
  52    ? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \
  53    : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
  54       ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
  55       : G_UNICODE_UNASSIGNED))
  56
  57
  58 #define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER       \
  59                        || (Type) == G_UNICODE_LETTER_NUMBER     \
  60                        || (Type) == G_UNICODE_OTHER_NUMBER)
  61
  62 #define ISALPHA(Type) ((Type) == G_UNICODE_LOWERCASE_LETTER     \
  63                        || (Type) == G_UNICODE_UPPERCASE_LETTER  \
  64                        || (Type) == G_UNICODE_TITLECASE_LETTER  \
  65                        || (Type) == G_UNICODE_MODIFIER_LETTER   \
  66                        || (Type) == G_UNICODE_OTHER_LETTER)
  67
  68 #define ISMARK(Type) ((Type) == G_UNICODE_NON_SPACING_MARK ||   \
  69                       (Type) == G_UNICODE_COMBINING_MARK ||     \
  70                       (Type) == G_UNICODE_ENCLOSING_MARK)
  71
  72
  73 /**
  74  * g_unichar_isalnum:
  75  * @c: a Unicode character
  76  *
  77  * Determines whether a character is alphanumeric.
  78  * Given some UTF-8 text, obtain a character value
  79  * with g_utf8_get_char().
  80  *
  81  * Return value: %TRUE if @c is an alphanumeric character
  82  **/
  83 gboolean
  84 g_unichar_isalnum (gunichar c)
  85 {
  86   int t = TYPE (c);
  87   return ISDIGIT (t) || ISALPHA (t);
  88 }
  89
  90 /**
  91  * g_unichar_isalpha:
  92  * @c: a Unicode character
  93  *
  94  * Determines whether a character is alphabetic (i.e. a letter).
  95  * Given some UTF-8 text, obtain a character value with
  96  * g_utf8_get_char().
  97  *
  98  * Return value: %TRUE if @c is an alphabetic character
  99  **/
 100 gboolean
 101 g_unichar_isalpha (gunichar c)
 102 {
 103   int t = TYPE (c);
 104   return ISALPHA (t);
 105 }
 106
 107
 108 /**
 109  * g_unichar_iscntrl:
 110  * @c: a Unicode character
 111  *
 112  * Determines whether a character is a control character.
 113  * Given some UTF-8 text, obtain a character value with
 114  * g_utf8_get_char().
 115  *
 116  * Return value: %TRUE if @c is a control character
 117  **/
 118 gboolean
 119 g_unichar_iscntrl (gunichar c)
 120 {
 121   return TYPE (c) == G_UNICODE_CONTROL;
 122 }
 123
 124 /**
 125  * g_unichar_isdigit:
 126  * @c: a Unicode character
 127  *
 128  * Determines whether a character is numeric (i.e. a digit).  This
 129  * covers ASCII 0-9 and also digits in other languages/scripts.  Given
 130  * some UTF-8 text, obtain a character value with g_utf8_get_char().
 131  *
 132  * Return value: %TRUE if @c is a digit
 133  **/
 134 gboolean
 135 g_unichar_isdigit (gunichar c)
 136 {
 137   return TYPE (c) == G_UNICODE_DECIMAL_NUMBER;
 138 }
 139
 140
 141 /**
 142  * g_unichar_isgraph:
 143  * @c: a Unicode character
 144  *
 145  * Determines whether a character is printable and not a space
 146  * (returns %FALSE for control characters, format characters, and
 147  * spaces). g_unichar_isprint() is similar, but returns %TRUE for
 148  * spaces. Given some UTF-8 text, obtain a character value with
 149  * g_utf8_get_char().
 150  *
 151  * Return value: %TRUE if @c is printable unless it's a space
 152  **/
 153 gboolean
 154 g_unichar_isgraph (gunichar c)
 155 {
 156   int t = TYPE (c);
 157   return (t != G_UNICODE_CONTROL
 158           && t != G_UNICODE_FORMAT
 159           && t != G_UNICODE_UNASSIGNED
 160           && t != G_UNICODE_PRIVATE_USE
 161           && t != G_UNICODE_SURROGATE
 162           && t != G_UNICODE_SPACE_SEPARATOR);
 163 }
 164
 165 /**
 166  * g_unichar_islower:
 167  * @c: a Unicode character
 168  *
 169  * Determines whether a character is a lowercase letter.
 170  * Given some UTF-8 text, obtain a character value with
 171  * g_utf8_get_char().
 172  *
 173  * Return value: %TRUE if @c is a lowercase letter
 174  **/
 175 gboolean
 176 g_unichar_islower (gunichar c)
 177 {
 178   return TYPE (c) == G_UNICODE_LOWERCASE_LETTER;
 179 }
 180
 181
 182 /**
 183  * g_unichar_isprint:
 184  * @c: a Unicode character
 185  *
 186  * Determines whether a character is printable.
 187  * Unlike g_unichar_isgraph(), returns %TRUE for spaces.
 188  * Given some UTF-8 text, obtain a character value with
 189  * g_utf8_get_char().
 190  *
 191  * Return value: %TRUE if @c is printable
 192  **/
 193 gboolean
 194 g_unichar_isprint (gunichar c)
 195 {
 196   int t = TYPE (c);
 197   return (t != G_UNICODE_CONTROL
 198           && t != G_UNICODE_FORMAT
 199           && t != G_UNICODE_UNASSIGNED
 200           && t != G_UNICODE_PRIVATE_USE
 201           && t != G_UNICODE_SURROGATE);
 202 }
 203
 204 /**
 205  * g_unichar_ispunct:
 206  * @c: a Unicode character
 207  *
 208  * Determines whether a character is punctuation or a symbol.
 209  * Given some UTF-8 text, obtain a character value with
 210  * g_utf8_get_char().
 211  *
 212  * Return value: %TRUE if @c is a punctuation or symbol character
 213  **/
 214 gboolean
 215 g_unichar_ispunct (gunichar c)
 216 {
 217   int t = TYPE (c);
 218   return (t == G_UNICODE_CONNECT_PUNCTUATION || t == G_UNICODE_DASH_PUNCTUATION
 219           || t == G_UNICODE_CLOSE_PUNCTUATION || t == G_UNICODE_FINAL_PUNCTUATION
 220           || t == G_UNICODE_INITIAL_PUNCTUATION || t == G_UNICODE_OTHER_PUNCTUATION
 221           || t == G_UNICODE_OPEN_PUNCTUATION || t == G_UNICODE_CURRENCY_SYMBOL
 222           || t == G_UNICODE_MODIFIER_SYMBOL || t == G_UNICODE_MATH_SYMBOL
 223           || t == G_UNICODE_OTHER_SYMBOL);
 224 }
 225
 226 /**
 227  * g_unichar_isspace:
 228  * @c: a Unicode character
 229  *
 230  * Determines whether a character is a space, tab, or line separator
 231  * (newline, carriage return, etc.).  Given some UTF-8 text, obtain a
 232  * character value with g_utf8_get_char().
 233  *
 234  * (Note: don't use this to do word breaking; you have to use
 235  * Pango or equivalent to get word breaking right, the algorithm
 236  * is fairly complex.)
 237  *
 238  * Return value: %TRUE if @c is a punctuation character
 239  **/
 240 gboolean
 241 g_unichar_isspace (gunichar c)
 242 {
 243   switch (c)
 244     {
 245       /* special-case these since Unicode thinks they are not spaces */
 246     case '\t':
 247     case '\n':
 248     case '\r':
 249     case '\f':
 250       return TRUE;
 251       break;
 252
 253     default:
 254       {
 255         int t = TYPE (c);
 256         return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR
 257                 || t == G_UNICODE_PARAGRAPH_SEPARATOR);
 258       }
 259       break;
 260     }
 261 }
 262
 263 /**
 264  * g_unichar_isupper:
 265  * @c: a Unicode character
 266  *
 267  * Determines if a character is uppercase.
 268  *
 269  * Return value: %TRUE if @c is an uppercase character
 270  **/
 271 gboolean
 272 g_unichar_isupper (gunichar c)
 273 {
 274   return TYPE (c) == G_UNICODE_UPPERCASE_LETTER;
 275 }
 276
 277 /**
 278  * g_unichar_istitle:
 279  * @c: a Unicode character
 280  *
 281  * Determines if a character is titlecase. Some characters in
 282  * Unicode which are composites, such as the DZ digraph
 283  * have three case variants instead of just two. The titlecase
 284  * form is used at the beginning of a word where only the
 285  * first letter is capitalized. The titlecase form of the DZ
 286  * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z.
 287  *
 288  * Return value: %TRUE if the character is titlecase
 289  **/
 290 gboolean
 291 g_unichar_istitle (gunichar c)
 292 {
 293   unsigned int i;
 294   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 295     if (title_table[i][0] == c)
 296       return 1;
 297   return 0;
 298 }
 299
 300 /**
 301  * g_unichar_isxdigit:
 302  * @c: a Unicode character.
 303  *
 304  * Determines if a character is a hexidecimal digit.
 305  *
 306  * Return value: %TRUE if the character is a hexadecimal digit
 307  **/
 308 gboolean
 309 g_unichar_isxdigit (gunichar c)
 310 {
 311   int t = TYPE (c);
 312   return ((c >= 'a' && c <= 'f')
 313           || (c >= 'A' && c <= 'F')
 314           || ISDIGIT (t));
 315 }
 316
 317 /**
 318  * g_unichar_isdefined:
 319  * @c: a Unicode character
 320  *
 321  * Determines if a given character is assigned in the Unicode
 322  * standard.
 323  *
 324  * Return value: %TRUE if the character has an assigned value
 325  **/
 326 gboolean
 327 g_unichar_isdefined (gunichar c)
 328 {
 329   int t = TYPE (c);
 330   return t != G_UNICODE_UNASSIGNED;
 331 }
 332
 333 /**
 334  * g_unichar_iswide:
 335  * @c: a Unicode character
 336  *
 337  * Determines if a character is typically rendered in a double-width
 338  * cell.
 339  *
 340  * Return value: %TRUE if the character is wide
 341  **/
 342 /* This function stolen from Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>.  */
 343 gboolean
 344 g_unichar_iswide (gunichar c)
 345 {
 346   if (c < 0x1100)
 347     return FALSE;
 348
 349   return (c <= 0x115f  /* Hangul Jamo init. consonants */
 350           || c == 0x2329 || c == 0x232a     /* angle brackets */
 351           || (c >= 0x2e80 && c <= 0xa4cf && (c < 0x302a || c > 0x302f)
 352               && c != 0x303f && c != 0x3099 && c!= 0x309a) /* CJK ... Yi */
 353           || (c >= 0xac00 && c <= 0xd7a3)   /* Hangul Syllables */
 354           || (c >= 0xf900 && c <= 0xfaff)   /* CJK Compatibility Ideographs */
 355           || (c >= 0xfe30 && c <= 0xfe6f)   /* CJK Compatibility Forms */
 356           || (c >= 0xff00 && c <= 0xff60)   /* Fullwidth Forms */
 357           || (c >= 0xffe0 && c <= 0xffe6)   /* Fullwidth Forms */
 358           || (c >= 0x20000 && c <= 0x2fffd) /* CJK extra stuff */
 359           || (c >= 0x30000 && c <= 0x3fffd));
 360 }
 361
 362 /**
 363  * g_unichar_toupper:
 364  * @c: a Unicode character
 365  *
 366  * Converts a character to uppercase.
 367  *
 368  * Return value: the result of converting @c to uppercase.
 369  *               If @c is not an lowercase or titlecase character,
 370  *               or has no upper case equivalent @c is returned unchanged.
 371  **/
 372 gunichar
 373 g_unichar_toupper (gunichar c)
 374 {
 375   int t = TYPE (c);
 376   if (t == G_UNICODE_LOWERCASE_LETTER)
 377     {
 378       gunichar val = ATTTABLE (c >> 8, c & 0xff);
 379       if (val >= 0x1000000)
 380         {
 381           const gchar *p = special_case_table + val - 0x1000000;
 382           return g_utf8_get_char (p);
 383         }
 384       else
 385         return val ? val : c;
 386     }
 387   else if (t == G_UNICODE_TITLECASE_LETTER)
 388     {
 389       unsigned int i;
 390       for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 391         {
 392           if (title_table[i][0] == c)
 393             return title_table[i][1];
 394         }
 395     }
 396   return c;
 397 }
 398
 399 /**
 400  * g_unichar_tolower:
 401  * @c: a Unicode character.
 402  *
 403  * Converts a character to lower case.
 404  *
 405  * Return value: the result of converting @c to lower case.
 406  *               If @c is not an upperlower or titlecase character,
 407  *               or has no lowercase equivalent @c is returned unchanged.
 408  **/
 409 gunichar
 410 g_unichar_tolower (gunichar c)
 411 {
 412   int t = TYPE (c);
 413   if (t == G_UNICODE_UPPERCASE_LETTER)
 414     {
 415       gunichar val = ATTTABLE (c >> 8, c & 0xff);
 416       if (val >= 0x1000000)
 417         {
 418           const gchar *p = special_case_table + val - 0x1000000;
 419           return g_utf8_get_char (p);
 420         }
 421       else
 422         return val ? val : c;
 423     }
 424   else if (t == G_UNICODE_TITLECASE_LETTER)
 425     {
 426       unsigned int i;
 427       for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 428         {
 429           if (title_table[i][0] == c)
 430             return title_table[i][2];
 431         }
 432     }
 433   return c;
 434 }
 435
 436 /**
 437  * g_unichar_totitle:
 438  * @c: a Unicode character
 439  *
 440  * Converts a character to the titlecase.
 441  *
 442  * Return value: the result of converting @c to titlecase.
 443  *               If @c is not an uppercase or lowercase character,
 444  *               @c is returned unchanged.
 445  **/
 446 gunichar
 447 g_unichar_totitle (gunichar c)
 448 {
 449   unsigned int i;
 450   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 451     {
 452       if (title_table[i][0] == c || title_table[i][1] == c
 453           || title_table[i][2] == c)
 454         return title_table[i][0];
 455     }
 456   return (TYPE (c) == G_UNICODE_LOWERCASE_LETTER
 457           ? ATTTABLE (c >> 8, c & 0xff)
 458           : c);
 459 }
 460
 461 /**
 462  * g_unichar_digit_value:
 463  * @c: a Unicode character
 464  *
 465  * Determines the numeric value of a character as a decimal
 466  * digit.
 467  *
 468  * Return value: If @c is a decimal digit (according to
 469  * g_unichar_isdigit()), its numeric value. Otherwise, -1.
 470  **/
 471 int
 472 g_unichar_digit_value (gunichar c)
 473 {
 474   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
 475     return ATTTABLE (c >> 8, c & 0xff);
 476   return -1;
 477 }
 478
 479 /**
 480  * g_unichar_xdigit_value:
 481  * @c: a Unicode character
 482  *
 483  * Determines the numeric value of a character as a hexidecimal
 484  * digit.
 485  *
 486  * Return value: If @c is a hex digit (according to
 487  * g_unichar_isxdigit()), its numeric value. Otherwise, -1.
 488  **/
 489 int
 490 g_unichar_xdigit_value (gunichar c)
 491 {
 492   if (c >= 'A' && c <= 'F')
 493     return c - 'A' + 10;
 494   if (c >= 'a' && c <= 'f')
 495     return c - 'a' + 10;
 496   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
 497     return ATTTABLE (c >> 8, c & 0xff);
 498   return -1;
 499 }
 500
 501 /**
 502  * g_unichar_type:
 503  * @c: a Unicode character
 504  *
 505  * Classifies a Unicode character by type.
 506  *
 507  * Return value: the type of the character.
 508  **/
 509 GUnicodeType
 510 g_unichar_type (gunichar c)
 511 {
 512   return TYPE (c);
 513 }
 514
 515 /*
 516  * Case mapping functions
 517  */
 518
 519 typedef enum {
 520   LOCALE_NORMAL,
 521   LOCALE_TURKIC,
 522   LOCALE_LITHUANIAN
 523 } LocaleType;
 524
 525 static LocaleType
 526 get_locale_type (void)
 527 {
 528 #ifdef G_OS_WIN32
 529   char *tem = g_win32_getlocale ();
 530   char locale[2];
 531
 532   locale[0] = tem[0];
 533   locale[1] = tem[1];
 534   g_free (tem);
 535 #else
 536   const char *locale = setlocale (LC_CTYPE, NULL);
 537 #endif
 538
 539   switch (locale[0])
 540     {
 541    case 'a':
 542       if (locale[1] == 'z')
 543         return LOCALE_TURKIC;
 544       break;
 545     case 'l':
 546       if (locale[1] == 't')
 547         return LOCALE_LITHUANIAN;
 548       break;
 549     case 't':
 550       if (locale[1] == 'r')
 551         return LOCALE_TURKIC;
 552       break;
 553     }
 554
 555   return LOCALE_NORMAL;
 556 }
 557
 558 static gint
 559 output_marks (const char **p_inout,
 560               char        *out_buffer,
 561               gboolean     remove_dot)
 562 {
 563   const char *p = *p_inout;
 564   gint len = 0;
 565
 566   while (*p)
 567     {
 568       gunichar c = g_utf8_get_char (p);
 569       int t = TYPE(c);
 570
 571       if (ISMARK(t))
 572         {
 573           if (!remove_dot || c != 0x307 /* COMBINING DOT ABOVE */)
 574             len += g_unichar_to_utf8 (c, out_buffer ? out_buffer + len : NULL);
 575           p = g_utf8_next_char (p);
 576         }
 577       else
 578         break;
 579     }
 580
 581   *p_inout = p;
 582   return len;
 583 }
 584
 585 static gint
 586 output_special_case (gchar *out_buffer,
 587                      int    offset,
 588                      int    type,
 589                      int    which)
 590 {
 591   const gchar *p = special_case_table + offset;
 592   gint len;
 593
 594   if (type != G_UNICODE_TITLECASE_LETTER)
 595     p = g_utf8_next_char (p);
 596
 597   if (which == 1)
 598     p += strlen (p) + 1;
 599
 600   len = strlen (p);
 601   if (out_buffer)
 602     memcpy (out_buffer, p, len);
 603
 604   return len;
 605 }
 606
 607 static gsize
 608 real_toupper (const gchar *str,
 609               gssize       max_len,
 610               gchar       *out_buffer,
 611               LocaleType   locale_type)
 612 {
 613   const gchar *p = str;
 614   const char *last = NULL;
 615   gsize len = 0;
 616   gboolean last_was_i = FALSE;
 617
 618   while ((max_len < 0 || p < str + max_len) && *p)
 619     {
 620       gunichar c = g_utf8_get_char (p);
 621       int t = TYPE (c);
 622       gunichar val;
 623
 624       last = p;
 625       p = g_utf8_next_char (p);
 626
 627       if (locale_type == LOCALE_LITHUANIAN)
 628         {
 629           if (c == 'i')
 630             last_was_i = TRUE;
 631           else
 632             {
 633               if (last_was_i)
 634                 {
 635                   /* Nasty, need to remove any dot above. Though
 636                    * I think only E WITH DOT ABOVE occurs in practice
 637                    * which could simplify this considerably.
 638                    */
 639                   gsize decomp_len, i;
 640                   gunichar *decomp;
 641
 642                   decomp = g_unicode_canonical_decomposition (c, &decomp_len);
 643                   for (i=0; i < decomp_len; i++)
 644                     {
 645                       if (decomp[i] != 0x307 /* COMBINING DOT ABOVE */)
 646                         len += g_unichar_to_utf8 (g_unichar_toupper (decomp[i]), out_buffer ? out_buffer + len : NULL);
 647                     }
 648                   g_free (decomp);
 649
 650                   len += output_marks (&p, out_buffer ? out_buffer + len : NULL, TRUE);
 651
 652                   continue;
 653                 }
 654
 655               if (!ISMARK(t))
 656                 last_was_i = FALSE;
 657             }
 658         }
 659
 660       if (locale_type == LOCALE_TURKIC && c == 'i')
 661         {
 662           /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE */
 663           len += g_unichar_to_utf8 (0x130, out_buffer ? out_buffer + len : NULL);
 664         }
 665       else if (c == 0x0345)     /* COMBINING GREEK YPOGEGRAMMENI */
 666         {
 667           /* Nasty, need to move it after other combining marks .. this would go away if
 668            * we normalized first.
 669            */
 670           len += output_marks (&p, out_buffer ? out_buffer + len : NULL, FALSE);
 671
 672           /* And output as GREEK CAPITAL LETTER IOTA */
 673           len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL);
 674         }
 675       else if (t == G_UNICODE_LOWERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER)
 676         {
 677           val = ATTTABLE (c >> 8, c & 0xff);
 678
 679           if (val >= 0x1000000)
 680             {
 681               len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t,
 682                                           t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1);
 683             }
 684           else
 685             {
 686               if (t == G_UNICODE_TITLECASE_LETTER)
 687                 {
 688                   unsigned int i;
 689                   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 690                     {
 691                       if (title_table[i][0] == c)
 692                         val = title_table[i][1];
 693                     }
 694                 }
 695
 696               len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 697             }
 698         }
 699       else
 700         {
 701           gsize char_len = g_utf8_skip[*(guchar *)last];
 702
 703           if (out_buffer)
 704             memcpy (out_buffer + len, last, char_len);
 705
 706           len += char_len;
 707         }
 708
 709     }
 710
 711   return len;
 712 }
 713
 714 /**
 715  * g_utf8_strup:
 716  * @str: a UTF-8 encoded string
 717  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 718  *
 719  * Converts all Unicode characters in the string that have a case
 720  * to uppercase. The exact manner that this is done depends
 721  * on the current locale, and may result in the number of
 722  * characters in the string increasing. (For instance, the
 723  * German ess-zet will be changed to SS.)
 724  *
 725  * Return value: a newly allocated string, with all characters
 726  *    converted to uppercase.
 727  **/
 728 gchar *
 729 g_utf8_strup (const gchar *str,
 730               gssize       len)
 731 {
 732   gsize result_len;
 733   LocaleType locale_type;
 734   gchar *result;
 735
 736   g_return_val_if_fail (str != NULL, NULL);
 737
 738   locale_type = get_locale_type ();
 739
 740   /*
 741    * We use a two pass approach to keep memory management simple
 742    */
 743   result_len = real_toupper (str, len, NULL, locale_type);
 744   result = g_malloc (result_len + 1);
 745   real_toupper (str, len, result, locale_type);
 746   result[result_len] = '\0';
 747
 748   return result;
 749 }
 750
 751 /* traverses the string checking for characters with combining class == 230
 752  * until a base character is found */
 753 static gboolean
 754 has_more_above (const gchar *str)
 755 {
 756   const gchar *p = str;
 757   gint combining_class;
 758
 759   while (*p)
 760     {
 761       combining_class = _g_unichar_combining_class (g_utf8_get_char (p));
 762       if (combining_class == 230)
 763         return TRUE;
 764       else if (combining_class == 0)
 765         break;
 766
 767       p = g_utf8_next_char (p);
 768     }
 769
 770   return FALSE;
 771 }
 772
 773 static gsize
 774 real_tolower (const gchar *str,
 775               gssize       max_len,
 776               gchar       *out_buffer,
 777               LocaleType   locale_type)
 778 {
 779   const gchar *p = str;
 780   const char *last = NULL;
 781   gsize len = 0;
 782
 783   while ((max_len < 0 || p < str + max_len) && *p)
 784     {
 785       gunichar c = g_utf8_get_char (p);
 786       int t = TYPE (c);
 787       gunichar val;
 788
 789       last = p;
 790       p = g_utf8_next_char (p);
 791
 792       if (locale_type == LOCALE_TURKIC && c == 'I')
 793         {
 794           if (g_utf8_get_char (p) == 0x0307)
 795             {
 796               /* I + COMBINING DOT ABOVE => i (U+0069) */
 797               len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL);
 798               p = g_utf8_next_char (p);
 799             }
 800           else
 801             {
 802               /* I => LATIN SMALL LETTER DOTLESS I */
 803               len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL);
 804             }
 805         }
 806       /* Introduce an explicit dot above when lowercasing capital I's and J's
 807        * whenever there are more accents above. [SpecialCasing.txt] */
 808       else if (locale_type == LOCALE_LITHUANIAN &&
 809                (c == 0x00cc || c == 0x00cd || c == 0x0128))
 810         {
 811           len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL);
 812           len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL);
 813
 814           switch (c)
 815             {
 816             case 0x00cc:
 817               len += g_unichar_to_utf8 (0x0300, out_buffer ? out_buffer + len : NULL);
 818               break;
 819             case 0x00cd:
 820               len += g_unichar_to_utf8 (0x0301, out_buffer ? out_buffer + len : NULL);
 821               break;
 822             case 0x0128:
 823               len += g_unichar_to_utf8 (0x0303, out_buffer ? out_buffer + len : NULL);
 824               break;
 825             }
 826         }
 827       else if (locale_type == LOCALE_LITHUANIAN &&
 828                (c == 'I' || c == 'J' || c == 0x012e) &&
 829                has_more_above (p))
 830         {
 831           len += g_unichar_to_utf8 (g_unichar_tolower (c), out_buffer ? out_buffer + len : NULL);
 832           len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL);
 833         }
 834       else if (c == 0x03A3)     /* GREEK CAPITAL LETTER SIGMA */
 835         {
 836           if ((max_len < 0 || p < str + max_len) && *p)
 837             {
 838               gunichar next_c = g_utf8_get_char (p);
 839               int next_type = TYPE(next_c);
 840
 841               /* SIGMA mapps differently depending on whether it is
 842                * final or not. The following simplified test would
 843                * fail in the case of combining marks following the
 844                * sigma, but I don't think that occurs in real text.
 845                * The test here matches that in ICU.
 846                */
 847               if (ISALPHA(next_type)) /* Lu,Ll,Lt,Lm,Lo */
 848                 val = 0x3c3;    /* GREEK SMALL SIGMA */
 849               else
 850                 val = 0x3c2;    /* GREEK SMALL FINAL SIGMA */
 851             }
 852           else
 853             val = 0x3c2;        /* GREEK SMALL FINAL SIGMA */
 854
 855           len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 856         }
 857       else if (t == G_UNICODE_UPPERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER)
 858         {
 859           val = ATTTABLE (c >> 8, c & 0xff);
 860
 861           if (val >= 0x1000000)
 862             {
 863               len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t, 0);
 864             }
 865           else
 866             {
 867               if (t == G_UNICODE_TITLECASE_LETTER)
 868                 {
 869                   unsigned int i;
 870                   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 871                     {
 872                       if (title_table[i][0] == c)
 873                         val = title_table[i][2];
 874                     }
 875                 }
 876
 877               len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 878             }
 879         }
 880       else
 881         {
 882           gsize char_len = g_utf8_skip[*(guchar *)last];
 883
 884           if (out_buffer)
 885             memcpy (out_buffer + len, last, char_len);
 886
 887           len += char_len;
 888         }
 889
 890     }
 891
 892   return len;
 893 }
 894
 895 /**
 896  * g_utf8_strdown:
 897  * @str: a UTF-8 encoded string
 898  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 899  *
 900  * Converts all Unicode characters in the string that have a case
 901  * to lowercase. The exact manner that this is done depends
 902  * on the current locale, and may result in the number of
 903  * characters in the string changing.
 904  *
 905  * Return value: a newly allocated string, with all characters
 906  *    converted to lowercase.
 907  **/
 908 gchar *
 909 g_utf8_strdown (const gchar *str,
 910                 gssize       len)
 911 {
 912   gsize result_len;
 913   LocaleType locale_type;
 914   gchar *result;
 915
 916   g_return_val_if_fail (str != NULL, NULL);
 917
 918   locale_type = get_locale_type ();
 919
 920   /*
 921    * We use a two pass approach to keep memory management simple
 922    */
 923   result_len = real_tolower (str, len, NULL, locale_type);
 924   result = g_malloc (result_len + 1);
 925   real_tolower (str, len, result, locale_type);
 926   result[result_len] = '\0';
 927
 928   return result;
 929 }
 930
 931 /**
 932  * g_utf8_casefold:
 933  * @str: a UTF-8 encoded string
 934  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 935  *
 936  * Converts a string into a form that is independent of case. The
 937  * result will not correspond to any particular case, but can be
 938  * compared for equality or ordered with the results of calling
 939  * g_utf8_casefold() on other strings.
 940  *
 941  * Note that calling g_utf8_casefold() followed by g_utf8_collate() is
 942  * only an approximation to the correct linguistic case insensitive
 943  * ordering, though it is a fairly good one. Getting this exactly
 944  * right would require a more sophisticated collation function that
 945  * takes case sensitivity into account. GLib does not currently
 946  * provide such a function.
 947  *
 948  * Return value: a newly allocated string, that is a
 949  *   case independent form of @str.
 950  **/
 951 gchar *
 952 g_utf8_casefold (const gchar *str,
 953                  gssize       len)
 954 {
 955   GString *result;
 956   const char *p;
 957
 958   g_return_val_if_fail (str != NULL, NULL);
 959
 960   result = g_string_new (NULL);
 961   p = str;
 962   while ((len < 0 || p < str + len) && *p)
 963     {
 964       gunichar ch = g_utf8_get_char (p);
 965
 966       int start = 0;
 967       int end = G_N_ELEMENTS (casefold_table);
 968
 969       if (ch >= casefold_table[start].ch &&
 970           ch <= casefold_table[end - 1].ch)
 971         {
 972           while (TRUE)
 973             {
 974               int half = (start + end) / 2;
 975               if (ch == casefold_table[half].ch)
 976                 {
 977                   g_string_append (result, casefold_table[half].data);
 978                   goto next;
 979                 }
 980               else if (half == start)
 981                 break;
 982               else if (ch > casefold_table[half].ch)
 983                 start = half;
 984               else
 985                 end = half;
 986             }
 987         }
 988
 989       g_string_append_unichar (result, g_unichar_tolower (ch));
 990
 991     next:
 992       p = g_utf8_next_char (p);
 993     }
 994
 995   return g_string_free (result, FALSE);
 996 }
 997
 998 /**
 999  * g_unichar_get_mirror_char:
1000  * @ch: a unicode character
1001  * @mirrored_ch: location to store the mirrored character
1002  *
1003  * In Unicode, some characters are <firstterm>mirrored</firstterm>. This
1004  * means that their images are mirrored horizontally in text that is laid
1005  * out from right to left. For instance, "(" would become its mirror image,
1006  * ")", in right-to-left text.
1007  *
1008  * If @ch has the Unicode mirrored property and there is another unicode
1009  * character that typically has a glyph that is the mirror image of @ch's
1010  * glyph, puts that character in the address pointed to by @mirrored_ch.
1011  *
1012  * Return value: %TRUE if @ch has a mirrored character and @mirrored_ch is
1013  * filled in, %FALSE otherwise
1014  *
1015  * Since: 2.4
1016  **/
1017 /* This code is adapted from FriBidi (http://fribidi.sourceforge.net/).
1018  * FriBidi is: Copyright (C) 1999,2000 Dov Grobgeld, and
1019  *             Copyright (C) 2001,2002 Behdad Esfahbod.
1020  */
1021 gboolean
1022 g_unichar_get_mirror_char (gunichar ch,
1023                            gunichar *mirrored_ch)
1024 {
1025   gint pos, step, size;
1026   gboolean found;
1027
1028   size = G_N_ELEMENTS (bidi_mirroring_table);
1029   pos = step = (size / 2) + 1;
1030
1031   while (step > 1)
1032     {
1033       gunichar cmp_ch = bidi_mirroring_table[pos].ch;
1034       step = (step + 1) / 2;
1035
1036       if (cmp_ch < ch)
1037         {
1038           pos += step;
1039           if (pos > size - 1)
1040             pos = size - 1;
1041         }
1042       else if (cmp_ch > ch)
1043         {
1044           pos -= step;
1045           if (pos < 0)
1046             pos = 0;
1047         }
1048       else
1049         break;
1050     }
1051   found = bidi_mirroring_table[pos].ch == ch;
1052   if (mirrored_ch)
1053     *mirrored_ch = found ? bidi_mirroring_table[pos].mirrored_ch : ch;
1054
1055   return found;
1056
1057 }
1058
1059 #define __G_UNIPROP_C__
1060 #include "galiasdef.c"