glib/guniprop.c

   1 /* guniprop.c - Unicode character properties.
   2  *
   3  * Copyright (C) 1999 Tom Tromey
   4  * Copyright (C) 2000 Red Hat, Inc.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  */
  21
  22 #include "config.h"
  23
  24 #include <stddef.h>
  25 #include <string.h>
  26 #include <locale.h>
  27
  28 #include "galias.h"
  29 #include "glib.h"
  30 #include "gunichartables.h"
  31 #include "gunicodeprivate.h"
  32
  33 #define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \
  34                           ? attr_table_part1[Page] \
  35                           : attr_table_part2[(Page) - 0xe00])
  36
  37 #define ATTTABLE(Page, Char) \
  38   ((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char]))
  39
  40 #define TTYPE_PART1(Page, Char) \
  41   ((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
  42    ? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
  43    : (type_data[type_table_part1[Page]][Char]))
  44
  45 #define TTYPE_PART2(Page, Char) \
  46   ((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
  47    ? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
  48    : (type_data[type_table_part2[Page]][Char]))
  49
  50 #define TYPE(Char) \
  51   (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
  52    ? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \
  53    : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
  54       ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
  55       : G_UNICODE_UNASSIGNED))
  56
  57
  58 #define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER       \
  59                        || (Type) == G_UNICODE_LETTER_NUMBER     \
  60                        || (Type) == G_UNICODE_OTHER_NUMBER)
  61
  62 #define ISALPHA(Type) ((Type) == G_UNICODE_LOWERCASE_LETTER     \
  63                        || (Type) == G_UNICODE_UPPERCASE_LETTER  \
  64                        || (Type) == G_UNICODE_TITLECASE_LETTER  \
  65                        || (Type) == G_UNICODE_MODIFIER_LETTER   \
  66                        || (Type) == G_UNICODE_OTHER_LETTER)
  67
  68 #define ISMARK(Type) ((Type) == G_UNICODE_NON_SPACING_MARK ||   \
  69                       (Type) == G_UNICODE_COMBINING_MARK ||     \
  70                       (Type) == G_UNICODE_ENCLOSING_MARK)
  71
  72
  73 /**
  74  * g_unichar_isalnum:
  75  * @c: a Unicode character
  76  *
  77  * Determines whether a character is alphanumeric.
  78  * Given some UTF-8 text, obtain a character value
  79  * with g_utf8_get_char().
  80  *
  81  * Return value: %TRUE if @c is an alphanumeric character
  82  **/
  83 gboolean
  84 g_unichar_isalnum (gunichar c)
  85 {
  86   int t = TYPE (c);
  87   return ISDIGIT (t) || ISALPHA (t);
  88 }
  89
  90 /**
  91  * g_unichar_isalpha:
  92  * @c: a Unicode character
  93  *
  94  * Determines whether a character is alphabetic (i.e. a letter).
  95  * Given some UTF-8 text, obtain a character value with
  96  * g_utf8_get_char().
  97  *
  98  * Return value: %TRUE if @c is an alphabetic character
  99  **/
 100 gboolean
 101 g_unichar_isalpha (gunichar c)
 102 {
 103   int t = TYPE (c);
 104   return ISALPHA (t);
 105 }
 106
 107
 108 /**
 109  * g_unichar_iscntrl:
 110  * @c: a Unicode character
 111  *
 112  * Determines whether a character is a control character.
 113  * Given some UTF-8 text, obtain a character value with
 114  * g_utf8_get_char().
 115  *
 116  * Return value: %TRUE if @c is a control character
 117  **/
 118 gboolean
 119 g_unichar_iscntrl (gunichar c)
 120 {
 121   return TYPE (c) == G_UNICODE_CONTROL;
 122 }
 123
 124 /**
 125  * g_unichar_isdigit:
 126  * @c: a Unicode character
 127  *
 128  * Determines whether a character is numeric (i.e. a digit).  This
 129  * covers ASCII 0-9 and also digits in other languages/scripts.  Given
 130  * some UTF-8 text, obtain a character value with g_utf8_get_char().
 131  *
 132  * Return value: %TRUE if @c is a digit
 133  **/
 134 gboolean
 135 g_unichar_isdigit (gunichar c)
 136 {
 137   return TYPE (c) == G_UNICODE_DECIMAL_NUMBER;
 138 }
 139
 140
 141 /**
 142  * g_unichar_isgraph:
 143  * @c: a Unicode character
 144  *
 145  * Determines whether a character is printable and not a space
 146  * (returns %FALSE for control characters, format characters, and
 147  * spaces). g_unichar_isprint() is similar, but returns %TRUE for
 148  * spaces. Given some UTF-8 text, obtain a character value with
 149  * g_utf8_get_char().
 150  *
 151  * Return value: %TRUE if @c is printable unless it's a space
 152  **/
 153 gboolean
 154 g_unichar_isgraph (gunichar c)
 155 {
 156   int t = TYPE (c);
 157   return (t != G_UNICODE_CONTROL
 158           && t != G_UNICODE_FORMAT
 159           && t != G_UNICODE_UNASSIGNED
 160           && t != G_UNICODE_PRIVATE_USE
 161           && t != G_UNICODE_SURROGATE
 162           && t != G_UNICODE_SPACE_SEPARATOR);
 163 }
 164
 165 /**
 166  * g_unichar_islower:
 167  * @c: a Unicode character
 168  *
 169  * Determines whether a character is a lowercase letter.
 170  * Given some UTF-8 text, obtain a character value with
 171  * g_utf8_get_char().
 172  *
 173  * Return value: %TRUE if @c is a lowercase letter
 174  **/
 175 gboolean
 176 g_unichar_islower (gunichar c)
 177 {
 178   return TYPE (c) == G_UNICODE_LOWERCASE_LETTER;
 179 }
 180
 181
 182 /**
 183  * g_unichar_isprint:
 184  * @c: a Unicode character
 185  *
 186  * Determines whether a character is printable.
 187  * Unlike g_unichar_isgraph(), returns %TRUE for spaces.
 188  * Given some UTF-8 text, obtain a character value with
 189  * g_utf8_get_char().
 190  *
 191  * Return value: %TRUE if @c is printable
 192  **/
 193 gboolean
 194 g_unichar_isprint (gunichar c)
 195 {
 196   int t = TYPE (c);
 197   return (t != G_UNICODE_CONTROL
 198           && t != G_UNICODE_FORMAT
 199           && t != G_UNICODE_UNASSIGNED
 200           && t != G_UNICODE_PRIVATE_USE
 201           && t != G_UNICODE_SURROGATE);
 202 }
 203
 204 /**
 205  * g_unichar_ispunct:
 206  * @c: a Unicode character
 207  *
 208  * Determines whether a character is punctuation or a symbol.
 209  * Given some UTF-8 text, obtain a character value with
 210  * g_utf8_get_char().
 211  *
 212  * Return value: %TRUE if @c is a punctuation or symbol character
 213  **/
 214 gboolean
 215 g_unichar_ispunct (gunichar c)
 216 {
 217   int t = TYPE (c);
 218   return (t == G_UNICODE_CONNECT_PUNCTUATION || t == G_UNICODE_DASH_PUNCTUATION
 219           || t == G_UNICODE_CLOSE_PUNCTUATION || t == G_UNICODE_FINAL_PUNCTUATION
 220           || t == G_UNICODE_INITIAL_PUNCTUATION || t == G_UNICODE_OTHER_PUNCTUATION
 221           || t == G_UNICODE_OPEN_PUNCTUATION || t == G_UNICODE_CURRENCY_SYMBOL
 222           || t == G_UNICODE_MODIFIER_SYMBOL || t == G_UNICODE_MATH_SYMBOL
 223           || t == G_UNICODE_OTHER_SYMBOL);
 224 }
 225
 226 /**
 227  * g_unichar_isspace:
 228  * @c: a Unicode character
 229  *
 230  * Determines whether a character is a space, tab, or line separator
 231  * (newline, carriage return, etc.).  Given some UTF-8 text, obtain a
 232  * character value with g_utf8_get_char().
 233  *
 234  * (Note: don't use this to do word breaking; you have to use
 235  * Pango or equivalent to get word breaking right, the algorithm
 236  * is fairly complex.)
 237  *
 238  * Return value: %TRUE if @c is a punctuation character
 239  **/
 240 gboolean
 241 g_unichar_isspace (gunichar c)
 242 {
 243   switch (c)
 244     {
 245       /* special-case these since Unicode thinks they are not spaces */
 246     case '\t':
 247     case '\n':
 248     case '\r':
 249     case '\f':
 250       return TRUE;
 251       break;
 252
 253     default:
 254       {
 255         int t = TYPE (c);
 256         return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR
 257                 || t == G_UNICODE_PARAGRAPH_SEPARATOR);
 258       }
 259       break;
 260     }
 261 }
 262
 263 /**
 264  * g_unichar_isupper:
 265  * @c: a Unicode character
 266  *
 267  * Determines if a character is uppercase.
 268  *
 269  * Return value: %TRUE if @c is an uppercase character
 270  **/
 271 gboolean
 272 g_unichar_isupper (gunichar c)
 273 {
 274   return TYPE (c) == G_UNICODE_UPPERCASE_LETTER;
 275 }
 276
 277 /**
 278  * g_unichar_istitle:
 279  * @c: a Unicode character
 280  *
 281  * Determines if a character is titlecase. Some characters in
 282  * Unicode which are composites, such as the DZ digraph
 283  * have three case variants instead of just two. The titlecase
 284  * form is used at the beginning of a word where only the
 285  * first letter is capitalized. The titlecase form of the DZ
 286  * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z.
 287  *
 288  * Return value: %TRUE if the character is titlecase
 289  **/
 290 gboolean
 291 g_unichar_istitle (gunichar c)
 292 {
 293   unsigned int i;
 294   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 295     if (title_table[i][0] == c)
 296       return 1;
 297   return 0;
 298 }
 299
 300 /**
 301  * g_unichar_isxdigit:
 302  * @c: a Unicode character.
 303  *
 304  * Determines if a character is a hexidecimal digit.
 305  *
 306  * Return value: %TRUE if the character is a hexadecimal digit
 307  **/
 308 gboolean
 309 g_unichar_isxdigit (gunichar c)
 310 {
 311   int t = TYPE (c);
 312   return ((c >= 'a' && c <= 'f')
 313           || (c >= 'A' && c <= 'F')
 314           || ISDIGIT (t));
 315 }
 316
 317 /**
 318  * g_unichar_isdefined:
 319  * @c: a Unicode character
 320  *
 321  * Determines if a given character is assigned in the Unicode
 322  * standard.
 323  *
 324  * Return value: %TRUE if the character has an assigned value
 325  **/
 326 gboolean
 327 g_unichar_isdefined (gunichar c)
 328 {
 329   int t = TYPE (c);
 330   return t != G_UNICODE_UNASSIGNED;
 331 }
 332
 333 /**
 334  * g_unichar_iswide:
 335  * @c: a Unicode character
 336  *
 337  * Determines if a character is typically rendered in a double-width
 338  * cell.
 339  *
 340  * Return value: %TRUE if the character is wide
 341  **/
 342 /* This function stolen from Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>.  */
 343 gboolean
 344 g_unichar_iswide (gunichar c)
 345 {
 346   if (c < 0x1100)
 347     return FALSE;
 348
 349   return (c <= 0x115f  /* Hangul Jamo init. consonants */
 350           || c == 0x2329 || c == 0x232a     /* angle brackets */
 351           || (c >= 0x2e80 && c <= 0xa4cf && (c < 0x302a || c > 0x302f)
 352               && c != 0x303f && c != 0x3099 && c!= 0x309a) /* CJK ... Yi */
 353           || (c >= 0xac00 && c <= 0xd7a3)   /* Hangul Syllables */
 354           || (c >= 0xf900 && c <= 0xfaff)   /* CJK Compatibility Ideographs */
 355           || (c >= 0xfe30 && c <= 0xfe6f)   /* CJK Compatibility Forms */
 356           || (c >= 0xff00 && c <= 0xff60)   /* Fullwidth Forms */
 357           || (c >= 0xffe0 && c <= 0xffe6)   /* Fullwidth Forms */
 358           || (c >= 0x20000 && c <= 0x2fffd) /* CJK extra stuff */
 359           || (c >= 0x30000 && c <= 0x3fffd));
 360 }
 361
 362 /**
 363  * g_unichar_toupper:
 364  * @c: a Unicode character
 365  *
 366  * Converts a character to uppercase.
 367  *
 368  * Return value: the result of converting @c to uppercase.
 369  *               If @c is not an lowercase or titlecase character,
 370  *               or has no upper case equivalent @c is returned unchanged.
 371  **/
 372 gunichar
 373 g_unichar_toupper (gunichar c)
 374 {
 375   int t = TYPE (c);
 376   if (t == G_UNICODE_LOWERCASE_LETTER)
 377     {
 378       gunichar val = ATTTABLE (c >> 8, c & 0xff);
 379       if (val >= 0x1000000)
 380         {
 381           const gchar *p = special_case_table + val - 0x1000000;
 382           return g_utf8_get_char (p);
 383         }
 384       else
 385         return val ? val : c;
 386     }
 387   else if (t == G_UNICODE_TITLECASE_LETTER)
 388     {
 389       unsigned int i;
 390       for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 391         {
 392           if (title_table[i][0] == c)
 393             return title_table[i][1];
 394         }
 395     }
 396   return c;
 397 }
 398
 399 /**
 400  * g_unichar_tolower:
 401  * @c: a Unicode character.
 402  *
 403  * Converts a character to lower case.
 404  *
 405  * Return value: the result of converting @c to lower case.
 406  *               If @c is not an upperlower or titlecase character,
 407  *               or has no lowercase equivalent @c is returned unchanged.
 408  **/
 409 gunichar
 410 g_unichar_tolower (gunichar c)
 411 {
 412   int t = TYPE (c);
 413   if (t == G_UNICODE_UPPERCASE_LETTER)
 414     {
 415       gunichar val = ATTTABLE (c >> 8, c & 0xff);
 416       if (val >= 0x1000000)
 417         {
 418           const gchar *p = special_case_table + val - 0x1000000;
 419           return g_utf8_get_char (p);
 420         }
 421       else
 422         return val ? val : c;
 423     }
 424   else if (t == G_UNICODE_TITLECASE_LETTER)
 425     {
 426       unsigned int i;
 427       for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 428         {
 429           if (title_table[i][0] == c)
 430             return title_table[i][2];
 431         }
 432     }
 433   return c;
 434 }
 435
 436 /**
 437  * g_unichar_totitle:
 438  * @c: a Unicode character
 439  *
 440  * Converts a character to the titlecase.
 441  *
 442  * Return value: the result of converting @c to titlecase.
 443  *               If @c is not an uppercase or lowercase character,
 444  *               @c is returned unchanged.
 445  **/
 446 gunichar
 447 g_unichar_totitle (gunichar c)
 448 {
 449   unsigned int i;
 450   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 451     {
 452       if (title_table[i][0] == c || title_table[i][1] == c
 453           || title_table[i][2] == c)
 454         return title_table[i][0];
 455     }
 456   return (TYPE (c) == G_UNICODE_LOWERCASE_LETTER
 457           ? ATTTABLE (c >> 8, c & 0xff)
 458           : c);
 459 }
 460
 461 /**
 462  * g_unichar_digit_value:
 463  * @c: a Unicode character
 464  *
 465  * Determines the numeric value of a character as a decimal
 466  * digit.
 467  *
 468  * Return value: If @c is a decimal digit (according to
 469  * g_unichar_isdigit()), its numeric value. Otherwise, -1.
 470  **/
 471 int
 472 g_unichar_digit_value (gunichar c)
 473 {
 474   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
 475     return ATTTABLE (c >> 8, c & 0xff);
 476   return -1;
 477 }
 478
 479 /**
 480  * g_unichar_xdigit_value:
 481  * @c: a Unicode character
 482  *
 483  * Determines the numeric value of a character as a hexidecimal
 484  * digit.
 485  *
 486  * Return value: If @c is a hex digit (according to
 487  * g_unichar_isxdigit()), its numeric value. Otherwise, -1.
 488  **/
 489 int
 490 g_unichar_xdigit_value (gunichar c)
 491 {
 492   if (c >= 'A' && c <= 'F')
 493     return c - 'A' + 10;
 494   if (c >= 'a' && c <= 'f')
 495     return c - 'a' + 10;
 496   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
 497     return ATTTABLE (c >> 8, c & 0xff);
 498   return -1;
 499 }
 500
 501 /**
 502  * g_unichar_type:
 503  * @c: a Unicode character
 504  *
 505  * Classifies a Unicode character by type.
 506  *
 507  * Return value: the type of the character.
 508  **/
 509 GUnicodeType
 510 g_unichar_type (gunichar c)
 511 {
 512   return TYPE (c);
 513 }
 514
 515 /*
 516  * Case mapping functions
 517  */
 518
 519 typedef enum {
 520   LOCALE_NORMAL,
 521   LOCALE_TURKIC,
 522   LOCALE_LITHUANIAN
 523 } LocaleType;
 524
 525 static LocaleType
 526 get_locale_type (void)
 527 {
 528   const char *locale = setlocale (LC_CTYPE, NULL);
 529
 530   switch (locale[0])
 531     {
 532    case 'a':
 533       if (locale[1] == 'z')
 534         return LOCALE_TURKIC;
 535       break;
 536     case 'l':
 537       if (locale[1] == 't')
 538         return LOCALE_LITHUANIAN;
 539       break;
 540     case 't':
 541       if (locale[1] == 'r')
 542         return LOCALE_TURKIC;
 543       break;
 544     }
 545
 546   return LOCALE_NORMAL;
 547 }
 548
 549 static gint
 550 output_marks (const char **p_inout,
 551               char        *out_buffer,
 552               gboolean     remove_dot)
 553 {
 554   const char *p = *p_inout;
 555   gint len = 0;
 556
 557   while (*p)
 558     {
 559       gunichar c = g_utf8_get_char (p);
 560       int t = TYPE(c);
 561
 562       if (ISMARK(t))
 563         {
 564           if (!remove_dot || c != 0x307 /* COMBINING DOT ABOVE */)
 565             len += g_unichar_to_utf8 (c, out_buffer ? out_buffer + len : NULL);
 566           p = g_utf8_next_char (p);
 567         }
 568       else
 569         break;
 570     }
 571
 572   *p_inout = p;
 573   return len;
 574 }
 575
 576 static gint
 577 output_special_case (gchar *out_buffer,
 578                      int    offset,
 579                      int    type,
 580                      int    which)
 581 {
 582   const gchar *p = special_case_table + offset;
 583   gint len;
 584
 585   if (type != G_UNICODE_TITLECASE_LETTER)
 586     p = g_utf8_next_char (p);
 587
 588   if (which == 1)
 589     p += strlen (p) + 1;
 590
 591   len = strlen (p);
 592   if (out_buffer)
 593     memcpy (out_buffer, p, len);
 594
 595   return len;
 596 }
 597
 598 static gsize
 599 real_toupper (const gchar *str,
 600               gssize       max_len,
 601               gchar       *out_buffer,
 602               LocaleType   locale_type)
 603 {
 604   const gchar *p = str;
 605   const char *last = NULL;
 606   gsize len = 0;
 607   gboolean last_was_i = FALSE;
 608
 609   while ((max_len < 0 || p < str + max_len) && *p)
 610     {
 611       gunichar c = g_utf8_get_char (p);
 612       int t = TYPE (c);
 613       gunichar val;
 614
 615       last = p;
 616       p = g_utf8_next_char (p);
 617
 618       if (locale_type == LOCALE_LITHUANIAN)
 619         {
 620           if (c == 'i')
 621             last_was_i = TRUE;
 622           else
 623             {
 624               if (last_was_i)
 625                 {
 626                   /* Nasty, need to remove any dot above. Though
 627                    * I think only E WITH DOT ABOVE occurs in practice
 628                    * which could simplify this considerably.
 629                    */
 630                   gsize decomp_len, i;
 631                   gunichar *decomp;
 632
 633                   decomp = g_unicode_canonical_decomposition (c, &decomp_len);
 634                   for (i=0; i < decomp_len; i++)
 635                     {
 636                       if (decomp[i] != 0x307 /* COMBINING DOT ABOVE */)
 637                         len += g_unichar_to_utf8 (g_unichar_toupper (decomp[i]), out_buffer ? out_buffer + len : NULL);
 638                     }
 639                   g_free (decomp);
 640
 641                   len += output_marks (&p, out_buffer ? out_buffer + len : NULL, TRUE);
 642
 643                   continue;
 644                 }
 645
 646               if (!ISMARK(t))
 647                 last_was_i = FALSE;
 648             }
 649         }
 650
 651       if (locale_type == LOCALE_TURKIC && c == 'i')
 652         {
 653           /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE */
 654           len += g_unichar_to_utf8 (0x130, out_buffer ? out_buffer + len : NULL);
 655         }
 656       else if (c == 0x0345)     /* COMBINING GREEK YPOGEGRAMMENI */
 657         {
 658           /* Nasty, need to move it after other combining marks .. this would go away if
 659            * we normalized first.
 660            */
 661           len += output_marks (&p, out_buffer ? out_buffer + len : NULL, FALSE);
 662
 663           /* And output as GREEK CAPITAL LETTER IOTA */
 664           len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL);
 665         }
 666       else if (t == G_UNICODE_LOWERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER)
 667         {
 668           val = ATTTABLE (c >> 8, c & 0xff);
 669
 670           if (val >= 0x1000000)
 671             {
 672               len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t,
 673                                           t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1);
 674             }
 675           else
 676             {
 677               if (t == G_UNICODE_TITLECASE_LETTER)
 678                 {
 679                   unsigned int i;
 680                   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 681                     {
 682                       if (title_table[i][0] == c)
 683                         val = title_table[i][1];
 684                     }
 685                 }
 686
 687               len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 688             }
 689         }
 690       else
 691         {
 692           gsize char_len = g_utf8_skip[*(guchar *)last];
 693
 694           if (out_buffer)
 695             memcpy (out_buffer + len, last, char_len);
 696
 697           len += char_len;
 698         }
 699
 700     }
 701
 702   return len;
 703 }
 704
 705 /**
 706  * g_utf8_strup:
 707  * @str: a UTF-8 encoded string
 708  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 709  *
 710  * Converts all Unicode characters in the string that have a case
 711  * to uppercase. The exact manner that this is done depends
 712  * on the current locale, and may result in the number of
 713  * characters in the string increasing. (For instance, the
 714  * German ess-zet will be changed to SS.)
 715  *
 716  * Return value: a newly allocated string, with all characters
 717  *    converted to uppercase.
 718  **/
 719 gchar *
 720 g_utf8_strup (const gchar *str,
 721               gssize       len)
 722 {
 723   gsize result_len;
 724   LocaleType locale_type;
 725   gchar *result;
 726
 727   g_return_val_if_fail (str != NULL, NULL);
 728
 729   locale_type = get_locale_type ();
 730
 731   /*
 732    * We use a two pass approach to keep memory management simple
 733    */
 734   result_len = real_toupper (str, len, NULL, locale_type);
 735   result = g_malloc (result_len + 1);
 736   real_toupper (str, len, result, locale_type);
 737   result[result_len] = '\0';
 738
 739   return result;
 740 }
 741
 742 /* traverses the string checking for characters with combining class == 230
 743  * until a base character is found */
 744 static gboolean
 745 has_more_above (const gchar *str)
 746 {
 747   const gchar *p = str;
 748   gint combining_class;
 749
 750   while (*p)
 751     {
 752       combining_class = _g_unichar_combining_class (g_utf8_get_char (p));
 753       if (combining_class == 230)
 754         return TRUE;
 755       else if (combining_class == 0)
 756         break;
 757
 758       p = g_utf8_next_char (p);
 759     }
 760
 761   return FALSE;
 762 }
 763
 764 static gsize
 765 real_tolower (const gchar *str,
 766               gssize       max_len,
 767               gchar       *out_buffer,
 768               LocaleType   locale_type)
 769 {
 770   const gchar *p = str;
 771   const char *last = NULL;
 772   gsize len = 0;
 773
 774   while ((max_len < 0 || p < str + max_len) && *p)
 775     {
 776       gunichar c = g_utf8_get_char (p);
 777       int t = TYPE (c);
 778       gunichar val;
 779
 780       last = p;
 781       p = g_utf8_next_char (p);
 782
 783       if (locale_type == LOCALE_TURKIC && c == 'I')
 784         {
 785           if (g_utf8_get_char (p) == 0x0307)
 786             {
 787               /* I + COMBINING DOT ABOVE => i (U+0069) */
 788               len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL);
 789               p = g_utf8_next_char (p);
 790             }
 791           else
 792             {
 793               /* I => LATIN SMALL LETTER DOTLESS I */
 794               len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL);
 795             }
 796         }
 797       /* Introduce an explicit dot above when lowercasing capital I's and J's
 798        * whenever there are more accents above. [SpecialCasing.txt] */
 799       else if (locale_type == LOCALE_LITHUANIAN &&
 800                (c == 0x00cc || c == 0x00cd || c == 0x0128))
 801         {
 802           len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL);
 803           len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL);
 804
 805           switch (c)
 806             {
 807             case 0x00cc:
 808               len += g_unichar_to_utf8 (0x0300, out_buffer ? out_buffer + len : NULL);
 809               break;
 810             case 0x00cd:
 811               len += g_unichar_to_utf8 (0x0301, out_buffer ? out_buffer + len : NULL);
 812               break;
 813             case 0x0128:
 814               len += g_unichar_to_utf8 (0x0303, out_buffer ? out_buffer + len : NULL);
 815               break;
 816             }
 817         }
 818       else if (locale_type == LOCALE_LITHUANIAN &&
 819                (c == 'I' || c == 'J' || c == 0x012e) &&
 820                has_more_above (p))
 821         {
 822           len += g_unichar_to_utf8 (g_unichar_tolower (c), out_buffer ? out_buffer + len : NULL);
 823           len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL);
 824         }
 825       else if (c == 0x03A3)     /* GREEK CAPITAL LETTER SIGMA */
 826         {
 827           if ((max_len < 0 || p < str + max_len) && *p)
 828             {
 829               gunichar next_c = g_utf8_get_char (p);
 830               int next_type = TYPE(next_c);
 831
 832               /* SIGMA mapps differently depending on whether it is
 833                * final or not. The following simplified test would
 834                * fail in the case of combining marks following the
 835                * sigma, but I don't think that occurs in real text.
 836                * The test here matches that in ICU.
 837                */
 838               if (ISALPHA(next_type)) /* Lu,Ll,Lt,Lm,Lo */
 839                 val = 0x3c3;    /* GREEK SMALL SIGMA */
 840               else
 841                 val = 0x3c2;    /* GREEK SMALL FINAL SIGMA */
 842             }
 843           else
 844             val = 0x3c2;        /* GREEK SMALL FINAL SIGMA */
 845
 846           len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 847         }
 848       else if (t == G_UNICODE_UPPERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER)
 849         {
 850           val = ATTTABLE (c >> 8, c & 0xff);
 851
 852           if (val >= 0x1000000)
 853             {
 854               len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t, 0);
 855             }
 856           else
 857             {
 858               if (t == G_UNICODE_TITLECASE_LETTER)
 859                 {
 860                   unsigned int i;
 861                   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 862                     {
 863                       if (title_table[i][0] == c)
 864                         val = title_table[i][2];
 865                     }
 866                 }
 867
 868               len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 869             }
 870         }
 871       else
 872         {
 873           gsize char_len = g_utf8_skip[*(guchar *)last];
 874
 875           if (out_buffer)
 876             memcpy (out_buffer + len, last, char_len);
 877
 878           len += char_len;
 879         }
 880
 881     }
 882
 883   return len;
 884 }
 885
 886 /**
 887  * g_utf8_strdown:
 888  * @str: a UTF-8 encoded string
 889  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 890  *
 891  * Converts all Unicode characters in the string that have a case
 892  * to lowercase. The exact manner that this is done depends
 893  * on the current locale, and may result in the number of
 894  * characters in the string changing.
 895  *
 896  * Return value: a newly allocated string, with all characters
 897  *    converted to lowercase.
 898  **/
 899 gchar *
 900 g_utf8_strdown (const gchar *str,
 901                 gssize       len)
 902 {
 903   gsize result_len;
 904   LocaleType locale_type;
 905   gchar *result;
 906
 907   g_return_val_if_fail (str != NULL, NULL);
 908
 909   locale_type = get_locale_type ();
 910
 911   /*
 912    * We use a two pass approach to keep memory management simple
 913    */
 914   result_len = real_tolower (str, len, NULL, locale_type);
 915   result = g_malloc (result_len + 1);
 916   real_tolower (str, len, result, locale_type);
 917   result[result_len] = '\0';
 918
 919   return result;
 920 }
 921
 922 /**
 923  * g_utf8_casefold:
 924  * @str: a UTF-8 encoded string
 925  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 926  *
 927  * Converts a string into a form that is independent of case. The
 928  * result will not correspond to any particular case, but can be
 929  * compared for equality or ordered with the results of calling
 930  * g_utf8_casefold() on other strings.
 931  *
 932  * Note that calling g_utf8_casefold() followed by g_utf8_collate() is
 933  * only an approximation to the correct linguistic case insensitive
 934  * ordering, though it is a fairly good one. Getting this exactly
 935  * right would require a more sophisticated collation function that
 936  * takes case sensitivity into account. GLib does not currently
 937  * provide such a function.
 938  *
 939  * Return value: a newly allocated string, that is a
 940  *   case independent form of @str.
 941  **/
 942 gchar *
 943 g_utf8_casefold (const gchar *str,
 944                  gssize       len)
 945 {
 946   GString *result;
 947   const char *p;
 948
 949   g_return_val_if_fail (str != NULL, NULL);
 950
 951   result = g_string_new (NULL);
 952   p = str;
 953   while ((len < 0 || p < str + len) && *p)
 954     {
 955       gunichar ch = g_utf8_get_char (p);
 956
 957       int start = 0;
 958       int end = G_N_ELEMENTS (casefold_table);
 959
 960       if (ch >= casefold_table[start].ch &&
 961           ch <= casefold_table[end - 1].ch)
 962         {
 963           while (TRUE)
 964             {
 965               int half = (start + end) / 2;
 966               if (ch == casefold_table[half].ch)
 967                 {
 968                   g_string_append (result, casefold_table[half].data);
 969                   goto next;
 970                 }
 971               else if (half == start)
 972                 break;
 973               else if (ch > casefold_table[half].ch)
 974                 start = half;
 975               else
 976                 end = half;
 977             }
 978         }
 979
 980       g_string_append_unichar (result, g_unichar_tolower (ch));
 981
 982     next:
 983       p = g_utf8_next_char (p);
 984     }
 985
 986   return g_string_free (result, FALSE);
 987 }
 988
 989 /**
 990  * g_unichar_get_mirror_char:
 991  * @ch: a unicode character
 992  * @mirrored_ch: location to store the mirrored character
 993  *
 994  * In Unicode, some characters are <firstterm>mirrored</firstterm>. This
 995  * means that their images are mirrored horizontally in text that is laid
 996  * out from right to left. For instance, "(" would become its mirror image,
 997  * ")", in right-to-left text.
 998  *
 999  * If @ch has the Unicode mirrored property and there is another unicode
1000  * character that typically has a glyph that is the mirror image of @ch's
1001  * glyph, puts that character in the address pointed to by @mirrored_ch.
1002  *
1003  * Return value: %TRUE if @ch has a mirrored character and @mirrored_ch is
1004  * filled in, %FALSE otherwise
1005  *
1006  * Since: 2.4
1007  **/
1008 /* This code is adapted from FriBidi (http://fribidi.sourceforge.net/).
1009  * FriBidi is: Copyright (C) 1999,2000 Dov Grobgeld, and
1010  *             Copyright (C) 2001,2002 Behdad Esfahbod.
1011  */
1012 gboolean
1013 g_unichar_get_mirror_char (gunichar ch,
1014                            gunichar *mirrored_ch)
1015 {
1016   gint pos, step, size;
1017   gboolean found;
1018
1019   size = G_N_ELEMENTS (bidi_mirroring_table);
1020   pos = step = (size / 2) + 1;
1021
1022   while (step > 1)
1023     {
1024       gunichar cmp_ch = bidi_mirroring_table[pos].ch;
1025       step = (step + 1) / 2;
1026
1027       if (cmp_ch < ch)
1028         {
1029           pos += step;
1030           if (pos > size - 1)
1031             pos = size - 1;
1032         }
1033       else if (cmp_ch > ch)
1034         {
1035           pos -= step;
1036           if (pos < 0)
1037             pos = 0;
1038         }
1039       else
1040         break;
1041     }
1042   found = bidi_mirroring_table[pos].ch == ch;
1043   if (mirrored_ch)
1044     *mirrored_ch = found ? bidi_mirroring_table[pos].mirrored_ch : ch;
1045
1046   return found;
1047
1048 }