glib/gunicollate.c

   1 /* gunicollate.c - Collation
   2  *
   3  *  Copyright 2001,2005 Red Hat, Inc.
   4  *
   5  * The Gnome Library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public License as
   7  * published by the Free Software Foundation; either version 2 of the
   8  * License, or (at your option) any later version.
   9  *
  10  * The Gnome Library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with the Gnome Library; see the file COPYING.LIB.  If not,
  17  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  18  *   Boston, MA 02111-1307, USA.
  19  */
  20
  21 #include "config.h"
  22
  23 #include <locale.h>
  24 #include <string.h>
  25 #ifdef __STDC_ISO_10646__
  26 #include <wchar.h>
  27 #endif
  28
  29 #ifdef HAVE_CARBON
  30 #include <CoreServices/CoreServices.h>
  31 #endif
  32
  33 #include "glib.h"
  34 #include "gunicodeprivate.h"
  35
  36 #ifdef _MSC_VER
  37 /* Workaround for bug in MSVCR80.DLL */
  38 static gsize
  39 msc_strxfrm_wrapper (char       *string1,
  40                      const char *string2,
  41                      gsize       count)
  42 {
  43   if (!string1 || count <= 0)
  44     {
  45       char tmp;
  46
  47       return strxfrm (&tmp, string2, 1);
  48     }
  49   return strxfrm (string1, string2, count);
  50 }
  51 #define strxfrm msc_strxfrm_wrapper
  52 #endif
  53
  54 /**
  55  * g_utf8_collate:
  56  * @str1: a UTF-8 encoded string
  57  * @str2: a UTF-8 encoded string
  58  *
  59  * Compares two strings for ordering using the linguistically
  60  * correct rules for the <link linkend="setlocale">current locale</link>.
  61  * When sorting a large number of strings, it will be significantly
  62  * faster to obtain collation keys with g_utf8_collate_key() and
  63  * compare the keys with strcmp() when sorting instead of sorting
  64  * the original strings.
  65  *
  66  * Return value: &lt; 0 if @str1 compares before @str2,
  67  *   0 if they compare equal, &gt; 0 if @str1 compares after @str2.
  68  **/
  69 gint
  70 g_utf8_collate (const gchar *str1,
  71                 const gchar *str2)
  72 {
  73   gint result;
  74
  75 #ifdef HAVE_CARBON
  76
  77   UniChar *str1_utf16;
  78   UniChar *str2_utf16;
  79   glong len1;
  80   glong len2;
  81   SInt32 retval = 0;
  82
  83   g_return_val_if_fail (str1 != NULL, 0);
  84   g_return_val_if_fail (str2 != NULL, 0);
  85
  86   str1_utf16 = g_utf8_to_utf16 (str1, -1, NULL, &len1, NULL);
  87   str2_utf16 = g_utf8_to_utf16 (str2, -1, NULL, &len2, NULL);
  88
  89   UCCompareTextDefault (kUCCollateStandardOptions,
  90                         str1_utf16, len1, str2_utf16, len2,
  91                         NULL, &retval);
  92   result = retval;
  93
  94   g_free (str2_utf16);
  95   g_free (str1_utf16);
  96
  97 #elif defined(__STDC_ISO_10646__)
  98
  99   gunichar *str1_norm;
 100   gunichar *str2_norm;
 101
 102   g_return_val_if_fail (str1 != NULL, 0);
 103   g_return_val_if_fail (str2 != NULL, 0);
 104
 105   str1_norm = _g_utf8_normalize_wc (str1, -1, G_NORMALIZE_ALL_COMPOSE);
 106   str2_norm = _g_utf8_normalize_wc (str2, -1, G_NORMALIZE_ALL_COMPOSE);
 107
 108   result = wcscoll ((wchar_t *)str1_norm, (wchar_t *)str2_norm);
 109
 110   g_free (str1_norm);
 111   g_free (str2_norm);
 112
 113 #else /* !__STDC_ISO_10646__ */
 114
 115   const gchar *charset;
 116   gchar *str1_norm;
 117   gchar *str2_norm;
 118
 119   g_return_val_if_fail (str1 != NULL, 0);
 120   g_return_val_if_fail (str2 != NULL, 0);
 121
 122   str1_norm = g_utf8_normalize (str1, -1, G_NORMALIZE_ALL_COMPOSE);
 123   str2_norm = g_utf8_normalize (str2, -1, G_NORMALIZE_ALL_COMPOSE);
 124
 125   if (g_get_charset (&charset))
 126     {
 127       result = strcoll (str1_norm, str2_norm);
 128     }
 129   else
 130     {
 131       gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
 132       gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
 133
 134       if (str1_locale && str2_locale)
 135         result =  strcoll (str1_locale, str2_locale);
 136       else if (str1_locale)
 137         result = -1;
 138       else if (str2_locale)
 139         result = 1;
 140       else
 141         result = strcmp (str1_norm, str2_norm);
 142
 143       g_free (str1_locale);
 144       g_free (str2_locale);
 145     }
 146
 147   g_free (str1_norm);
 148   g_free (str2_norm);
 149
 150 #endif /* __STDC_ISO_10646__ */
 151
 152   return result;
 153 }
 154
 155 #if defined(__STDC_ISO_10646__) || defined(HAVE_CARBON)
 156 /* We need UTF-8 encoding of numbers to encode the weights if
 157  * we are using wcsxfrm. However, we aren't encoding Unicode
 158  * characters, so we can't simply use g_unichar_to_utf8.
 159  *
 160  * The following routine is taken (with modification) from GNU
 161  * libc's strxfrm routine:
 162  *
 163  * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
 164  * Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
 165  */
 166 static inline int
 167 utf8_encode (char *buf, wchar_t val)
 168 {
 169   int retval;
 170
 171   if (val < 0x80)
 172     {
 173       if (buf)
 174         *buf++ = (char) val;
 175       retval = 1;
 176     }
 177   else
 178     {
 179       int step;
 180
 181       for (step = 2; step < 6; ++step)
 182         if ((val & (~(guint32)0 << (5 * step + 1))) == 0)
 183           break;
 184       retval = step;
 185
 186       if (buf)
 187         {
 188           *buf = (unsigned char) (~0xff >> step);
 189           --step;
 190           do
 191             {
 192               buf[step] = 0x80 | (val & 0x3f);
 193               val >>= 6;
 194             }
 195           while (--step > 0);
 196           *buf |= val;
 197         }
 198     }
 199
 200   return retval;
 201 }
 202 #endif /* __STDC_ISO_10646__ || HAVE_CARBON */
 203
 204 #ifdef HAVE_CARBON
 205
 206 static gchar *
 207 collate_key_to_string (UCCollationValue *key,
 208                        gsize             key_len)
 209 {
 210   gchar *result;
 211   gsize result_len;
 212   gsize i;
 213
 214   /* Pretty smart algorithm here: ignore first eight bytes of the
 215    * collation key. It doesn't produce results equivalent to
 216    * UCCompareCollationKeys's, but the difference seems to be only
 217    * that UCCompareCollationKeys in some cases produces 0 where our
 218    * comparison gets -1 or 1. */
 219
 220   if (key_len * sizeof (UCCollationValue) <= 8)
 221     return g_strdup ("");
 222
 223   result_len = 0;
 224   for (i = 8; i < key_len * sizeof (UCCollationValue); i++)
 225     /* there may be nul bytes, encode byteval+1 */
 226     result_len += utf8_encode (NULL, *((guchar*)key + i) + 1);
 227
 228   result = g_malloc (result_len + 1);
 229   result_len = 0;
 230   for (i = 8; i < key_len * sizeof (UCCollationValue); i++)
 231     result_len += utf8_encode (result + result_len, *((guchar*)key + i) + 1);
 232
 233   result[result_len] = 0;
 234   return result;
 235 }
 236
 237 static gchar *
 238 carbon_collate_key_with_collator (const gchar *str,
 239                                   gssize       len,
 240                                   CollatorRef  collator)
 241 {
 242   UniChar *str_utf16 = NULL;
 243   glong len_utf16;
 244   OSStatus ret;
 245   UCCollationValue staticbuf[512];
 246   UCCollationValue *freeme = NULL;
 247   UCCollationValue *buf;
 248   ItemCount buf_len;
 249   ItemCount key_len;
 250   ItemCount try_len;
 251   gchar *result = NULL;
 252
 253   str_utf16 = g_utf8_to_utf16 (str, len, NULL, &len_utf16, NULL);
 254   try_len = len_utf16 * 5 + 2;
 255
 256   if (try_len <= sizeof staticbuf)
 257     {
 258       buf = staticbuf;
 259       buf_len = sizeof staticbuf;
 260     }
 261   else
 262     {
 263       freeme = g_new (UCCollationValue, try_len);
 264       buf = freeme;
 265       buf_len = try_len;
 266     }
 267
 268   ret = UCGetCollationKey (collator, str_utf16, len_utf16,
 269                            buf_len, &key_len, buf);
 270
 271   if (ret == kCollateBufferTooSmall)
 272     {
 273       freeme = g_renew (UCCollationValue, freeme, try_len * 2);
 274       buf = freeme;
 275       buf_len = try_len * 2;
 276       ret = UCGetCollationKey (collator, str_utf16, len_utf16,
 277                                buf_len, &key_len, buf);
 278     }
 279
 280   if (ret == 0)
 281     result = collate_key_to_string (buf, key_len);
 282   else
 283     result = g_strdup ("");
 284
 285   g_free (freeme);
 286   g_free (str_utf16);
 287   return result;
 288 }
 289
 290 static gchar *
 291 carbon_collate_key (const gchar *str,
 292                     gssize       len)
 293 {
 294   static CollatorRef collator;
 295
 296   if (G_UNLIKELY (!collator))
 297     {
 298       UCCreateCollator (NULL, 0, kUCCollateStandardOptions, &collator);
 299
 300       if (!collator)
 301         {
 302           static gboolean been_here;
 303           if (!been_here)
 304             g_warning ("%s: UCCreateCollator failed", G_STRLOC);
 305           been_here = TRUE;
 306           return g_strdup ("");
 307         }
 308     }
 309
 310   return carbon_collate_key_with_collator (str, len, collator);
 311 }
 312
 313 static gchar *
 314 carbon_collate_key_for_filename (const gchar *str,
 315                                  gssize       len)
 316 {
 317   static CollatorRef collator;
 318
 319   if (G_UNLIKELY (!collator))
 320     {
 321       /* http://developer.apple.com/qa/qa2004/qa1159.html */
 322       UCCreateCollator (NULL, 0,
 323                         kUCCollateComposeInsensitiveMask
 324                          | kUCCollateWidthInsensitiveMask
 325                          | kUCCollateCaseInsensitiveMask
 326                          | kUCCollateDigitsOverrideMask
 327                          | kUCCollateDigitsAsNumberMask
 328                          | kUCCollatePunctuationSignificantMask,
 329                         &collator);
 330
 331       if (!collator)
 332         {
 333           static gboolean been_here;
 334           if (!been_here)
 335             g_warning ("%s: UCCreateCollator failed", G_STRLOC);
 336           been_here = TRUE;
 337           return g_strdup ("");
 338         }
 339     }
 340
 341   return carbon_collate_key_with_collator (str, len, collator);
 342 }
 343
 344 #endif /* HAVE_CARBON */
 345
 346 /**
 347  * g_utf8_collate_key:
 348  * @str: a UTF-8 encoded string.
 349  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 350  *
 351  * Converts a string into a collation key that can be compared
 352  * with other collation keys produced by the same function using
 353  * strcmp().
 354  *
 355  * The results of comparing the collation keys of two strings
 356  * with strcmp() will always be the same as comparing the two
 357  * original keys with g_utf8_collate().
 358  *
 359  * Note that this function depends on the
 360  * <link linkend="setlocale">current locale</link>.
 361  *
 362  * Return value: a newly allocated string. This string should
 363  *   be freed with g_free() when you are done with it.
 364  **/
 365 gchar *
 366 g_utf8_collate_key (const gchar *str,
 367                     gssize       len)
 368 {
 369   gchar *result;
 370
 371 #ifdef HAVE_CARBON
 372
 373   g_return_val_if_fail (str != NULL, NULL);
 374   result = carbon_collate_key (str, len);
 375
 376 #elif defined(__STDC_ISO_10646__)
 377
 378   gsize xfrm_len;
 379   gunichar *str_norm;
 380   wchar_t *result_wc;
 381   gsize i;
 382   gsize result_len = 0;
 383
 384   g_return_val_if_fail (str != NULL, NULL);
 385
 386   str_norm = _g_utf8_normalize_wc (str, len, G_NORMALIZE_ALL_COMPOSE);
 387
 388   xfrm_len = wcsxfrm (NULL, (wchar_t *)str_norm, 0);
 389   result_wc = g_new (wchar_t, xfrm_len + 1);
 390   wcsxfrm (result_wc, (wchar_t *)str_norm, xfrm_len + 1);
 391
 392   for (i=0; i < xfrm_len; i++)
 393     result_len += utf8_encode (NULL, result_wc[i]);
 394
 395   result = g_malloc (result_len + 1);
 396   result_len = 0;
 397   for (i=0; i < xfrm_len; i++)
 398     result_len += utf8_encode (result + result_len, result_wc[i]);
 399
 400   result[result_len] = '\0';
 401
 402   g_free (result_wc);
 403   g_free (str_norm);
 404
 405   return result;
 406 #else /* !__STDC_ISO_10646__ */
 407
 408   gsize xfrm_len;
 409   const gchar *charset;
 410   gchar *str_norm;
 411
 412   g_return_val_if_fail (str != NULL, NULL);
 413
 414   str_norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL_COMPOSE);
 415
 416   result = NULL;
 417
 418   if (g_get_charset (&charset))
 419     {
 420       xfrm_len = strxfrm (NULL, str_norm, 0);
 421       if (xfrm_len >= 0 && xfrm_len < G_MAXINT - 2)
 422         {
 423           result = g_malloc (xfrm_len + 1);
 424           strxfrm (result, str_norm, xfrm_len + 1);
 425         }
 426     }
 427   else
 428     {
 429       gchar *str_locale = g_convert (str_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
 430
 431       if (str_locale)
 432         {
 433           xfrm_len = strxfrm (NULL, str_locale, 0);
 434           if (xfrm_len < 0 || xfrm_len >= G_MAXINT - 2)
 435             {
 436               g_free (str_locale);
 437               str_locale = NULL;
 438             }
 439         }
 440       if (str_locale)
 441         {
 442           result = g_malloc (xfrm_len + 2);
 443           result[0] = 'A';
 444           strxfrm (result + 1, str_locale, xfrm_len + 1);
 445
 446           g_free (str_locale);
 447         }
 448     }
 449
 450   if (!result)
 451     {
 452       xfrm_len = strlen (str_norm);
 453       result = g_malloc (xfrm_len + 2);
 454       result[0] = 'B';
 455       memcpy (result + 1, str_norm, xfrm_len);
 456       result[xfrm_len+1] = '\0';
 457     }
 458
 459   g_free (str_norm);
 460 #endif /* __STDC_ISO_10646__ */
 461
 462   return result;
 463 }
 464
 465 /* This is a collation key that is very very likely to sort before any
 466    collation key that libc strxfrm generates. We use this before any
 467    special case (dot or number) to make sure that its sorted before
 468    anything else.
 469  */
 470 #define COLLATION_SENTINEL "\1\1\1"
 471
 472 /**
 473  * g_utf8_collate_key_for_filename:
 474  * @str: a UTF-8 encoded string.
 475  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 476  *
 477  * Converts a string into a collation key that can be compared
 478  * with other collation keys produced by the same function using strcmp().
 479  *
 480  * In order to sort filenames correctly, this function treats the dot '.'
 481  * as a special case. Most dictionary orderings seem to consider it
 482  * insignificant, thus producing the ordering "event.c" "eventgenerator.c"
 483  * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we
 484  * would like to treat numbers intelligently so that "file1" "file10" "file5"
 485  * is sorted as "file1" "file5" "file10".
 486  *
 487  * Note that this function depends on the
 488  * <link linkend="setlocale">current locale</link>.
 489  *
 490  * Return value: a newly allocated string. This string should
 491  *   be freed with g_free() when you are done with it.
 492  *
 493  * Since: 2.8
 494  */
 495 gchar*
 496 g_utf8_collate_key_for_filename (const gchar *str,
 497                                  gssize       len)
 498 {
 499 #ifndef HAVE_CARBON
 500   GString *result;
 501   GString *append;
 502   const gchar *p;
 503   const gchar *prev;
 504   const gchar *end;
 505   gchar *collate_key;
 506   gint digits;
 507   gint leading_zeros;
 508
 509   /*
 510    * How it works:
 511    *
 512    * Split the filename into collatable substrings which do
 513    * not contain [.0-9] and special-cased substrings. The collatable
 514    * substrings are run through the normal g_utf8_collate_key() and the
 515    * resulting keys are concatenated with keys generated from the
 516    * special-cased substrings.
 517    *
 518    * Special cases: Dots are handled by replacing them with '\1' which
 519    * implies that short dot-delimited substrings are before long ones,
 520    * e.g.
 521    *
 522    *   a\1a   (a.a)
 523    *   a-\1a  (a-.a)
 524    *   aa\1a  (aa.a)
 525    *
 526    * Numbers are handled by prepending to each number d-1 superdigits
 527    * where d = number of digits in the number and SUPERDIGIT is a
 528    * character with an integer value higher than any digit (for instance
 529    * ':'). This ensures that single-digit numbers are sorted before
 530    * double-digit numbers which in turn are sorted separately from
 531    * triple-digit numbers, etc. To avoid strange side-effects when
 532    * sorting strings that already contain SUPERDIGITs, a '\2'
 533    * is also prepended, like this
 534    *
 535    *   file\21      (file1)
 536    *   file\25      (file5)
 537    *   file\2:10    (file10)
 538    *   file\2:26    (file26)
 539    *   file\2::100  (file100)
 540    *   file:foo     (file:foo)
 541    *
 542    * This has the side-effect of sorting numbers before everything else (except
 543    * dots), but this is probably OK.
 544    *
 545    * Leading digits are ignored when doing the above. To discriminate
 546    * numbers which differ only in the number of leading digits, we append
 547    * the number of leading digits as a byte at the very end of the collation
 548    * key.
 549    *
 550    * To try avoid conflict with any collation key sequence generated by libc we
 551    * start each switch to a special cased part with a sentinel that hopefully
 552    * will sort before anything libc will generate.
 553    */
 554
 555   if (len < 0)
 556     len = strlen (str);
 557
 558   result = g_string_sized_new (len * 2);
 559   append = g_string_sized_new (0);
 560
 561   end = str + len;
 562
 563   /* No need to use utf8 functions, since we're only looking for ascii chars */
 564   for (prev = p = str; p < end; p++)
 565     {
 566       switch (*p)
 567         {
 568         case '.':
 569           if (prev != p)
 570             {
 571               collate_key = g_utf8_collate_key (prev, p - prev);
 572               g_string_append (result, collate_key);
 573               g_free (collate_key);
 574             }
 575
 576           g_string_append (result, COLLATION_SENTINEL "\1");
 577
 578           /* skip the dot */
 579           prev = p + 1;
 580           break;
 581
 582         case '0':
 583         case '1':
 584         case '2':
 585         case '3':
 586         case '4':
 587         case '5':
 588         case '6':
 589         case '7':
 590         case '8':
 591         case '9':
 592           if (prev != p)
 593             {
 594               collate_key = g_utf8_collate_key (prev, p - prev);
 595               g_string_append (result, collate_key);
 596               g_free (collate_key);
 597             }
 598
 599           g_string_append (result, COLLATION_SENTINEL "\2");
 600
 601           prev = p;
 602
 603           /* write d-1 colons */
 604           if (*p == '0')
 605             {
 606               leading_zeros = 1;
 607               digits = 0;
 608             }
 609           else
 610             {
 611               leading_zeros = 0;
 612               digits = 1;
 613             }
 614
 615           while (++p < end)
 616             {
 617               if (*p == '0' && !digits)
 618                 ++leading_zeros;
 619               else if (g_ascii_isdigit(*p))
 620                 ++digits;
 621               else
 622                 {
 623                   /* count an all-zero sequence as
 624                    * one digit plus leading zeros
 625                    */
 626                   if (!digits)
 627                     {
 628                       ++digits;
 629                       --leading_zeros;
 630                     }
 631                   break;
 632                 }
 633             }
 634
 635           while (digits > 1)
 636             {
 637               g_string_append_c (result, ':');
 638               --digits;
 639             }
 640
 641           if (leading_zeros > 0)
 642             {
 643               g_string_append_c (append, (char)leading_zeros);
 644               prev += leading_zeros;
 645             }
 646
 647           /* write the number itself */
 648           g_string_append_len (result, prev, p - prev);
 649
 650           prev = p;
 651           --p;    /* go one step back to avoid disturbing outer loop */
 652           break;
 653
 654         default:
 655           /* other characters just accumulate */
 656           break;
 657         }
 658     }
 659
 660   if (prev != p)
 661     {
 662       collate_key = g_utf8_collate_key (prev, p - prev);
 663       g_string_append (result, collate_key);
 664       g_free (collate_key);
 665     }
 666
 667   g_string_append (result, append->str);
 668   g_string_free (append, TRUE);
 669
 670   return g_string_free (result, FALSE);
 671 #else /* HAVE_CARBON */
 672   return carbon_collate_key_for_filename (str, len);
 673 #endif
 674 }