glib/gunicollate.c

   1 /* gunicollate.c - Collation
   2  *
   3  *  Copyright 2001,2005 Red Hat, Inc.
   4  *
   5  * The Gnome Library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public License as
   7  * published by the Free Software Foundation; either version 2 of the
   8  * License, or (at your option) any later version.
   9  *
  10  * The Gnome Library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with the Gnome Library; see the file COPYING.LIB.  If not,
  17  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  18  *   Boston, MA 02111-1307, USA.
  19  */
  20
  21 #include "config.h"
  22
  23 #include <locale.h>
  24 #include <string.h>
  25 #ifdef __STDC_ISO_10646__
  26 #include <wchar.h>
  27 #endif
  28
  29 #include "glib.h"
  30 #include "gunicodeprivate.h"
  31 #include "galias.h"
  32
  33 #ifdef _MSC_VER
  34 /* Workaround for bug in MSVCR80.DLL */
  35 static gsize
  36 msc_strxfrm_wrapper (char       *string1,
  37                      const char *string2,
  38                      gsize       count)
  39 {
  40   if (!string1 || count <= 0)
  41     {
  42       char tmp;
  43
  44       return strxfrm (&tmp, string2, 1);
  45     }
  46   return strxfrm (string1, string2, count);
  47 }
  48 #define strxfrm msc_strxfrm_wrapper
  49 #endif
  50
  51 /**
  52  * g_utf8_collate:
  53  * @str1: a UTF-8 encoded string
  54  * @str2: a UTF-8 encoded string
  55  *
  56  * Compares two strings for ordering using the linguistically
  57  * correct rules for the <link linkend="setlocale">current locale</link>.
  58  * When sorting a large number of strings, it will be significantly
  59  * faster to obtain collation keys with g_utf8_collate_key() and
  60  * compare the keys with strcmp() when sorting instead of sorting
  61  * the original strings.
  62  *
  63  * Return value: &lt; 0 if @str1 compares before @str2,
  64  *   0 if they compare equal, &gt; 0 if @str1 compares after @str2.
  65  **/
  66 gint
  67 g_utf8_collate (const gchar *str1,
  68                 const gchar *str2)
  69 {
  70   gint result;
  71
  72 #ifdef __STDC_ISO_10646__
  73
  74   gunichar *str1_norm;
  75   gunichar *str2_norm;
  76
  77   g_return_val_if_fail (str1 != NULL, 0);
  78   g_return_val_if_fail (str2 != NULL, 0);
  79
  80   str1_norm = _g_utf8_normalize_wc (str1, -1, G_NORMALIZE_ALL_COMPOSE);
  81   str2_norm = _g_utf8_normalize_wc (str2, -1, G_NORMALIZE_ALL_COMPOSE);
  82
  83   result = wcscoll ((wchar_t *)str1_norm, (wchar_t *)str2_norm);
  84
  85   g_free (str1_norm);
  86   g_free (str2_norm);
  87
  88 #else /* !__STDC_ISO_10646__ */
  89
  90   const gchar *charset;
  91   gchar *str1_norm;
  92   gchar *str2_norm;
  93
  94   g_return_val_if_fail (str1 != NULL, 0);
  95   g_return_val_if_fail (str2 != NULL, 0);
  96
  97   str1_norm = g_utf8_normalize (str1, -1, G_NORMALIZE_ALL_COMPOSE);
  98   str2_norm = g_utf8_normalize (str2, -1, G_NORMALIZE_ALL_COMPOSE);
  99
 100   if (g_get_charset (&charset))
 101     {
 102       result = strcoll (str1_norm, str2_norm);
 103     }
 104   else
 105     {
 106       gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
 107       gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
 108
 109       if (str1_locale && str2_locale)
 110         result =  strcoll (str1_locale, str2_locale);
 111       else if (str1_locale)
 112         result = -1;
 113       else if (str2_locale)
 114         result = 1;
 115       else
 116         result = strcmp (str1_norm, str2_norm);
 117
 118       g_free (str1_locale);
 119       g_free (str2_locale);
 120     }
 121
 122   g_free (str1_norm);
 123   g_free (str2_norm);
 124
 125 #endif /* __STDC_ISO_10646__ */
 126
 127   return result;
 128 }
 129
 130 #ifdef __STDC_ISO_10646__
 131 /* We need UTF-8 encoding of numbers to encode the weights if
 132  * we are using wcsxfrm. However, we aren't encoding Unicode
 133  * characters, so we can't simply use g_unichar_to_utf8.
 134  *
 135  * The following routine is taken (with modification) from GNU
 136  * libc's strxfrm routine:
 137  *
 138  * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
 139  * Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
 140  */
 141 static inline int
 142 utf8_encode (char *buf, wchar_t val)
 143 {
 144   int retval;
 145
 146   if (val < 0x80)
 147     {
 148       if (buf)
 149         *buf++ = (char) val;
 150       retval = 1;
 151     }
 152   else
 153     {
 154       int step;
 155
 156       for (step = 2; step < 6; ++step)
 157         if ((val & (~(guint32)0 << (5 * step + 1))) == 0)
 158           break;
 159       retval = step;
 160
 161       if (buf)
 162         {
 163           *buf = (unsigned char) (~0xff >> step);
 164           --step;
 165           do
 166             {
 167               buf[step] = 0x80 | (val & 0x3f);
 168               val >>= 6;
 169             }
 170           while (--step > 0);
 171           *buf |= val;
 172         }
 173     }
 174
 175   return retval;
 176 }
 177 #endif /* __STDC_ISO_10646__ */
 178
 179 /**
 180  * g_utf8_collate_key:
 181  * @str: a UTF-8 encoded string.
 182  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 183  *
 184  * Converts a string into a collation key that can be compared
 185  * with other collation keys produced by the same function using
 186  * strcmp().
 187  *
 188  * The results of comparing the collation keys of two strings
 189  * with strcmp() will always be the same as comparing the two
 190  * original keys with g_utf8_collate().
 191  *
 192  * Note that this function depends on the
 193  * <link linkend="setlocale">current locale</link>.
 194  *
 195  * Return value: a newly allocated string. This string should
 196  *   be freed with g_free() when you are done with it.
 197  **/
 198 gchar *
 199 g_utf8_collate_key (const gchar *str,
 200                     gssize       len)
 201 {
 202   gchar *result;
 203   gsize xfrm_len;
 204
 205 #ifdef __STDC_ISO_10646__
 206
 207   gunichar *str_norm;
 208   wchar_t *result_wc;
 209   gsize i;
 210   gsize result_len = 0;
 211
 212   g_return_val_if_fail (str != NULL, NULL);
 213
 214   str_norm = _g_utf8_normalize_wc (str, len, G_NORMALIZE_ALL_COMPOSE);
 215
 216   xfrm_len = wcsxfrm (NULL, (wchar_t *)str_norm, 0);
 217   result_wc = g_new (wchar_t, xfrm_len + 1);
 218   wcsxfrm (result_wc, (wchar_t *)str_norm, xfrm_len + 1);
 219
 220   for (i=0; i < xfrm_len; i++)
 221     result_len += utf8_encode (NULL, result_wc[i]);
 222
 223   result = g_malloc (result_len + 1);
 224   result_len = 0;
 225   for (i=0; i < xfrm_len; i++)
 226     result_len += utf8_encode (result + result_len, result_wc[i]);
 227
 228   result[result_len] = '\0';
 229
 230   g_free (result_wc);
 231   g_free (str_norm);
 232
 233   return result;
 234 #else /* !__STDC_ISO_10646__ */
 235
 236   const gchar *charset;
 237   gchar *str_norm;
 238
 239   g_return_val_if_fail (str != NULL, NULL);
 240
 241   str_norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL_COMPOSE);
 242
 243   if (g_get_charset (&charset))
 244     {
 245       xfrm_len = strxfrm (NULL, str_norm, 0);
 246       result = g_malloc (xfrm_len + 1);
 247       strxfrm (result, str_norm, xfrm_len + 1);
 248     }
 249   else
 250     {
 251       gchar *str_locale = g_convert (str_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
 252
 253       if (str_locale)
 254         {
 255           xfrm_len = strxfrm (NULL, str_locale, 0);
 256           if (xfrm_len < 0 || xfrm_len >= G_MAXINT - 2)
 257             {
 258               g_free (str_locale);
 259               str_locale = NULL;
 260             }
 261         }
 262       if (str_locale)
 263         {
 264           result = g_malloc (xfrm_len + 2);
 265           result[0] = 'A';
 266           strxfrm (result + 1, str_locale, xfrm_len + 1);
 267
 268           g_free (str_locale);
 269         }
 270       else
 271         {
 272           xfrm_len = strlen (str_norm);
 273           result = g_malloc (xfrm_len + 2);
 274           result[0] = 'B';
 275           memcpy (result + 1, str_norm, xfrm_len);
 276           result[xfrm_len+1] = '\0';
 277         }
 278     }
 279
 280   g_free (str_norm);
 281 #endif /* __STDC_ISO_10646__ */
 282
 283   return result;
 284 }
 285
 286 /* This is a collation key that is very very likely to sort before any
 287    collation key that libc strxfrm generates. We use this before any
 288    special case (dot or number) to make sure that its sorted before
 289    anything else.
 290  */
 291 #define COLLATION_SENTINEL "\1\1\1"
 292
 293 /**
 294  * g_utf8_collate_key_for_filename:
 295  * @str: a UTF-8 encoded string.
 296  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 297  *
 298  * Converts a string into a collation key that can be compared
 299  * with other collation keys produced by the same function using strcmp().
 300  *
 301  * In order to sort filenames correctly, this function treats the dot '.'
 302  * as a special case. Most dictionary orderings seem to consider it
 303  * insignificant, thus producing the ordering "event.c" "eventgenerator.c"
 304  * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we
 305  * would like to treat numbers intelligently so that "file1" "file10" "file5"
 306  * is sorted as "file1" "file5" "file10".
 307  *
 308  * Note that this function depends on the
 309  * <link linkend="setlocale">current locale</link>.
 310  *
 311  * Return value: a newly allocated string. This string should
 312  *   be freed with g_free() when you are done with it.
 313  *
 314  * Since: 2.8
 315  */
 316 gchar*
 317 g_utf8_collate_key_for_filename (const gchar *str,
 318                                  gssize       len)
 319 {
 320   GString *result;
 321   GString *append;
 322   const gchar *p;
 323   const gchar *prev;
 324   const gchar *end;
 325   gchar *collate_key;
 326   gint digits;
 327   gint leading_zeros;
 328
 329   /*
 330    * How it works:
 331    *
 332    * Split the filename into collatable substrings which do
 333    * not contain [.0-9] and special-cased substrings. The collatable
 334    * substrings are run through the normal g_utf8_collate_key() and the
 335    * resulting keys are concatenated with keys generated from the
 336    * special-cased substrings.
 337    *
 338    * Special cases: Dots are handled by replacing them with '\1' which
 339    * implies that short dot-delimited substrings are before long ones,
 340    * e.g.
 341    *
 342    *   a\1a   (a.a)
 343    *   a-\1a  (a-.a)
 344    *   aa\1a  (aa.a)
 345    *
 346    * Numbers are handled by prepending to each number d-1 superdigits
 347    * where d = number of digits in the number and SUPERDIGIT is a
 348    * character with an integer value higher than any digit (for instance
 349    * ':'). This ensures that single-digit numbers are sorted before
 350    * double-digit numbers which in turn are sorted separately from
 351    * triple-digit numbers, etc. To avoid strange side-effects when
 352    * sorting strings that already contain SUPERDIGITs, a '\2'
 353    * is also prepended, like this
 354    *
 355    *   file\21      (file1)
 356    *   file\25      (file5)
 357    *   file\2:10    (file10)
 358    *   file\2:26    (file26)
 359    *   file\2::100  (file100)
 360    *   file:foo     (file:foo)
 361    *
 362    * This has the side-effect of sorting numbers before everything else (except
 363    * dots), but this is probably OK.
 364    *
 365    * Leading digits are ignored when doing the above. To discriminate
 366    * numbers which differ only in the number of leading digits, we append
 367    * the number of leading digits as a byte at the very end of the collation
 368    * key.
 369    *
 370    * To try avoid conflict with any collation key sequence generated by libc we
 371    * start each switch to a special cased part with a sentinel that hopefully
 372    * will sort before anything libc will generate.
 373    */
 374
 375   if (len < 0)
 376     len = strlen (str);
 377
 378   result = g_string_sized_new (len * 2);
 379   append = g_string_sized_new (0);
 380
 381   end = str + len;
 382
 383   /* No need to use utf8 functions, since we're only looking for ascii chars */
 384   for (prev = p = str; p < end; p++)
 385     {
 386       switch (*p)
 387         {
 388         case '.':
 389           if (prev != p)
 390             {
 391               collate_key = g_utf8_collate_key (prev, p - prev);
 392               g_string_append (result, collate_key);
 393               g_free (collate_key);
 394             }
 395
 396           g_string_append (result, COLLATION_SENTINEL "\1");
 397
 398           /* skip the dot */
 399           prev = p + 1;
 400           break;
 401
 402         case '0':
 403         case '1':
 404         case '2':
 405         case '3':
 406         case '4':
 407         case '5':
 408         case '6':
 409         case '7':
 410         case '8':
 411         case '9':
 412           if (prev != p)
 413             {
 414               collate_key = g_utf8_collate_key (prev, p - prev);
 415               g_string_append (result, collate_key);
 416               g_free (collate_key);
 417             }
 418
 419           g_string_append (result, COLLATION_SENTINEL "\2");
 420
 421           prev = p;
 422
 423           /* write d-1 colons */
 424           if (*p == '0')
 425             {
 426               leading_zeros = 1;
 427               digits = 0;
 428             }
 429           else
 430             {
 431               leading_zeros = 0;
 432               digits = 1;
 433             }
 434
 435           while (++p < end)
 436             {
 437               if (*p == '0' && !digits)
 438                 ++leading_zeros;
 439               else if (g_ascii_isdigit(*p))
 440                 ++digits;
 441               else
 442                 {
 443                   /* count an all-zero sequence as
 444                    * one digit plus leading zeros
 445                    */
 446                   if (!digits)
 447                     {
 448                       ++digits;
 449                       --leading_zeros;
 450                     }
 451                   break;
 452                 }
 453             }
 454
 455           while (digits > 1)
 456             {
 457               g_string_append_c (result, ':');
 458               --digits;
 459             }
 460
 461           if (leading_zeros > 0)
 462             {
 463               g_string_append_c (append, (char)leading_zeros);
 464               prev += leading_zeros;
 465             }
 466
 467           /* write the number itself */
 468           g_string_append_len (result, prev, p - prev);
 469
 470           prev = p;
 471           --p;    /* go one step back to avoid disturbing outer loop */
 472           break;
 473
 474         default:
 475           /* other characters just accumulate */
 476           break;
 477         }
 478     }
 479
 480   if (prev != p)
 481     {
 482       collate_key = g_utf8_collate_key (prev, p - prev);
 483       g_string_append (result, collate_key);
 484       g_free (collate_key);
 485     }
 486
 487   g_string_append (result, append->str);
 488   g_string_free (append, TRUE);
 489
 490   return g_string_free (result, FALSE);
 491 }
 492
 493
 494 #define __G_UNICOLLATE_C__
 495 #include "galiasdef.c"