glib/gunicollate.c

   1 /* gunicollate.c - Collation
   2  *
   3  *  Copyright 2001,2005 Red Hat, Inc.
   4  *
   5  * The Gnome Library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public License as
   7  * published by the Free Software Foundation; either version 2 of the
   8  * License, or (at your option) any later version.
   9  *
  10  * The Gnome Library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with the Gnome Library; see the file COPYING.LIB.  If not,
  17  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  18  *   Boston, MA 02111-1307, USA.
  19  */
  20
  21 #include "config.h"
  22
  23 #include <locale.h>
  24 #include <string.h>
  25 #ifdef __STDC_ISO_10646__
  26 #include <wchar.h>
  27 #endif
  28
  29 #include "glib.h"
  30 #include "gunicodeprivate.h"
  31 #include "galias.h"
  32
  33 /**
  34  * g_utf8_collate:
  35  * @str1: a UTF-8 encoded string
  36  * @str2: a UTF-8 encoded string
  37  *
  38  * Compares two strings for ordering using the linguistically
  39  * correct rules for the current locale. When sorting a large
  40  * number of strings, it will be significantly faster to
  41  * obtain collation keys with g_utf8_collate_key() and
  42  * compare the keys with strcmp() when
  43  * sorting instead of sorting the original strings.
  44  *
  45  * Return value: &lt; 0 if @str1 compares before @str2,
  46  *   0 if they compare equal, &gt; 0 if @str1 compares after @str2.
  47  **/
  48 gint
  49 g_utf8_collate (const gchar *str1,
  50                 const gchar *str2)
  51 {
  52   gint result;
  53
  54 #ifdef __STDC_ISO_10646__
  55
  56   gunichar *str1_norm;
  57   gunichar *str2_norm;
  58
  59   g_return_val_if_fail (str1 != NULL, 0);
  60   g_return_val_if_fail (str2 != NULL, 0);
  61
  62   str1_norm = _g_utf8_normalize_wc (str1, -1, G_NORMALIZE_ALL_COMPOSE);
  63   str2_norm = _g_utf8_normalize_wc (str2, -1, G_NORMALIZE_ALL_COMPOSE);
  64
  65   result = wcscoll ((wchar_t *)str1_norm, (wchar_t *)str2_norm);
  66
  67   g_free (str1_norm);
  68   g_free (str2_norm);
  69
  70 #else /* !__STDC_ISO_10646__ */
  71
  72   const gchar *charset;
  73   gchar *str1_norm;
  74   gchar *str2_norm;
  75
  76   g_return_val_if_fail (str1 != NULL, 0);
  77   g_return_val_if_fail (str2 != NULL, 0);
  78
  79   str1_norm = g_utf8_normalize (str1, -1, G_NORMALIZE_ALL_COMPOSE);
  80   str2_norm = g_utf8_normalize (str2, -1, G_NORMALIZE_ALL_COMPOSE);
  81
  82   if (g_get_charset (&charset))
  83     {
  84       result = strcoll (str1_norm, str2_norm);
  85     }
  86   else
  87     {
  88       gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
  89       gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
  90
  91       if (str1_locale && str2_locale)
  92         result =  strcoll (str1_locale, str2_locale);
  93       else if (str1_locale)
  94         result = -1;
  95       else if (str2_locale)
  96         result = 1;
  97       else
  98         result = strcmp (str1_norm, str2_norm);
  99
 100       g_free (str1_locale);
 101       g_free (str2_locale);
 102     }
 103
 104   g_free (str1_norm);
 105   g_free (str2_norm);
 106
 107 #endif /* __STDC_ISO_10646__ */
 108
 109   return result;
 110 }
 111
 112 #ifdef __STDC_ISO_10646__
 113 /* We need UTF-8 encoding of numbers to encode the weights if
 114  * we are using wcsxfrm. However, we aren't encoding Unicode
 115  * characters, so we can't simply use g_unichar_to_utf8.
 116  *
 117  * The following routine is taken (with modification) from GNU
 118  * libc's strxfrm routine:
 119  *
 120  * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
 121  * Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
 122  */
 123 static inline int
 124 utf8_encode (char *buf, wchar_t val)
 125 {
 126   int retval;
 127
 128   if (val < 0x80)
 129     {
 130       if (buf)
 131         *buf++ = (char) val;
 132       retval = 1;
 133     }
 134   else
 135     {
 136       int step;
 137
 138       for (step = 2; step < 6; ++step)
 139         if ((val & (~(guint32)0 << (5 * step + 1))) == 0)
 140           break;
 141       retval = step;
 142
 143       if (buf)
 144         {
 145           *buf = (unsigned char) (~0xff >> step);
 146           --step;
 147           do
 148             {
 149               buf[step] = 0x80 | (val & 0x3f);
 150               val >>= 6;
 151             }
 152           while (--step > 0);
 153           *buf |= val;
 154         }
 155     }
 156
 157   return retval;
 158 }
 159 #endif /* __STDC_ISO_10646__ */
 160
 161 /**
 162  * g_utf8_collate_key:
 163  * @str: a UTF-8 encoded string.
 164  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 165  *
 166  * Converts a string into a collation key that can be compared
 167  * with other collation keys produced by the same function using
 168  * strcmp().
 169  * The results of comparing the collation keys of two strings
 170  * with strcmp() will always be the same as
 171  * comparing the two original keys with g_utf8_collate().
 172  *
 173  * Return value: a newly allocated string. This string should
 174  *   be freed with g_free() when you are done with it.
 175  **/
 176 gchar *
 177 g_utf8_collate_key (const gchar *str,
 178                     gssize       len)
 179 {
 180   gchar *result;
 181   size_t xfrm_len;
 182
 183 #ifdef __STDC_ISO_10646__
 184
 185   gunichar *str_norm;
 186   wchar_t *result_wc;
 187   size_t i;
 188   size_t result_len = 0;
 189
 190   g_return_val_if_fail (str != NULL, NULL);
 191
 192   str_norm = _g_utf8_normalize_wc (str, len, G_NORMALIZE_ALL_COMPOSE);
 193
 194   setlocale (LC_COLLATE, "");
 195
 196   xfrm_len = wcsxfrm (NULL, (wchar_t *)str_norm, 0);
 197   result_wc = g_new (wchar_t, xfrm_len + 1);
 198   wcsxfrm (result_wc, (wchar_t *)str_norm, xfrm_len + 1);
 199
 200   for (i=0; i < xfrm_len; i++)
 201     result_len += utf8_encode (NULL, result_wc[i]);
 202
 203   result = g_malloc (result_len + 1);
 204   result_len = 0;
 205   for (i=0; i < xfrm_len; i++)
 206     result_len += utf8_encode (result + result_len, result_wc[i]);
 207
 208   result[result_len] = '\0';
 209
 210   g_free (result_wc);
 211   g_free (str_norm);
 212
 213   return result;
 214 #else /* !__STDC_ISO_10646__ */
 215
 216   const gchar *charset;
 217   gchar *str_norm;
 218
 219   g_return_val_if_fail (str != NULL, NULL);
 220
 221   str_norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL_COMPOSE);
 222
 223   if (g_get_charset (&charset))
 224     {
 225       xfrm_len = strxfrm (NULL, str_norm, 0);
 226       result = g_malloc (xfrm_len + 1);
 227       strxfrm (result, str_norm, xfrm_len + 1);
 228     }
 229   else
 230     {
 231       gchar *str_locale = g_convert (str_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
 232
 233       if (str_locale)
 234         {
 235           xfrm_len = strxfrm (NULL, str_locale, 0);
 236           if (xfrm_len < 0 || xfrm_len >= G_MAXINT - 2)
 237             {
 238               g_free (str_locale);
 239               str_locale = NULL;
 240             }
 241         }
 242       if (str_locale)
 243         {
 244           result = g_malloc (xfrm_len + 2);
 245           result[0] = 'A';
 246           strxfrm (result + 1, str_locale, xfrm_len + 1);
 247
 248           g_free (str_locale);
 249         }
 250       else
 251         {
 252           xfrm_len = strlen (str_norm);
 253           result = g_malloc (xfrm_len + 2);
 254           result[0] = 'B';
 255           memcpy (result + 1, str_norm, xfrm_len);
 256           result[xfrm_len+1] = '\0';
 257         }
 258     }
 259
 260   g_free (str_norm);
 261 #endif /* __STDC_ISO_10646__ */
 262
 263   return result;
 264 }
 265
 266 /* This is a collation key that is very very likely to sort before any
 267    collation key that libc strxfrm generates. We use this before any
 268    special case (dot or number) to make sure that its sorted before
 269    anything else.
 270  */
 271 #define COLLATION_SENTINEL "\1\1\1"
 272
 273 /**
 274  * g_utf8_collate_key_for_filename:
 275  * @str: a UTF-8 encoded string.
 276  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 277  *
 278  * Converts a string into a collation key that can be compared
 279  * with other collation keys produced by the same function using strcmp().
 280  *
 281  * In order to sort filenames correctly, this function treats the dot '.'
 282  * as a special case. Most dictionary orderings seem to consider it
 283  * insignificant, thus producing the ordering "event.c" "eventgenerator.c"
 284  * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we
 285  * would like to treat numbers intelligently so that "file1" "file10" "file5"
 286  * is sorted as "file1" "file5" "file10".
 287  *
 288  * Return value: a newly allocated string. This string should
 289  *   be freed with g_free() when you are done with it.
 290  *
 291  * Since: 2.8
 292  */
 293 gchar*
 294 g_utf8_collate_key_for_filename (const gchar *str,
 295                                  gssize       len)
 296 {
 297   GString *result;
 298   GString *append;
 299   const gchar *p;
 300   const gchar *prev;
 301   const gchar *end;
 302   gchar *collate_key;
 303   gint digits;
 304   gint leading_zeros;
 305
 306   /*
 307    * How it works:
 308    *
 309    * Split the filename into collatable substrings which do
 310    * not contain [.0-9] and special-cased substrings. The collatable
 311    * substrings are run through the normal g_utf8_collate_key() and the
 312    * resulting keys are concatenated with keys generated from the
 313    * special-cased substrings.
 314    *
 315    * Special cases: Dots are handled by replacing them with '\1' which
 316    * implies that short dot-delimited substrings are before long ones,
 317    * e.g.
 318    *
 319    *   a\1a   (a.a)
 320    *   a-\1a  (a-.a)
 321    *   aa\1a  (aa.a)
 322    *
 323    * Numbers are handled by prepending to each number d-1 superdigits
 324    * where d = number of digits in the number and SUPERDIGIT is a
 325    * character with an integer value higher than any digit (for instance
 326    * ':'). This ensures that single-digit numbers are sorted before
 327    * double-digit numbers which in turn are sorted separately from
 328    * triple-digit numbers, etc. To avoid strange side-effects when
 329    * sorting strings that already contain SUPERDIGITs, a '\2'
 330    * is also prepended, like this
 331    *
 332    *   file\21      (file1)
 333    *   file\25      (file5)
 334    *   file\2:10    (file10)
 335    *   file\2:26    (file26)
 336    *   file\2::100  (file100)
 337    *   file:foo     (file:foo)
 338    *
 339    * This has the side-effect of sorting numbers before everything else (except
 340    * dots), but this is probably OK.
 341    *
 342    * Leading digits are ignored when doing the above. To discriminate
 343    * numbers which differ only in the number of leading digits, we append
 344    * the number of leading digits as a byte at the very end of the collation
 345    * key.
 346    *
 347    * To try avoid conflict with any collation key sequence generated by libc we
 348    * start each switch to a special cased part with a sentinel that hopefully
 349    * will sort before anything libc will generate.
 350    */
 351
 352   if (len < 0)
 353     len = strlen (str);
 354
 355   result = g_string_sized_new (len * 2);
 356   append = g_string_sized_new (0);
 357
 358   end = str + len;
 359
 360   /* No need to use utf8 functions, since we're only looking for ascii chars */
 361   for (prev = p = str; p < end; p++)
 362     {
 363       switch (*p)
 364         {
 365         case '.':
 366           if (prev != p)
 367             {
 368               collate_key = g_utf8_collate_key (prev, p - prev);
 369               g_string_append (result, collate_key);
 370               g_free (collate_key);
 371             }
 372
 373           g_string_append (result, COLLATION_SENTINEL "\1");
 374
 375           /* skip the dot */
 376           prev = p + 1;
 377           break;
 378
 379         case '0':
 380         case '1':
 381         case '2':
 382         case '3':
 383         case '4':
 384         case '5':
 385         case '6':
 386         case '7':
 387         case '8':
 388         case '9':
 389           if (prev != p)
 390             {
 391               collate_key = g_utf8_collate_key (prev, p - prev);
 392               g_string_append (result, collate_key);
 393               g_free (collate_key);
 394             }
 395
 396           g_string_append (result, COLLATION_SENTINEL "\2");
 397
 398           prev = p;
 399
 400           /* write d-1 colons */
 401           if (*p == '0')
 402             {
 403               leading_zeros = 1;
 404               digits = 0;
 405             }
 406           else
 407             {
 408               leading_zeros = 0;
 409               digits = 1;
 410             }
 411
 412           while (++p < end)
 413             {
 414               if (*p == '0' && !digits)
 415                 ++leading_zeros;
 416               else if (g_ascii_isdigit(*p))
 417                 ++digits;
 418               else
 419                 {
 420                   /* count an all-zero sequence as
 421                    * one digit plus leading zeros
 422                    */
 423                   if (!digits)
 424                     {
 425                       ++digits;
 426                       --leading_zeros;
 427                     }
 428                   break;
 429                 }
 430             }
 431
 432           while (digits > 1)
 433             {
 434               g_string_append_c (result, ':');
 435               --digits;
 436             }
 437
 438           if (leading_zeros > 0)
 439             {
 440               g_string_append_c (append, (char)leading_zeros);
 441               prev += leading_zeros;
 442             }
 443
 444           /* write the number itself */
 445           g_string_append_len (result, prev, p - prev);
 446
 447           prev = p;
 448           --p;    /* go one step back to avoid disturbing outer loop */
 449           break;
 450
 451         default:
 452           /* other characters just accumulate */
 453           break;
 454         }
 455     }
 456
 457   if (prev != p)
 458     {
 459       collate_key = g_utf8_collate_key (prev, p - prev);
 460       g_string_append (result, collate_key);
 461       g_free (collate_key);
 462     }
 463
 464   g_string_append (result, append->str);
 465   g_string_free (append, TRUE);
 466
 467   return g_string_free (result, FALSE);
 468 }
 469
 470
 471 #define __G_UNICOLLATE_C__
 472 #include "galiasdef.c"