1 /* gunicollate.c - Collation
3 * Copyright 2001,2005 Red Hat, Inc.
5 * The Gnome Library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
10 * The Gnome Library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with the Gnome Library; see the file COPYING.LIB. If not,
17 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 * Boston, MA 02111-1307, USA.
25 #ifdef __STDC_ISO_10646__
30 #include <CoreServices/CoreServices.h>
34 #include "gunicodeprivate.h"
37 /* Workaround for bug in MSVCR80.DLL */
39 msc_strxfrm_wrapper (char *string1,
43 if (!string1 || count <= 0)
47 return strxfrm (&tmp, string2, 1);
49 return strxfrm (string1, string2, count);
51 #define strxfrm msc_strxfrm_wrapper
56 * @str1: a UTF-8 encoded string
57 * @str2: a UTF-8 encoded string
59 * Compares two strings for ordering using the linguistically
60 * correct rules for the <link linkend="setlocale">current locale</link>.
61 * When sorting a large number of strings, it will be significantly
62 * faster to obtain collation keys with g_utf8_collate_key() and
63 * compare the keys with strcmp() when sorting instead of sorting
64 * the original strings.
66 * Return value: < 0 if @str1 compares before @str2,
67 * 0 if they compare equal, > 0 if @str1 compares after @str2.
70 g_utf8_collate (const gchar *str1,
83 g_return_val_if_fail (str1 != NULL, 0);
84 g_return_val_if_fail (str2 != NULL, 0);
86 str1_utf16 = g_utf8_to_utf16 (str1, -1, NULL, &len1, NULL);
87 str2_utf16 = g_utf8_to_utf16 (str2, -1, NULL, &len2, NULL);
89 UCCompareTextDefault (kUCCollateStandardOptions,
90 str1_utf16, len1, str2_utf16, len2,
97 #elif defined(__STDC_ISO_10646__)
102 g_return_val_if_fail (str1 != NULL, 0);
103 g_return_val_if_fail (str2 != NULL, 0);
105 str1_norm = _g_utf8_normalize_wc (str1, -1, G_NORMALIZE_ALL_COMPOSE);
106 str2_norm = _g_utf8_normalize_wc (str2, -1, G_NORMALIZE_ALL_COMPOSE);
108 result = wcscoll ((wchar_t *)str1_norm, (wchar_t *)str2_norm);
113 #else /* !__STDC_ISO_10646__ */
115 const gchar *charset;
119 g_return_val_if_fail (str1 != NULL, 0);
120 g_return_val_if_fail (str2 != NULL, 0);
122 str1_norm = g_utf8_normalize (str1, -1, G_NORMALIZE_ALL_COMPOSE);
123 str2_norm = g_utf8_normalize (str2, -1, G_NORMALIZE_ALL_COMPOSE);
125 if (g_get_charset (&charset))
127 result = strcoll (str1_norm, str2_norm);
131 gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
132 gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
134 if (str1_locale && str2_locale)
135 result = strcoll (str1_locale, str2_locale);
136 else if (str1_locale)
138 else if (str2_locale)
141 result = strcmp (str1_norm, str2_norm);
143 g_free (str1_locale);
144 g_free (str2_locale);
150 #endif /* __STDC_ISO_10646__ */
155 #if defined(__STDC_ISO_10646__) || defined(HAVE_CARBON)
156 /* We need UTF-8 encoding of numbers to encode the weights if
157 * we are using wcsxfrm. However, we aren't encoding Unicode
158 * characters, so we can't simply use g_unichar_to_utf8.
160 * The following routine is taken (with modification) from GNU
161 * libc's strxfrm routine:
163 * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
164 * Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
167 utf8_encode (char *buf, wchar_t val)
181 for (step = 2; step < 6; ++step)
182 if ((val & (~(guint32)0 << (5 * step + 1))) == 0)
188 *buf = (unsigned char) (~0xff >> step);
192 buf[step] = 0x80 | (val & 0x3f);
202 #endif /* __STDC_ISO_10646__ || HAVE_CARBON */
207 collate_key_to_string (UCCollationValue *key,
214 /* Pretty smart algorithm here: ignore first eight bytes of the
215 * collation key. It doesn't produce results equivalent to
216 * UCCompareCollationKeys's, but the difference seems to be only
217 * that UCCompareCollationKeys in some cases produces 0 where our
218 * comparison gets -1 or 1. */
220 if (key_len * sizeof (UCCollationValue) <= 8)
221 return g_strdup ("");
224 for (i = 8; i < key_len * sizeof (UCCollationValue); i++)
225 /* there may be nul bytes, encode byteval+1 */
226 result_len += utf8_encode (NULL, *((guchar*)key + i) + 1);
228 result = g_malloc (result_len + 1);
230 for (i = 8; i < key_len * sizeof (UCCollationValue); i++)
231 result_len += utf8_encode (result + result_len, *((guchar*)key + i) + 1);
233 result[result_len] = 0;
238 carbon_collate_key_with_collator (const gchar *str,
240 CollatorRef collator)
242 UniChar *str_utf16 = NULL;
245 UCCollationValue staticbuf[512];
246 UCCollationValue *freeme = NULL;
247 UCCollationValue *buf;
251 gchar *result = NULL;
253 str_utf16 = g_utf8_to_utf16 (str, len, NULL, &len_utf16, NULL);
254 try_len = len_utf16 * 5 + 2;
256 if (try_len <= sizeof staticbuf)
259 buf_len = sizeof staticbuf;
263 freeme = g_new (UCCollationValue, try_len);
268 ret = UCGetCollationKey (collator, str_utf16, len_utf16,
269 buf_len, &key_len, buf);
271 if (ret == kCollateBufferTooSmall)
273 freeme = g_renew (UCCollationValue, freeme, try_len * 2);
275 buf_len = try_len * 2;
276 ret = UCGetCollationKey (collator, str_utf16, len_utf16,
277 buf_len, &key_len, buf);
281 result = collate_key_to_string (buf, key_len);
283 result = g_strdup ("");
291 carbon_collate_key (const gchar *str,
294 static CollatorRef collator;
296 if (G_UNLIKELY (!collator))
298 UCCreateCollator (NULL, 0, kUCCollateStandardOptions, &collator);
302 static gboolean been_here;
304 g_warning ("%s: UCCreateCollator failed", G_STRLOC);
306 return g_strdup ("");
310 return carbon_collate_key_with_collator (str, len, collator);
314 carbon_collate_key_for_filename (const gchar *str,
317 static CollatorRef collator;
319 if (G_UNLIKELY (!collator))
321 /* http://developer.apple.com/qa/qa2004/qa1159.html */
322 UCCreateCollator (NULL, 0,
323 kUCCollateComposeInsensitiveMask
324 | kUCCollateWidthInsensitiveMask
325 | kUCCollateCaseInsensitiveMask
326 | kUCCollateDigitsOverrideMask
327 | kUCCollateDigitsAsNumberMask
328 | kUCCollatePunctuationSignificantMask,
333 static gboolean been_here;
335 g_warning ("%s: UCCreateCollator failed", G_STRLOC);
337 return g_strdup ("");
341 return carbon_collate_key_with_collator (str, len, collator);
344 #endif /* HAVE_CARBON */
347 * g_utf8_collate_key:
348 * @str: a UTF-8 encoded string.
349 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
351 * Converts a string into a collation key that can be compared
352 * with other collation keys produced by the same function using
355 * The results of comparing the collation keys of two strings
356 * with strcmp() will always be the same as comparing the two
357 * original keys with g_utf8_collate().
359 * Note that this function depends on the
360 * <link linkend="setlocale">current locale</link>.
362 * Return value: a newly allocated string. This string should
363 * be freed with g_free() when you are done with it.
366 g_utf8_collate_key (const gchar *str,
373 g_return_val_if_fail (str != NULL, NULL);
374 result = carbon_collate_key (str, len);
376 #elif defined(__STDC_ISO_10646__)
382 gsize result_len = 0;
384 g_return_val_if_fail (str != NULL, NULL);
386 str_norm = _g_utf8_normalize_wc (str, len, G_NORMALIZE_ALL_COMPOSE);
388 xfrm_len = wcsxfrm (NULL, (wchar_t *)str_norm, 0);
389 result_wc = g_new (wchar_t, xfrm_len + 1);
390 wcsxfrm (result_wc, (wchar_t *)str_norm, xfrm_len + 1);
392 for (i=0; i < xfrm_len; i++)
393 result_len += utf8_encode (NULL, result_wc[i]);
395 result = g_malloc (result_len + 1);
397 for (i=0; i < xfrm_len; i++)
398 result_len += utf8_encode (result + result_len, result_wc[i]);
400 result[result_len] = '\0';
406 #else /* !__STDC_ISO_10646__ */
409 const gchar *charset;
412 g_return_val_if_fail (str != NULL, NULL);
414 str_norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL_COMPOSE);
418 if (g_get_charset (&charset))
420 xfrm_len = strxfrm (NULL, str_norm, 0);
421 if (xfrm_len >= 0 && xfrm_len < G_MAXINT - 2)
423 result = g_malloc (xfrm_len + 1);
424 strxfrm (result, str_norm, xfrm_len + 1);
429 gchar *str_locale = g_convert (str_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
433 xfrm_len = strxfrm (NULL, str_locale, 0);
434 if (xfrm_len < 0 || xfrm_len >= G_MAXINT - 2)
442 result = g_malloc (xfrm_len + 2);
444 strxfrm (result + 1, str_locale, xfrm_len + 1);
452 xfrm_len = strlen (str_norm);
453 result = g_malloc (xfrm_len + 2);
455 memcpy (result + 1, str_norm, xfrm_len);
456 result[xfrm_len+1] = '\0';
460 #endif /* __STDC_ISO_10646__ */
465 /* This is a collation key that is very very likely to sort before any
466 collation key that libc strxfrm generates. We use this before any
467 special case (dot or number) to make sure that its sorted before
470 #define COLLATION_SENTINEL "\1\1\1"
473 * g_utf8_collate_key_for_filename:
474 * @str: a UTF-8 encoded string.
475 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
477 * Converts a string into a collation key that can be compared
478 * with other collation keys produced by the same function using strcmp().
480 * In order to sort filenames correctly, this function treats the dot '.'
481 * as a special case. Most dictionary orderings seem to consider it
482 * insignificant, thus producing the ordering "event.c" "eventgenerator.c"
483 * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we
484 * would like to treat numbers intelligently so that "file1" "file10" "file5"
485 * is sorted as "file1" "file5" "file10".
487 * Note that this function depends on the
488 * <link linkend="setlocale">current locale</link>.
490 * Return value: a newly allocated string. This string should
491 * be freed with g_free() when you are done with it.
496 g_utf8_collate_key_for_filename (const gchar *str,
512 * Split the filename into collatable substrings which do
513 * not contain [.0-9] and special-cased substrings. The collatable
514 * substrings are run through the normal g_utf8_collate_key() and the
515 * resulting keys are concatenated with keys generated from the
516 * special-cased substrings.
518 * Special cases: Dots are handled by replacing them with '\1' which
519 * implies that short dot-delimited substrings are before long ones,
526 * Numbers are handled by prepending to each number d-1 superdigits
527 * where d = number of digits in the number and SUPERDIGIT is a
528 * character with an integer value higher than any digit (for instance
529 * ':'). This ensures that single-digit numbers are sorted before
530 * double-digit numbers which in turn are sorted separately from
531 * triple-digit numbers, etc. To avoid strange side-effects when
532 * sorting strings that already contain SUPERDIGITs, a '\2'
533 * is also prepended, like this
539 * file\2::100 (file100)
540 * file:foo (file:foo)
542 * This has the side-effect of sorting numbers before everything else (except
543 * dots), but this is probably OK.
545 * Leading digits are ignored when doing the above. To discriminate
546 * numbers which differ only in the number of leading digits, we append
547 * the number of leading digits as a byte at the very end of the collation
550 * To try avoid conflict with any collation key sequence generated by libc we
551 * start each switch to a special cased part with a sentinel that hopefully
552 * will sort before anything libc will generate.
558 result = g_string_sized_new (len * 2);
559 append = g_string_sized_new (0);
563 /* No need to use utf8 functions, since we're only looking for ascii chars */
564 for (prev = p = str; p < end; p++)
571 collate_key = g_utf8_collate_key (prev, p - prev);
572 g_string_append (result, collate_key);
573 g_free (collate_key);
576 g_string_append (result, COLLATION_SENTINEL "\1");
594 collate_key = g_utf8_collate_key (prev, p - prev);
595 g_string_append (result, collate_key);
596 g_free (collate_key);
599 g_string_append (result, COLLATION_SENTINEL "\2");
603 /* write d-1 colons */
617 if (*p == '0' && !digits)
619 else if (g_ascii_isdigit(*p))
623 /* count an all-zero sequence as
624 * one digit plus leading zeros
637 g_string_append_c (result, ':');
641 if (leading_zeros > 0)
643 g_string_append_c (append, (char)leading_zeros);
644 prev += leading_zeros;
647 /* write the number itself */
648 g_string_append_len (result, prev, p - prev);
651 --p; /* go one step back to avoid disturbing outer loop */
655 /* other characters just accumulate */
662 collate_key = g_utf8_collate_key (prev, p - prev);
663 g_string_append (result, collate_key);
664 g_free (collate_key);
667 g_string_append (result, append->str);
668 g_string_free (append, TRUE);
670 return g_string_free (result, FALSE);
671 #else /* HAVE_CARBON */
672 return carbon_collate_key_for_filename (str, len);