1 /* gunicollate.c - Collation
3 * Copyright 2001,2005 Red Hat, Inc.
5 * The Gnome Library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
10 * The Gnome Library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with the Gnome Library; see the file COPYING.LIB. If not,
17 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 * Boston, MA 02111-1307, USA.
25 #ifdef __STDC_ISO_10646__
30 #include <CoreServices/CoreServices.h>
35 #include "gunicodeprivate.h"
37 #include "gstrfuncs.h"
38 #include "gtestutils.h"
39 #ifndef __STDC_ISO_10646__
45 /* Workaround for bug in MSVCR80.DLL */
47 msc_strxfrm_wrapper (char *string1,
51 if (!string1 || count <= 0)
55 return strxfrm (&tmp, string2, 1);
57 return strxfrm (string1, string2, count);
59 #define strxfrm msc_strxfrm_wrapper
64 * @str1: a UTF-8 encoded string
65 * @str2: a UTF-8 encoded string
67 * Compares two strings for ordering using the linguistically
68 * correct rules for the <link linkend="setlocale">current locale</link>.
69 * When sorting a large number of strings, it will be significantly
70 * faster to obtain collation keys with g_utf8_collate_key() and
71 * compare the keys with strcmp() when sorting instead of sorting
72 * the original strings.
74 * Return value: < 0 if @str1 compares before @str2,
75 * 0 if they compare equal, > 0 if @str1 compares after @str2.
78 g_utf8_collate (const gchar *str1,
91 g_return_val_if_fail (str1 != NULL, 0);
92 g_return_val_if_fail (str2 != NULL, 0);
94 str1_utf16 = g_utf8_to_utf16 (str1, -1, NULL, &len1, NULL);
95 str2_utf16 = g_utf8_to_utf16 (str2, -1, NULL, &len2, NULL);
97 UCCompareTextDefault (kUCCollateStandardOptions,
98 str1_utf16, len1, str2_utf16, len2,
105 #elif defined(__STDC_ISO_10646__)
110 g_return_val_if_fail (str1 != NULL, 0);
111 g_return_val_if_fail (str2 != NULL, 0);
113 str1_norm = _g_utf8_normalize_wc (str1, -1, G_NORMALIZE_ALL_COMPOSE);
114 str2_norm = _g_utf8_normalize_wc (str2, -1, G_NORMALIZE_ALL_COMPOSE);
116 result = wcscoll ((wchar_t *)str1_norm, (wchar_t *)str2_norm);
121 #else /* !__STDC_ISO_10646__ */
123 const gchar *charset;
127 g_return_val_if_fail (str1 != NULL, 0);
128 g_return_val_if_fail (str2 != NULL, 0);
130 str1_norm = g_utf8_normalize (str1, -1, G_NORMALIZE_ALL_COMPOSE);
131 str2_norm = g_utf8_normalize (str2, -1, G_NORMALIZE_ALL_COMPOSE);
133 if (g_get_charset (&charset))
135 result = strcoll (str1_norm, str2_norm);
139 gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
140 gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
142 if (str1_locale && str2_locale)
143 result = strcoll (str1_locale, str2_locale);
144 else if (str1_locale)
146 else if (str2_locale)
149 result = strcmp (str1_norm, str2_norm);
151 g_free (str1_locale);
152 g_free (str2_locale);
158 #endif /* __STDC_ISO_10646__ */
163 #if defined(__STDC_ISO_10646__) || defined(HAVE_CARBON)
164 /* We need UTF-8 encoding of numbers to encode the weights if
165 * we are using wcsxfrm. However, we aren't encoding Unicode
166 * characters, so we can't simply use g_unichar_to_utf8.
168 * The following routine is taken (with modification) from GNU
169 * libc's strxfrm routine:
171 * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
172 * Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
175 utf8_encode (char *buf, wchar_t val)
189 for (step = 2; step < 6; ++step)
190 if ((val & (~(guint32)0 << (5 * step + 1))) == 0)
196 *buf = (unsigned char) (~0xff >> step);
200 buf[step] = 0x80 | (val & 0x3f);
210 #endif /* __STDC_ISO_10646__ || HAVE_CARBON */
215 collate_key_to_string (UCCollationValue *key,
222 /* Pretty smart algorithm here: ignore first eight bytes of the
223 * collation key. It doesn't produce results equivalent to
224 * UCCompareCollationKeys's, but the difference seems to be only
225 * that UCCompareCollationKeys in some cases produces 0 where our
226 * comparison gets -1 or 1. */
228 if (key_len * sizeof (UCCollationValue) <= 8)
229 return g_strdup ("");
232 for (i = 8; i < key_len * sizeof (UCCollationValue); i++)
233 /* there may be nul bytes, encode byteval+1 */
234 result_len += utf8_encode (NULL, *((guchar*)key + i) + 1);
236 result = g_malloc (result_len + 1);
238 for (i = 8; i < key_len * sizeof (UCCollationValue); i++)
239 result_len += utf8_encode (result + result_len, *((guchar*)key + i) + 1);
241 result[result_len] = 0;
246 carbon_collate_key_with_collator (const gchar *str,
248 CollatorRef collator)
250 UniChar *str_utf16 = NULL;
253 UCCollationValue staticbuf[512];
254 UCCollationValue *freeme = NULL;
255 UCCollationValue *buf;
259 gchar *result = NULL;
261 str_utf16 = g_utf8_to_utf16 (str, len, NULL, &len_utf16, NULL);
262 try_len = len_utf16 * 5 + 2;
264 if (try_len <= sizeof staticbuf)
267 buf_len = sizeof staticbuf;
271 freeme = g_new (UCCollationValue, try_len);
276 ret = UCGetCollationKey (collator, str_utf16, len_utf16,
277 buf_len, &key_len, buf);
279 if (ret == kCollateBufferTooSmall)
281 freeme = g_renew (UCCollationValue, freeme, try_len * 2);
283 buf_len = try_len * 2;
284 ret = UCGetCollationKey (collator, str_utf16, len_utf16,
285 buf_len, &key_len, buf);
289 result = collate_key_to_string (buf, key_len);
291 result = g_strdup ("");
299 carbon_collate_key (const gchar *str,
302 static CollatorRef collator;
304 if (G_UNLIKELY (!collator))
306 UCCreateCollator (NULL, 0, kUCCollateStandardOptions, &collator);
310 static gboolean been_here;
312 g_warning ("%s: UCCreateCollator failed", G_STRLOC);
314 return g_strdup ("");
318 return carbon_collate_key_with_collator (str, len, collator);
322 carbon_collate_key_for_filename (const gchar *str,
325 static CollatorRef collator;
327 if (G_UNLIKELY (!collator))
329 /* http://developer.apple.com/qa/qa2004/qa1159.html */
330 UCCreateCollator (NULL, 0,
331 kUCCollateComposeInsensitiveMask
332 | kUCCollateWidthInsensitiveMask
333 | kUCCollateCaseInsensitiveMask
334 | kUCCollateDigitsOverrideMask
335 | kUCCollateDigitsAsNumberMask
336 | kUCCollatePunctuationSignificantMask,
341 static gboolean been_here;
343 g_warning ("%s: UCCreateCollator failed", G_STRLOC);
345 return g_strdup ("");
349 return carbon_collate_key_with_collator (str, len, collator);
352 #endif /* HAVE_CARBON */
355 * g_utf8_collate_key:
356 * @str: a UTF-8 encoded string.
357 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
359 * Converts a string into a collation key that can be compared
360 * with other collation keys produced by the same function using
363 * The results of comparing the collation keys of two strings
364 * with strcmp() will always be the same as comparing the two
365 * original keys with g_utf8_collate().
367 * Note that this function depends on the
368 * <link linkend="setlocale">current locale</link>.
370 * Return value: a newly allocated string. This string should
371 * be freed with g_free() when you are done with it.
374 g_utf8_collate_key (const gchar *str,
381 g_return_val_if_fail (str != NULL, NULL);
382 result = carbon_collate_key (str, len);
384 #elif defined(__STDC_ISO_10646__)
390 gsize result_len = 0;
392 g_return_val_if_fail (str != NULL, NULL);
394 str_norm = _g_utf8_normalize_wc (str, len, G_NORMALIZE_ALL_COMPOSE);
396 xfrm_len = wcsxfrm (NULL, (wchar_t *)str_norm, 0);
397 result_wc = g_new (wchar_t, xfrm_len + 1);
398 wcsxfrm (result_wc, (wchar_t *)str_norm, xfrm_len + 1);
400 for (i=0; i < xfrm_len; i++)
401 result_len += utf8_encode (NULL, result_wc[i]);
403 result = g_malloc (result_len + 1);
405 for (i=0; i < xfrm_len; i++)
406 result_len += utf8_encode (result + result_len, result_wc[i]);
408 result[result_len] = '\0';
414 #else /* !__STDC_ISO_10646__ */
417 const gchar *charset;
420 g_return_val_if_fail (str != NULL, NULL);
422 str_norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL_COMPOSE);
426 if (g_get_charset (&charset))
428 xfrm_len = strxfrm (NULL, str_norm, 0);
429 if (xfrm_len >= 0 && xfrm_len < G_MAXINT - 2)
431 result = g_malloc (xfrm_len + 1);
432 strxfrm (result, str_norm, xfrm_len + 1);
437 gchar *str_locale = g_convert (str_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
441 xfrm_len = strxfrm (NULL, str_locale, 0);
442 if (xfrm_len < 0 || xfrm_len >= G_MAXINT - 2)
450 result = g_malloc (xfrm_len + 2);
452 strxfrm (result + 1, str_locale, xfrm_len + 1);
460 xfrm_len = strlen (str_norm);
461 result = g_malloc (xfrm_len + 2);
463 memcpy (result + 1, str_norm, xfrm_len);
464 result[xfrm_len+1] = '\0';
468 #endif /* __STDC_ISO_10646__ */
473 /* This is a collation key that is very very likely to sort before any
474 collation key that libc strxfrm generates. We use this before any
475 special case (dot or number) to make sure that its sorted before
478 #define COLLATION_SENTINEL "\1\1\1"
481 * g_utf8_collate_key_for_filename:
482 * @str: a UTF-8 encoded string.
483 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
485 * Converts a string into a collation key that can be compared
486 * with other collation keys produced by the same function using strcmp().
488 * In order to sort filenames correctly, this function treats the dot '.'
489 * as a special case. Most dictionary orderings seem to consider it
490 * insignificant, thus producing the ordering "event.c" "eventgenerator.c"
491 * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we
492 * would like to treat numbers intelligently so that "file1" "file10" "file5"
493 * is sorted as "file1" "file5" "file10".
495 * Note that this function depends on the
496 * <link linkend="setlocale">current locale</link>.
498 * Return value: a newly allocated string. This string should
499 * be freed with g_free() when you are done with it.
504 g_utf8_collate_key_for_filename (const gchar *str,
520 * Split the filename into collatable substrings which do
521 * not contain [.0-9] and special-cased substrings. The collatable
522 * substrings are run through the normal g_utf8_collate_key() and the
523 * resulting keys are concatenated with keys generated from the
524 * special-cased substrings.
526 * Special cases: Dots are handled by replacing them with '\1' which
527 * implies that short dot-delimited substrings are before long ones,
534 * Numbers are handled by prepending to each number d-1 superdigits
535 * where d = number of digits in the number and SUPERDIGIT is a
536 * character with an integer value higher than any digit (for instance
537 * ':'). This ensures that single-digit numbers are sorted before
538 * double-digit numbers which in turn are sorted separately from
539 * triple-digit numbers, etc. To avoid strange side-effects when
540 * sorting strings that already contain SUPERDIGITs, a '\2'
541 * is also prepended, like this
547 * file\2::100 (file100)
548 * file:foo (file:foo)
550 * This has the side-effect of sorting numbers before everything else (except
551 * dots), but this is probably OK.
553 * Leading digits are ignored when doing the above. To discriminate
554 * numbers which differ only in the number of leading digits, we append
555 * the number of leading digits as a byte at the very end of the collation
558 * To try avoid conflict with any collation key sequence generated by libc we
559 * start each switch to a special cased part with a sentinel that hopefully
560 * will sort before anything libc will generate.
566 result = g_string_sized_new (len * 2);
567 append = g_string_sized_new (0);
571 /* No need to use utf8 functions, since we're only looking for ascii chars */
572 for (prev = p = str; p < end; p++)
579 collate_key = g_utf8_collate_key (prev, p - prev);
580 g_string_append (result, collate_key);
581 g_free (collate_key);
584 g_string_append (result, COLLATION_SENTINEL "\1");
602 collate_key = g_utf8_collate_key (prev, p - prev);
603 g_string_append (result, collate_key);
604 g_free (collate_key);
607 g_string_append (result, COLLATION_SENTINEL "\2");
611 /* write d-1 colons */
625 if (*p == '0' && !digits)
627 else if (g_ascii_isdigit(*p))
631 /* count an all-zero sequence as
632 * one digit plus leading zeros
645 g_string_append_c (result, ':');
649 if (leading_zeros > 0)
651 g_string_append_c (append, (char)leading_zeros);
652 prev += leading_zeros;
655 /* write the number itself */
656 g_string_append_len (result, prev, p - prev);
659 --p; /* go one step back to avoid disturbing outer loop */
663 /* other characters just accumulate */
670 collate_key = g_utf8_collate_key (prev, p - prev);
671 g_string_append (result, collate_key);
672 g_free (collate_key);
675 g_string_append (result, append->str);
676 g_string_free (append, TRUE);
678 return g_string_free (result, FALSE);
679 #else /* HAVE_CARBON */
680 return carbon_collate_key_for_filename (str, len);