1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
3 * Copyright (C) 2013 Intel Corporation
5 * This library is free software you can redistribute it and/or modify it
6 * under the terms of the GNU Lesser General Public License as published by
7 * the Free Software Foundation.
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
11 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * You should have received a copy of the GNU Lesser General Public License
15 * along with this library; if not, see <http://www.gnu.org/licenses/>.
17 * Author: Tristan Van Berkom <tristanvb@openismus.com>
22 * @include: libedataserver/libedataserver.h
23 * @short_description: Collation services for locale sensitive sorting
25 * The #ECollator is a wrapper object around ICU collation services and
26 * provides features to sort words in locale specific ways. The collator
27 * also provides some API for determining features of the active alphabet
28 * in the user's locale, and which words should be sorted under which
29 * letter in the user's alphabet.
40 #include <unicode/uclean.h>
41 #include <unicode/ucol.h>
42 #include <unicode/ustring.h>
44 #include "e-collator.h"
45 #include "e-alphabet-index-private.h"
46 #include "e-transliterator-private.h"
48 #define CONVERT_BUFFER_LEN 512
49 #define COLLATION_KEY_BUFFER_LEN 1024
50 #define LOCALE_BUFFER_LEN 256
52 #define ENABLE_DEBUGGING 0
54 G_DEFINE_QUARK (e-collator-error-quark, e_collator_error)
56 G_DEFINE_BOXED_TYPE (ECollator,
64 volatile gint ref_count;
66 EAlphabetIndex *alpha_index;
73 ECxxTransliterator *transliterator;
76 /*****************************************************
77 * ICU Helper Functions *
78 *****************************************************/
81 print_available_locales (void)
83 UErrorCode status = U_ZERO_ERROR;
85 gchar printable[100 * 4];
90 g_printerr ("List of available locales (default locale is: %s)\n", uloc_getDefault ());
92 count = uloc_countAvailable ();
93 for (i = 0; i < count; i++) {
94 UEnumeration *keywords;
97 uloc_getDisplayName (uloc_getAvailable (i), NULL, result, 100, &status);
99 u_austrncpy (printable, result, sizeof (printable));
102 g_printerr ("\t%s - %s", uloc_getAvailable (i), printable);
104 keywords = uloc_openKeywords (uloc_getAvailable (i), &status);
106 UErrorCode kstatus = U_ZERO_ERROR;
110 while ((keyword = uenum_next (keywords, NULL, &kstatus)) != NULL)
111 g_printerr (" %s ", keyword);
115 uenum_close (keywords);
123 canonicalize_locale (const gchar *posix_locale,
124 gchar **language_code,
125 gchar **country_code,
128 UErrorCode status = U_ZERO_ERROR;
129 gchar locale_buffer[LOCALE_BUFFER_LEN];
130 gchar language_buffer[8];
131 gchar country_buffer[8];
135 const gchar *collation_type = NULL;
137 len = uloc_canonicalize (posix_locale, locale_buffer, LOCALE_BUFFER_LEN, &status);
139 if (U_FAILURE (status)) {
141 error, E_COLLATOR_ERROR,
142 E_COLLATOR_ERROR_INVALID_LOCALE,
143 "Failed to interpret locale '%s' (%s)",
145 u_errorName (status));
149 if (len > LOCALE_BUFFER_LEN) {
150 icu_locale = g_malloc (len);
152 uloc_canonicalize (posix_locale, icu_locale, len, &status);
154 icu_locale = g_strndup (locale_buffer, len);
157 status = U_ZERO_ERROR;
158 len = uloc_getLanguage (icu_locale, language_buffer, 8, &status);
159 if (U_FAILURE (status)) {
161 error, E_COLLATOR_ERROR,
162 E_COLLATOR_ERROR_INVALID_LOCALE,
163 "Failed to interpret language for locale '%s': %s",
165 u_errorName (status));
170 status = U_ZERO_ERROR;
171 len = uloc_getCountry (icu_locale, country_buffer, 8, &status);
172 if (U_FAILURE (status)) {
174 error, E_COLLATOR_ERROR,
175 E_COLLATOR_ERROR_INVALID_LOCALE,
176 "Failed to interpret country for locale '%s': %s",
178 u_errorName (status));
183 /* Add 'phonebook' tailoring to certain locales */
185 (strcmp (language_buffer, "de") == 0 ||
186 strcmp (language_buffer, "fi") == 0)) {
188 collation_type = "phonebook";
191 if (collation_type != NULL)
192 final_locale = g_strconcat (icu_locale, "@collation=", collation_type, NULL);
194 final_locale = icu_locale;
201 *language_code = g_strdup (language_buffer);
204 *country_code = g_strdup (country_buffer);
209 /* All purpose character encoding function, encodes text
210 * to a UChar from UTF-8 and first ensures that the string
214 convert_to_ustring (const gchar *string,
221 UErrorCode status = U_ZERO_ERROR;
222 const gchar *source_utf8;
223 gchar *alloc_utf8 = NULL;
224 gint converted_len = 0;
225 UChar *converted_buffer;
227 /* First make sure we're dealing with utf8 */
228 if (g_utf8_validate (string, -1, NULL))
229 source_utf8 = string;
231 alloc_utf8 = e_util_utf8_make_valid (string);
232 source_utf8 = alloc_utf8;
235 /* First pass, try converting to UChar in the given buffer */
236 converted_buffer = u_strFromUTF8Lenient (
244 /* Set the result length right away... */
245 *result_len = converted_len;
247 if (U_FAILURE (status)) {
248 converted_buffer = NULL;
252 /* Second pass, allocate a buffer big enough and then convert */
253 if (converted_len > buffer_len) {
254 *free_me = g_new (UChar, converted_len);
256 converted_buffer = u_strFromUTF8Lenient (
264 if (U_FAILURE (status)) {
267 converted_buffer = NULL;
275 if (U_FAILURE (status))
277 error, E_COLLATOR_ERROR,
278 E_COLLATOR_ERROR_CONVERSION,
279 "Error occured while converting character encoding (%s)",
280 u_errorName (status));
282 return converted_buffer;
285 /*****************************************************
287 *****************************************************/
291 * @locale: The locale under which to sort
292 * @error: (allow-none): A location to store a #GError from the #E_COLLATOR_ERROR domain
294 * Creates a new #ECollator for the given @locale,
295 * the returned collator should be freed with e_collator_unref().
297 * Returns: (transfer full): A newly created #ECollator.
302 e_collator_new (const gchar *locale,
305 return e_collator_new_interpret_country (locale, NULL, error);
309 * e_collator_new_interpret_country:
310 * @locale: The locale under which to sort
311 * @country_code: (allow-none) (out) (transfer full): A location to store the interpreted country code from @locale
312 * @error: (allow-none): A location to store a #GError from the #E_COLLATOR_ERROR domain
314 * Creates a new #ECollator for the given @locale,
315 * the returned collator should be freed with e_collator_unref().
317 * In addition, this also reliably interprets the country
318 * code from the @locale string and stores it to @country_code.
320 * Returns: (transfer full): A newly created #ECollator.
325 e_collator_new_interpret_country (const gchar *locale,
326 gchar **country_code,
331 UErrorCode status = U_ZERO_ERROR;
333 gchar *language_code = NULL;
334 gchar *local_country_code = NULL;
336 g_return_val_if_fail (locale && locale[0], NULL);
339 print_available_locales ();
342 icu_locale = canonicalize_locale (
350 coll = ucol_open (icu_locale, &status);
352 if (U_FAILURE (status)) {
354 error, E_COLLATOR_ERROR,
355 E_COLLATOR_ERROR_OPEN,
356 "Unable to open collator for locale '%s' (%s)",
358 u_errorName (status));
360 g_free (language_code);
361 g_free (local_country_code);
369 ucol_setStrength (coll, UCOL_DEFAULT_STRENGTH);
371 collator = g_slice_new0 (ECollator);
372 collator->coll = coll;
373 collator->ref_count = 1;
375 /* In Chinese we use transliteration services to sort latin
376 * names interleaved with Chinese names in a latin AlphabeticIndex
378 if (g_strcmp0 (language_code, "zh") == 0)
379 collator->transliterator = _e_transliterator_cxx_new ("Han-Latin");
381 collator->alpha_index = _e_alphabet_index_cxx_new_for_language (language_code);
382 collator->labels = _e_alphabet_index_cxx_get_labels (
383 collator->alpha_index,
385 &collator->underflow,
387 &collator->overflow);
389 g_free (language_code);
392 *country_code = local_country_code;
394 g_free (local_country_code);
401 * @collator: An #ECollator
403 * Increases the reference count of @collator.
405 * Returns: (transfer full): @collator
410 e_collator_ref (ECollator *collator)
412 g_return_val_if_fail (collator != NULL, NULL);
414 g_atomic_int_inc (&collator->ref_count);
421 * @collator: An #ECollator
423 * Decreases the reference count of @collator.
424 * If the reference count reaches 0 then the collator is freed
429 e_collator_unref (ECollator *collator)
431 g_return_if_fail (collator != NULL);
433 if (g_atomic_int_dec_and_test (&collator->ref_count)) {
436 ucol_close (collator->coll);
438 _e_alphabet_index_cxx_free (collator->alpha_index);
439 g_strfreev (collator->labels);
441 /* The transliterator is only used for specialized sorting in some locales,
442 * notably Chinese locales
444 if (collator->transliterator)
445 _e_transliterator_cxx_free (collator->transliterator);
447 g_slice_free (ECollator, collator);
452 * e_collator_generate_key:
453 * @collator: An #ECollator
454 * @str: The string to generate a collation key for
455 * @error: (allow-none): A location to store a #GError from the #E_COLLATOR_ERROR domain
457 * Generates a collation key for @str, the result of comparing
458 * two collation keys with strcmp() will be the same result
459 * of calling e_collator_collate() on the same original strings.
461 * This function will first ensure that @str is valid UTF-8 encoded.
463 * Returns: (transfer full): A collation key for @str, or %NULL on failure with @error set.
468 e_collator_generate_key (ECollator *collator,
472 UChar source_buffer[CONVERT_BUFFER_LEN];
473 UChar *free_me = NULL;
475 gchar stack_buffer[COLLATION_KEY_BUFFER_LEN];
476 gchar *collation_key;
477 gint key_len, source_len = 0;
479 gchar *translit_str = NULL;
480 const gchar *input_str;
482 g_return_val_if_fail (collator != NULL, NULL);
483 g_return_val_if_fail (str != NULL, NULL);
485 /* We may need to perform a conversion before generating the sort key */
486 if (collator->transliterator) {
487 translit_str = _e_transliterator_cxx_transliterate (collator->transliterator, str);
488 input_str = translit_str;
493 source = convert_to_ustring (
502 g_free (translit_str);
506 /* Get the numerical index for this string */
507 alphabet_index = _e_alphabet_index_cxx_get_index (collator->alpha_index, input_str);
509 /* First try to generate a key in a predefined buffer size */
510 key_len = ucol_getSortKey (
511 collator->coll, source, source_len,
512 (guchar *) stack_buffer, COLLATION_KEY_BUFFER_LEN);
514 if (key_len > COLLATION_KEY_BUFFER_LEN) {
516 /* Stack buffer wasn't large enough, regenerate into a new buffer
517 * (add a byte for a trailing NULL char)
519 * Note we allocate 4 extra chars to hold the prefixed alphabetic
520 * index into the first 4 charachters (the 5th extra char is the trailing
523 collation_key = g_malloc (key_len + 5);
525 /* Format the alphabetic index into the first 4 chars */
526 snprintf (collation_key, 4, "%03d-", alphabet_index);
528 /* Get the sort key and put it in &collation_key[4] */
530 collator->coll, source, source_len,
531 (guchar *)(collation_key + 4), key_len);
533 /* Just being paranoid, make sure we're null terminated since the API
534 * doesn't specify if the result length is null character inclusive
536 collation_key[key_len + 4] = '\0';
538 GString *string = g_string_new (NULL);
540 /* Format the alphabetic index into the first 4 chars */
541 g_string_append_printf (string, "%03d-", alphabet_index);
543 /* Insert the rest of the sort key from the stack buffer into the allocated buffer */
544 g_string_insert_len (string, 4, stack_buffer, key_len);
546 collation_key = g_string_free (string, FALSE);
550 g_free (translit_str);
552 return (gchar *) collation_key;
556 * e_collator_generate_key_for_index:
557 * @collator: An #ECollator
558 * @index: An index into the alphabetic labels
560 * Generates a sort key for the given alphabetic @index.
562 * The generated sort key is guaranteed to sort below
563 * any sort keys for words beginning with any variant of
566 * For instance, a sort key generated for the index 5 of
567 * a latin alphabet, where the fifth index is 'E' will sort
568 * below any sort keys generated for words starting with
569 * the characters 'e', 'E', 'é', 'É', 'è' or 'È'. It will also
570 * sort above any sort keys generated for words starting with
571 * the characters 'd' or 'D'.
573 * Returns: (transfer full): A sort key for the given index
578 e_collator_generate_key_for_index (ECollator *collator,
581 g_return_val_if_fail (collator != NULL, NULL);
582 g_return_val_if_fail (index >= 0 && index < collator->n_labels, NULL);
584 return g_strdup_printf ("%03d", index);
588 * e_collator_collate:
589 * @collator: An #ECollator
590 * @str_a: (allow-none): A string to compare
591 * @str_b: (allow-none): The string to compare with @str_a
592 * @result: (out): A location to store the comparison result
593 * @error: (allow-none): A location to store a #GError from the #E_COLLATOR_ERROR domain
595 * Compares @str_a with @str_b, the order of strings is determined by the parameters of @collator.
597 * The @result will be set to integer less than, equal to, or greater than zero if @str_a is found,
598 * respectively, to be less than, to match, or be greater than @str_b.
600 * Either @str_a or @str_b can be %NULL, %NULL strings are considered to sort below other strings.
602 * This function will first ensure that both strings are valid UTF-8.
604 * Returns: %TRUE on success, otherwise if %FALSE is returned then @error will be set.
609 e_collator_collate (ECollator *collator,
615 gchar *sort_key_a, *sort_key_b;
617 g_return_val_if_fail (collator != NULL, -1);
618 g_return_val_if_fail (result != NULL, -1);
620 if (!str_a || !str_b) {
621 *result = g_strcmp0 (str_a, str_b);
625 sort_key_a = e_collator_generate_key (collator, str_a, error);
629 sort_key_b = e_collator_generate_key (collator, str_b, error);
635 *result = strcmp (sort_key_a, sort_key_b);
644 * e_collator_get_index_labels:
645 * @collator: An #ECollator
646 * @n_labels: (out): The number of labels/indexes available for @collator
647 * @underflow: (allow-none) (out): The underflow index, for any words which sort below the active alphabet(s)
648 * @inflow: (allow-none) (out): The inflow index, for any words which sort between the active alphabets (if there is more than one)
649 * @overflow: (allow-none) (out): The overflow index, for any words which sort above the active alphabet(s)
651 * Fetches the displayable labels and index positions for the active alphabet.
653 * Returns: (array zero-terminated=1) (element-type utf8) (transfer none):
654 * The array of displayable labels for each index in the active alphabet(s).
659 e_collator_get_index_labels (ECollator *collator,
665 g_return_val_if_fail (collator != NULL, NULL);
668 *n_labels = collator->n_labels;
670 *underflow = collator->underflow;
672 *inflow = collator->inflow;
674 *overflow = collator->overflow;
676 return (const gchar *const *) collator->labels;
680 * e_collator_get_index:
681 * @collator: An #ECollator
684 * Checks which index, as determined by e_collator_get_index_labels(),
685 * that @str should sort under.
687 * Returns: The alphabetic index under which @str would sort
692 e_collator_get_index (ECollator *collator,
696 gchar *translit_str = NULL;
697 const gchar *input_str;
699 g_return_val_if_fail (collator != NULL, -1);
700 g_return_val_if_fail (str != NULL, -1);
702 /* We may need to perform a conversion before generating the sort key */
703 if (collator->transliterator) {
704 translit_str = _e_transliterator_cxx_transliterate (collator->transliterator, str);
705 input_str = translit_str;
710 index = _e_alphabet_index_cxx_get_index (collator->alpha_index, input_str);
712 g_free (translit_str);