From e6cc98c9cd2e08a0520dd14cd06b7cacf197a36c Mon Sep 17 00:00:00 2001 From: Matthias Clasen Date: Wed, 8 Jun 2005 05:22:05 +0000 Subject: [PATCH] New function to calculate collation keys which are more suitable for 2005-06-08 Matthias Clasen * glib/glib.symbols: * glib/gunicode.h: * glib/gunicollate.c (g_utf8_collate_key_for_filename): New function to calculate collation keys which are more suitable for sorting filenames. (#172690, Ole Laursen) --- ChangeLog | 8 ++ ChangeLog.pre-2-10 | 8 ++ ChangeLog.pre-2-12 | 8 ++ ChangeLog.pre-2-8 | 8 ++ docs/reference/ChangeLog | 4 + docs/reference/glib/glib-sections.txt | 1 + glib/glib.symbols | 1 + glib/gunicode.h | 4 +- glib/gunicollate.c | 200 +++++++++++++++++++++++++++++++++- 9 files changed, 239 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index a671ba5..cc3eabb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2005-06-08 Matthias Clasen + + * glib/glib.symbols: + * glib/gunicode.h: + * glib/gunicollate.c (g_utf8_collate_key_for_filename): + New function to calculate collation keys which are more + suitable for sorting filenames. (#172690, Ole Laursen) + Fri May 27 17:18:00 2005 Manish Singh * glib/goption.c (parse_short_option): initialize option_name diff --git a/ChangeLog.pre-2-10 b/ChangeLog.pre-2-10 index a671ba5..cc3eabb 100644 --- a/ChangeLog.pre-2-10 +++ b/ChangeLog.pre-2-10 @@ -1,3 +1,11 @@ +2005-06-08 Matthias Clasen + + * glib/glib.symbols: + * glib/gunicode.h: + * glib/gunicollate.c (g_utf8_collate_key_for_filename): + New function to calculate collation keys which are more + suitable for sorting filenames. (#172690, Ole Laursen) + Fri May 27 17:18:00 2005 Manish Singh * glib/goption.c (parse_short_option): initialize option_name diff --git a/ChangeLog.pre-2-12 b/ChangeLog.pre-2-12 index a671ba5..cc3eabb 100644 --- a/ChangeLog.pre-2-12 +++ b/ChangeLog.pre-2-12 @@ -1,3 +1,11 @@ +2005-06-08 Matthias Clasen + + * glib/glib.symbols: + * glib/gunicode.h: + * glib/gunicollate.c (g_utf8_collate_key_for_filename): + New function to calculate collation keys which are more + suitable for sorting filenames. (#172690, Ole Laursen) + Fri May 27 17:18:00 2005 Manish Singh * glib/goption.c (parse_short_option): initialize option_name diff --git a/ChangeLog.pre-2-8 b/ChangeLog.pre-2-8 index a671ba5..cc3eabb 100644 --- a/ChangeLog.pre-2-8 +++ b/ChangeLog.pre-2-8 @@ -1,3 +1,11 @@ +2005-06-08 Matthias Clasen + + * glib/glib.symbols: + * glib/gunicode.h: + * glib/gunicollate.c (g_utf8_collate_key_for_filename): + New function to calculate collation keys which are more + suitable for sorting filenames. (#172690, Ole Laursen) + Fri May 27 17:18:00 2005 Manish Singh * glib/goption.c (parse_short_option): initialize option_name diff --git a/docs/reference/ChangeLog b/docs/reference/ChangeLog index f5c398a..0c965b3 100644 --- a/docs/reference/ChangeLog +++ b/docs/reference/ChangeLog @@ -1,3 +1,7 @@ +2005-06-07 Matthias Clasen + + * glib/glib-sections.txt: Add g_utf8_collate_key_for_filename. + 2005-05-25 Mathieu Lacage * gobject/tut_*.xml: fix lots of typos, diff --git a/docs/reference/glib/glib-sections.txt b/docs/reference/glib/glib-sections.txt index 9a5f8bb..b918fe8 100644 --- a/docs/reference/glib/glib-sections.txt +++ b/docs/reference/glib/glib-sections.txt @@ -2169,6 +2169,7 @@ g_utf8_normalize GNormalizeMode g_utf8_collate g_utf8_collate_key +g_utf8_collate_key_for_filename g_utf8_to_utf16 diff --git a/glib/glib.symbols b/glib/glib.symbols index 00831ec..376c232 100644 --- a/glib/glib.symbols +++ b/glib/glib.symbols @@ -1073,6 +1073,7 @@ g_unichar_break_type G_GNUC_CONST #if IN_FILE(__G_UNICOLLATE_C__) g_utf8_collate g_utf8_collate_key G_GNUC_MALLOC +g_utf8_collate_key_for_filename G_GNUC_MALLOC #endif #endif diff --git a/glib/gunicode.h b/glib/gunicode.h index 4ed37a5..7c93f90 100644 --- a/glib/gunicode.h +++ b/glib/gunicode.h @@ -1,7 +1,7 @@ /* gunicode.h - Unicode manipulation functions * * Copyright (C) 1999, 2000 Tom Tromey - * Copyright 2000 Red Hat, Inc. + * Copyright 2000, 2005 Red Hat, Inc. * * The Gnome Library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License as @@ -278,6 +278,8 @@ gint g_utf8_collate (const gchar *str1, const gchar *str2); gchar *g_utf8_collate_key (const gchar *str, gssize len) G_GNUC_MALLOC; +gchar *g_utf8_collate_key_for_filename (const gchar *str, + gssize len) G_GNUC_MALLOC; gboolean g_unichar_get_mirror_char (gunichar ch, gunichar *mirrored_ch); diff --git a/glib/gunicollate.c b/glib/gunicollate.c index f3e2857..2e66842 100644 --- a/glib/gunicollate.c +++ b/glib/gunicollate.c @@ -1,6 +1,6 @@ /* gunicollate.c - Collation * - * Copyright 2001 Red Hat, Inc. + * Copyright 2001,2005 Red Hat, Inc. * * The Gnome Library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License as @@ -164,7 +164,8 @@ utf8_encode (char *buf, wchar_t val) * @len: length of @str, in bytes, or -1 if @str is nul-terminated. * * Converts a string into a collation key that can be compared - * with other collation keys using strcmp(). + * with other collation keys produced by the same function using + * strcmp(). * The results of comparing the collation keys of two strings * with strcmp() will always be the same as * comparing the two original keys with g_utf8_collate(). @@ -262,5 +263,200 @@ g_utf8_collate_key (const gchar *str, return result; } +/* This is a collation key that is very very likely to sort before any + collation key that libc strxfrm generates. We use this before any + special case (dot or number) to make sure that its sorted before + anything else. + */ +#define COLLATION_SENTINEL "\1\1\1" + +/** + * g_utf8_collate_key_for_filename: + * @str: a UTF-8 encoded string. + * @len: length of @str, in bytes, or -1 if @str is nul-terminated. + * + * Converts a string into a collation key that can be compared + * with other collation keys produced by the same function using strcmp(). + * + * In order to sort filenames correctly, this function treats the dot '.' + * as a special case. Most dictionary orderings seem to consider it + * insignificant, thus producing the ordering "event.c" "eventgenerator.c" + * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we + * would like to treat numbers intelligently so that "file1" "file10" "file5" + * is sorted as "file1" "file5" "file10". + * + * Return value: a newly allocated string. This string should + * be freed with g_free() when you are done with it. + * + * Since: 2.8 + */ +gchar* +g_utf8_collate_key_for_filename (const gchar *str, + gssize len) +{ + GString *result; + GString *append; + const gchar *p; + const gchar *prev; + gchar *collate_key; + gint digits; + gint leading_zeros; + + /* + * How it works: + * + * Split the filename into collatable substrings which do + * not contain [.0-9] and special-cased substrings. The collatable + * substrings are run through the normal g_utf8_collate_key() and the + * resulting keys are concatenated with keys generated from the + * special-cased substrings. + * + * Special cases: Dots are handled by replacing them with '\1' which + * implies that short dot-delimited substrings are before long ones, + * e.g. + * + * a\1a (a.a) + * a-\1a (a-.a) + * aa\1a (aa.a) + * + * Numbers are handled by prepending to each number d-1 superdigits + * where d = number of digits in the number and SUPERDIGIT is a + * character with an integer value higher than any digit (for instance + * ':'). This ensures that single-digit numbers are sorted before + * double-digit numbers which in turn are sorted separately from + * triple-digit numbers, etc. To avoid strange side-effects when + * sorting strings that already contain SUPERDIGITs, a '\2' + * is also prepended, like this + * + * file\21 (file1) + * file\25 (file5) + * file\2:10 (file10) + * file\2:26 (file26) + * file\2::100 (file100) + * file:foo (file:foo) + * + * This has the side-effect of sorting numbers before everything else (except + * dots), but this is probably OK. + * + * Leading digits are ignored when doing the above. To discriminate + * numbers which differ only in the number of leading digits, we append + * the number of leading digits as a byte at the very end of the collation + * key. + * + * To try avoid conflict with any collation key sequence generated by libc we + * start each switch to a special cased part with a sentinel that hopefully + * will sort before anything libc will generate. + */ + + if (len < 0) + len = strlen (str); + + result = g_string_sized_new (len * 2); + append = g_string_sized_new (0); + + /* No need to use utf8 functions, since we're only looking for ascii chars */ + for (prev = p = str; *p != '\0'; p++) + { + switch (*p) + { + case '.': + if (prev != p) + { + collate_key = g_utf8_collate_key (prev, p - prev); + g_string_append (result, collate_key); + g_free (collate_key); + } + + g_string_append (result, COLLATION_SENTINEL "\1"); + + /* skip the dot */ + prev = p + 1; + break; + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (prev != p) + { + collate_key = g_utf8_collate_key (prev, p - prev); + g_string_append (result, collate_key); + g_free (collate_key); + } + + g_string_append (result, COLLATION_SENTINEL "\2"); + + prev = p; + + /* write d-1 colons */ + if (*p == '0') + { + leading_zeros = 1; + digits = 0; + } + else + { + leading_zeros = 0; + digits = 1; + } + + do + { + p++; + + if (*p == '0' && !digits) + ++leading_zeros; + else if (g_ascii_isdigit(*p)) + ++digits; + else + break; + } + while (*p != '\0'); + + while (digits > 1) + { + g_string_append_c (result, ':'); + --digits; + } + + if (leading_zeros > 0) + { + g_string_append_c (append, (char)leading_zeros); + prev += leading_zeros; + } + + /* write the number itself */ + g_string_append_len (result, prev, p - prev); + + prev = p; + --p; /* go one step back to avoid disturbing outer loop */ + break; + + default: + /* other characters just accumulate */ + break; + } + } + + if (prev != p) + { + collate_key = g_utf8_collate_key (prev, p - prev); + g_string_append (result, collate_key); + g_free (collate_key); + } + + g_string_append (result, append->str); + g_string_free (append, TRUE); + + return g_string_free (result, FALSE); +} + + #define __G_UNICOLLATE_C__ #include "galiasdef.c" -- 2.7.4