From a73c66b1344c852c18279ba43e951cc71c5839a3 Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Thu, 14 Jul 2011 16:18:30 -0400 Subject: [PATCH] Add tests for Unicode canonical composition/decomposition Also update compose()/decompose() API corner cases and docs. --- glib/gunidecomp.c | 25 ++++++++++-- glib/tests/utf8-misc.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 128 insertions(+), 4 deletions(-) diff --git a/glib/gunidecomp.c b/glib/gunidecomp.c index 0e892a1..7caeb5d 100644 --- a/glib/gunidecomp.c +++ b/glib/gunidecomp.c @@ -616,13 +616,20 @@ compose_hangul_step (gunichar a, * @b: return location for the second component of @ch * * Performs a single decomposition step of the - * Unicode character normalization algorithm. + * Unicode canonical decomposition algorithm. * * This function does not include compatibility * decompositions. It does, however, include algorithmic * Hangul Jamo decomposition, as well as 'singleton' * decompositions which replace a character by a single - * other character. In this case, *@b will be set to zero. + * other character. In the case of singletons *@b will + * be set to zero. + * + * If @ch is not decomposable, *@a is set to @ch and *@b + * is set to zero. + * + * See UAX#15 + * for details. * * Returns: %TRUE if the character could be decomposed * @@ -661,6 +668,9 @@ g_unichar_decompose (gunichar ch, } } + *a = ch; + *b = 0; + return FALSE; } @@ -671,7 +681,7 @@ g_unichar_decompose (gunichar ch, * @ch: return location for the composed character * * Performs a single composition step of the - * Unicode character normalization algorithm. + * Unicode canonical composition algorithm. * * This function does not perform algorithmic composition * for Hangul characters, and does not include compatibility @@ -679,6 +689,13 @@ g_unichar_decompose (gunichar ch, * compositions which replace a character by a single * other character. To obtain these, pass zero for @b. * + * This function includes algorithmic Hangul Jamo composition. + * + * If @a and @b do not compose a new character, @ch is set to zero. + * + * See UAX#15 + * for details. + * * Returns: %TRUE if the characters could be composed * * Since: 2.30 @@ -715,5 +732,7 @@ g_unichar_compose (gunichar a, } } + *ch = 0; + return FALSE; } diff --git a/glib/tests/utf8-misc.c b/glib/tests/utf8-misc.c index 5d23f8b..778afad 100644 --- a/glib/tests/utf8-misc.c +++ b/glib/tests/utf8-misc.c @@ -436,7 +436,6 @@ test_title (void) g_assert (g_unichar_istitle (0x01c5)); g_assert (g_unichar_istitle (0x1f88)); g_assert (g_unichar_istitle (0x1fcc)); - g_assert (!g_unichar_ismark ('a')); g_assert (g_unichar_totitle (0x01c6) == 0x01c5); g_assert (g_unichar_totitle (0x01c4) == 0x01c5); @@ -529,6 +528,109 @@ test_wide (void) } }; +static void +test_compose (void) +{ + gunichar ch; + + /* Not composable */ + g_assert (!g_unichar_compose (0x0041, 0x0042, &ch) && ch == 0); + g_assert (!g_unichar_compose (0x0041, 0, &ch) && ch == 0); + g_assert (!g_unichar_compose (0x0066, 0x0069, &ch) && ch == 0); + + /* Singletons should not compose */ + g_assert (!g_unichar_compose (0x212B, 0, &ch) && ch == 0); + g_assert (!g_unichar_compose (0x00C5, 0, &ch) && ch == 0); + g_assert (!g_unichar_compose (0x2126, 0, &ch) && ch == 0); + g_assert (!g_unichar_compose (0x03A9, 0, &ch) && ch == 0); + + /* Pairs */ + g_assert (g_unichar_compose (0x0041, 0x030A, &ch) && ch == 0x00C5); + g_assert (g_unichar_compose (0x006F, 0x0302, &ch) && ch == 0x00F4); + g_assert (g_unichar_compose (0x1E63, 0x0307, &ch) && ch == 0x1E69); + g_assert (g_unichar_compose (0x0073, 0x0323, &ch) && ch == 0x1E63); + g_assert (g_unichar_compose (0x0064, 0x0307, &ch) && ch == 0x1E0B); + g_assert (g_unichar_compose (0x0064, 0x0323, &ch) && ch == 0x1E0D); + + /* Hangul */ + g_assert (g_unichar_compose (0xD4CC, 0x11B6, &ch) && ch == 0xD4DB); + g_assert (g_unichar_compose (0x1111, 0x1171, &ch) && ch == 0xD4CC); + g_assert (g_unichar_compose (0xCE20, 0x11B8, &ch) && ch == 0xCE31); + g_assert (g_unichar_compose (0x110E, 0x1173, &ch) && ch == 0xCE20); +} + +static void +test_decompose (void) +{ + gunichar a, b; + + /* Not decomposable */ + g_assert (!g_unichar_decompose (0x0041, &a, &b) && a == 0x0041 && b == 0); + g_assert (!g_unichar_decompose (0xFB01, &a, &b) && a == 0xFB01 && b == 0); + + /* Singletons */ + g_assert (g_unichar_decompose (0x212B, &a, &b) && a == 0x00C5 && b == 0); + g_assert (g_unichar_decompose (0x2126, &a, &b) && a == 0x03A9 && b == 0); + + /* Pairs */ + g_assert (g_unichar_decompose (0x00C5, &a, &b) && a == 0x0041 && b == 0x030A); + g_assert (g_unichar_decompose (0x00F4, &a, &b) && a == 0x006F && b == 0x0302); + g_assert (g_unichar_decompose (0x1E69, &a, &b) && a == 0x1E63 && b == 0x0307); + g_assert (g_unichar_decompose (0x1E63, &a, &b) && a == 0x0073 && b == 0x0323); + g_assert (g_unichar_decompose (0x1E0B, &a, &b) && a == 0x0064 && b == 0x0307); + g_assert (g_unichar_decompose (0x1E0D, &a, &b) && a == 0x0064 && b == 0x0323); + + /* Hangul */ + g_assert (g_unichar_decompose (0xD4DB, &a, &b) && a == 0xD4CC && b == 0x11B6); + g_assert (g_unichar_decompose (0xD4CC, &a, &b) && a == 0x1111 && b == 0x1171); + g_assert (g_unichar_decompose (0xCE31, &a, &b) && a == 0xCE20 && b == 0x11B8); + g_assert (g_unichar_decompose (0xCE20, &a, &b) && a == 0x110E && b == 0x1173); +} + +static void +test_canonical_decomposition (void) +{ + gunichar *decomp; + gsize len; + +#define TEST_DECOMP(ch, expected_len, a, b, c, d) \ + decomp = g_unicode_canonical_decomposition (ch, &len); \ + g_assert_cmpint (expected_len, ==, len); \ + if (expected_len >= 1) g_assert_cmphex (decomp[0], ==, a); \ + if (expected_len >= 2) g_assert_cmphex (decomp[1], ==, b); \ + if (expected_len >= 3) g_assert_cmphex (decomp[2], ==, c); \ + if (expected_len >= 4) g_assert_cmphex (decomp[3], ==, d); \ + g_free (d); + +#define TEST0(ch) TEST_DECOMP (ch, 1, ch, 0, 0, 0) +#define TEST1(ch, a) TEST_DECOMP (ch, 1, a, 0, 0, 0) +#define TEST2(ch, a, b) TEST_DECOMP (ch, 2, a, b, 0, 0) +#define TEST3(ch, a, b, c) TEST_DECOMP (ch, 3, a, b, c, 0) +#define TEST4(ch, a, b, c, d) TEST_DECOMP (ch, 4, a, b, c, d) + + /* Not decomposable */ + TEST0 (0x0041); + TEST0 (0xFB01); + + /* Singletons */ + TEST2 (0x212B, 0x0041, 0x030A); + TEST1 (0x2126, 0x03A9); + + /* General */ + TEST2 (0x00C5, 0x0041, 0x030A); + TEST2 (0x00F4, 0x006F, 0x0302); + TEST3 (0x1E69, 0x0073, 0x0323, 0x0307); + TEST2 (0x1E63, 0x0073, 0x0323); + TEST2 (0x1E0B, 0x0064, 0x0307); + TEST2 (0x1E0D, 0x0064, 0x0323); + + /* Hangul */ + TEST3 (0xD4DB, 0x1111, 0x1171, 0x11B6); + TEST2 (0xD4CC, 0x1111, 0x1171); + TEST3 (0xCE31, 0x110E, 0x1173, 0x11B8); + TEST2 (0xCE20, 0x110E, 0x1173); +} + int main (int argc, char *argv[]) @@ -549,6 +651,9 @@ main (int argc, g_test_add_func ("/unicode/mark", test_mark); g_test_add_func ("/unicode/title", test_title); g_test_add_func ("/unicode/wide", test_wide); + g_test_add_func ("/unicode/compose", test_compose); + g_test_add_func ("/unicode/decompose", test_decompose); + g_test_add_func ("/unicode/canonical-decomposition", test_canonical_decomposition); return g_test_run(); } -- 2.7.4