glib/tests/unicode.c

   1 /* Unit tests for utilities
   2  * Copyright (C) 2010 Red Hat, Inc.
   3  * Copyright (C) 2011 Google, Inc.
   4  *
   5  * SPDX-License-Identifier: LicenseRef-old-glib-tests
   6  *
   7  * This work is provided "as is"; redistribution and modification
   8  * in whole or in part, in any medium, physical or electronic is
   9  * permitted without restriction.
  10  *
  11  * This work is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14  *
  15  * In no event shall the authors or contributors be liable for any
  16  * direct, indirect, incidental, special, exemplary, or consequential
  17  * damages (including, but not limited to, procurement of substitute
  18  * goods or services; loss of use, data, or profits; or business
  19  * interruption) however caused and on any theory of liability, whether
  20  * in contract, strict liability, or tort (including negligence or
  21  * otherwise) arising in any way out of the use of this software, even
  22  * if advised of the possibility of such damage.
  23  *
  24  * Author: Matthias Clasen, Behdad Esfahbod
  25  */
  26
  27 /* We are testing some deprecated APIs here */
  28 #ifndef GLIB_DISABLE_DEPRECATION_WARNINGS
  29 #define GLIB_DISABLE_DEPRECATION_WARNINGS
  30 #endif
  31
  32 #include <locale.h>
  33 #include <stdio.h>
  34
  35 #include "glib.h"
  36
  37 #include "glib/gunidecomp.h"
  38
  39 #ifdef G_OS_WIN32
  40 #include <windows.h>
  41 #endif
  42
  43 static void
  44 save_and_clear_env (const char  *name,
  45                     char       **save)
  46 {
  47   *save = g_strdup (g_getenv (name));
  48   g_unsetenv (name);
  49 }
  50
  51 /* Test that g_unichar_validate() returns the correct value for various
  52  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
  53 static void
  54 test_unichar_validate (void)
  55 {
  56   g_assert_true (g_unichar_validate ('j'));
  57   g_assert_true (g_unichar_validate (8356));
  58   g_assert_true (g_unichar_validate (8356));
  59   g_assert_true (g_unichar_validate (0xFDD1));
  60   g_assert_true (g_unichar_validate (917760));
  61   g_assert_false (g_unichar_validate (0x110000));
  62 }
  63
  64 /* Test that g_unichar_type() returns the correct value for various
  65  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
  66 static void
  67 test_unichar_character_type (void)
  68 {
  69   guint i;
  70   struct {
  71     GUnicodeType type;
  72     gunichar     c;
  73   } examples[] = {
  74     { G_UNICODE_CONTROL,              0x000D },
  75     { G_UNICODE_FORMAT,               0x200E },
  76      /* G_UNICODE_UNASSIGNED */
  77     { G_UNICODE_PRIVATE_USE,          0xE000 },
  78     { G_UNICODE_SURROGATE,            0xD800 },
  79     { G_UNICODE_LOWERCASE_LETTER,     0x0061 },
  80     { G_UNICODE_MODIFIER_LETTER,      0x02B0 },
  81     { G_UNICODE_OTHER_LETTER,         0x3400 },
  82     { G_UNICODE_TITLECASE_LETTER,     0x01C5 },
  83     { G_UNICODE_UPPERCASE_LETTER,     0xFF21 },
  84     { G_UNICODE_SPACING_MARK,         0x0903 },
  85     { G_UNICODE_ENCLOSING_MARK,       0x20DD },
  86     { G_UNICODE_NON_SPACING_MARK,     0xA806 },
  87     { G_UNICODE_DECIMAL_NUMBER,       0xFF10 },
  88     { G_UNICODE_LETTER_NUMBER,        0x16EE },
  89     { G_UNICODE_OTHER_NUMBER,         0x17F0 },
  90     { G_UNICODE_CONNECT_PUNCTUATION,  0x005F },
  91     { G_UNICODE_DASH_PUNCTUATION,     0x058A },
  92     { G_UNICODE_CLOSE_PUNCTUATION,    0x0F3B },
  93     { G_UNICODE_FINAL_PUNCTUATION,    0x2019 },
  94     { G_UNICODE_INITIAL_PUNCTUATION,  0x2018 },
  95     { G_UNICODE_OTHER_PUNCTUATION,    0x2016 },
  96     { G_UNICODE_OPEN_PUNCTUATION,     0x0F3A },
  97     { G_UNICODE_CURRENCY_SYMBOL,      0x20A0 },
  98     { G_UNICODE_MODIFIER_SYMBOL,      0x309B },
  99     { G_UNICODE_MATH_SYMBOL,          0xFB29 },
 100     { G_UNICODE_OTHER_SYMBOL,         0x00A6 },
 101     { G_UNICODE_LINE_SEPARATOR,       0x2028 },
 102     { G_UNICODE_PARAGRAPH_SEPARATOR,  0x2029 },
 103     { G_UNICODE_SPACE_SEPARATOR,      0x202F },
 104   };
 105
 106   for (i = 0; i < G_N_ELEMENTS (examples); i++)
 107     {
 108       g_assert_cmpint (g_unichar_type (examples[i].c), ==, examples[i].type);
 109     }
 110
 111   /*** Testing TYPE() border cases ***/
 112   g_assert_cmpint (g_unichar_type (0x3FF5), ==, 0x07);
 113   /* U+FFEFF Plane 15 Private Use */
 114   g_assert_cmpint (g_unichar_type (0xFFEFF), ==, 0x03);
 115   /* U+E0001 Language Tag */
 116   g_assert_cmpint (g_unichar_type (0xE0001), ==, 0x01);
 117   g_assert_cmpint (g_unichar_type (G_UNICODE_LAST_CHAR), ==, 0x02);
 118   g_assert_cmpint (g_unichar_type (G_UNICODE_LAST_CHAR + 1), ==, 0x02);
 119   g_assert_cmpint (g_unichar_type (G_UNICODE_LAST_CHAR_PART1), ==, 0x02);
 120   g_assert_cmpint (g_unichar_type (G_UNICODE_LAST_CHAR_PART1 + 1), ==, 0x02);
 121 }
 122
 123 /* Test that g_unichar_break_type() returns the correct value for various
 124  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
 125 static void
 126 test_unichar_break_type (void)
 127 {
 128   guint i;
 129   struct {
 130     GUnicodeBreakType type;
 131     gunichar          c;
 132   } examples[] = {
 133     { G_UNICODE_BREAK_MANDATORY,           0x2028 },
 134     { G_UNICODE_BREAK_CARRIAGE_RETURN,     0x000D },
 135     { G_UNICODE_BREAK_LINE_FEED,           0x000A },
 136     { G_UNICODE_BREAK_COMBINING_MARK,      0x0300 },
 137     { G_UNICODE_BREAK_SURROGATE,           0xD800 },
 138     { G_UNICODE_BREAK_ZERO_WIDTH_SPACE,    0x200B },
 139     { G_UNICODE_BREAK_INSEPARABLE,         0x2024 },
 140     { G_UNICODE_BREAK_NON_BREAKING_GLUE,   0x00A0 },
 141     { G_UNICODE_BREAK_CONTINGENT,          0xFFFC },
 142     { G_UNICODE_BREAK_SPACE,               0x0020 },
 143     { G_UNICODE_BREAK_AFTER,               0x05BE },
 144     { G_UNICODE_BREAK_BEFORE,              0x02C8 },
 145     { G_UNICODE_BREAK_BEFORE_AND_AFTER,    0x2014 },
 146     { G_UNICODE_BREAK_HYPHEN,              0x002D },
 147     { G_UNICODE_BREAK_NON_STARTER,         0x17D6 },
 148     { G_UNICODE_BREAK_OPEN_PUNCTUATION,    0x0028 },
 149     { G_UNICODE_BREAK_CLOSE_PARENTHESIS,   0x0029 },
 150     { G_UNICODE_BREAK_CLOSE_PUNCTUATION,   0x007D },
 151     { G_UNICODE_BREAK_QUOTATION,           0x0022 },
 152     { G_UNICODE_BREAK_EXCLAMATION,         0x0021 },
 153     { G_UNICODE_BREAK_IDEOGRAPHIC,         0x2E80 },
 154     { G_UNICODE_BREAK_NUMERIC,             0x0030 },
 155     { G_UNICODE_BREAK_INFIX_SEPARATOR,     0x002C },
 156     { G_UNICODE_BREAK_SYMBOL,              0x002F },
 157     { G_UNICODE_BREAK_ALPHABETIC,          0x0023 },
 158     { G_UNICODE_BREAK_PREFIX,              0x0024 },
 159     { G_UNICODE_BREAK_POSTFIX,             0x0025 },
 160     { G_UNICODE_BREAK_COMPLEX_CONTEXT,     0x0E01 },
 161     { G_UNICODE_BREAK_AMBIGUOUS,           0x00F7 },
 162     { G_UNICODE_BREAK_UNKNOWN,             0xE000 },
 163     { G_UNICODE_BREAK_NEXT_LINE,           0x0085 },
 164     { G_UNICODE_BREAK_WORD_JOINER,         0x2060 },
 165     { G_UNICODE_BREAK_HANGUL_L_JAMO,       0x1100 },
 166     { G_UNICODE_BREAK_HANGUL_V_JAMO,       0x1160 },
 167     { G_UNICODE_BREAK_HANGUL_T_JAMO,       0x11A8 },
 168     { G_UNICODE_BREAK_HANGUL_LV_SYLLABLE,  0xAC00 },
 169     { G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE, 0xAC01 },
 170     { G_UNICODE_BREAK_CONDITIONAL_JAPANESE_STARTER, 0x3041 },
 171     { G_UNICODE_BREAK_HEBREW_LETTER,                0x05D0 },
 172     { G_UNICODE_BREAK_REGIONAL_INDICATOR,           0x1F1F6 },
 173     { G_UNICODE_BREAK_EMOJI_BASE,          0x1F466 },
 174     { G_UNICODE_BREAK_EMOJI_MODIFIER,      0x1F3FB },
 175     { G_UNICODE_BREAK_ZERO_WIDTH_JOINER,   0x200D },
 176   };
 177
 178   for (i = 0; i < G_N_ELEMENTS (examples); i++)
 179     {
 180       g_assert_cmpint (g_unichar_break_type (examples[i].c), ==, examples[i].type);
 181     }
 182 }
 183
 184 /* Test that g_unichar_get_script() returns the correct value for various
 185  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
 186 static void
 187 test_unichar_script (void)
 188 {
 189   guint i;
 190   struct {
 191     GUnicodeScript script;
 192     gunichar          c;
 193   } examples[] = {
 194     { G_UNICODE_SCRIPT_COMMON,                  0x002A },
 195     { G_UNICODE_SCRIPT_INHERITED,               0x1CED },
 196     { G_UNICODE_SCRIPT_INHERITED,               0x0670 },
 197     { G_UNICODE_SCRIPT_ARABIC,                  0x060D },
 198     { G_UNICODE_SCRIPT_ARMENIAN,                0x0559 },
 199     { G_UNICODE_SCRIPT_BENGALI,                 0x09CD },
 200     { G_UNICODE_SCRIPT_BOPOMOFO,                0x31B6 },
 201     { G_UNICODE_SCRIPT_CHEROKEE,                0x13A2 },
 202     { G_UNICODE_SCRIPT_COPTIC,                  0x2CFD },
 203     { G_UNICODE_SCRIPT_CYRILLIC,                0x0482 },
 204     { G_UNICODE_SCRIPT_DESERET,                0x10401 },
 205     { G_UNICODE_SCRIPT_DEVANAGARI,              0x094D },
 206     { G_UNICODE_SCRIPT_ETHIOPIC,                0x1258 },
 207     { G_UNICODE_SCRIPT_GEORGIAN,                0x10FC },
 208     { G_UNICODE_SCRIPT_GOTHIC,                 0x10341 },
 209     { G_UNICODE_SCRIPT_GREEK,                   0x0375 },
 210     { G_UNICODE_SCRIPT_GUJARATI,                0x0A83 },
 211     { G_UNICODE_SCRIPT_GURMUKHI,                0x0A3C },
 212     { G_UNICODE_SCRIPT_HAN,                     0x3005 },
 213     { G_UNICODE_SCRIPT_HANGUL,                  0x1100 },
 214     { G_UNICODE_SCRIPT_HEBREW,                  0x05BF },
 215     { G_UNICODE_SCRIPT_HIRAGANA,                0x309F },
 216     { G_UNICODE_SCRIPT_KANNADA,                 0x0CBC },
 217     { G_UNICODE_SCRIPT_KATAKANA,                0x30FF },
 218     { G_UNICODE_SCRIPT_KHMER,                   0x17DD },
 219     { G_UNICODE_SCRIPT_LAO,                     0x0EDD },
 220     { G_UNICODE_SCRIPT_LATIN,                   0x0061 },
 221     { G_UNICODE_SCRIPT_MALAYALAM,               0x0D3D },
 222     { G_UNICODE_SCRIPT_MONGOLIAN,               0x1843 },
 223     { G_UNICODE_SCRIPT_MYANMAR,                 0x1031 },
 224     { G_UNICODE_SCRIPT_OGHAM,                   0x169C },
 225     { G_UNICODE_SCRIPT_OLD_ITALIC,             0x10322 },
 226     { G_UNICODE_SCRIPT_ORIYA,                   0x0B3C },
 227     { G_UNICODE_SCRIPT_RUNIC,                   0x16EF },
 228     { G_UNICODE_SCRIPT_SINHALA,                 0x0DBD },
 229     { G_UNICODE_SCRIPT_SYRIAC,                  0x0711 },
 230     { G_UNICODE_SCRIPT_TAMIL,                   0x0B82 },
 231     { G_UNICODE_SCRIPT_TELUGU,                  0x0C03 },
 232     { G_UNICODE_SCRIPT_THAANA,                  0x07B1 },
 233     { G_UNICODE_SCRIPT_THAI,                    0x0E31 },
 234     { G_UNICODE_SCRIPT_TIBETAN,                 0x0FD4 },
 235     { G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL,     0x1400 },
 236     { G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL,     0x1401 },
 237     { G_UNICODE_SCRIPT_YI,                      0xA015 },
 238     { G_UNICODE_SCRIPT_TAGALOG,                 0x1700 },
 239     { G_UNICODE_SCRIPT_HANUNOO,                 0x1720 },
 240     { G_UNICODE_SCRIPT_BUHID,                   0x1740 },
 241     { G_UNICODE_SCRIPT_TAGBANWA,                0x1760 },
 242     { G_UNICODE_SCRIPT_BRAILLE,                 0x2800 },
 243     { G_UNICODE_SCRIPT_CYPRIOT,                0x10808 },
 244     { G_UNICODE_SCRIPT_LIMBU,                   0x1932 },
 245     { G_UNICODE_SCRIPT_OSMANYA,                0x10480 },
 246     { G_UNICODE_SCRIPT_SHAVIAN,                0x10450 },
 247     { G_UNICODE_SCRIPT_LINEAR_B,               0x10000 },
 248     { G_UNICODE_SCRIPT_TAI_LE,                  0x1950 },
 249     { G_UNICODE_SCRIPT_UGARITIC,               0x1039F },
 250     { G_UNICODE_SCRIPT_NEW_TAI_LUE,             0x1980 },
 251     { G_UNICODE_SCRIPT_BUGINESE,                0x1A1F },
 252     { G_UNICODE_SCRIPT_GLAGOLITIC,              0x2C00 },
 253     { G_UNICODE_SCRIPT_TIFINAGH,                0x2D6F },
 254     { G_UNICODE_SCRIPT_SYLOTI_NAGRI,            0xA800 },
 255     { G_UNICODE_SCRIPT_OLD_PERSIAN,            0x103D0 },
 256     { G_UNICODE_SCRIPT_KHAROSHTHI,             0x10A3F },
 257     { G_UNICODE_SCRIPT_UNKNOWN,              0x1111111 },
 258     { G_UNICODE_SCRIPT_BALINESE,                0x1B04 },
 259     { G_UNICODE_SCRIPT_CUNEIFORM,              0x12000 },
 260     { G_UNICODE_SCRIPT_PHOENICIAN,             0x10900 },
 261     { G_UNICODE_SCRIPT_PHAGS_PA,                0xA840 },
 262     { G_UNICODE_SCRIPT_NKO,                     0x07C0 },
 263     { G_UNICODE_SCRIPT_KAYAH_LI,                0xA900 },
 264     { G_UNICODE_SCRIPT_LEPCHA,                  0x1C00 },
 265     { G_UNICODE_SCRIPT_REJANG,                  0xA930 },
 266     { G_UNICODE_SCRIPT_SUNDANESE,               0x1B80 },
 267     { G_UNICODE_SCRIPT_SAURASHTRA,              0xA880 },
 268     { G_UNICODE_SCRIPT_CHAM,                    0xAA00 },
 269     { G_UNICODE_SCRIPT_OL_CHIKI,                0x1C50 },
 270     { G_UNICODE_SCRIPT_VAI,                     0xA500 },
 271     { G_UNICODE_SCRIPT_CARIAN,                 0x102A0 },
 272     { G_UNICODE_SCRIPT_LYCIAN,                 0x10280 },
 273     { G_UNICODE_SCRIPT_LYDIAN,                 0x1093F },
 274     { G_UNICODE_SCRIPT_AVESTAN,                0x10B00 },
 275     { G_UNICODE_SCRIPT_BAMUM,                   0xA6A0 },
 276     { G_UNICODE_SCRIPT_EGYPTIAN_HIEROGLYPHS,   0x13000 },
 277     { G_UNICODE_SCRIPT_IMPERIAL_ARAMAIC,       0x10840 },
 278     { G_UNICODE_SCRIPT_INSCRIPTIONAL_PAHLAVI,  0x10B60 },
 279     { G_UNICODE_SCRIPT_INSCRIPTIONAL_PARTHIAN, 0x10B40 },
 280     { G_UNICODE_SCRIPT_JAVANESE,                0xA980 },
 281     { G_UNICODE_SCRIPT_KAITHI,                 0x11082 },
 282     { G_UNICODE_SCRIPT_LISU,                    0xA4D0 },
 283     { G_UNICODE_SCRIPT_MEETEI_MAYEK,            0xABE5 },
 284     { G_UNICODE_SCRIPT_OLD_SOUTH_ARABIAN,      0x10A60 },
 285     { G_UNICODE_SCRIPT_OLD_TURKIC,             0x10C00 },
 286     { G_UNICODE_SCRIPT_SAMARITAN,               0x0800 },
 287     { G_UNICODE_SCRIPT_TAI_THAM,                0x1A20 },
 288     { G_UNICODE_SCRIPT_TAI_VIET,                0xAA80 },
 289     { G_UNICODE_SCRIPT_BATAK,                   0x1BC0 },
 290     { G_UNICODE_SCRIPT_BRAHMI,                 0x11000 },
 291     { G_UNICODE_SCRIPT_MANDAIC,                 0x0840 },
 292     { G_UNICODE_SCRIPT_CHAKMA,                 0x11100 },
 293     { G_UNICODE_SCRIPT_MEROITIC_CURSIVE,       0x109A0 },
 294     { G_UNICODE_SCRIPT_MEROITIC_HIEROGLYPHS,   0x10980 },
 295     { G_UNICODE_SCRIPT_MIAO,                   0x16F00 },
 296     { G_UNICODE_SCRIPT_SHARADA,                0x11180 },
 297     { G_UNICODE_SCRIPT_SORA_SOMPENG,           0x110D0 },
 298     { G_UNICODE_SCRIPT_TAKRI,                  0x11680 },
 299     { G_UNICODE_SCRIPT_BASSA_VAH,              0x16AD0 },
 300     { G_UNICODE_SCRIPT_CAUCASIAN_ALBANIAN,     0x10530 },
 301     { G_UNICODE_SCRIPT_DUPLOYAN,               0x1BC00 },
 302     { G_UNICODE_SCRIPT_ELBASAN,                0x10500 },
 303     { G_UNICODE_SCRIPT_GRANTHA,                0x11301 },
 304     { G_UNICODE_SCRIPT_KHOJKI,                 0x11200 },
 305     { G_UNICODE_SCRIPT_KHUDAWADI,              0x112B0 },
 306     { G_UNICODE_SCRIPT_LINEAR_A,               0x10600 },
 307     { G_UNICODE_SCRIPT_MAHAJANI,               0x11150 },
 308     { G_UNICODE_SCRIPT_MANICHAEAN,             0x10AC0 },
 309     { G_UNICODE_SCRIPT_MENDE_KIKAKUI,          0x1E800 },
 310     { G_UNICODE_SCRIPT_MODI,                   0x11600 },
 311     { G_UNICODE_SCRIPT_MRO,                    0x16A40 },
 312     { G_UNICODE_SCRIPT_NABATAEAN,              0x10880 },
 313     { G_UNICODE_SCRIPT_OLD_NORTH_ARABIAN,      0x10A80 },
 314     { G_UNICODE_SCRIPT_OLD_PERMIC,             0x10350 },
 315     { G_UNICODE_SCRIPT_PAHAWH_HMONG,           0x16B00 },
 316     { G_UNICODE_SCRIPT_PALMYRENE,              0x10860 },
 317     { G_UNICODE_SCRIPT_PAU_CIN_HAU,            0x11AC0 },
 318     { G_UNICODE_SCRIPT_PSALTER_PAHLAVI,        0x10B80 },
 319     { G_UNICODE_SCRIPT_SIDDHAM,                0x11580 },
 320     { G_UNICODE_SCRIPT_TIRHUTA,                0x11480 },
 321     { G_UNICODE_SCRIPT_WARANG_CITI,            0x118A0 },
 322     { G_UNICODE_SCRIPT_CHEROKEE,               0x0AB71 },
 323     { G_UNICODE_SCRIPT_HATRAN,                 0x108E0 },
 324     { G_UNICODE_SCRIPT_OLD_HUNGARIAN,          0x10C80 },
 325     { G_UNICODE_SCRIPT_MULTANI,                0x11280 },
 326     { G_UNICODE_SCRIPT_AHOM,                   0x11700 },
 327     { G_UNICODE_SCRIPT_CUNEIFORM,              0x12480 },
 328     { G_UNICODE_SCRIPT_ANATOLIAN_HIEROGLYPHS,  0x14400 },
 329     { G_UNICODE_SCRIPT_SIGNWRITING,            0x1D800 },
 330     { G_UNICODE_SCRIPT_ADLAM,                  0x1E900 },
 331     { G_UNICODE_SCRIPT_BHAIKSUKI,              0x11C00 },
 332     { G_UNICODE_SCRIPT_MARCHEN,                0x11C70 },
 333     { G_UNICODE_SCRIPT_NEWA,                   0x11400 },
 334     { G_UNICODE_SCRIPT_OSAGE,                  0x104B0 },
 335     { G_UNICODE_SCRIPT_TANGUT,                 0x16FE0 },
 336     { G_UNICODE_SCRIPT_MASARAM_GONDI,          0x11D00 },
 337     { G_UNICODE_SCRIPT_NUSHU,                  0x1B170 },
 338     { G_UNICODE_SCRIPT_SOYOMBO,                0x11A50 },
 339     { G_UNICODE_SCRIPT_ZANABAZAR_SQUARE,       0x11A00 },
 340     { G_UNICODE_SCRIPT_DOGRA,                  0x11800 },
 341     { G_UNICODE_SCRIPT_GUNJALA_GONDI,          0x11D60 },
 342     { G_UNICODE_SCRIPT_HANIFI_ROHINGYA,        0x10D00 },
 343     { G_UNICODE_SCRIPT_MAKASAR,                0x11EE0 },
 344     { G_UNICODE_SCRIPT_MEDEFAIDRIN,            0x16E40 },
 345     { G_UNICODE_SCRIPT_OLD_SOGDIAN,            0x10F00 },
 346     { G_UNICODE_SCRIPT_SOGDIAN,                0x10F30 },
 347     { G_UNICODE_SCRIPT_ELYMAIC,                0x10FE0 },
 348     { G_UNICODE_SCRIPT_NANDINAGARI,            0x119A0 },
 349     { G_UNICODE_SCRIPT_NYIAKENG_PUACHUE_HMONG, 0x1E100 },
 350     { G_UNICODE_SCRIPT_WANCHO,                 0x1E2C0 },
 351     { G_UNICODE_SCRIPT_CHORASMIAN,             0x10FB0 },
 352     { G_UNICODE_SCRIPT_DIVES_AKURU,            0x11900 },
 353     { G_UNICODE_SCRIPT_KHITAN_SMALL_SCRIPT,    0x18B00 },
 354     { G_UNICODE_SCRIPT_YEZIDI,                 0x10E80 },
 355     { G_UNICODE_SCRIPT_CYPRO_MINOAN,           0x12F90 },
 356     { G_UNICODE_SCRIPT_OLD_UYGHUR,             0x10F70 },
 357     { G_UNICODE_SCRIPT_TANGSA,                 0x16A70 },
 358     { G_UNICODE_SCRIPT_TOTO,                   0x1E290 },
 359     { G_UNICODE_SCRIPT_VITHKUQI,               0x10570 },
 360     { G_UNICODE_SCRIPT_KAWI,                   0x11F00 },
 361     { G_UNICODE_SCRIPT_NAG_MUNDARI,            0x1E4D0 },
 362   };
 363   for (i = 0; i < G_N_ELEMENTS (examples); i++)
 364     g_assert_cmpint (g_unichar_get_script (examples[i].c), ==, examples[i].script);
 365 }
 366
 367 /* Test that g_unichar_combining_class() returns the correct value for
 368  * various ASCII and Unicode alphabetic, numeric, and other, codepoints. */
 369 static void
 370 test_combining_class (void)
 371 {
 372   guint i;
 373   struct {
 374     gint class;
 375     gunichar          c;
 376   } examples[] = {
 377     {   0, 0x0020 },
 378     {   1, 0x0334 },
 379     {   7, 0x093C },
 380     {   8, 0x3099 },
 381     {   9, 0x094D },
 382     {  10, 0x05B0 },
 383     {  11, 0x05B1 },
 384     {  12, 0x05B2 },
 385     {  13, 0x05B3 },
 386     {  14, 0x05B4 },
 387     {  15, 0x05B5 },
 388     {  16, 0x05B6 },
 389     {  17, 0x05B7 },
 390     {  18, 0x05B8 },
 391     {  19, 0x05B9 },
 392     {  20, 0x05BB },
 393     {  21, 0x05BC },
 394     {  22, 0x05BD },
 395     {  23, 0x05BF },
 396     {  24, 0x05C1 },
 397     {  25, 0x05C2 },
 398     {  26, 0xFB1E },
 399     {  27, 0x064B },
 400     {  28, 0x064C },
 401     {  29, 0x064D },
 402     /* ... */
 403     { 228, 0x05AE },
 404     { 230, 0x0300 },
 405     { 232, 0x302C },
 406     { 233, 0x0362 },
 407     { 234, 0x0360 },
 408     { 234, 0x1DCD },
 409     { 240, 0x0345 }
 410   };
 411   for (i = 0; i < G_N_ELEMENTS (examples); i++)
 412     {
 413       g_assert_cmpint (g_unichar_combining_class (examples[i].c), ==, examples[i].class);
 414     }
 415 }
 416
 417 /* Test that g_unichar_get_mirror() returns the correct value for various
 418  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
 419 static void
 420 test_mirror (void)
 421 {
 422   gunichar mirror;
 423
 424   g_assert_true (g_unichar_get_mirror_char ('(', &mirror));
 425   g_assert_cmpint (mirror, ==, ')');
 426   g_assert_true (g_unichar_get_mirror_char (')', &mirror));
 427   g_assert_cmpint (mirror, ==, '(');
 428   g_assert_true (g_unichar_get_mirror_char ('{', &mirror));
 429   g_assert_cmpint (mirror, ==, '}');
 430   g_assert_true (g_unichar_get_mirror_char ('}', &mirror));
 431   g_assert_cmpint (mirror, ==, '{');
 432   g_assert_true (g_unichar_get_mirror_char (0x208D, &mirror));
 433   g_assert_cmpint (mirror, ==, 0x208E);
 434   g_assert_true (g_unichar_get_mirror_char (0x208E, &mirror));
 435   g_assert_cmpint (mirror, ==, 0x208D);
 436   g_assert_false (g_unichar_get_mirror_char ('a', &mirror));
 437 }
 438
 439 /* Test that g_utf8_strup() returns the correct value for various
 440  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
 441 static void
 442 test_strup (void)
 443 {
 444   char *str_up = NULL;
 445   const char *str = "AaZz09x;\x03\x45"
 446     "\xEF\xBD\x81"  /* Unichar 'A' (U+FF21) */
 447     "\xEF\xBC\xA1"; /* Unichar 'a' (U+FF41) */
 448
 449   /* Testing degenerated cases */
 450   if (g_test_undefined ())
 451     {
 452       g_test_expect_message (G_LOG_DOMAIN, G_LOG_LEVEL_CRITICAL,
 453                              "*assertion*!= NULL*");
 454       str_up = g_utf8_strup (NULL, 0);
 455       g_test_assert_expected_messages ();
 456     }
 457
 458   str_up = g_utf8_strup (str, strlen (str));
 459   /* Tricky, comparing two unicode strings with an ASCII function */
 460   g_assert_cmpstr (str_up, ==, "AAZZ09X;\003E\357\274\241\357\274\241");
 461   g_free (str_up);
 462
 463   str_up = g_utf8_strup ("", 0);
 464   g_assert_cmpstr (str_up, ==, "");
 465   g_free (str_up);
 466 }
 467
 468 /* Test that g_utf8_strdown() returns the correct value for various
 469  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
 470 static void
 471 test_strdown (void)
 472 {
 473   char *str_down = NULL;
 474   const char *str = "AaZz09x;\x03\x07"
 475     "\xEF\xBD\x81"  /* Unichar 'A' (U+FF21) */
 476     "\xEF\xBC\xA1"; /* Unichar 'a' (U+FF41) */
 477
 478   /* Testing degenerated cases */
 479   if (g_test_undefined ())
 480     {
 481       g_test_expect_message (G_LOG_DOMAIN, G_LOG_LEVEL_CRITICAL,
 482                              "*assertion*!= NULL*");
 483       str_down = g_utf8_strdown (NULL, 0);
 484       g_test_assert_expected_messages ();
 485     }
 486
 487   str_down = g_utf8_strdown (str, strlen (str));
 488   /* Tricky, comparing two unicode strings with an ASCII function */
 489   g_assert_cmpstr (str_down, ==, "aazz09x;\003\007\357\275\201\357\275\201");
 490   g_free (str_down);
 491
 492   str_down = g_utf8_strdown ("", 0);
 493   g_assert_cmpstr (str_down, ==, "");
 494   g_free (str_down);
 495 }
 496
 497 /* Test that g_utf8_strup() and g_utf8_strdown() return the correct
 498  * value for Turkish 'i' with and without dot above. */
 499 static void
 500 test_turkish_strupdown (void)
 501 {
 502   char *str_up = NULL;
 503   char *str_down = NULL;
 504   const char *str = "iII"
 505                     "\xcc\x87"  /* COMBINING DOT ABOVE (U+307) */
 506                     "\xc4\xb1"  /* LATIN SMALL LETTER DOTLESS I (U+131) */
 507                     "\xc4\xb0"; /* LATIN CAPITAL LETTER I WITH DOT ABOVE (U+130) */
 508   char *oldlocale;
 509   char *old_lc_all, *old_lc_messages, *old_lang;
 510 #ifdef G_OS_WIN32
 511   LCID old_lcid;
 512 #endif
 513
 514   /* interferes with g_win32_getlocale() */
 515   save_and_clear_env ("LC_ALL", &old_lc_all);
 516   save_and_clear_env ("LC_MESSAGES", &old_lc_messages);
 517   save_and_clear_env ("LANG", &old_lang);
 518
 519   oldlocale = g_strdup (setlocale (LC_ALL, "tr_TR"));
 520   if (oldlocale == NULL)
 521     {
 522       g_test_skip ("locale tr_TR not available");
 523       return;
 524     }
 525
 526 #ifdef G_OS_WIN32
 527   old_lcid = GetThreadLocale ();
 528   SetThreadLocale (MAKELCID (MAKELANGID (LANG_TURKISH, SUBLANG_TURKISH_TURKEY), SORT_DEFAULT));
 529 #endif
 530
 531   str_up = g_utf8_strup (str, strlen (str));
 532   str_down = g_utf8_strdown (str, strlen (str));
 533   /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE,
 534    * I => I,
 535    * I + COMBINING DOT ABOVE => I + COMBINING DOT ABOVE,
 536    * LATIN SMALL LETTER DOTLESS I => I,
 537    * LATIN CAPITAL LETTER I WITH DOT ABOVE => LATIN CAPITAL LETTER I WITH DOT ABOVE */
 538   g_assert_cmpstr (str_up, ==, "\xc4\xb0II\xcc\x87I\xc4\xb0");
 539   /* i => i,
 540    * I => LATIN SMALL LETTER DOTLESS I,
 541    * I + COMBINING DOT ABOVE => i,
 542    * LATIN SMALL LETTER DOTLESS I => LATIN SMALL LETTER DOTLESS I,
 543    * LATIN CAPITAL LETTER I WITH DOT ABOVE => i */
 544   g_assert_cmpstr (str_down, ==, "i\xc4\xb1i\xc4\xb1i");
 545   g_free (str_up);
 546   g_free (str_down);
 547
 548   setlocale (LC_ALL, oldlocale);
 549 #ifdef G_OS_WIN32
 550   SetThreadLocale (old_lcid);
 551 #endif
 552   g_free (oldlocale);
 553   if (old_lc_all)
 554     g_setenv ("LC_ALL", old_lc_all, TRUE);
 555   if (old_lc_messages)
 556     g_setenv ("LC_MESSAGES", old_lc_messages, TRUE);
 557   if (old_lang)
 558     g_setenv ("LANG", old_lang, TRUE);
 559   g_free (old_lc_all);
 560   g_free (old_lc_messages);
 561   g_free (old_lang);
 562 }
 563
 564 /* Test that g_utf8_casefold() returns the correct value for various
 565  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
 566 static void
 567 test_casefold (void)
 568 {
 569   char *str_casefold = NULL;
 570   const char *str = "AaZz09x;"
 571     "\xEF\xBD\x81"  /* Unichar 'A' (U+FF21) */
 572     "\xEF\xBC\xA1"; /* Unichar 'a' (U+FF41) */
 573
 574   /* Testing degenerated cases */
 575   if (g_test_undefined ())
 576     {
 577       g_test_expect_message (G_LOG_DOMAIN, G_LOG_LEVEL_CRITICAL,
 578                              "*assertion*!= NULL*");
 579       str_casefold = g_utf8_casefold (NULL, 0);
 580       g_test_assert_expected_messages ();
 581     }
 582
 583   str_casefold = g_utf8_casefold (str, strlen (str));
 584   /* Tricky, comparing two unicode strings with an ASCII function */
 585   g_assert_cmpstr (str_casefold, ==, "aazz09x;\357\275\201\357\275\201");
 586   g_free (str_casefold);
 587
 588   str_casefold = g_utf8_casefold ("", 0);
 589   g_assert_cmpstr (str_casefold, ==, "");
 590   g_free (str_casefold);
 591 }
 592
 593 static void
 594 test_casemap_and_casefold (void)
 595 {
 596   FILE *infile;
 597   char buffer[1024];
 598   char **strings;
 599   char *filename;
 600   const char *locale;
 601   const char *test;
 602   const char *expected;
 603   char *convert;
 604   char *current_locale = setlocale (LC_CTYPE, NULL);
 605   char *old_lc_all, *old_lc_messages, *old_lang;
 606 #ifdef G_OS_WIN32
 607   LCID old_lcid;
 608
 609   old_lcid = GetThreadLocale ();
 610 #endif
 611
 612   /* interferes with g_win32_getlocale() */
 613   save_and_clear_env ("LC_ALL", &old_lc_all);
 614   save_and_clear_env ("LC_MESSAGES", &old_lc_messages);
 615   save_and_clear_env ("LANG", &old_lang);
 616
 617   filename = g_test_build_filename (G_TEST_DIST, "casemap.txt", NULL);
 618   infile = fopen (filename, "r");
 619   g_assert (infile != NULL);
 620
 621   while (fgets (buffer, sizeof (buffer), infile))
 622     {
 623       if (buffer[0] == '#')
 624         continue;
 625
 626       strings = g_strsplit (buffer, "\t", -1);
 627       locale = strings[0];
 628       if (!locale[0])
 629         locale = "C";
 630
 631       if (strcmp (locale, current_locale) != 0)
 632         {
 633           setlocale (LC_CTYPE, locale);
 634           current_locale = setlocale (LC_CTYPE, NULL);
 635
 636           if (strncmp (current_locale, locale, 2) != 0)
 637             {
 638               g_test_message ("Cannot set locale to %s, skipping", locale);
 639               goto next;
 640             }
 641         }
 642
 643 #ifdef G_OS_WIN32
 644       if (strstr (locale, "lt_LT"))
 645         SetThreadLocale (MAKELCID (MAKELANGID (LANG_LITHUANIAN, SUBLANG_LITHUANIAN), SORT_DEFAULT));
 646       else if (strstr (locale, "tr_TR"))
 647         SetThreadLocale (MAKELCID (MAKELANGID (LANG_TURKISH, SUBLANG_TURKISH_TURKEY), SORT_DEFAULT));
 648       else
 649         SetThreadLocale (old_lcid);
 650 #endif
 651
 652       test = strings[1];
 653
 654       /* gen-casemap-txt.py uses an empty string when a single
 655        * character doesn't have an equivalent in a particular case;
 656        * since that behavior is nonsense for multicharacter strings,
 657        * it would make more sense to put the expected result ... the
 658        * original character unchanged. But for now, we just work
 659        * around it here and take the empty string to mean "same as
 660        * original"
 661        */
 662
 663       convert = g_utf8_strup (test, -1);
 664       expected = strings[4][0] ? strings[4] : test;
 665       g_assert_cmpstr (convert, ==, expected);
 666       g_free (convert);
 667
 668       convert = g_utf8_strdown (test, -1);
 669       expected = strings[2][0] ? strings[2] : test;
 670       g_assert_cmpstr (convert, ==, expected);
 671       g_free (convert);
 672
 673     next:
 674       g_strfreev (strings);
 675     }
 676
 677   fclose (infile);
 678
 679   g_free (filename);
 680   filename = g_test_build_filename (G_TEST_DIST, "casefold.txt", NULL);
 681
 682   infile = fopen (filename, "r");
 683   g_assert (infile != NULL);
 684
 685   while (fgets (buffer, sizeof (buffer), infile))
 686     {
 687       if (buffer[0] == '#')
 688         continue;
 689
 690       buffer[strlen (buffer) - 1] = '\0';
 691       strings = g_strsplit (buffer, "\t", -1);
 692
 693       test = strings[0];
 694
 695       convert = g_utf8_casefold (test, -1);
 696       g_assert_cmpstr (convert, ==, strings[1]);
 697       g_free (convert);
 698
 699       g_strfreev (strings);
 700     }
 701
 702   fclose (infile);
 703   g_free (filename);
 704
 705   if (old_lc_all)
 706     g_setenv ("LC_ALL", old_lc_all, TRUE);
 707   if (old_lc_messages)
 708     g_setenv ("LC_MESSAGES", old_lc_messages, TRUE);
 709   if (old_lang)
 710     g_setenv ("LANG", old_lang, TRUE);
 711   g_free (old_lc_all);
 712   g_free (old_lc_messages);
 713   g_free (old_lang);
 714 #ifdef G_OS_WIN32
 715   SetThreadLocale (old_lcid);
 716 #endif
 717 }
 718
 719 /* Test that g_unichar_ismark() returns the correct value for various
 720  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
 721 static void
 722 test_mark (void)
 723 {
 724   g_assert_true (g_unichar_ismark (0x0903));
 725   g_assert_true (g_unichar_ismark (0x20DD));
 726   g_assert_true (g_unichar_ismark (0xA806));
 727   g_assert_false (g_unichar_ismark ('a'));
 728
 729   /*** Testing TYPE() border cases ***/
 730   g_assert_false (g_unichar_ismark (0x3FF5));
 731   /* U+FFEFF Plane 15 Private Use (needed to be > G_UNICODE_MAX_TABLE_INDEX) */
 732   g_assert_false (g_unichar_ismark (0xFFEFF));
 733   /* U+E0001 Language Tag */
 734   g_assert_false (g_unichar_ismark (0xE0001));
 735   g_assert_false (g_unichar_ismark (G_UNICODE_LAST_CHAR));
 736   g_assert_false (g_unichar_ismark (G_UNICODE_LAST_CHAR + 1));
 737   g_assert_false (g_unichar_ismark (G_UNICODE_LAST_CHAR_PART1));
 738   g_assert_false (g_unichar_ismark (G_UNICODE_LAST_CHAR_PART1 + 1));
 739 }
 740
 741 /* Test that g_unichar_isspace() returns the correct value for various
 742  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
 743 static void
 744 test_space (void)
 745 {
 746   g_assert_false (g_unichar_isspace ('a'));
 747   g_assert_true (g_unichar_isspace (' '));
 748   g_assert_true (g_unichar_isspace ('\t'));
 749   g_assert_true (g_unichar_isspace ('\n'));
 750   g_assert_true (g_unichar_isspace ('\r'));
 751   g_assert_true (g_unichar_isspace ('\f'));
 752   g_assert_false (g_unichar_isspace (0xff41)); /* Unicode fullwidth 'a' */
 753   g_assert_true (g_unichar_isspace (0x202F)); /* Unicode space separator */
 754   g_assert_true (g_unichar_isspace (0x2028)); /* Unicode line separator */
 755   g_assert_true (g_unichar_isspace (0x2029)); /* Unicode paragraph separator */
 756
 757   /*** Testing TYPE() border cases ***/
 758   g_assert_false (g_unichar_isspace (0x3FF5));
 759   /* U+FFEFF Plane 15 Private Use (needed to be > G_UNICODE_MAX_TABLE_INDEX) */
 760   g_assert_false (g_unichar_isspace (0xFFEFF));
 761   /* U+E0001 Language Tag */
 762   g_assert_false (g_unichar_isspace (0xE0001));
 763   g_assert_false (g_unichar_isspace (G_UNICODE_LAST_CHAR));
 764   g_assert_false (g_unichar_isspace (G_UNICODE_LAST_CHAR + 1));
 765   g_assert_false (g_unichar_isspace (G_UNICODE_LAST_CHAR_PART1));
 766   g_assert_false (g_unichar_isspace (G_UNICODE_LAST_CHAR_PART1 + 1));
 767 }
 768
 769 /* Test that g_unichar_isalnum() returns the correct value for various
 770  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
 771 static void
 772 test_alnum (void)
 773 {
 774   g_assert_false (g_unichar_isalnum (' '));
 775   g_assert_true (g_unichar_isalnum ('a'));
 776   g_assert_true (g_unichar_isalnum ('z'));
 777   g_assert_true (g_unichar_isalnum ('0'));
 778   g_assert_true (g_unichar_isalnum ('9'));
 779   g_assert_true (g_unichar_isalnum ('A'));
 780   g_assert_true (g_unichar_isalnum ('Z'));
 781   g_assert_false (g_unichar_isalnum ('-'));
 782   g_assert_false (g_unichar_isalnum ('*'));
 783   g_assert_true (g_unichar_isalnum (0xFF21));  /* Unichar fullwidth 'A' */
 784   g_assert_true (g_unichar_isalnum (0xFF3A));  /* Unichar fullwidth 'Z' */
 785   g_assert_true (g_unichar_isalnum (0xFF41));  /* Unichar fullwidth 'a' */
 786   g_assert_true (g_unichar_isalnum (0xFF5A));  /* Unichar fullwidth 'z' */
 787   g_assert_true (g_unichar_isalnum (0xFF10));  /* Unichar fullwidth '0' */
 788   g_assert_true (g_unichar_isalnum (0xFF19));  /* Unichar fullwidth '9' */
 789   g_assert_false (g_unichar_isalnum (0xFF0A)); /* Unichar fullwidth '*' */
 790
 791   /*** Testing TYPE() border cases ***/
 792   g_assert_true (g_unichar_isalnum (0x3FF5));
 793   /* U+FFEFF Plane 15 Private Use (needed to be > G_UNICODE_MAX_TABLE_INDEX) */
 794   g_assert_false (g_unichar_isalnum (0xFFEFF));
 795   /* U+E0001 Language Tag */
 796   g_assert_false (g_unichar_isalnum (0xE0001));
 797   g_assert_false (g_unichar_isalnum (G_UNICODE_LAST_CHAR));
 798   g_assert_false (g_unichar_isalnum (G_UNICODE_LAST_CHAR + 1));
 799   g_assert_false (g_unichar_isalnum (G_UNICODE_LAST_CHAR_PART1));
 800   g_assert_false (g_unichar_isalnum (G_UNICODE_LAST_CHAR_PART1 + 1));
 801 }
 802
 803 /* Test that g_unichar_isalpha() returns the correct value for various
 804  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
 805 static void
 806 test_alpha (void)
 807 {
 808   g_assert_false (g_unichar_isalpha (' '));
 809   g_assert_true (g_unichar_isalpha ('a'));
 810   g_assert_true (g_unichar_isalpha ('z'));
 811   g_assert_false (g_unichar_isalpha ('0'));
 812   g_assert_false (g_unichar_isalpha ('9'));
 813   g_assert_true (g_unichar_isalpha ('A'));
 814   g_assert_true (g_unichar_isalpha ('Z'));
 815   g_assert_false (g_unichar_isalpha ('-'));
 816   g_assert_false (g_unichar_isalpha ('*'));
 817   g_assert_true (g_unichar_isalpha (0xFF21));  /* Unichar fullwidth 'A' */
 818   g_assert_true (g_unichar_isalpha (0xFF3A));  /* Unichar fullwidth 'Z' */
 819   g_assert_true (g_unichar_isalpha (0xFF41));  /* Unichar fullwidth 'a' */
 820   g_assert_true (g_unichar_isalpha (0xFF5A));  /* Unichar fullwidth 'z' */
 821   g_assert_false (g_unichar_isalpha (0xFF10)); /* Unichar fullwidth '0' */
 822   g_assert_false (g_unichar_isalpha (0xFF19)); /* Unichar fullwidth '9' */
 823   g_assert_false (g_unichar_isalpha (0xFF0A)); /* Unichar fullwidth '*' */
 824
 825   /*** Testing TYPE() border cases ***/
 826   g_assert_true (g_unichar_isalpha (0x3FF5));
 827   /* U+FFEFF Plane 15 Private Use (needed to be > G_UNICODE_MAX_TABLE_INDEX) */
 828   g_assert_false (g_unichar_isalpha (0xFFEFF));
 829   /* U+E0001 Language Tag */
 830   g_assert_false (g_unichar_isalpha (0xE0001));
 831   g_assert_false (g_unichar_isalpha (G_UNICODE_LAST_CHAR));
 832   g_assert_false (g_unichar_isalpha (G_UNICODE_LAST_CHAR + 1));
 833   g_assert_false (g_unichar_isalpha (G_UNICODE_LAST_CHAR_PART1));
 834   g_assert_false (g_unichar_isalpha (G_UNICODE_LAST_CHAR_PART1 + 1));
 835 }
 836
 837 /* Test that g_unichar_isdigit() returns the correct value for various
 838  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
 839 static void
 840 test_digit (void)
 841 {
 842   g_assert_false (g_unichar_isdigit (' '));
 843   g_assert_false (g_unichar_isdigit ('a'));
 844   g_assert_true (g_unichar_isdigit ('0'));
 845   g_assert_true (g_unichar_isdigit ('9'));
 846   g_assert_false (g_unichar_isdigit ('A'));
 847   g_assert_false (g_unichar_isdigit ('-'));
 848   g_assert_false (g_unichar_isdigit ('*'));
 849   g_assert_false (g_unichar_isdigit (0xFF21)); /* Unichar fullwidth 'A' */
 850   g_assert_false (g_unichar_isdigit (0xFF3A)); /* Unichar fullwidth 'Z' */
 851   g_assert_false (g_unichar_isdigit (0xFF41)); /* Unichar fullwidth 'a' */
 852   g_assert_false (g_unichar_isdigit (0xFF5A)); /* Unichar fullwidth 'z' */
 853   g_assert_true (g_unichar_isdigit (0xFF10));  /* Unichar fullwidth '0' */
 854   g_assert_true (g_unichar_isdigit (0xFF19));  /* Unichar fullwidth '9' */
 855   g_assert_false (g_unichar_isdigit (0xFF0A)); /* Unichar fullwidth '*' */
 856
 857   /*** Testing TYPE() border cases ***/
 858   g_assert_false (g_unichar_isdigit (0x3FF5));
 859   /* U+FFEFF Plane 15 Private Use (needed to be > G_UNICODE_MAX_TABLE_INDEX) */
 860   g_assert_false (g_unichar_isdigit (0xFFEFF));
 861   /* U+E0001 Language Tag */
 862   g_assert_false (g_unichar_isdigit (0xE0001));
 863   g_assert_false (g_unichar_isdigit (G_UNICODE_LAST_CHAR));
 864   g_assert_false (g_unichar_isdigit (G_UNICODE_LAST_CHAR + 1));
 865   g_assert_false (g_unichar_isdigit (G_UNICODE_LAST_CHAR_PART1));
 866   g_assert_false (g_unichar_isdigit (G_UNICODE_LAST_CHAR_PART1 + 1));
 867 }
 868
 869 /* Test that g_unichar_digit_value() returns the correct value for various
 870  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
 871 static void
 872 test_digit_value (void)
 873 {
 874   g_assert_cmpint (g_unichar_digit_value (' '), ==, -1);
 875   g_assert_cmpint (g_unichar_digit_value ('a'), ==, -1);
 876   g_assert_cmpint (g_unichar_digit_value ('0'), ==, 0);
 877   g_assert_cmpint (g_unichar_digit_value ('9'), ==, 9);
 878   g_assert_cmpint (g_unichar_digit_value ('A'), ==, -1);
 879   g_assert_cmpint (g_unichar_digit_value ('-'), ==, -1);
 880   g_assert_cmpint (g_unichar_digit_value (0xFF21), ==, -1); /* Unichar 'A' */
 881   g_assert_cmpint (g_unichar_digit_value (0xFF3A), ==, -1); /* Unichar 'Z' */
 882   g_assert_cmpint (g_unichar_digit_value (0xFF41), ==, -1); /* Unichar 'a' */
 883   g_assert_cmpint (g_unichar_digit_value (0xFF5A), ==, -1); /* Unichar 'z' */
 884   g_assert_cmpint (g_unichar_digit_value (0xFF10), ==, 0);  /* Unichar '0' */
 885   g_assert_cmpint (g_unichar_digit_value (0xFF19), ==, 9);  /* Unichar '9' */
 886   g_assert_cmpint (g_unichar_digit_value (0xFF0A), ==, -1); /* Unichar '*' */
 887
 888   /*** Testing TYPE() border cases ***/
 889   g_assert_cmpint (g_unichar_digit_value (0x3FF5), ==, -1);
 890    /* U+FFEFF Plane 15 Private Use (needed to be > G_UNICODE_MAX_TABLE_INDEX) */
 891   g_assert_cmpint (g_unichar_digit_value (0xFFEFF), ==, -1);
 892   /* U+E0001 Language Tag */
 893   g_assert_cmpint (g_unichar_digit_value (0xE0001), ==, -1);
 894   g_assert_cmpint (g_unichar_digit_value (G_UNICODE_LAST_CHAR), ==, -1);
 895   g_assert_cmpint (g_unichar_digit_value (G_UNICODE_LAST_CHAR + 1), ==, -1);
 896   g_assert_cmpint (g_unichar_digit_value (G_UNICODE_LAST_CHAR_PART1), ==, -1);
 897   g_assert_cmpint (g_unichar_digit_value (G_UNICODE_LAST_CHAR_PART1 + 1), ==, -1);
 898 }
 899
 900 /* Test that g_unichar_isxdigit() returns the correct value for various
 901  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
 902 static void
 903 test_xdigit (void)
 904 {
 905   g_assert_false (g_unichar_isxdigit (' '));
 906   g_assert_true (g_unichar_isxdigit ('a'));
 907   g_assert_true (g_unichar_isxdigit ('f'));
 908   g_assert_false (g_unichar_isxdigit ('g'));
 909   g_assert_false (g_unichar_isxdigit ('z'));
 910   g_assert_true (g_unichar_isxdigit ('0'));
 911   g_assert_true (g_unichar_isxdigit ('9'));
 912   g_assert_true (g_unichar_isxdigit ('A'));
 913   g_assert_true (g_unichar_isxdigit ('F'));
 914   g_assert_false (g_unichar_isxdigit ('G'));
 915   g_assert_false (g_unichar_isxdigit ('Z'));
 916   g_assert_false (g_unichar_isxdigit ('-'));
 917   g_assert_false (g_unichar_isxdigit ('*'));
 918   g_assert_true (g_unichar_isxdigit (0xFF21));  /* Unichar fullwidth 'A' */
 919   g_assert_true (g_unichar_isxdigit (0xFF26));  /* Unichar fullwidth 'F' */
 920   g_assert_false (g_unichar_isxdigit (0xFF27)); /* Unichar fullwidth 'G' */
 921   g_assert_false (g_unichar_isxdigit (0xFF3A)); /* Unichar fullwidth 'Z' */
 922   g_assert_true (g_unichar_isxdigit (0xFF41));  /* Unichar fullwidth 'a' */
 923   g_assert_true (g_unichar_isxdigit (0xFF46));  /* Unichar fullwidth 'f' */
 924   g_assert_false (g_unichar_isxdigit (0xFF47)); /* Unichar fullwidth 'g' */
 925   g_assert_false (g_unichar_isxdigit (0xFF5A)); /* Unichar fullwidth 'z' */
 926   g_assert_true (g_unichar_isxdigit (0xFF10));  /* Unichar fullwidth '0' */
 927   g_assert_true (g_unichar_isxdigit (0xFF19));  /* Unichar fullwidth '9' */
 928   g_assert_false (g_unichar_isxdigit (0xFF0A)); /* Unichar fullwidth '*' */
 929
 930   /*** Testing TYPE() border cases ***/
 931   g_assert_false (g_unichar_isxdigit (0x3FF5));
 932   /* U+FFEFF Plane 15 Private Use (needed to be > G_UNICODE_MAX_TABLE_INDEX) */
 933   g_assert_false (g_unichar_isxdigit (0xFFEFF));
 934   /* U+E0001 Language Tag */
 935   g_assert_false (g_unichar_isxdigit (0xE0001));
 936   g_assert_false (g_unichar_isxdigit (G_UNICODE_LAST_CHAR));
 937   g_assert_false (g_unichar_isxdigit (G_UNICODE_LAST_CHAR + 1));
 938   g_assert_false (g_unichar_isxdigit (G_UNICODE_LAST_CHAR_PART1));
 939   g_assert_false (g_unichar_isxdigit (G_UNICODE_LAST_CHAR_PART1 + 1));
 940 }
 941
 942 /* Test that g_unichar_xdigit_value() returns the correct value for various
 943  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
 944 static void
 945 test_xdigit_value (void)
 946 {
 947   g_assert_cmpint (g_unichar_xdigit_value (' '), ==, -1);
 948   g_assert_cmpint (g_unichar_xdigit_value ('a'), ==, 10);
 949   g_assert_cmpint (g_unichar_xdigit_value ('f'), ==, 15);
 950   g_assert_cmpint (g_unichar_xdigit_value ('g'), ==, -1);
 951   g_assert_cmpint (g_unichar_xdigit_value ('0'), ==, 0);
 952   g_assert_cmpint (g_unichar_xdigit_value ('9'), ==, 9);
 953   g_assert_cmpint (g_unichar_xdigit_value ('A'), ==, 10);
 954   g_assert_cmpint (g_unichar_xdigit_value ('F'), ==, 15);
 955   g_assert_cmpint (g_unichar_xdigit_value ('G'), ==, -1);
 956   g_assert_cmpint (g_unichar_xdigit_value ('-'), ==, -1);
 957   g_assert_cmpint (g_unichar_xdigit_value (0xFF21), ==, 10); /* Unichar 'A' */
 958   g_assert_cmpint (g_unichar_xdigit_value (0xFF26), ==, 15); /* Unichar 'F' */
 959   g_assert_cmpint (g_unichar_xdigit_value (0xFF27), ==, -1); /* Unichar 'G' */
 960   g_assert_cmpint (g_unichar_xdigit_value (0xFF3A), ==, -1); /* Unichar 'Z' */
 961   g_assert_cmpint (g_unichar_xdigit_value (0xFF41), ==, 10); /* Unichar 'a' */
 962   g_assert_cmpint (g_unichar_xdigit_value (0xFF46), ==, 15); /* Unichar 'f' */
 963   g_assert_cmpint (g_unichar_xdigit_value (0xFF47), ==, -1); /* Unichar 'g' */
 964   g_assert_cmpint (g_unichar_xdigit_value (0xFF5A), ==, -1); /* Unichar 'z' */
 965   g_assert_cmpint (g_unichar_xdigit_value (0xFF10), ==, 0);  /* Unichar '0' */
 966   g_assert_cmpint (g_unichar_xdigit_value (0xFF19), ==, 9);  /* Unichar '9' */
 967   g_assert_cmpint (g_unichar_xdigit_value (0xFF0A), ==, -1); /* Unichar '*' */
 968
 969   /*** Testing TYPE() border cases ***/
 970   g_assert_cmpint (g_unichar_xdigit_value (0x3FF5), ==, -1);
 971    /* U+FFEFF Plane 15 Private Use (needed to be > G_UNICODE_MAX_TABLE_INDEX) */
 972   g_assert_cmpint (g_unichar_xdigit_value (0xFFEFF), ==, -1);
 973   /* U+E0001 Language Tag */
 974   g_assert_cmpint (g_unichar_xdigit_value (0xE0001), ==, -1);
 975   g_assert_cmpint (g_unichar_xdigit_value (G_UNICODE_LAST_CHAR), ==, -1);
 976   g_assert_cmpint (g_unichar_xdigit_value (G_UNICODE_LAST_CHAR + 1), ==, -1);
 977   g_assert_cmpint (g_unichar_xdigit_value (G_UNICODE_LAST_CHAR_PART1), ==, -1);
 978   g_assert_cmpint (g_unichar_xdigit_value (G_UNICODE_LAST_CHAR_PART1 + 1), ==, -1);
 979 }
 980
 981 /* Test that g_unichar_ispunct() returns the correct value for various
 982  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
 983 static void
 984 test_punctuation (void)
 985 {
 986   g_assert_false (g_unichar_ispunct (' '));
 987   g_assert_false (g_unichar_ispunct ('a'));
 988   g_assert_true (g_unichar_ispunct ('.'));
 989   g_assert_true (g_unichar_ispunct (','));
 990   g_assert_true (g_unichar_ispunct (';'));
 991   g_assert_true (g_unichar_ispunct (':'));
 992   g_assert_true (g_unichar_ispunct ('-'));
 993
 994   g_assert_false (g_unichar_ispunct (0xFF21)); /* Unichar fullwidth 'A' */
 995   g_assert_true (g_unichar_ispunct (0x005F));  /* Unichar fullwidth '.' */
 996   g_assert_true (g_unichar_ispunct (0x058A));  /* Unichar fullwidth '-' */
 997
 998   /*** Testing TYPE() border cases ***/
 999   g_assert_false (g_unichar_ispunct (0x3FF5));
1000   /* U+FFEFF Plane 15 Private Use (needed to be > G_UNICODE_MAX_TABLE_INDEX) */
1001   g_assert_false (g_unichar_ispunct (0xFFEFF));
1002   /* U+E0001 Language Tag */
1003   g_assert_false (g_unichar_ispunct (0xE0001));
1004   g_assert_false (g_unichar_ispunct (G_UNICODE_LAST_CHAR));
1005   g_assert_false (g_unichar_ispunct (G_UNICODE_LAST_CHAR + 1));
1006   g_assert_false (g_unichar_ispunct (G_UNICODE_LAST_CHAR_PART1));
1007   g_assert_false (g_unichar_ispunct (G_UNICODE_LAST_CHAR_PART1 + 1));
1008 }
1009
1010 /* Test that g_unichar_iscntrl() returns the correct value for various
1011  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
1012 static void
1013 test_cntrl (void)
1014 {
1015   g_assert_true (g_unichar_iscntrl (0x08));
1016   g_assert_false (g_unichar_iscntrl ('a'));
1017   g_assert_true (g_unichar_iscntrl (0x007F)); /* Unichar fullwidth <del> */
1018   g_assert_true (g_unichar_iscntrl (0x009F)); /* Unichar fullwidth control */
1019
1020   /*** Testing TYPE() border cases ***/
1021   g_assert_false (g_unichar_iscntrl (0x3FF5));
1022   /* U+FFEFF Plane 15 Private Use (needed to be > G_UNICODE_MAX_TABLE_INDEX) */
1023   g_assert_false (g_unichar_iscntrl (0xFFEFF));
1024   /* U+E0001 Language Tag */
1025   g_assert_false (g_unichar_iscntrl (0xE0001));
1026   g_assert_false (g_unichar_iscntrl (G_UNICODE_LAST_CHAR));
1027   g_assert_false (g_unichar_iscntrl (G_UNICODE_LAST_CHAR + 1));
1028   g_assert_false (g_unichar_iscntrl (G_UNICODE_LAST_CHAR_PART1));
1029   g_assert_false (g_unichar_iscntrl (G_UNICODE_LAST_CHAR_PART1 + 1));
1030 }
1031
1032 /* Test that g_unichar_isgraph() returns the correct value for various
1033  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
1034 static void
1035 test_graph (void)
1036 {
1037   g_assert_false (g_unichar_isgraph (0x08));
1038   g_assert_false (g_unichar_isgraph (' '));
1039   g_assert_true (g_unichar_isgraph ('a'));
1040   g_assert_true (g_unichar_isgraph ('0'));
1041   g_assert_true (g_unichar_isgraph ('9'));
1042   g_assert_true (g_unichar_isgraph ('A'));
1043   g_assert_true (g_unichar_isgraph ('-'));
1044   g_assert_true (g_unichar_isgraph ('*'));
1045   g_assert_true (g_unichar_isgraph (0xFF21));  /* Unichar fullwidth 'A' */
1046   g_assert_true (g_unichar_isgraph (0xFF3A));  /* Unichar fullwidth 'Z' */
1047   g_assert_true (g_unichar_isgraph (0xFF41));  /* Unichar fullwidth 'a' */
1048   g_assert_true (g_unichar_isgraph (0xFF5A));  /* Unichar fullwidth 'z' */
1049   g_assert_true (g_unichar_isgraph (0xFF10));  /* Unichar fullwidth '0' */
1050   g_assert_true (g_unichar_isgraph (0xFF19));  /* Unichar fullwidth '9' */
1051   g_assert_true (g_unichar_isgraph (0xFF0A));  /* Unichar fullwidth '*' */
1052   g_assert_false (g_unichar_isgraph (0x007F)); /* Unichar fullwidth <del> */
1053   g_assert_false (g_unichar_isgraph (0x009F)); /* Unichar fullwidth control */
1054
1055   /*** Testing TYPE() border cases ***/
1056   g_assert_true (g_unichar_isgraph (0x3FF5));
1057   /* U+FFEFF Plane 15 Private Use (needed to be > G_UNICODE_MAX_TABLE_INDEX) */
1058   g_assert_true (g_unichar_isgraph (0xFFEFF));
1059   /* U+E0001 Language Tag */
1060   g_assert_false (g_unichar_isgraph (0xE0001));
1061   g_assert_false (g_unichar_isgraph (G_UNICODE_LAST_CHAR));
1062   g_assert_false (g_unichar_isgraph (G_UNICODE_LAST_CHAR + 1));
1063   g_assert_false (g_unichar_isgraph (G_UNICODE_LAST_CHAR_PART1));
1064   g_assert_false (g_unichar_isgraph (G_UNICODE_LAST_CHAR_PART1 + 1));
1065 }
1066
1067 /* Test that g_unichar_iszerowidth() returns the correct value for various
1068  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
1069 static void
1070 test_zerowidth (void)
1071 {
1072   g_assert_false (g_unichar_iszerowidth (0x00AD));
1073   g_assert_false (g_unichar_iszerowidth (0x115F));
1074   g_assert_true (g_unichar_iszerowidth (0x1160));
1075   g_assert_true (g_unichar_iszerowidth (0x11AA));
1076   g_assert_true (g_unichar_iszerowidth (0x11FF));
1077   g_assert_false (g_unichar_iszerowidth (0x1200));
1078   g_assert_false (g_unichar_iszerowidth (0x200A));
1079   g_assert_true (g_unichar_iszerowidth (0x200B));
1080   g_assert_true (g_unichar_iszerowidth (0x200C));
1081   g_assert_true (g_unichar_iszerowidth (0x591));
1082
1083   /*** Testing TYPE() border cases ***/
1084   g_assert_false (g_unichar_iszerowidth (0x3FF5));
1085   /* U+FFEFF Plane 15 Private Use (needed to be > G_UNICODE_MAX_TABLE_INDEX) */
1086   g_assert_false (g_unichar_iszerowidth (0xFFEFF));
1087   /* U+E0001 Language Tag */
1088   g_assert_true (g_unichar_iszerowidth (0xE0001));
1089   g_assert_false (g_unichar_iszerowidth (G_UNICODE_LAST_CHAR));
1090   g_assert_false (g_unichar_iszerowidth (G_UNICODE_LAST_CHAR + 1));
1091   g_assert_false (g_unichar_iszerowidth (G_UNICODE_LAST_CHAR_PART1));
1092   g_assert_false (g_unichar_iszerowidth (G_UNICODE_LAST_CHAR_PART1 + 1));
1093
1094   /* Hangul Jamo Extended-B block, containing jungseong and jongseong for
1095    * Old Korean */
1096   g_assert_true (g_unichar_iszerowidth (0xD7B0));
1097   g_assert_true (g_unichar_iszerowidth (0xD7FB));
1098 }
1099
1100 /* Test that g_unichar_istitle() returns the correct value for various
1101  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
1102 static void
1103 test_title (void)
1104 {
1105   g_assert_true (g_unichar_istitle (0x01c5));
1106   g_assert_true (g_unichar_istitle (0x1f88));
1107   g_assert_true (g_unichar_istitle (0x1fcc));
1108   g_assert_false (g_unichar_istitle ('a'));
1109   g_assert_false (g_unichar_istitle ('A'));
1110   g_assert_false (g_unichar_istitle (';'));
1111
1112   /*** Testing TYPE() border cases ***/
1113   g_assert_false (g_unichar_istitle (0x3FF5));
1114   /* U+FFEFF Plane 15 Private Use (needed to be > G_UNICODE_MAX_TABLE_INDEX) */
1115   g_assert_false (g_unichar_istitle (0xFFEFF));
1116   /* U+E0001 Language Tag */
1117   g_assert_false (g_unichar_istitle (0xE0001));
1118   g_assert_false (g_unichar_istitle (G_UNICODE_LAST_CHAR));
1119   g_assert_false (g_unichar_istitle (G_UNICODE_LAST_CHAR + 1));
1120   g_assert_false (g_unichar_istitle (G_UNICODE_LAST_CHAR_PART1));
1121   g_assert_false (g_unichar_istitle (G_UNICODE_LAST_CHAR_PART1 + 1));
1122
1123   g_assert_cmphex (g_unichar_totitle (0x0000), ==, 0x0000);
1124   g_assert_cmphex (g_unichar_totitle (0x01c6), ==, 0x01c5);
1125   g_assert_cmphex (g_unichar_totitle (0x01c4), ==, 0x01c5);
1126   g_assert_cmphex (g_unichar_totitle (0x01c5), ==, 0x01c5);
1127   g_assert_cmphex (g_unichar_totitle (0x1f80), ==, 0x1f88);
1128   g_assert_cmphex (g_unichar_totitle (0x1f88), ==, 0x1f88);
1129   g_assert_cmphex (g_unichar_totitle ('a'), ==, 'A');
1130   g_assert_cmphex (g_unichar_totitle ('A'), ==, 'A');
1131
1132   /*** Testing TYPE() border cases ***/
1133   g_assert_cmphex (g_unichar_totitle (0x3FF5), ==, 0x3FF5);
1134   /* U+FFEFF Plane 15 Private Use (needed to be > G_UNICODE_MAX_TABLE_INDEX) */
1135   g_assert_cmphex (g_unichar_totitle (0xFFEFF), ==, 0xFFEFF);
1136   g_assert_cmphex (g_unichar_totitle (0xDFFFF), ==, 0xDFFFF);
1137   /* U+E0001 Language Tag */
1138   g_assert_cmphex (g_unichar_totitle (0xE0001), ==, 0xE0001);
1139   g_assert_cmphex (g_unichar_totitle (G_UNICODE_LAST_CHAR), ==,
1140                    G_UNICODE_LAST_CHAR);
1141   g_assert_cmphex (g_unichar_totitle (G_UNICODE_LAST_CHAR + 1), ==,
1142                    (G_UNICODE_LAST_CHAR + 1));
1143   g_assert_cmphex (g_unichar_totitle (G_UNICODE_LAST_CHAR_PART1), ==,
1144                    (G_UNICODE_LAST_CHAR_PART1));
1145   g_assert_cmphex (g_unichar_totitle (G_UNICODE_LAST_CHAR_PART1 + 1), ==,
1146                    (G_UNICODE_LAST_CHAR_PART1 + 1));
1147 }
1148
1149 /* Test that g_unichar_isupper() returns the correct value for various
1150  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
1151 static void
1152 test_upper (void)
1153 {
1154   g_assert_false (g_unichar_isupper (' '));
1155   g_assert_false (g_unichar_isupper ('0'));
1156   g_assert_false (g_unichar_isupper ('a'));
1157   g_assert_true (g_unichar_isupper ('A'));
1158   g_assert_false (g_unichar_isupper (0xff41)); /* Unicode fullwidth 'a' */
1159   g_assert_true (g_unichar_isupper (0xff21)); /* Unicode fullwidth 'A' */
1160
1161   /*** Testing TYPE() border cases ***/
1162   g_assert_false (g_unichar_isupper (0x3FF5));
1163   /* U+FFEFF Plane 15 Private Use (needed to be > G_UNICODE_MAX_TABLE_INDEX) */
1164   g_assert_false (g_unichar_isupper (0xFFEFF));
1165   /* U+E0001 Language Tag */
1166   g_assert_false (g_unichar_isupper (0xE0001));
1167   g_assert_false (g_unichar_isupper (G_UNICODE_LAST_CHAR));
1168   g_assert_false (g_unichar_isupper (G_UNICODE_LAST_CHAR + 1));
1169   g_assert_false (g_unichar_isupper (G_UNICODE_LAST_CHAR_PART1));
1170   g_assert_false (g_unichar_isupper (G_UNICODE_LAST_CHAR_PART1 + 1));
1171 }
1172
1173 /* Test that g_unichar_islower() returns the correct value for various
1174  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
1175 static void
1176 test_lower (void)
1177 {
1178   g_assert_false (g_unichar_islower (' '));
1179   g_assert_false (g_unichar_islower ('0'));
1180   g_assert_true (g_unichar_islower ('a'));
1181   g_assert_false (g_unichar_islower ('A'));
1182   g_assert_true (g_unichar_islower (0xff41)); /* Unicode fullwidth 'a' */
1183   g_assert_false (g_unichar_islower (0xff21)); /* Unicode fullwidth 'A' */
1184
1185   /*** Testing TYPE() border cases ***/
1186   g_assert_false (g_unichar_islower (0x3FF5));
1187   /* U+FFEFF Plane 15 Private Use (needed to be > G_UNICODE_MAX_TABLE_INDEX) */
1188   g_assert_false (g_unichar_islower (0xFFEFF));
1189   /* U+E0001 Language Tag */
1190   g_assert_false (g_unichar_islower (0xE0001));
1191   g_assert_false (g_unichar_islower (G_UNICODE_LAST_CHAR));
1192   g_assert_false (g_unichar_islower (G_UNICODE_LAST_CHAR + 1));
1193   g_assert_false (g_unichar_islower (G_UNICODE_LAST_CHAR_PART1));
1194   g_assert_false (g_unichar_islower (G_UNICODE_LAST_CHAR_PART1 + 1));
1195 }
1196
1197 /* Test that g_unichar_isprint() returns the correct value for various
1198  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
1199 static void
1200 test_print (void)
1201 {
1202   g_assert_true (g_unichar_isprint (' '));
1203   g_assert_true (g_unichar_isprint ('0'));
1204   g_assert_true (g_unichar_isprint ('a'));
1205   g_assert_true (g_unichar_isprint ('A'));
1206   g_assert_true (g_unichar_isprint (0xff41)); /* Unicode fullwidth 'a' */
1207   g_assert_true (g_unichar_isprint (0xff21)); /* Unicode fullwidth 'A' */
1208
1209   /*** Testing TYPE() border cases ***/
1210   g_assert_true (g_unichar_isprint (0x3FF5));
1211   /* U+FFEFF Plane 15 Private Use (needed to be > G_UNICODE_MAX_TABLE_INDEX) */
1212   g_assert_true (g_unichar_isprint (0xFFEFF));
1213   /* U+E0001 Language Tag */
1214   g_assert_false (g_unichar_isprint (0xE0001));
1215   g_assert_false (g_unichar_isprint (G_UNICODE_LAST_CHAR));
1216   g_assert_false (g_unichar_isprint (G_UNICODE_LAST_CHAR + 1));
1217   g_assert_false (g_unichar_isprint (G_UNICODE_LAST_CHAR_PART1));
1218   g_assert_false (g_unichar_isprint (G_UNICODE_LAST_CHAR_PART1 + 1));
1219 }
1220
1221 /* Test that g_unichar_toupper() and g_unichar_tolower() return the
1222  * correct values for various ASCII and Unicode alphabetic, numeric,
1223  * and other, codepoints. */
1224 static void
1225 test_cases (void)
1226 {
1227   g_assert_cmphex (g_unichar_toupper (0x0), ==, 0x0);
1228   g_assert_cmphex (g_unichar_tolower (0x0), ==, 0x0);
1229   g_assert_cmphex (g_unichar_toupper ('a'), ==, 'A');
1230   g_assert_cmphex (g_unichar_toupper ('A'), ==, 'A');
1231   /* Unicode fullwidth 'a' == 'A' */
1232   g_assert_cmphex (g_unichar_toupper (0xff41), ==, 0xff21);
1233   /* Unicode fullwidth 'A' == 'A' */
1234   g_assert_cmphex (g_unichar_toupper (0xff21), ==, 0xff21);
1235   g_assert_cmphex (g_unichar_toupper (0x01C5), ==, 0x01C4);
1236   g_assert_cmphex (g_unichar_toupper (0x01C6), ==, 0x01C4);
1237   g_assert_cmphex (g_unichar_tolower ('A'), ==, 'a');
1238   g_assert_cmphex (g_unichar_tolower ('a'), ==, 'a');
1239   /* Unicode fullwidth 'A' == 'a' */
1240   g_assert_cmphex (g_unichar_tolower (0xff21), ==, 0xff41);
1241   /* Unicode fullwidth 'a' == 'a' */
1242   g_assert_cmphex (g_unichar_tolower (0xff41), ==, 0xff41);
1243   g_assert_cmphex (g_unichar_tolower (0x01C4), ==, 0x01C6);
1244   g_assert_cmphex (g_unichar_tolower (0x01C5), ==, 0x01C6);
1245   g_assert_cmphex (g_unichar_tolower (0x1F8A), ==, 0x1F82);
1246   g_assert_cmphex (g_unichar_totitle (0x1F8A), ==, 0x1F8A);
1247   g_assert_cmphex (g_unichar_toupper (0x1F8A), ==, 0x1F8A);
1248   g_assert_cmphex (g_unichar_tolower (0x1FB2), ==, 0x1FB2);
1249   g_assert_cmphex (g_unichar_toupper (0x1FB2), ==, 0x1FB2);
1250
1251   /* U+130 is a special case, it's a 'I' with a dot on top */
1252   g_assert_cmphex (g_unichar_tolower (0x130), ==, 0x69);
1253
1254   /* Testing ATTTABLE() border cases */
1255   g_assert_cmphex (g_unichar_toupper (0x1D6FE), ==, 0x1D6FE);
1256
1257   /*** Testing TYPE() border cases ***/
1258   g_assert_cmphex (g_unichar_toupper (0x3FF5), ==, 0x3FF5);
1259   /* U+FFEFF Plane 15 Private Use (needed to be > G_UNICODE_MAX_TABLE_INDEX) */
1260   g_assert_cmphex (g_unichar_toupper (0xFFEFF), ==, 0xFFEFF);
1261   g_assert_cmphex (g_unichar_toupper (0xDFFFF), ==, 0xDFFFF);
1262   /* U+E0001 Language Tag */
1263   g_assert_cmphex (g_unichar_toupper (0xE0001), ==, 0xE0001);
1264   g_assert_cmphex (g_unichar_toupper (G_UNICODE_LAST_CHAR), ==,
1265                    G_UNICODE_LAST_CHAR);
1266   g_assert_cmphex (g_unichar_toupper (G_UNICODE_LAST_CHAR + 1), ==,
1267                    (G_UNICODE_LAST_CHAR + 1));
1268   g_assert_cmphex (g_unichar_toupper (G_UNICODE_LAST_CHAR_PART1), ==,
1269                    (G_UNICODE_LAST_CHAR_PART1));
1270   g_assert_cmphex (g_unichar_toupper (G_UNICODE_LAST_CHAR_PART1 + 1), ==,
1271                    (G_UNICODE_LAST_CHAR_PART1 + 1));
1272
1273   /* Testing ATTTABLE() border cases */
1274   g_assert_cmphex (g_unichar_tolower (0x1D6FA), ==, 0x1D6FA);
1275
1276   /*** Testing TYPE() border cases ***/
1277   g_assert_cmphex (g_unichar_tolower (0x3FF5), ==, 0x3FF5);
1278   /* U+FFEFF Plane 15 Private Use (needed to be > G_UNICODE_MAX_TABLE_INDEX) */
1279   g_assert_cmphex (g_unichar_tolower (0xFFEFF), ==, 0xFFEFF);
1280   g_assert_cmphex (g_unichar_tolower (0xDFFFF), ==, 0xDFFFF);
1281   /* U+E0001 Language Tag */
1282   g_assert_cmphex (g_unichar_tolower (0xE0001), ==, 0xE0001);
1283   g_assert_cmphex (g_unichar_tolower (G_UNICODE_LAST_CHAR), ==,
1284                    G_UNICODE_LAST_CHAR);
1285   g_assert_cmphex (g_unichar_tolower (G_UNICODE_LAST_CHAR + 1), ==,
1286                    (G_UNICODE_LAST_CHAR + 1));
1287   g_assert_cmphex (g_unichar_tolower (G_UNICODE_LAST_CHAR_PART1), ==,
1288                    G_UNICODE_LAST_CHAR_PART1);
1289   g_assert_cmphex (g_unichar_tolower (G_UNICODE_LAST_CHAR_PART1 + 1), ==,
1290                    (G_UNICODE_LAST_CHAR_PART1 + 1));
1291 }
1292
1293 /* Test that g_unichar_isdefined() returns the correct value for various
1294  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
1295 static void
1296 test_defined (void)
1297 {
1298   g_assert_true (g_unichar_isdefined (0x0903));
1299   g_assert_true (g_unichar_isdefined (0x20DD));
1300   g_assert_true (g_unichar_isdefined (0x20BA));
1301   g_assert_true (g_unichar_isdefined (0xA806));
1302   g_assert_true (g_unichar_isdefined ('a'));
1303   g_assert_false (g_unichar_isdefined (0x10C49));
1304   g_assert_false (g_unichar_isdefined (0x169D));
1305
1306   /*** Testing TYPE() border cases ***/
1307   g_assert_true (g_unichar_isdefined (0x3FF5));
1308   /* U+FFEFF Plane 15 Private Use (needed to be > G_UNICODE_MAX_TABLE_INDEX) */
1309   g_assert_true (g_unichar_isdefined (0xFFEFF));
1310   g_assert_false (g_unichar_isdefined (0xDFFFF));
1311   /* U+E0001 Language Tag */
1312   g_assert_true (g_unichar_isdefined (0xE0001));
1313   g_assert_false (g_unichar_isdefined (G_UNICODE_LAST_CHAR));
1314   g_assert_false (g_unichar_isdefined (G_UNICODE_LAST_CHAR + 1));
1315   g_assert_false (g_unichar_isdefined (G_UNICODE_LAST_CHAR_PART1));
1316   g_assert_false (g_unichar_isdefined (G_UNICODE_LAST_CHAR_PART1 + 1));
1317 }
1318
1319 /* Test that g_unichar_iswide() returns the correct value for various
1320  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
1321 static void
1322 test_wide (void)
1323 {
1324   guint i;
1325   struct {
1326     gunichar c;
1327     enum {
1328       NOT_WIDE,
1329       WIDE_CJK,
1330       WIDE
1331     } wide;
1332   } examples[] = {
1333     /* Neutral */
1334     {   0x0000, NOT_WIDE },
1335     {   0x0483, NOT_WIDE },
1336     {   0x0641, NOT_WIDE },
1337     {   0xFFFC, NOT_WIDE },
1338     {  0x10000, NOT_WIDE },
1339     {  0xE0001, NOT_WIDE },
1340     {  0x2FFFE, NOT_WIDE },
1341     {  0x3FFFE, NOT_WIDE },
1342
1343     /* Narrow */
1344     {   0x0020, NOT_WIDE },
1345     {   0x0041, NOT_WIDE },
1346     {   0x27E6, NOT_WIDE },
1347
1348     /* Halfwidth */
1349     {   0x20A9, NOT_WIDE },
1350     {   0xFF61, NOT_WIDE },
1351     {   0xFF69, NOT_WIDE },
1352     {   0xFFEE, NOT_WIDE },
1353
1354     /* Ambiguous */
1355     {   0x00A1, WIDE_CJK },
1356     {   0x00BE, WIDE_CJK },
1357     {   0x02DD, WIDE_CJK },
1358     {   0x2020, WIDE_CJK },
1359     {   0xFFFD, WIDE_CJK },
1360     {   0x00A1, WIDE_CJK },
1361     {  0x1F100, WIDE_CJK },
1362     {  0xE0100, WIDE_CJK },
1363     { 0x100000, WIDE_CJK },
1364     { 0x10FFFD, WIDE_CJK },
1365
1366     /* Fullwidth */
1367     {   0x3000, WIDE },
1368     {   0xFF60, WIDE },
1369
1370     /* Wide */
1371     {   0x2329, WIDE },
1372     {   0x3001, WIDE },
1373     {   0xFE69, WIDE },
1374     {  0x30000, WIDE },
1375     {  0x3FFFD, WIDE },
1376
1377     /* Default Wide blocks */
1378     {   0x4DBF, WIDE },
1379     {   0x9FFF, WIDE },
1380     {   0xFAFF, WIDE },
1381     {  0x2A6DF, WIDE },
1382     {  0x2B73F, WIDE },
1383     {  0x2B81F, WIDE },
1384     {  0x2FA1F, WIDE },
1385
1386     /* Uniode-5.2 character additions */
1387     /* Wide */
1388     {   0x115F, WIDE },
1389
1390     /* Uniode-6.0 character additions */
1391     /* Wide */
1392     {  0x2B740, WIDE },
1393     {  0x1B000, WIDE },
1394
1395     { 0x111111, NOT_WIDE }
1396   };
1397
1398   for (i = 0; i < G_N_ELEMENTS (examples); i++)
1399     {
1400       g_assert_cmpint (g_unichar_iswide (examples[i].c), ==,
1401                        (examples[i].wide == WIDE));
1402       g_assert_cmpint (g_unichar_iswide_cjk (examples[i].c), ==,
1403                        (examples[i].wide != NOT_WIDE));
1404     }
1405 };
1406
1407 /* Test that g_unichar_compose() returns the correct value for various
1408  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
1409 static void
1410 test_compose (void)
1411 {
1412   gunichar ch;
1413
1414   /* Not composable */
1415   g_assert_false (g_unichar_compose (0x0041, 0x0042, &ch) && ch == 0);
1416   g_assert_false (g_unichar_compose (0x0041, 0, &ch) && ch == 0);
1417   g_assert_false (g_unichar_compose (0x0066, 0x0069, &ch) && ch == 0);
1418
1419   /* Tricky non-composable */
1420   g_assert_false (g_unichar_compose (0x0308, 0x0301, &ch) && ch == 0); /* !0x0344 */
1421   g_assert_false (g_unichar_compose (0x0F71, 0x0F72, &ch) && ch == 0); /* !0x0F73 */
1422
1423   /* Singletons should not compose */
1424   g_assert_false (g_unichar_compose (0x212B, 0, &ch) && ch == 0);
1425   g_assert_false (g_unichar_compose (0x00C5, 0, &ch) && ch == 0);
1426   g_assert_false (g_unichar_compose (0x2126, 0, &ch) && ch == 0);
1427   g_assert_false (g_unichar_compose (0x03A9, 0, &ch) && ch == 0);
1428
1429   /* Pairs */
1430   g_assert_true (g_unichar_compose (0x0041, 0x030A, &ch) && ch == 0x00C5);
1431   g_assert_true (g_unichar_compose (0x006F, 0x0302, &ch) && ch == 0x00F4);
1432   g_assert_true (g_unichar_compose (0x1E63, 0x0307, &ch) && ch == 0x1E69);
1433   g_assert_true (g_unichar_compose (0x0073, 0x0323, &ch) && ch == 0x1E63);
1434   g_assert_true (g_unichar_compose (0x0064, 0x0307, &ch) && ch == 0x1E0B);
1435   g_assert_true (g_unichar_compose (0x0064, 0x0323, &ch) && ch == 0x1E0D);
1436
1437   /* Hangul */
1438   g_assert_true (g_unichar_compose (0xD4CC, 0x11B6, &ch) && ch == 0xD4DB);
1439   g_assert_true (g_unichar_compose (0x1111, 0x1171, &ch) && ch == 0xD4CC);
1440   g_assert_true (g_unichar_compose (0xCE20, 0x11B8, &ch) && ch == 0xCE31);
1441   g_assert_true (g_unichar_compose (0x110E, 0x1173, &ch) && ch == 0xCE20);
1442 }
1443
1444 /* Test that g_unichar_decompose() returns the correct value for various
1445  * ASCII and Unicode alphabetic, numeric, and other, codepoints. */
1446 static void
1447 test_decompose (void)
1448 {
1449   gunichar a, b;
1450
1451   /* Not decomposable */
1452   g_assert_false (g_unichar_decompose (0x0041, &a, &b) && a == 0x0041 && b == 0);
1453   g_assert_false (g_unichar_decompose (0xFB01, &a, &b) && a == 0xFB01 && b == 0);
1454
1455   /* Singletons */
1456   g_assert_true (g_unichar_decompose (0x212B, &a, &b) && a == 0x00C5 && b == 0);
1457   g_assert_true (g_unichar_decompose (0x2126, &a, &b) && a == 0x03A9 && b == 0);
1458
1459   /* Tricky pairs */
1460   g_assert_true (g_unichar_decompose (0x0344, &a, &b) && a == 0x0308 && b == 0x0301);
1461   g_assert_true (g_unichar_decompose (0x0F73, &a, &b) && a == 0x0F71 && b == 0x0F72);
1462
1463   /* Pairs */
1464   g_assert_true (g_unichar_decompose (0x00C5, &a, &b) && a == 0x0041 && b == 0x030A);
1465   g_assert_true (g_unichar_decompose (0x00F4, &a, &b) && a == 0x006F && b == 0x0302);
1466   g_assert_true (g_unichar_decompose (0x1E69, &a, &b) && a == 0x1E63 && b == 0x0307);
1467   g_assert_true (g_unichar_decompose (0x1E63, &a, &b) && a == 0x0073 && b == 0x0323);
1468   g_assert_true (g_unichar_decompose (0x1E0B, &a, &b) && a == 0x0064 && b == 0x0307);
1469   g_assert_true (g_unichar_decompose (0x1E0D, &a, &b) && a == 0x0064 && b == 0x0323);
1470
1471   /* Hangul */
1472   g_assert_true (g_unichar_decompose (0xD4DB, &a, &b) && a == 0xD4CC && b == 0x11B6);
1473   g_assert_true (g_unichar_decompose (0xD4CC, &a, &b) && a == 0x1111 && b == 0x1171);
1474   g_assert_true (g_unichar_decompose (0xCE31, &a, &b) && a == 0xCE20 && b == 0x11B8);
1475   g_assert_true (g_unichar_decompose (0xCE20, &a, &b) && a == 0x110E && b == 0x1173);
1476 }
1477
1478 /* Test that g_unichar_fully_decompose() returns the correct value for
1479  * various ASCII and Unicode alphabetic, numeric, and other, codepoints. */
1480 static void
1481 test_fully_decompose_canonical (void)
1482 {
1483   gunichar decomp[5];
1484   gsize len;
1485
1486 #define TEST_DECOMP(ch, expected_len, a, b, c, d) \
1487   len = g_unichar_fully_decompose (ch, FALSE, decomp, G_N_ELEMENTS (decomp)); \
1488   g_assert_cmpint (expected_len, ==, len); \
1489   if (expected_len >= 1) g_assert_cmphex (decomp[0], ==, a); \
1490   if (expected_len >= 2) g_assert_cmphex (decomp[1], ==, b); \
1491   if (expected_len >= 3) g_assert_cmphex (decomp[2], ==, c); \
1492   if (expected_len >= 4) g_assert_cmphex (decomp[3], ==, d); \
1493
1494 #define TEST0(ch)               TEST_DECOMP (ch, 1, ch, 0, 0, 0)
1495 #define TEST1(ch, a)            TEST_DECOMP (ch, 1, a, 0, 0, 0)
1496 #define TEST2(ch, a, b)         TEST_DECOMP (ch, 2, a, b, 0, 0)
1497 #define TEST3(ch, a, b, c)      TEST_DECOMP (ch, 3, a, b, c, 0)
1498 #define TEST4(ch, a, b, c, d)   TEST_DECOMP (ch, 4, a, b, c, d)
1499
1500   /* Not decomposable */
1501   TEST0 (0x0041);
1502   TEST0 (0xFB01);
1503
1504   /* Singletons */
1505   TEST2 (0x212B, 0x0041, 0x030A);
1506   TEST1 (0x2126, 0x03A9);
1507
1508   /* Tricky pairs */
1509   TEST2 (0x0344, 0x0308, 0x0301);
1510   TEST2 (0x0F73, 0x0F71, 0x0F72);
1511
1512   /* General */
1513   TEST2 (0x00C5, 0x0041, 0x030A);
1514   TEST2 (0x00F4, 0x006F, 0x0302);
1515   TEST3 (0x1E69, 0x0073, 0x0323, 0x0307);
1516   TEST2 (0x1E63, 0x0073, 0x0323);
1517   TEST2 (0x1E0B, 0x0064, 0x0307);
1518   TEST2 (0x1E0D, 0x0064, 0x0323);
1519
1520   /* Hangul */
1521   TEST3 (0xD4DB, 0x1111, 0x1171, 0x11B6);
1522   TEST2 (0xD4CC, 0x1111, 0x1171);
1523   TEST3 (0xCE31, 0x110E, 0x1173, 0x11B8);
1524   TEST2 (0xCE20, 0x110E, 0x1173);
1525
1526 #undef TEST_DECOMP
1527 }
1528
1529 /* Test that g_unicode_canonical_decomposition() returns the correct
1530  * value for various ASCII and Unicode alphabetic, numeric, and other,
1531  * codepoints. */
1532 static void
1533 test_canonical_decomposition (void)
1534 {
1535   gunichar *decomp;
1536   gsize len;
1537
1538 #define TEST_DECOMP(ch, expected_len, a, b, c, d) \
1539   decomp = g_unicode_canonical_decomposition (ch, &len); \
1540   g_assert_cmpint (expected_len, ==, len); \
1541   if (expected_len >= 1) g_assert_cmphex (decomp[0], ==, a); \
1542   if (expected_len >= 2) g_assert_cmphex (decomp[1], ==, b); \
1543   if (expected_len >= 3) g_assert_cmphex (decomp[2], ==, c); \
1544   if (expected_len >= 4) g_assert_cmphex (decomp[3], ==, d); \
1545   g_free (decomp);
1546
1547 #define TEST0(ch)               TEST_DECOMP (ch, 1, ch, 0, 0, 0)
1548 #define TEST1(ch, a)            TEST_DECOMP (ch, 1, a, 0, 0, 0)
1549 #define TEST2(ch, a, b)         TEST_DECOMP (ch, 2, a, b, 0, 0)
1550 #define TEST3(ch, a, b, c)      TEST_DECOMP (ch, 3, a, b, c, 0)
1551 #define TEST4(ch, a, b, c, d)   TEST_DECOMP (ch, 4, a, b, c, d)
1552
1553   /* Not decomposable */
1554   TEST0 (0x0041);
1555   TEST0 (0xFB01);
1556
1557   /* Singletons */
1558   TEST2 (0x212B, 0x0041, 0x030A);
1559   TEST1 (0x2126, 0x03A9);
1560
1561   /* Tricky pairs */
1562   TEST2 (0x0344, 0x0308, 0x0301);
1563   TEST2 (0x0F73, 0x0F71, 0x0F72);
1564
1565   /* General */
1566   TEST2 (0x00C5, 0x0041, 0x030A);
1567   TEST2 (0x00F4, 0x006F, 0x0302);
1568   TEST3 (0x1E69, 0x0073, 0x0323, 0x0307);
1569   TEST2 (0x1E63, 0x0073, 0x0323);
1570   TEST2 (0x1E0B, 0x0064, 0x0307);
1571   TEST2 (0x1E0D, 0x0064, 0x0323);
1572
1573   /* Hangul */
1574   TEST3 (0xD4DB, 0x1111, 0x1171, 0x11B6);
1575   TEST2 (0xD4CC, 0x1111, 0x1171);
1576   TEST3 (0xCE31, 0x110E, 0x1173, 0x11B8);
1577   TEST2 (0xCE20, 0x110E, 0x1173);
1578
1579 #undef TEST_DECOMP
1580 }
1581
1582 /* Test that g_unichar_decompose() whenever encouttering a char ch
1583  * decomposes into a and b, b itself won't decompose any further. */
1584 static void
1585 test_decompose_tail (void)
1586 {
1587   gunichar ch, a, b, c, d;
1588
1589   /* Test that whenever a char ch decomposes into a and b, b itself
1590    * won't decompose any further. */
1591
1592   for (ch = 0; ch < 0x110000; ch++)
1593     if (g_unichar_decompose (ch, &a, &b))
1594       g_assert_false (g_unichar_decompose (b, &c, &d));
1595     else
1596       {
1597         g_assert_cmpuint (a, ==, ch);
1598         g_assert_cmpuint (b, ==, 0);
1599       }
1600 }
1601
1602 /* Test that all canonical decompositions of g_unichar_fully_decompose()
1603  * are at most 4 in length, and compatibility decompositions are
1604  * at most 18 in length. */
1605 static void
1606 test_fully_decompose_len (void)
1607 {
1608   gunichar ch;
1609
1610   /* Test that all canonical decompositions are at most 4 in length,
1611    * and compatibility decompositions are at most 18 in length.
1612    */
1613
1614   for (ch = 0; ch < 0x110000; ch++) {
1615     g_assert_cmpint (g_unichar_fully_decompose (ch, FALSE, NULL, 0), <=, 4);
1616     g_assert_cmpint (g_unichar_fully_decompose (ch, TRUE,  NULL, 0), <=, 18);
1617   }
1618 }
1619
1620 /* Check various examples from Unicode Annex #15 for NFD and NFC
1621  * normalization.
1622  */
1623 static void
1624 test_normalization (void)
1625 {
1626   const struct {
1627     const char *source;
1628     const char *nfd;
1629     const char *nfc;
1630   } tests[] = {
1631     // Singletons
1632     { "\xe2\x84\xab", "A\xcc\x8a", "Å" }, // U+212B ANGSTROM SIGN
1633     { "\xe2\x84\xa6", "Ω", "Ω" }, // U+2126 OHM SIGN
1634     // Canonical Composites
1635     { "Å", "A\xcc\x8a", "Å" }, // U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE
1636     { "ô", "o\xcc\x82", "ô" }, // U+00F4 LATIN SMALL LETTER O WITH CIRCUMFLEX
1637     // Multiple Combining Marks
1638     { "\xe1\xb9\xa9", "s\xcc\xa3\xcc\x87", "ṩ" }, // U+1E69 LATIN SMALL LETTER S WITH DOT BELOW AND DOT ABOVE
1639     { "\xe1\xb8\x8b\xcc\xa3", "d\xcc\xa3\xcc\x87", "ḍ̇" },
1640     { "q\xcc\x87\xcc\xa3", "q\xcc\xa3\xcc\x87", "q̣̇" },
1641     // Compatibility Composites
1642     { "ﬁ", "ﬁ", "ﬁ" }, // U+FB01 LATIN SMALL LIGATURE FI
1643     { "2\xe2\x81\xb5", "2\xe2\x81\xb5", "2⁵" },
1644     { "\xe1\xba\x9b\xcc\xa3", "\xc5\xbf\xcc\xa3\xcc\x87", "ẛ̣" },
1645
1646     // Tests for behavior with reordered marks
1647     { "s\xcc\x87\xcc\xa3", "s\xcc\xa3\xcc\x87", "ṩ" },
1648     { "α\xcc\x94\xcd\x82", "α\xcc\x94\xcd\x82", "ἇ" },
1649     { "α\xcd\x82\xcc\x94", "α\xcd\x82\xcc\x94", "ᾶ\xcc\x94" },
1650   };
1651   gsize i;
1652
1653   for (i = 0; i < G_N_ELEMENTS (tests); i++)
1654     {
1655       char *nfd, *nfc;
1656
1657       nfd = g_utf8_normalize (tests[i].source, -1, G_NORMALIZE_NFD);
1658       g_assert_cmpstr (nfd, ==, tests[i].nfd);
1659
1660       nfc = g_utf8_normalize (tests[i].nfd, -1, G_NORMALIZE_NFC);
1661       g_assert_cmpstr (nfc, ==, tests[i].nfc);
1662
1663       g_free (nfd);
1664       g_free (nfc);
1665     }
1666 }
1667
1668 static void
1669 test_iso15924 (void)
1670 {
1671   const struct {
1672     GUnicodeScript script;
1673     char four_letter_code[5];
1674   } data[] = {
1675     { G_UNICODE_SCRIPT_COMMON,             "Zyyy" },
1676     { G_UNICODE_SCRIPT_INHERITED,          "Zinh" },
1677     { G_UNICODE_SCRIPT_MATH,               "Zmth" },
1678     { G_UNICODE_SCRIPT_ARABIC,             "Arab" },
1679     { G_UNICODE_SCRIPT_ARMENIAN,           "Armn" },
1680     { G_UNICODE_SCRIPT_BENGALI,            "Beng" },
1681     { G_UNICODE_SCRIPT_BOPOMOFO,           "Bopo" },
1682     { G_UNICODE_SCRIPT_CHEROKEE,           "Cher" },
1683     { G_UNICODE_SCRIPT_COPTIC,             "Copt" },
1684     { G_UNICODE_SCRIPT_CYRILLIC,           "Cyrl" },
1685     { G_UNICODE_SCRIPT_DESERET,            "Dsrt" },
1686     { G_UNICODE_SCRIPT_DEVANAGARI,         "Deva" },
1687     { G_UNICODE_SCRIPT_ETHIOPIC,           "Ethi" },
1688     { G_UNICODE_SCRIPT_GEORGIAN,           "Geor" },
1689     { G_UNICODE_SCRIPT_GOTHIC,             "Goth" },
1690     { G_UNICODE_SCRIPT_GREEK,              "Grek" },
1691     { G_UNICODE_SCRIPT_GUJARATI,           "Gujr" },
1692     { G_UNICODE_SCRIPT_GURMUKHI,           "Guru" },
1693     { G_UNICODE_SCRIPT_HAN,                "Hani" },
1694     { G_UNICODE_SCRIPT_HANGUL,             "Hang" },
1695     { G_UNICODE_SCRIPT_HEBREW,             "Hebr" },
1696     { G_UNICODE_SCRIPT_HIRAGANA,           "Hira" },
1697     { G_UNICODE_SCRIPT_KANNADA,            "Knda" },
1698     { G_UNICODE_SCRIPT_KATAKANA,           "Kana" },
1699     { G_UNICODE_SCRIPT_KHMER,              "Khmr" },
1700     { G_UNICODE_SCRIPT_LAO,                "Laoo" },
1701     { G_UNICODE_SCRIPT_LATIN,              "Latn" },
1702     { G_UNICODE_SCRIPT_MALAYALAM,          "Mlym" },
1703     { G_UNICODE_SCRIPT_MONGOLIAN,          "Mong" },
1704     { G_UNICODE_SCRIPT_MYANMAR,            "Mymr" },
1705     { G_UNICODE_SCRIPT_OGHAM,              "Ogam" },
1706     { G_UNICODE_SCRIPT_OLD_ITALIC,         "Ital" },
1707     { G_UNICODE_SCRIPT_ORIYA,              "Orya" },
1708     { G_UNICODE_SCRIPT_RUNIC,              "Runr" },
1709     { G_UNICODE_SCRIPT_SINHALA,            "Sinh" },
1710     { G_UNICODE_SCRIPT_SYRIAC,             "Syrc" },
1711     { G_UNICODE_SCRIPT_TAMIL,              "Taml" },
1712     { G_UNICODE_SCRIPT_TELUGU,             "Telu" },
1713     { G_UNICODE_SCRIPT_THAANA,             "Thaa" },
1714     { G_UNICODE_SCRIPT_THAI,               "Thai" },
1715     { G_UNICODE_SCRIPT_TIBETAN,            "Tibt" },
1716     { G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL, "Cans" },
1717     { G_UNICODE_SCRIPT_YI,                 "Yiii" },
1718     { G_UNICODE_SCRIPT_TAGALOG,            "Tglg" },
1719     { G_UNICODE_SCRIPT_HANUNOO,            "Hano" },
1720     { G_UNICODE_SCRIPT_BUHID,              "Buhd" },
1721     { G_UNICODE_SCRIPT_TAGBANWA,           "Tagb" },
1722
1723     /* Unicode-4.0 additions */
1724     { G_UNICODE_SCRIPT_BRAILLE,            "Brai" },
1725     { G_UNICODE_SCRIPT_CYPRIOT,            "Cprt" },
1726     { G_UNICODE_SCRIPT_LIMBU,              "Limb" },
1727     { G_UNICODE_SCRIPT_OSMANYA,            "Osma" },
1728     { G_UNICODE_SCRIPT_SHAVIAN,            "Shaw" },
1729     { G_UNICODE_SCRIPT_LINEAR_B,           "Linb" },
1730     { G_UNICODE_SCRIPT_TAI_LE,             "Tale" },
1731     { G_UNICODE_SCRIPT_UGARITIC,           "Ugar" },
1732
1733     /* Unicode-4.1 additions */
1734     { G_UNICODE_SCRIPT_NEW_TAI_LUE,        "Talu" },
1735     { G_UNICODE_SCRIPT_BUGINESE,           "Bugi" },
1736     { G_UNICODE_SCRIPT_GLAGOLITIC,         "Glag" },
1737     { G_UNICODE_SCRIPT_TIFINAGH,           "Tfng" },
1738     { G_UNICODE_SCRIPT_SYLOTI_NAGRI,       "Sylo" },
1739     { G_UNICODE_SCRIPT_OLD_PERSIAN,        "Xpeo" },
1740     { G_UNICODE_SCRIPT_KHAROSHTHI,         "Khar" },
1741
1742     /* Unicode-5.0 additions */
1743     { G_UNICODE_SCRIPT_UNKNOWN,            "Zzzz" },
1744     { G_UNICODE_SCRIPT_BALINESE,           "Bali" },
1745     { G_UNICODE_SCRIPT_CUNEIFORM,          "Xsux" },
1746     { G_UNICODE_SCRIPT_PHOENICIAN,         "Phnx" },
1747     { G_UNICODE_SCRIPT_PHAGS_PA,           "Phag" },
1748     { G_UNICODE_SCRIPT_NKO,                "Nkoo" },
1749
1750     /* Unicode-5.1 additions */
1751     { G_UNICODE_SCRIPT_KAYAH_LI,           "Kali" },
1752     { G_UNICODE_SCRIPT_LEPCHA,             "Lepc" },
1753     { G_UNICODE_SCRIPT_REJANG,             "Rjng" },
1754     { G_UNICODE_SCRIPT_SUNDANESE,          "Sund" },
1755     { G_UNICODE_SCRIPT_SAURASHTRA,         "Saur" },
1756     { G_UNICODE_SCRIPT_CHAM,               "Cham" },
1757     { G_UNICODE_SCRIPT_OL_CHIKI,           "Olck" },
1758     { G_UNICODE_SCRIPT_VAI,                "Vaii" },
1759     { G_UNICODE_SCRIPT_CARIAN,             "Cari" },
1760     { G_UNICODE_SCRIPT_LYCIAN,             "Lyci" },
1761     { G_UNICODE_SCRIPT_LYDIAN,             "Lydi" },
1762
1763     /* Unicode-5.2 additions */
1764     { G_UNICODE_SCRIPT_AVESTAN,                "Avst" },
1765     { G_UNICODE_SCRIPT_BAMUM,                  "Bamu" },
1766     { G_UNICODE_SCRIPT_EGYPTIAN_HIEROGLYPHS,   "Egyp" },
1767     { G_UNICODE_SCRIPT_IMPERIAL_ARAMAIC,       "Armi" },
1768     { G_UNICODE_SCRIPT_INSCRIPTIONAL_PAHLAVI,  "Phli" },
1769     { G_UNICODE_SCRIPT_INSCRIPTIONAL_PARTHIAN, "Prti" },
1770     { G_UNICODE_SCRIPT_JAVANESE,               "Java" },
1771     { G_UNICODE_SCRIPT_KAITHI,                 "Kthi" },
1772     { G_UNICODE_SCRIPT_LISU,                   "Lisu" },
1773     { G_UNICODE_SCRIPT_MEETEI_MAYEK,           "Mtei" },
1774     { G_UNICODE_SCRIPT_OLD_SOUTH_ARABIAN,      "Sarb" },
1775     { G_UNICODE_SCRIPT_OLD_TURKIC,             "Orkh" },
1776     { G_UNICODE_SCRIPT_SAMARITAN,              "Samr" },
1777     { G_UNICODE_SCRIPT_TAI_THAM,               "Lana" },
1778     { G_UNICODE_SCRIPT_TAI_VIET,               "Tavt" },
1779
1780     /* Unicode-6.0 additions */
1781     { G_UNICODE_SCRIPT_BATAK,                  "Batk" },
1782     { G_UNICODE_SCRIPT_BRAHMI,                 "Brah" },
1783     { G_UNICODE_SCRIPT_MANDAIC,                "Mand" },
1784
1785     /* Unicode-6.1 additions */
1786     { G_UNICODE_SCRIPT_CHAKMA,                 "Cakm" },
1787     { G_UNICODE_SCRIPT_MEROITIC_CURSIVE,       "Merc" },
1788     { G_UNICODE_SCRIPT_MEROITIC_HIEROGLYPHS,   "Mero" },
1789     { G_UNICODE_SCRIPT_MIAO,                   "Plrd" },
1790     { G_UNICODE_SCRIPT_SHARADA,                "Shrd" },
1791     { G_UNICODE_SCRIPT_SORA_SOMPENG,           "Sora" },
1792     { G_UNICODE_SCRIPT_TAKRI,                  "Takr" },
1793
1794     /* Unicode 7.0 additions */
1795     { G_UNICODE_SCRIPT_BASSA_VAH,              "Bass" },
1796     { G_UNICODE_SCRIPT_CAUCASIAN_ALBANIAN,     "Aghb" },
1797     { G_UNICODE_SCRIPT_DUPLOYAN,               "Dupl" },
1798     { G_UNICODE_SCRIPT_ELBASAN,                "Elba" },
1799     { G_UNICODE_SCRIPT_GRANTHA,                "Gran" },
1800     { G_UNICODE_SCRIPT_KHOJKI,                 "Khoj" },
1801     { G_UNICODE_SCRIPT_KHUDAWADI,              "Sind" },
1802     { G_UNICODE_SCRIPT_LINEAR_A,               "Lina" },
1803     { G_UNICODE_SCRIPT_MAHAJANI,               "Mahj" },
1804     { G_UNICODE_SCRIPT_MANICHAEAN,             "Mani" },
1805     { G_UNICODE_SCRIPT_MENDE_KIKAKUI,          "Mend" },
1806     { G_UNICODE_SCRIPT_MODI,                   "Modi" },
1807     { G_UNICODE_SCRIPT_MRO,                    "Mroo" },
1808     { G_UNICODE_SCRIPT_NABATAEAN,              "Nbat" },
1809     { G_UNICODE_SCRIPT_OLD_NORTH_ARABIAN,      "Narb" },
1810     { G_UNICODE_SCRIPT_OLD_PERMIC,             "Perm" },
1811     { G_UNICODE_SCRIPT_PAHAWH_HMONG,           "Hmng" },
1812     { G_UNICODE_SCRIPT_PALMYRENE,              "Palm" },
1813     { G_UNICODE_SCRIPT_PAU_CIN_HAU,            "Pauc" },
1814     { G_UNICODE_SCRIPT_PSALTER_PAHLAVI,        "Phlp" },
1815     { G_UNICODE_SCRIPT_SIDDHAM,                "Sidd" },
1816     { G_UNICODE_SCRIPT_TIRHUTA,                "Tirh" },
1817     { G_UNICODE_SCRIPT_WARANG_CITI,            "Wara" },
1818
1819     /* Unicode 8.0 additions */
1820     { G_UNICODE_SCRIPT_AHOM,                   "Ahom" },
1821     { G_UNICODE_SCRIPT_ANATOLIAN_HIEROGLYPHS,  "Hluw" },
1822     { G_UNICODE_SCRIPT_HATRAN,                 "Hatr" },
1823     { G_UNICODE_SCRIPT_MULTANI,                "Mult" },
1824     { G_UNICODE_SCRIPT_OLD_HUNGARIAN,          "Hung" },
1825     { G_UNICODE_SCRIPT_SIGNWRITING,            "Sgnw" },
1826
1827     /* Unicode 9.0 additions */
1828     { G_UNICODE_SCRIPT_ADLAM,                  "Adlm" },
1829     { G_UNICODE_SCRIPT_BHAIKSUKI,              "Bhks" },
1830     { G_UNICODE_SCRIPT_MARCHEN,                "Marc" },
1831     { G_UNICODE_SCRIPT_NEWA,                   "Newa" },
1832     { G_UNICODE_SCRIPT_OSAGE,                  "Osge" },
1833     { G_UNICODE_SCRIPT_TANGUT,                 "Tang" },
1834
1835     /* Unicode 10.0 additions */
1836     { G_UNICODE_SCRIPT_MASARAM_GONDI,          "Gonm" },
1837     { G_UNICODE_SCRIPT_NUSHU,                  "Nshu" },
1838     { G_UNICODE_SCRIPT_SOYOMBO,                "Soyo" },
1839     { G_UNICODE_SCRIPT_ZANABAZAR_SQUARE,       "Zanb" },
1840
1841     /* Unicode 11.0 additions */
1842     { G_UNICODE_SCRIPT_DOGRA,                  "Dogr" },
1843     { G_UNICODE_SCRIPT_GUNJALA_GONDI,          "Gong" },
1844     { G_UNICODE_SCRIPT_HANIFI_ROHINGYA,        "Rohg" },
1845     { G_UNICODE_SCRIPT_MAKASAR,                "Maka" },
1846     { G_UNICODE_SCRIPT_MEDEFAIDRIN,            "Medf" },
1847     { G_UNICODE_SCRIPT_OLD_SOGDIAN,            "Sogo" },
1848     { G_UNICODE_SCRIPT_SOGDIAN,                "Sogd" },
1849
1850     /* Unicode 12.0 additions */
1851     { G_UNICODE_SCRIPT_ELYMAIC,                "Elym" },
1852     { G_UNICODE_SCRIPT_NANDINAGARI,            "Nand" },
1853     { G_UNICODE_SCRIPT_NYIAKENG_PUACHUE_HMONG, "Hmnp" },
1854     { G_UNICODE_SCRIPT_WANCHO,                 "Wcho" },
1855
1856     /* Unicode 13.0 additions */
1857     { G_UNICODE_SCRIPT_CHORASMIAN,             "Chrs" },
1858     { G_UNICODE_SCRIPT_DIVES_AKURU,            "Diak" },
1859     { G_UNICODE_SCRIPT_KHITAN_SMALL_SCRIPT,    "Kits" },
1860     { G_UNICODE_SCRIPT_YEZIDI,                 "Yezi" },
1861
1862     /* Unicode 14.0 additions */
1863     { G_UNICODE_SCRIPT_CYPRO_MINOAN,           "Cpmn" },
1864     { G_UNICODE_SCRIPT_OLD_UYGHUR,             "Ougr" },
1865     { G_UNICODE_SCRIPT_TANGSA,                 "Tnsa" },
1866     { G_UNICODE_SCRIPT_TOTO,                   "Toto" },
1867     { G_UNICODE_SCRIPT_VITHKUQI,               "Vith" },
1868
1869     /* Unicode 15.0 additions */
1870     { G_UNICODE_SCRIPT_KAWI,                   "Kawi" },
1871     { G_UNICODE_SCRIPT_NAG_MUNDARI,            "Nagm" },
1872   };
1873   guint i;
1874
1875   g_assert_cmphex (0, ==,
1876                    g_unicode_script_to_iso15924 (G_UNICODE_SCRIPT_INVALID_CODE));
1877   g_assert_cmphex (0x5A7A7A7A, ==, g_unicode_script_to_iso15924 (1000));
1878   g_assert_cmphex (0x41726162, ==,
1879                    g_unicode_script_to_iso15924 (G_UNICODE_SCRIPT_ARABIC));
1880
1881   g_assert_cmphex (G_UNICODE_SCRIPT_INVALID_CODE, ==,
1882                    g_unicode_script_from_iso15924 (0));
1883   g_assert_cmphex (G_UNICODE_SCRIPT_UNKNOWN, ==,
1884                    g_unicode_script_from_iso15924 (0x12345678));
1885
1886 #define PACK(a,b,c,d) \
1887   ((guint32)((((guint8)(a))<<24)|(((guint8)(b))<<16)|(((guint8)(c))<<8)|((guint8)(d))))
1888
1889   for (i = 0; i < G_N_ELEMENTS (data); i++)
1890     {
1891       guint32 code = PACK (data[i].four_letter_code[0],
1892                            data[i].four_letter_code[1],
1893                            data[i].four_letter_code[2],
1894                            data[i].four_letter_code[3]);
1895
1896       g_test_message ("Testing script %s (code %u)", data[i].four_letter_code, code);
1897       g_assert_cmphex (g_unicode_script_to_iso15924 (data[i].script), ==, code);
1898       g_assert_cmpint (g_unicode_script_from_iso15924 (code), ==, data[i].script);
1899     }
1900
1901 #undef PACK
1902 }
1903
1904 static void
1905 test_normalize (void)
1906 {
1907   guint i;
1908   typedef struct
1909   {
1910     const gchar *str;
1911     const gchar *nfd;
1912     const gchar *nfc;
1913     const gchar *nfkd;
1914     const gchar *nfkc;
1915   } Test;
1916   Test tests[] = {
1917     { "Äffin", "A\u0308ffin", "Äffin", "A\u0308ffin", "Äffin" },
1918     { "Ä\uFB03n", "A\u0308\uFB03n", "Ä\uFB03n", "A\u0308ffin", "Äffin" },
1919     { "Henry IV", "Henry IV", "Henry IV", "Henry IV", "Henry IV" },
1920     { "Henry \u2163", "Henry \u2163", "Henry \u2163", "Henry IV", "Henry IV" },
1921     { "non-utf\x88", NULL, NULL, NULL, NULL },
1922     { "", "", "", "", "" },
1923   };
1924
1925 #define TEST(str, mode, expected)                         \
1926   {                                                       \
1927     gchar *normalized = g_utf8_normalize (str, -1, mode); \
1928     g_assert_cmpstr (normalized, ==, expected);           \
1929     g_free (normalized);                                  \
1930   }
1931
1932   for (i = 0; i < G_N_ELEMENTS (tests); i++)
1933     {
1934       TEST (tests[i].str, G_NORMALIZE_NFD, tests[i].nfd);
1935       TEST (tests[i].str, G_NORMALIZE_NFC, tests[i].nfc);
1936       TEST (tests[i].str, G_NORMALIZE_NFKD, tests[i].nfkd);
1937       TEST (tests[i].str, G_NORMALIZE_NFKC, tests[i].nfkc);
1938     }
1939
1940 #undef TEST
1941 }
1942
1943 int
1944 main (int   argc,
1945       char *argv[])
1946 {
1947   g_test_init (&argc, &argv, NULL);
1948
1949   g_test_add_func ("/unicode/alnum", test_alnum);
1950   g_test_add_func ("/unicode/alpha", test_alpha);
1951   g_test_add_func ("/unicode/break-type", test_unichar_break_type);
1952   g_test_add_func ("/unicode/canonical-decomposition", test_canonical_decomposition);
1953   g_test_add_func ("/unicode/casefold", test_casefold);
1954   g_test_add_func ("/unicode/casemap_and_casefold", test_casemap_and_casefold);
1955   g_test_add_func ("/unicode/cases", test_cases);
1956   g_test_add_func ("/unicode/character-type", test_unichar_character_type);
1957   g_test_add_func ("/unicode/cntrl", test_cntrl);
1958   g_test_add_func ("/unicode/combining-class", test_combining_class);
1959   g_test_add_func ("/unicode/compose", test_compose);
1960   g_test_add_func ("/unicode/decompose", test_decompose);
1961   g_test_add_func ("/unicode/decompose-tail", test_decompose_tail);
1962   g_test_add_func ("/unicode/defined", test_defined);
1963   g_test_add_func ("/unicode/digit", test_digit);
1964   g_test_add_func ("/unicode/digit-value", test_digit_value);
1965   g_test_add_func ("/unicode/fully-decompose-canonical", test_fully_decompose_canonical);
1966   g_test_add_func ("/unicode/fully-decompose-len", test_fully_decompose_len);
1967   g_test_add_func ("/unicode/normalization", test_normalization);
1968   g_test_add_func ("/unicode/graph", test_graph);
1969   g_test_add_func ("/unicode/iso15924", test_iso15924);
1970   g_test_add_func ("/unicode/lower", test_lower);
1971   g_test_add_func ("/unicode/mark", test_mark);
1972   g_test_add_func ("/unicode/mirror", test_mirror);
1973   g_test_add_func ("/unicode/print", test_print);
1974   g_test_add_func ("/unicode/punctuation", test_punctuation);
1975   g_test_add_func ("/unicode/script", test_unichar_script);
1976   g_test_add_func ("/unicode/space", test_space);
1977   g_test_add_func ("/unicode/strdown", test_strdown);
1978   g_test_add_func ("/unicode/strup", test_strup);
1979   g_test_add_func ("/unicode/turkish-strupdown", test_turkish_strupdown);
1980   g_test_add_func ("/unicode/title", test_title);
1981   g_test_add_func ("/unicode/upper", test_upper);
1982   g_test_add_func ("/unicode/validate", test_unichar_validate);
1983   g_test_add_func ("/unicode/wide", test_wide);
1984   g_test_add_func ("/unicode/xdigit", test_xdigit);
1985   g_test_add_func ("/unicode/xdigit-value", test_xdigit_value);
1986   g_test_add_func ("/unicode/zero-width", test_zerowidth);
1987   g_test_add_func ("/unicode/normalize", test_normalize);
1988
1989   return g_test_run();
1990 }