fs/unicode/utf8-selftest.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Kernel module for testing utf-8 support.
   4  *
   5  * Copyright 2017 Collabora Ltd.
   6  */
   7
   8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   9
  10 #include <linux/module.h>
  11 #include <linux/printk.h>
  12 #include <linux/unicode.h>
  13 #include <linux/dcache.h>
  14
  15 #include "utf8n.h"
  16
  17 unsigned int failed_tests;
  18 unsigned int total_tests;
  19
  20 /* Tests will be based on this version. */
  21 #define UTF8_LATEST     UNICODE_AGE(12, 1, 0)
  22
  23 #define _test(cond, func, line, fmt, ...) do {                          \
  24                 total_tests++;                                          \
  25                 if (!cond) {                                            \
  26                         failed_tests++;                                 \
  27                         pr_err("test %s:%d Failed: %s%s",               \
  28                                func, line, #cond, (fmt?":":"."));       \
  29                         if (fmt)                                        \
  30                                 pr_err(fmt, ##__VA_ARGS__);             \
  31                 }                                                       \
  32         } while (0)
  33 #define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__)
  34 #define test(cond) _test(cond, __func__, __LINE__, "")
  35
  36 static const struct {
  37         /* UTF-8 strings in this vector _must_ be NULL-terminated. */
  38         unsigned char str[10];
  39         unsigned char dec[10];
  40 } nfdi_test_data[] = {
  41         /* Trivial sequence */
  42         {
  43                 /* "ABba" decomposes to itself */
  44                 .str = "aBba",
  45                 .dec = "aBba",
  46         },
  47         /* Simple equivalent sequences */
  48         {
  49                /* 'VULGAR FRACTION ONE QUARTER' cannot decompose to
  50                   'NUMBER 1' + 'FRACTION SLASH' + 'NUMBER 4' on
  51                   canonical decomposition */
  52                .str = {0xc2, 0xbc, 0x00},
  53                .dec = {0xc2, 0xbc, 0x00},
  54         },
  55         {
  56                 /* 'LATIN SMALL LETTER A WITH DIAERESIS' decomposes to
  57                    'LETTER A' + 'COMBINING DIAERESIS' */
  58                 .str = {0xc3, 0xa4, 0x00},
  59                 .dec = {0x61, 0xcc, 0x88, 0x00},
  60         },
  61         {
  62                 /* 'LATIN SMALL LETTER LJ' can't decompose to
  63                    'LETTER L' + 'LETTER J' on canonical decomposition */
  64                 .str = {0xC7, 0x89, 0x00},
  65                 .dec = {0xC7, 0x89, 0x00},
  66         },
  67         {
  68                 /* GREEK ANO TELEIA decomposes to MIDDLE DOT */
  69                 .str = {0xCE, 0x87, 0x00},
  70                 .dec = {0xC2, 0xB7, 0x00}
  71         },
  72         /* Canonical ordering */
  73         {
  74                 /* A + 'COMBINING ACUTE ACCENT' + 'COMBINING OGONEK' decomposes
  75                    to A + 'COMBINING OGONEK' + 'COMBINING ACUTE ACCENT' */
  76                 .str = {0x41, 0xcc, 0x81, 0xcc, 0xa8, 0x0},
  77                 .dec = {0x41, 0xcc, 0xa8, 0xcc, 0x81, 0x0},
  78         },
  79         {
  80                 /* 'LATIN SMALL LETTER A WITH DIAERESIS' + 'COMBINING OGONEK'
  81                    decomposes to
  82                    'LETTER A' + 'COMBINING OGONEK' + 'COMBINING DIAERESIS' */
  83                 .str = {0xc3, 0xa4, 0xCC, 0xA8, 0x00},
  84
  85                 .dec = {0x61, 0xCC, 0xA8, 0xcc, 0x88, 0x00},
  86         },
  87
  88 };
  89
  90 static const struct {
  91         /* UTF-8 strings in this vector _must_ be NULL-terminated. */
  92         unsigned char str[30];
  93         unsigned char ncf[30];
  94 } nfdicf_test_data[] = {
  95         /* Trivial sequences */
  96         {
  97                 /* "ABba" folds to lowercase */
  98                 .str = {0x41, 0x42, 0x62, 0x61, 0x00},
  99                 .ncf = {0x61, 0x62, 0x62, 0x61, 0x00},
 100         },
 101         {
 102                 /* All ASCII folds to lower-case */
 103                 .str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0.1",
 104                 .ncf = "abcdefghijklmnopqrstuvwxyz0.1",
 105         },
 106         {
 107                 /* LATIN SMALL LETTER SHARP S folds to
 108                    LATIN SMALL LETTER S + LATIN SMALL LETTER S */
 109                 .str = {0xc3, 0x9f, 0x00},
 110                 .ncf = {0x73, 0x73, 0x00},
 111         },
 112         {
 113                 /* LATIN CAPITAL LETTER A WITH RING ABOVE folds to
 114                    LATIN SMALL LETTER A + COMBINING RING ABOVE */
 115                 .str = {0xC3, 0x85, 0x00},
 116                 .ncf = {0x61, 0xcc, 0x8a, 0x00},
 117         },
 118         /* Introduced by UTF-8.0.0. */
 119         /* Cherokee letters are interesting test-cases because they fold
 120            to upper-case.  Before 8.0.0, Cherokee lowercase were
 121            undefined, thus, the folding from LC is not stable between
 122            7.0.0 -> 8.0.0, but it is from UC. */
 123         {
 124                 /* CHEROKEE SMALL LETTER A folds to CHEROKEE LETTER A */
 125                 .str = {0xea, 0xad, 0xb0, 0x00},
 126                 .ncf = {0xe1, 0x8e, 0xa0, 0x00},
 127         },
 128         {
 129                 /* CHEROKEE SMALL LETTER YE folds to CHEROKEE LETTER YE */
 130                 .str = {0xe1, 0x8f, 0xb8, 0x00},
 131                 .ncf = {0xe1, 0x8f, 0xb0, 0x00},
 132         },
 133         {
 134                 /* OLD HUNGARIAN CAPITAL LETTER AMB folds to
 135                    OLD HUNGARIAN SMALL LETTER AMB */
 136                 .str = {0xf0, 0x90, 0xb2, 0x83, 0x00},
 137                 .ncf = {0xf0, 0x90, 0xb3, 0x83, 0x00},
 138         },
 139         /* Introduced by UTF-9.0.0. */
 140         {
 141                 /* OSAGE CAPITAL LETTER CHA folds to
 142                    OSAGE SMALL LETTER CHA */
 143                 .str = {0xf0, 0x90, 0x92, 0xb5, 0x00},
 144                 .ncf = {0xf0, 0x90, 0x93, 0x9d, 0x00},
 145         },
 146         {
 147                 /* LATIN CAPITAL LETTER SMALL CAPITAL I folds to
 148                    LATIN LETTER SMALL CAPITAL I */
 149                 .str = {0xea, 0x9e, 0xae, 0x00},
 150                 .ncf = {0xc9, 0xaa, 0x00},
 151         },
 152         /* Introduced by UTF-11.0.0. */
 153         {
 154                 /* GEORGIAN SMALL LETTER AN folds to GEORGIAN MTAVRULI
 155                    CAPITAL LETTER AN */
 156                 .str = {0xe1, 0xb2, 0x90, 0x00},
 157                 .ncf = {0xe1, 0x83, 0x90, 0x00},
 158         }
 159 };
 160
 161 static ssize_t utf8len(const struct unicode_map *um, enum utf8_normalization n,
 162                 const char *s)
 163 {
 164         return utf8nlen(um, n, s, (size_t)-1);
 165 }
 166
 167 static int utf8cursor(struct utf8cursor *u8c, const struct unicode_map *um,
 168                 enum utf8_normalization n, const char *s)
 169 {
 170         return utf8ncursor(u8c, um, n, s, (unsigned int)-1);
 171 }
 172
 173 static void check_utf8_nfdi(struct unicode_map *um)
 174 {
 175         int i;
 176         struct utf8cursor u8c;
 177
 178         for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
 179                 int len = strlen(nfdi_test_data[i].str);
 180                 int nlen = strlen(nfdi_test_data[i].dec);
 181                 int j = 0;
 182                 unsigned char c;
 183
 184                 test((utf8len(um, UTF8_NFDI, nfdi_test_data[i].str) == nlen));
 185                 test((utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len) ==
 186                         nlen));
 187
 188                 if (utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str) < 0)
 189                         pr_err("can't create cursor\n");
 190
 191                 while ((c = utf8byte(&u8c)) > 0) {
 192                         test_f((c == nfdi_test_data[i].dec[j]),
 193                                "Unexpected byte 0x%x should be 0x%x\n",
 194                                c, nfdi_test_data[i].dec[j]);
 195                         j++;
 196                 }
 197
 198                 test((j == nlen));
 199         }
 200 }
 201
 202 static void check_utf8_nfdicf(struct unicode_map *um)
 203 {
 204         int i;
 205         struct utf8cursor u8c;
 206
 207         for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
 208                 int len = strlen(nfdicf_test_data[i].str);
 209                 int nlen = strlen(nfdicf_test_data[i].ncf);
 210                 int j = 0;
 211                 unsigned char c;
 212
 213                 test((utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str) ==
 214                                 nlen));
 215                 test((utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len) ==
 216                                 nlen));
 217
 218                 if (utf8cursor(&u8c, um, UTF8_NFDICF,
 219                                 nfdicf_test_data[i].str) < 0)
 220                         pr_err("can't create cursor\n");
 221
 222                 while ((c = utf8byte(&u8c)) > 0) {
 223                         test_f((c == nfdicf_test_data[i].ncf[j]),
 224                                "Unexpected byte 0x%x should be 0x%x\n",
 225                                c, nfdicf_test_data[i].ncf[j]);
 226                         j++;
 227                 }
 228
 229                 test((j == nlen));
 230         }
 231 }
 232
 233 static void check_utf8_comparisons(struct unicode_map *table)
 234 {
 235         int i;
 236
 237         for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
 238                 const struct qstr s1 = {.name = nfdi_test_data[i].str,
 239                                         .len = sizeof(nfdi_test_data[i].str)};
 240                 const struct qstr s2 = {.name = nfdi_test_data[i].dec,
 241                                         .len = sizeof(nfdi_test_data[i].dec)};
 242
 243                 test_f(!utf8_strncmp(table, &s1, &s2),
 244                        "%s %s comparison mismatch\n", s1.name, s2.name);
 245         }
 246
 247         for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
 248                 const struct qstr s1 = {.name = nfdicf_test_data[i].str,
 249                                         .len = sizeof(nfdicf_test_data[i].str)};
 250                 const struct qstr s2 = {.name = nfdicf_test_data[i].ncf,
 251                                         .len = sizeof(nfdicf_test_data[i].ncf)};
 252
 253                 test_f(!utf8_strncasecmp(table, &s1, &s2),
 254                        "%s %s comparison mismatch\n", s1.name, s2.name);
 255         }
 256 }
 257
 258 static void check_supported_versions(struct unicode_map *um)
 259 {
 260         /* Unicode 7.0.0 should be supported. */
 261         test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0)));
 262
 263         /* Unicode 9.0.0 should be supported. */
 264         test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0)));
 265
 266         /* Unicode 1x.0.0 (the latest version) should be supported. */
 267         test(utf8version_is_supported(um, UTF8_LATEST));
 268
 269         /* Next versions don't exist. */
 270         test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0)));
 271         test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0)));
 272         test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1)));
 273 }
 274
 275 static int __init init_test_ucd(void)
 276 {
 277         struct unicode_map *um;
 278
 279         failed_tests = 0;
 280         total_tests = 0;
 281
 282         um = utf8_load(UTF8_LATEST);
 283         if (IS_ERR(um)) {
 284                 pr_err("%s: Unable to load utf8 table.\n", __func__);
 285                 return PTR_ERR(um);
 286         }
 287
 288         check_supported_versions(um);
 289         check_utf8_nfdi(um);
 290         check_utf8_nfdicf(um);
 291         check_utf8_comparisons(um);
 292
 293         if (!failed_tests)
 294                 pr_info("All %u tests passed\n", total_tests);
 295         else
 296                 pr_err("%u out of %u tests failed\n", failed_tests,
 297                        total_tests);
 298         utf8_unload(um);
 299         return 0;
 300 }
 301
 302 static void __exit exit_test_ucd(void)
 303 {
 304 }
 305
 306 module_init(init_test_ucd);
 307 module_exit(exit_test_ucd);
 308
 309 MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>");
 310 MODULE_LICENSE("GPL");