Utilities/cmlibuv/src/idna.c

   1 /* Copyright (c) 2011, 2018 Ben Noordhuis <info@bnoordhuis.nl>
   2  *
   3  * Permission to use, copy, modify, and/or distribute this software for any
   4  * purpose with or without fee is hereby granted, provided that the above
   5  * copyright notice and this permission notice appear in all copies.
   6  *
   7  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
   8  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
   9  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  10  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  11  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  12  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  13  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  14  */
  15
  16 /* Derived from https://github.com/bnoordhuis/punycode
  17  * but updated to support IDNA 2008.
  18  */
  19
  20 #include "uv.h"
  21 #include "idna.h"
  22 #include <assert.h>
  23 #include <string.h>
  24
  25 static unsigned uv__utf8_decode1_slow(const char** p,
  26                                       const char* pe,
  27                                       unsigned a) {
  28   unsigned b;
  29   unsigned c;
  30   unsigned d;
  31   unsigned min;
  32
  33   if (a > 0xF7)
  34     return -1;
  35
  36   switch (pe - *p) {
  37   default:
  38     if (a > 0xEF) {
  39       min = 0x10000;
  40       a = a & 7;
  41       b = (unsigned char) *(*p)++;
  42       c = (unsigned char) *(*p)++;
  43       d = (unsigned char) *(*p)++;
  44       break;
  45     }
  46     /* Fall through. */
  47   case 2:
  48     if (a > 0xDF) {
  49       min = 0x800;
  50       b = 0x80 | (a & 15);
  51       c = (unsigned char) *(*p)++;
  52       d = (unsigned char) *(*p)++;
  53       a = 0;
  54       break;
  55     }
  56     /* Fall through. */
  57   case 1:
  58     if (a > 0xBF) {
  59       min = 0x80;
  60       b = 0x80;
  61       c = 0x80 | (a & 31);
  62       d = (unsigned char) *(*p)++;
  63       a = 0;
  64       break;
  65     }
  66     /* Fall through. */
  67   case 0:
  68     return -1;  /* Invalid continuation byte. */
  69   }
  70
  71   if (0x80 != (0xC0 & (b ^ c ^ d)))
  72     return -1;  /* Invalid sequence. */
  73
  74   b &= 63;
  75   c &= 63;
  76   d &= 63;
  77   a = (a << 18) | (b << 12) | (c << 6) | d;
  78
  79   if (a < min)
  80     return -1;  /* Overlong sequence. */
  81
  82   if (a > 0x10FFFF)
  83     return -1;  /* Four-byte sequence > U+10FFFF. */
  84
  85   if (a >= 0xD800 && a <= 0xDFFF)
  86     return -1;  /* Surrogate pair. */
  87
  88   return a;
  89 }
  90
  91 unsigned uv__utf8_decode1(const char** p, const char* pe) {
  92   unsigned a;
  93
  94   assert(*p < pe);
  95
  96   a = (unsigned char) *(*p)++;
  97
  98   if (a < 128)
  99     return a;  /* ASCII, common case. */
 100
 101   return uv__utf8_decode1_slow(p, pe, a);
 102 }
 103
 104 static int uv__idna_toascii_label(const char* s, const char* se,
 105                                   char** d, char* de) {
 106   static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789";
 107   const char* ss;
 108   unsigned c;
 109   unsigned h;
 110   unsigned k;
 111   unsigned n;
 112   unsigned m;
 113   unsigned q;
 114   unsigned t;
 115   unsigned x;
 116   unsigned y;
 117   unsigned bias;
 118   unsigned delta;
 119   unsigned todo;
 120   int first;
 121
 122   h = 0;
 123   ss = s;
 124   todo = 0;
 125
 126   /* Note: after this loop we've visited all UTF-8 characters and know
 127    * they're legal so we no longer need to check for decode errors.
 128    */
 129   while (s < se) {
 130     c = uv__utf8_decode1(&s, se);
 131
 132     if (c == -1u)
 133       return UV_EINVAL;
 134
 135     if (c < 128)
 136       h++;
 137     else
 138       todo++;
 139   }
 140
 141   /* Only write "xn--" when there are non-ASCII characters. */
 142   if (todo > 0) {
 143     if (*d < de) *(*d)++ = 'x';
 144     if (*d < de) *(*d)++ = 'n';
 145     if (*d < de) *(*d)++ = '-';
 146     if (*d < de) *(*d)++ = '-';
 147   }
 148
 149   /* Write ASCII characters. */
 150   x = 0;
 151   s = ss;
 152   while (s < se) {
 153     c = uv__utf8_decode1(&s, se);
 154     assert(c != -1u);
 155
 156     if (c > 127)
 157       continue;
 158
 159     if (*d < de)
 160       *(*d)++ = c;
 161
 162     if (++x == h)
 163       break;  /* Visited all ASCII characters. */
 164   }
 165
 166   if (todo == 0)
 167     return h;
 168
 169   /* Only write separator when we've written ASCII characters first. */
 170   if (h > 0)
 171     if (*d < de)
 172       *(*d)++ = '-';
 173
 174   n = 128;
 175   bias = 72;
 176   delta = 0;
 177   first = 1;
 178
 179   while (todo > 0) {
 180     m = -1;
 181     s = ss;
 182
 183     while (s < se) {
 184       c = uv__utf8_decode1(&s, se);
 185       assert(c != -1u);
 186
 187       if (c >= n)
 188         if (c < m)
 189           m = c;
 190     }
 191
 192     x = m - n;
 193     y = h + 1;
 194
 195     if (x > ~delta / y)
 196       return UV_E2BIG;  /* Overflow. */
 197
 198     delta += x * y;
 199     n = m;
 200
 201     s = ss;
 202     while (s < se) {
 203       c = uv__utf8_decode1(&s, se);
 204       assert(c != -1u);
 205
 206       if (c < n)
 207         if (++delta == 0)
 208           return UV_E2BIG;  /* Overflow. */
 209
 210       if (c != n)
 211         continue;
 212
 213       for (k = 36, q = delta; /* empty */; k += 36) {
 214         t = 1;
 215
 216         if (k > bias)
 217           t = k - bias;
 218
 219         if (t > 26)
 220           t = 26;
 221
 222         if (q < t)
 223           break;
 224
 225         /* TODO(bnoordhuis) Since 1 <= t <= 26 and therefore
 226          * 10 <= y <= 35, we can optimize the long division
 227          * into a table-based reciprocal multiplication.
 228          */
 229         x = q - t;
 230         y = 36 - t;  /* 10 <= y <= 35 since 1 <= t <= 26. */
 231         q = x / y;
 232         t = t + x % y;  /* 1 <= t <= 35 because of y. */
 233
 234         if (*d < de)
 235           *(*d)++ = alphabet[t];
 236       }
 237
 238       if (*d < de)
 239         *(*d)++ = alphabet[q];
 240
 241       delta /= 2;
 242
 243       if (first) {
 244         delta /= 350;
 245         first = 0;
 246       }
 247
 248       /* No overflow check is needed because |delta| was just
 249        * divided by 2 and |delta+delta >= delta + delta/h|.
 250        */
 251       h++;
 252       delta += delta / h;
 253
 254       for (bias = 0; delta > 35 * 26 / 2; bias += 36)
 255         delta /= 35;
 256
 257       bias += 36 * delta / (delta + 38);
 258       delta = 0;
 259       todo--;
 260     }
 261
 262     delta++;
 263     n++;
 264   }
 265
 266   return 0;
 267 }
 268
 269 long uv__idna_toascii(const char* s, const char* se, char* d, char* de) {
 270   const char* si;
 271   const char* st;
 272   unsigned c;
 273   char* ds;
 274   int rc;
 275
 276   ds = d;
 277
 278   si = s;
 279   while (si < se) {
 280     st = si;
 281     c = uv__utf8_decode1(&si, se);
 282
 283     if (c == -1u)
 284       return UV_EINVAL;
 285
 286     if (c != '.')
 287       if (c != 0x3002)  /* 。 */
 288         if (c != 0xFF0E)  /* ． */
 289           if (c != 0xFF61)  /* ｡ */
 290             continue;
 291
 292     rc = uv__idna_toascii_label(s, st, &d, de);
 293
 294     if (rc < 0)
 295       return rc;
 296
 297     if (d < de)
 298       *d++ = '.';
 299
 300     s = si;
 301   }
 302
 303   if (s < se) {
 304     rc = uv__idna_toascii_label(s, se, &d, de);
 305
 306     if (rc < 0)
 307       return rc;
 308   }
 309
 310   if (d < de)
 311     *d++ = '\0';
 312
 313   return d - ds;  /* Number of bytes written. */
 314 }