url/url_canon_host.cc

   1 // Copyright 2013 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "base/logging.h"
   6 #include "url/url_canon.h"
   7 #include "url/url_canon_internal.h"
   8
   9 namespace url {
  10
  11 namespace {
  12
  13 // For reference, here's what IE supports:
  14 // Key: 0 (disallowed: failure if present in the input)
  15 //      + (allowed either escaped or unescaped, and unmodified)
  16 //      U (allowed escaped or unescaped but always unescaped if present in
  17 //         escaped form)
  18 //      E (allowed escaped or unescaped but always escaped if present in
  19 //         unescaped form)
  20 //      % (only allowed escaped in the input, will be unmodified).
  21 //      I left blank alpha numeric characters.
  22 //
  23 //    00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
  24 //    -----------------------------------------------
  25 // 0   0  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
  26 // 1   E  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
  27 // 2   E  +  E  E  +  E  +  +  +  +  +  +  +  U  U  0
  28 // 3                                 %  %  E  +  E  0  <-- Those are  : ; < = > ?
  29 // 4   %
  30 // 5                                    U  0  U  U  U  <-- Those are  [ \ ] ^ _
  31 // 6   E                                               <-- That's  `
  32 // 7                                    E  E  E  U  E  <-- Those are { | } ~ (UNPRINTABLE)
  33 //
  34 // NOTE: I didn't actually test all the control characters. Some may be
  35 // disallowed in the input, but they are all accepted escaped except for 0.
  36 // I also didn't test if characters affecting HTML parsing are allowed
  37 // unescaped, e.g. (") or (#), which would indicate the beginning of the path.
  38 // Surprisingly, space is accepted in the input and always escaped.
  39
  40 // This table lists the canonical version of all characters we allow in the
  41 // input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
  42 // value to indicate that this character should be escaped. We are a little more
  43 // restrictive than IE, but less restrictive than Firefox.
  44 //
  45 // Note that we disallow the % character. We will allow it when part of an
  46 // escape sequence, of course, but this disallows "%25". Even though IE allows
  47 // it, allowing it would put us in a funny state. If there was an invalid
  48 // escape sequence like "%zz", we'll add "%25zz" to the output and fail.
  49 // Allowing percents means we'll succeed a second time, so validity would change
  50 // based on how many times you run the canonicalizer. We prefer to always report
  51 // the same vailidity, so reject this.
  52 const unsigned char kEsc = 0xff;
  53 const unsigned char kHostCharLookup[0x80] = {
  54 // 00-1f: all are invalid
  55      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
  56      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
  57 //  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
  58    kEsc,kEsc,kEsc,kEsc,kEsc,  0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.',  0,
  59 //   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
  60     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':',  0 ,kEsc,kEsc,kEsc,  0 ,
  61 //   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
  62    kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
  63 //   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
  64     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[',  0 , ']',  0 , '_',
  65 //   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
  66    kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
  67 //   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
  68     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc,  0 ,  0 };
  69
  70 const int kTempHostBufferLen = 1024;
  71 typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;
  72 typedef RawCanonOutputT<base::char16, kTempHostBufferLen> StackBufferW;
  73
  74 // Scans a host name and fills in the output flags according to what we find.
  75 // |has_non_ascii| will be true if there are any non-7-bit characters, and
  76 // |has_escaped| will be true if there is a percent sign.
  77 template<typename CHAR, typename UCHAR>
  78 void ScanHostname(const CHAR* spec,
  79                   const Component& host,
  80                   bool* has_non_ascii,
  81                   bool* has_escaped) {
  82   int end = host.end();
  83   *has_non_ascii = false;
  84   *has_escaped = false;
  85   for (int i = host.begin; i < end; i++) {
  86     if (static_cast<UCHAR>(spec[i]) >= 0x80)
  87       *has_non_ascii = true;
  88     else if (spec[i] == '%')
  89       *has_escaped = true;
  90   }
  91 }
  92
  93 // Canonicalizes a host name that is entirely 8-bit characters (even though
  94 // the type holding them may be 16 bits. Escaped characters will be unescaped.
  95 // Non-7-bit characters (for example, UTF-8) will be passed unchanged.
  96 //
  97 // The |*has_non_ascii| flag will be true if there are non-7-bit characters in
  98 // the output.
  99 //
 100 // This function is used in two situations:
 101 //
 102 //  * When the caller knows there is no non-ASCII or percent escaped
 103 //    characters. This is what DoHost does. The result will be a completely
 104 //    canonicalized host since we know nothing weird can happen (escaped
 105 //    characters could be unescaped to non-7-bit, so they have to be treated
 106 //    with suspicion at this point). It does not use the |has_non_ascii| flag.
 107 //
 108 //  * When the caller has an 8-bit string that may need unescaping.
 109 //    DoComplexHost calls us this situation to do unescaping and validation.
 110 //    After this, it may do other IDN operations depending on the value of the
 111 //    |*has_non_ascii| flag.
 112 //
 113 // The return value indicates if the output is a potentially valid host name.
 114 template<typename INCHAR, typename OUTCHAR>
 115 bool DoSimpleHost(const INCHAR* host,
 116                   int host_len,
 117                   CanonOutputT<OUTCHAR>* output,
 118                   bool* has_non_ascii) {
 119   *has_non_ascii = false;
 120
 121   bool success = true;
 122   for (int i = 0; i < host_len; ++i) {
 123     unsigned int source = host[i];
 124     if (source == '%') {
 125       // Unescape first, if possible.
 126       // Source will be used only if decode operation was successful.
 127       if (!DecodeEscaped(host, &i, host_len,
 128                          reinterpret_cast<unsigned char*>(&source))) {
 129         // Invalid escaped character. There is nothing that can make this
 130         // host valid. We append an escaped percent so the URL looks reasonable
 131         // and mark as failed.
 132         AppendEscapedChar('%', output);
 133         success = false;
 134         continue;
 135       }
 136     }
 137
 138     if (source < 0x80) {
 139       // We have ASCII input, we can use our lookup table.
 140       unsigned char replacement = kHostCharLookup[source];
 141       if (!replacement) {
 142         // Invalid character, add it as percent-escaped and mark as failed.
 143         AppendEscapedChar(source, output);
 144         success = false;
 145       } else if (replacement == kEsc) {
 146         // This character is valid but should be escaped.
 147         AppendEscapedChar(source, output);
 148       } else {
 149         // Common case, the given character is valid in a hostname, the lookup
 150         // table tells us the canonical representation of that character (lower
 151         // cased).
 152         output->push_back(replacement);
 153       }
 154     } else {
 155       // It's a non-ascii char. Just push it to the output.
 156       // In case where we have char16 input, and char output it's safe to
 157       // cast char16->char only if input string was converted to ASCII.
 158       output->push_back(static_cast<OUTCHAR>(source));
 159       *has_non_ascii = true;
 160     }
 161   }
 162
 163   return success;
 164 }
 165
 166 // Canonicalizes a host that requires IDN conversion. Returns true on success
 167 bool DoIDNHost(const base::char16* src, int src_len, CanonOutput* output) {
 168   int original_output_len = output->length();  // So we can rewind below.
 169
 170   // We need to escape URL before doing IDN conversion, since punicode strings
 171   // cannot be escaped after they are created.
 172   RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
 173   bool has_non_ascii;
 174   DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
 175
 176   StackBufferW wide_output;
 177   if (!IDNToASCII(url_escaped_host.data(),
 178                   url_escaped_host.length(),
 179                   &wide_output)) {
 180     // Some error, give up. This will write some reasonable looking
 181     // representation of the string to the output.
 182     AppendInvalidNarrowString(src, 0, src_len, output);
 183     return false;
 184   }
 185
 186   // Now we check the ASCII output like a normal host. It will also handle
 187   // unescaping. Although we unescaped everything before this function call, if
 188   // somebody does %00 as fullwidth, ICU will convert this to ASCII.
 189   bool success = DoSimpleHost(wide_output.data(),
 190                               wide_output.length(),
 191                               output, &has_non_ascii);
 192   if (has_non_ascii) {
 193     // ICU generated something that DoSimpleHost didn't think looked like
 194     // ASCII. This is quite rare, but ICU might convert some characters to
 195     // percent signs which might generate new escape sequences which might in
 196     // turn be invalid. An example is U+FE6A "small percent" which ICU will
 197     // name prep into an ASCII percent and then we can interpret the following
 198     // characters as escaped characters.
 199     //
 200     // If DoSimpleHost didn't think the output was ASCII, just escape the
 201     // thing we gave ICU and give up. DoSimpleHost will have handled a further
 202     // level of escaping from ICU for simple ASCII cases (i.e. if ICU generates
 203     // a new escaped ASCII sequence like "%41" we'll unescape it) but it won't
 204     // do more (like handle escaped non-ASCII sequences). Handling the escaped
 205     // ASCII isn't strictly necessary, but DoSimpleHost handles this case
 206     // anyway so we handle it/
 207     output->set_length(original_output_len);
 208     AppendInvalidNarrowString(wide_output.data(), 0, wide_output.length(),
 209                               output);
 210     return false;
 211   }
 212   return success;
 213 }
 214
 215 // 8-bit convert host to its ASCII version: this converts the UTF-8 input to
 216 // UTF-16. The has_escaped flag should be set if the input string requires
 217 // unescaping.
 218 bool DoComplexHost(const char* host, int host_len,
 219                    bool has_non_ascii, bool has_escaped, CanonOutput* output) {
 220   // Save the current position in the output. We may write stuff and rewind it
 221   // below, so we need to know where to rewind to.
 222   int begin_length = output->length();
 223
 224   // Points to the UTF-8 data we want to convert. This will either be the
 225   // input or the unescaped version written to |*output| if necessary.
 226   const char* utf8_source;
 227   int utf8_source_len;
 228   if (has_escaped) {
 229     // Unescape before converting to UTF-16 for IDN. We write this into the
 230     // output because it most likely does not require IDNization, and we can
 231     // save another huge stack buffer. It will be replaced below if it requires
 232     // IDN. This will also update our non-ASCII flag so we know whether the
 233     // unescaped input requires IDN.
 234     if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
 235       // Error with some escape sequence. We'll call the current output
 236       // complete. DoSimpleHost will have written some "reasonable" output.
 237       return false;
 238     }
 239
 240     // Unescaping may have left us with ASCII input, in which case the
 241     // unescaped version we wrote to output is complete.
 242     if (!has_non_ascii) {
 243       return true;
 244     }
 245
 246     // Save the pointer into the data was just converted (it may be appended to
 247     // other data in the output buffer).
 248     utf8_source = &output->data()[begin_length];
 249     utf8_source_len = output->length() - begin_length;
 250   } else {
 251     // We don't need to unescape, use input for IDNization later. (We know the
 252     // input has non-ASCII, or the simple version would have been called
 253     // instead of us.)
 254     utf8_source = host;
 255     utf8_source_len = host_len;
 256   }
 257
 258   // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
 259   // Above, we may have used the output to write the unescaped values to, so
 260   // we have to rewind it to where we started after we convert it to UTF-16.
 261   StackBufferW utf16;
 262   if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
 263     // In this error case, the input may or may not be the output.
 264     StackBuffer utf8;
 265     for (int i = 0; i < utf8_source_len; i++)
 266       utf8.push_back(utf8_source[i]);
 267     output->set_length(begin_length);
 268     AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
 269     return false;
 270   }
 271   output->set_length(begin_length);
 272
 273   // This will call DoSimpleHost which will do normal ASCII canonicalization
 274   // and also check for IP addresses in the outpt.
 275   return DoIDNHost(utf16.data(), utf16.length(), output);
 276 }
 277
 278 // UTF-16 convert host to its ASCII version. The set up is already ready for
 279 // the backend, so we just pass through. The has_escaped flag should be set if
 280 // the input string requires unescaping.
 281 bool DoComplexHost(const base::char16* host, int host_len,
 282                    bool has_non_ascii, bool has_escaped, CanonOutput* output) {
 283   if (has_escaped) {
 284     // Yikes, we have escaped characters with wide input. The escaped
 285     // characters should be interpreted as UTF-8. To solve this problem,
 286     // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
 287     //
 288     // We don't bother to optimize the conversion in the ASCII case (which
 289     // *could* just be a copy) and use the UTF-8 path, because it should be
 290     // very rare that host names have escaped characters, and it is relatively
 291     // fast to do the conversion anyway.
 292     StackBuffer utf8;
 293     if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
 294       AppendInvalidNarrowString(host, 0, host_len, output);
 295       return false;
 296     }
 297
 298     // Once we convert to UTF-8, we can use the 8-bit version of the complex
 299     // host handling code above.
 300     return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii,
 301                          has_escaped, output);
 302   }
 303
 304   // No unescaping necessary, we can safely pass the input to ICU. This
 305   // function will only get called if we either have escaped or non-ascii
 306   // input, so it's safe to just use ICU now. Even if the input is ASCII,
 307   // this function will do the right thing (just slower than we could).
 308   return DoIDNHost(host, host_len, output);
 309 }
 310
 311 template <typename CHAR, typename UCHAR>
 312 bool DoHostSubstring(const CHAR* spec,
 313                      const Component& host,
 314                      CanonOutput* output) {
 315   bool has_non_ascii, has_escaped;
 316   ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
 317
 318   if (has_non_ascii || has_escaped) {
 319     return DoComplexHost(&spec[host.begin], host.len, has_non_ascii,
 320                          has_escaped, output);
 321   }
 322
 323   const bool success =
 324       DoSimpleHost(&spec[host.begin], host.len, output, &has_non_ascii);
 325   DCHECK(!has_non_ascii);
 326   return success;
 327 }
 328
 329 template <typename CHAR, typename UCHAR>
 330 void DoHost(const CHAR* spec,
 331             const Component& host,
 332             CanonOutput* output,
 333             CanonHostInfo* host_info) {
 334   if (host.len <= 0) {
 335     // Empty hosts don't need anything.
 336     host_info->family = CanonHostInfo::NEUTRAL;
 337     host_info->out_host = Component();
 338     return;
 339   }
 340
 341   // Keep track of output's initial length, so we can rewind later.
 342   const int output_begin = output->length();
 343
 344   if (DoHostSubstring<CHAR, UCHAR>(spec, host, output)) {
 345     // After all the other canonicalization, check if we ended up with an IP
 346     // address. IP addresses are small, so writing into this temporary buffer
 347     // should not cause an allocation.
 348     RawCanonOutput<64> canon_ip;
 349     CanonicalizeIPAddress(output->data(),
 350                           MakeRange(output_begin, output->length()),
 351                           &canon_ip, host_info);
 352
 353     // If we got an IPv4/IPv6 address, copy the canonical form back to the
 354     // real buffer. Otherwise, it's a hostname or broken IP, in which case
 355     // we just leave it in place.
 356     if (host_info->IsIPAddress()) {
 357       output->set_length(output_begin);
 358       output->Append(canon_ip.data(), canon_ip.length());
 359     }
 360   } else {
 361     // Canonicalization failed. Set BROKEN to notify the caller.
 362     host_info->family = CanonHostInfo::BROKEN;
 363   }
 364
 365   host_info->out_host = MakeRange(output_begin, output->length());
 366 }
 367
 368 }  // namespace
 369
 370 bool CanonicalizeHost(const char* spec,
 371                       const Component& host,
 372                       CanonOutput* output,
 373                       Component* out_host) {
 374   CanonHostInfo host_info;
 375   DoHost<char, unsigned char>(spec, host, output, &host_info);
 376   *out_host = host_info.out_host;
 377   return (host_info.family != CanonHostInfo::BROKEN);
 378 }
 379
 380 bool CanonicalizeHost(const base::char16* spec,
 381                       const Component& host,
 382                       CanonOutput* output,
 383                       Component* out_host) {
 384   CanonHostInfo host_info;
 385   DoHost<base::char16, base::char16>(spec, host, output, &host_info);
 386   *out_host = host_info.out_host;
 387   return (host_info.family != CanonHostInfo::BROKEN);
 388 }
 389
 390 void CanonicalizeHostVerbose(const char* spec,
 391                              const Component& host,
 392                              CanonOutput* output,
 393                              CanonHostInfo* host_info) {
 394   DoHost<char, unsigned char>(spec, host, output, host_info);
 395 }
 396
 397 void CanonicalizeHostVerbose(const base::char16* spec,
 398                              const Component& host,
 399                              CanonOutput* output,
 400                              CanonHostInfo* host_info) {
 401   DoHost<base::char16, base::char16>(spec, host, output, host_info);
 402 }
 403
 404 bool CanonicalizeHostSubstring(const char* spec,
 405                                const Component& host,
 406                                CanonOutput* output) {
 407   return DoHostSubstring<char, unsigned char>(spec, host, output);
 408 }
 409
 410 bool CanonicalizeHostSubstring(const base::char16* spec,
 411                                const Component& host,
 412                                CanonOutput* output) {
 413   return DoHostSubstring<base::char16, base::char16>(spec, host, output);
 414 }
 415
 416 }  // namespace url