url/url_canon_etc.cc

   1 // Copyright 2013 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 // Canonicalizers for random bits that aren't big enough for their own files.
   6
   7 #include <string.h>
   8
   9 #include "url/url_canon.h"
  10 #include "url/url_canon_internal.h"
  11
  12 namespace url {
  13
  14 namespace {
  15
  16 // Returns true if the given character should be removed from the middle of a
  17 // URL.
  18 inline bool IsRemovableURLWhitespace(int ch) {
  19   return ch == '\r' || ch == '\n' || ch == '\t';
  20 }
  21
  22 // Backend for RemoveURLWhitespace (see declaration in url_canon.h).
  23 // It sucks that we have to do this, since this takes about 13% of the total URL
  24 // canonicalization time.
  25 template <typename CHAR>
  26 const CHAR* DoRemoveURLWhitespace(const CHAR* input,
  27                                   int input_len,
  28                                   CanonOutputT<CHAR>* buffer,
  29                                   int* output_len,
  30                                   bool* potentially_dangling_markup) {
  31   // Fast verification that there's nothing that needs removal. This is the 99%
  32   // case, so we want it to be fast and don't care about impacting the speed
  33   // when we do find whitespace.
  34   int found_whitespace = false;
  35   for (int i = 0; i < input_len; i++) {
  36     if (!IsRemovableURLWhitespace(input[i]))
  37       continue;
  38     found_whitespace = true;
  39     break;
  40   }
  41
  42   if (!found_whitespace) {
  43     // Didn't find any whitespace, we don't need to do anything. We can just
  44     // return the input as the output.
  45     *output_len = input_len;
  46     return input;
  47   }
  48
  49   // Skip whitespace removal for `data:` URLs.
  50   //
  51   // TODO(mkwst): Ideally, this would use something like `base::StartsWith`, but
  52   // that turns out to be difficult to do correctly given this function's
  53   // character type templating.
  54   if (input_len > 5 && input[0] == 'd' && input[1] == 'a' && input[2] == 't' &&
  55       input[3] == 'a' && input[4] == ':') {
  56     *output_len = input_len;
  57     return input;
  58   }
  59
  60   // Remove the whitespace into the new buffer and return it.
  61   for (int i = 0; i < input_len; i++) {
  62     if (!IsRemovableURLWhitespace(input[i])) {
  63       if (potentially_dangling_markup && input[i] == 0x3C)
  64         *potentially_dangling_markup = true;
  65       buffer->push_back(input[i]);
  66     }
  67   }
  68   *output_len = buffer->length();
  69   return buffer->data();
  70 }
  71
  72 // Contains the canonical version of each possible input letter in the scheme
  73 // (basically, lower-cased). The corresponding entry will be 0 if the letter
  74 // is not allowed in a scheme.
  75 const char kSchemeCanonical[0x80] = {
  76 // 00-1f: all are invalid
  77      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
  78      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
  79 //  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
  80      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  '+',  0,  '-', '.',  0,
  81 //   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
  82     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
  83 //   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
  84      0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
  85 //   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
  86     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0,   0 ,  0,   0 ,  0,
  87 //   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
  88      0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
  89 //   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
  90     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0 ,  0 ,  0 ,  0 ,  0 };
  91
  92 // This could be a table lookup as well by setting the high bit for each
  93 // valid character, but it's only called once per URL, and it makes the lookup
  94 // table easier to read not having extra stuff in it.
  95 inline bool IsSchemeFirstChar(unsigned char c) {
  96   return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
  97 }
  98
  99 template<typename CHAR, typename UCHAR>
 100 bool DoScheme(const CHAR* spec,
 101               const Component& scheme,
 102               CanonOutput* output,
 103               Component* out_scheme) {
 104   if (scheme.len <= 0) {
 105     // Scheme is unspecified or empty, convert to empty by appending a colon.
 106     *out_scheme = Component(output->length(), 0);
 107     output->push_back(':');
 108     return false;
 109   }
 110
 111   // The output scheme starts from the current position.
 112   out_scheme->begin = output->length();
 113
 114   // Danger: it's important that this code does not strip any characters;
 115   // it only emits the canonical version (be it valid or escaped) for each
 116   // of the input characters. Stripping would put it out of sync with
 117   // FindAndCompareScheme, which could cause some security checks on
 118   // schemes to be incorrect.
 119   bool success = true;
 120   int end = scheme.end();
 121   for (int i = scheme.begin; i < end; i++) {
 122     UCHAR ch = static_cast<UCHAR>(spec[i]);
 123     char replacement = 0;
 124     if (ch < 0x80) {
 125       if (i == scheme.begin) {
 126         // Need to do a special check for the first letter of the scheme.
 127         if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
 128           replacement = kSchemeCanonical[ch];
 129       } else {
 130         replacement = kSchemeCanonical[ch];
 131       }
 132     }
 133
 134     if (replacement) {
 135       output->push_back(replacement);
 136     } else if (ch == '%') {
 137       // Canonicalizing the scheme multiple times should lead to the same
 138       // result. Since invalid characters will be escaped, we need to preserve
 139       // the percent to avoid multiple escaping. The scheme will be invalid.
 140       success = false;
 141       output->push_back('%');
 142     } else {
 143       // Invalid character, store it but mark this scheme as invalid.
 144       success = false;
 145
 146       // This will escape the output and also handle encoding issues.
 147       // Ignore the return value since we already failed.
 148       AppendUTF8EscapedChar(spec, &i, end, output);
 149     }
 150   }
 151
 152   // The output scheme ends with the the current position, before appending
 153   // the colon.
 154   out_scheme->len = output->length() - out_scheme->begin;
 155   output->push_back(':');
 156   return success;
 157 }
 158
 159 // The username and password components reference ranges in the corresponding
 160 // *_spec strings. Typically, these specs will be the same (we're
 161 // canonicalizing a single source string), but may be different when
 162 // replacing components.
 163 template<typename CHAR, typename UCHAR>
 164 bool DoUserInfo(const CHAR* username_spec,
 165                 const Component& username,
 166                 const CHAR* password_spec,
 167                 const Component& password,
 168                 CanonOutput* output,
 169                 Component* out_username,
 170                 Component* out_password) {
 171   if (username.len <= 0 && password.len <= 0) {
 172     // Common case: no user info. We strip empty username/passwords.
 173     *out_username = Component();
 174     *out_password = Component();
 175     return true;
 176   }
 177
 178   // Write the username.
 179   out_username->begin = output->length();
 180   if (username.len > 0) {
 181     // This will escape characters not valid for the username.
 182     AppendStringOfType(&username_spec[username.begin], username.len,
 183                        CHAR_USERINFO, output);
 184   }
 185   out_username->len = output->length() - out_username->begin;
 186
 187   // When there is a password, we need the separator. Note that we strip
 188   // empty but specified passwords.
 189   if (password.len > 0) {
 190     output->push_back(':');
 191     out_password->begin = output->length();
 192     AppendStringOfType(&password_spec[password.begin], password.len,
 193                        CHAR_USERINFO, output);
 194     out_password->len = output->length() - out_password->begin;
 195   } else {
 196     *out_password = Component();
 197   }
 198
 199   output->push_back('@');
 200   return true;
 201 }
 202
 203 // Helper functions for converting port integers to strings.
 204 inline void WritePortInt(char* output, int output_len, int port) {
 205   _itoa_s(port, output, output_len, 10);
 206 }
 207
 208 // This function will prepend the colon if there will be a port.
 209 template<typename CHAR, typename UCHAR>
 210 bool DoPort(const CHAR* spec,
 211             const Component& port,
 212             int default_port_for_scheme,
 213             CanonOutput* output,
 214             Component* out_port) {
 215   int port_num = ParsePort(spec, port);
 216   if (port_num == PORT_UNSPECIFIED || port_num == default_port_for_scheme) {
 217     *out_port = Component();
 218     return true;  // Leave port empty.
 219   }
 220
 221   if (port_num == PORT_INVALID) {
 222     // Invalid port: We'll copy the text from the input so the user can see
 223     // what the error was, and mark the URL as invalid by returning false.
 224     output->push_back(':');
 225     out_port->begin = output->length();
 226     AppendInvalidNarrowString(spec, port.begin, port.end(), output);
 227     out_port->len = output->length() - out_port->begin;
 228     return false;
 229   }
 230
 231   // Convert port number back to an integer. Max port value is 5 digits, and
 232   // the Parsed::ExtractPort will have made sure the integer is in range.
 233   const int buf_size = 6;
 234   char buf[buf_size];
 235   WritePortInt(buf, buf_size, port_num);
 236
 237   // Append the port number to the output, preceded by a colon.
 238   output->push_back(':');
 239   out_port->begin = output->length();
 240   for (int i = 0; i < buf_size && buf[i]; i++)
 241     output->push_back(buf[i]);
 242
 243   out_port->len = output->length() - out_port->begin;
 244   return true;
 245 }
 246
 247 // clang-format off
 248 //   Percent-escape all "C0 controls" (0x00-0x1F)
 249 //   https://infra.spec.whatwg.org/#c0-control along with the characters ' '
 250 //   (0x20), '"' (0x22), '<' (0x3C), '>' (0x3E), and '`' (0x60):
 251 const bool kShouldEscapeCharInRef[0x80] = {
 252 //  Control characters (0x00-0x1F)
 253     true,  true,  true,  true,  true,  true,  true,  true,
 254     true,  true,  true,  true,  true,  true,  true,  true,
 255     true,  true,  true,  true,  true,  true,  true,  true,
 256     true,  true,  true,  true,  true,  true,  true,  true,
 257 //  ' '    !      "      #      $      %      &      '
 258     true,  false, true,  false, false, false, false, false,
 259 //  (      )      *      +      ,      -      .      /
 260     false, false, false, false, false, false, false, false,
 261 //  0      1      2      3      4      5      6      7
 262     false, false, false, false, false, false, false, false,
 263 //  8      9      :      ;      <      =      >      ?
 264     false, false, false, false, true,  false, true,  false,
 265 //  @      A      B      C      D      E      F      G
 266     false, false, false, false, false, false, false, false,
 267 //  H      I      J      K      L      M      N      O
 268     false, false, false, false, false, false, false, false,
 269 //  P      Q      R      S      T      U      V      W
 270     false, false, false, false, false, false, false, false,
 271 //  X      Y      Z      [      \      ]      ^      _
 272     false, false, false, false, false, false, false, false,
 273 //  `      a      b      c      d      e      f      g
 274     true,  false, false, false, false, false, false, false,
 275 //  h      i      j      k      l      m      n      o
 276     false, false, false, false, false, false, false, false,
 277 //  p      q      r      s      t      u      v      w
 278     false, false, false, false, false, false, false, false,
 279 //  x      y      z      {      |      }      ~
 280     false, false, false, false, false, false, false
 281 };
 282 // clang-format on
 283
 284 template<typename CHAR, typename UCHAR>
 285 void DoCanonicalizeRef(const CHAR* spec,
 286                        const Component& ref,
 287                        CanonOutput* output,
 288                        Component* out_ref) {
 289   if (ref.len < 0) {
 290     // Common case of no ref.
 291     *out_ref = Component();
 292     return;
 293   }
 294
 295   // Append the ref separator. Note that we need to do this even when the ref
 296   // is empty but present.
 297   output->push_back('#');
 298   out_ref->begin = output->length();
 299
 300   // Now iterate through all the characters, converting to UTF-8 and validating.
 301   int end = ref.end();
 302   for (int i = ref.begin; i < end; i++) {
 303     if (spec[i] == 0) {
 304       // IE just strips NULLs, so we do too.
 305       continue;
 306     }
 307
 308     UCHAR current_char = static_cast<UCHAR>(spec[i]);
 309     if (current_char < 0x80) {
 310       if (kShouldEscapeCharInRef[current_char])
 311         AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
 312       else
 313         output->push_back(static_cast<char>(spec[i]));
 314     } else {
 315       AppendUTF8EscapedChar(spec, &i, end, output);
 316     }
 317   }
 318
 319   out_ref->len = output->length() - out_ref->begin;
 320 }
 321
 322 }  // namespace
 323
 324 const char* RemoveURLWhitespace(const char* input,
 325                                 int input_len,
 326                                 CanonOutputT<char>* buffer,
 327                                 int* output_len,
 328                                 bool* potentially_dangling_markup) {
 329   return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
 330                                potentially_dangling_markup);
 331 }
 332
 333 const base::char16* RemoveURLWhitespace(const base::char16* input,
 334                                         int input_len,
 335                                         CanonOutputT<base::char16>* buffer,
 336                                         int* output_len,
 337                                         bool* potentially_dangling_markup) {
 338   return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
 339                                potentially_dangling_markup);
 340 }
 341
 342 char CanonicalSchemeChar(base::char16 ch) {
 343   if (ch >= 0x80)
 344     return 0;  // Non-ASCII is not supported by schemes.
 345   return kSchemeCanonical[ch];
 346 }
 347
 348 bool CanonicalizeScheme(const char* spec,
 349                         const Component& scheme,
 350                         CanonOutput* output,
 351                         Component* out_scheme) {
 352   return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
 353 }
 354
 355 bool CanonicalizeScheme(const base::char16* spec,
 356                         const Component& scheme,
 357                         CanonOutput* output,
 358                         Component* out_scheme) {
 359   return DoScheme<base::char16, base::char16>(spec, scheme, output, out_scheme);
 360 }
 361
 362 bool CanonicalizeUserInfo(const char* username_source,
 363                           const Component& username,
 364                           const char* password_source,
 365                           const Component& password,
 366                           CanonOutput* output,
 367                           Component* out_username,
 368                           Component* out_password) {
 369   return DoUserInfo<char, unsigned char>(
 370       username_source, username, password_source, password,
 371       output, out_username, out_password);
 372 }
 373
 374 bool CanonicalizeUserInfo(const base::char16* username_source,
 375                           const Component& username,
 376                           const base::char16* password_source,
 377                           const Component& password,
 378                           CanonOutput* output,
 379                           Component* out_username,
 380                           Component* out_password) {
 381   return DoUserInfo<base::char16, base::char16>(
 382       username_source, username, password_source, password,
 383       output, out_username, out_password);
 384 }
 385
 386 bool CanonicalizePort(const char* spec,
 387                       const Component& port,
 388                       int default_port_for_scheme,
 389                       CanonOutput* output,
 390                       Component* out_port) {
 391   return DoPort<char, unsigned char>(spec, port,
 392                                      default_port_for_scheme,
 393                                      output, out_port);
 394 }
 395
 396 bool CanonicalizePort(const base::char16* spec,
 397                       const Component& port,
 398                       int default_port_for_scheme,
 399                       CanonOutput* output,
 400                       Component* out_port) {
 401   return DoPort<base::char16, base::char16>(spec, port, default_port_for_scheme,
 402                                             output, out_port);
 403 }
 404
 405 void CanonicalizeRef(const char* spec,
 406                      const Component& ref,
 407                      CanonOutput* output,
 408                      Component* out_ref) {
 409   DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
 410 }
 411
 412 void CanonicalizeRef(const base::char16* spec,
 413                      const Component& ref,
 414                      CanonOutput* output,
 415                      Component* out_ref) {
 416   DoCanonicalizeRef<base::char16, base::char16>(spec, ref, output, out_ref);
 417 }
 418
 419 }  // namespace url