url/url_canon_etc.cc

   1 // Copyright 2013 The Chromium Authors
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 // Canonicalizers for random bits that aren't big enough for their own files.
   6
   7 #include <string.h>
   8
   9 #include "url/url_canon.h"
  10 #include "url/url_canon_internal.h"
  11
  12 namespace url {
  13
  14 namespace {
  15
  16 // Returns true if the given character should be removed from the middle of a
  17 // URL.
  18 inline bool IsRemovableURLWhitespace(int ch) {
  19   return ch == '\r' || ch == '\n' || ch == '\t';
  20 }
  21
  22 // Backend for RemoveURLWhitespace (see declaration in url_canon.h).
  23 // It sucks that we have to do this, since this takes about 13% of the total URL
  24 // canonicalization time.
  25 template <typename CHAR>
  26 const CHAR* DoRemoveURLWhitespace(const CHAR* input,
  27                                   int input_len,
  28                                   CanonOutputT<CHAR>* buffer,
  29                                   int* output_len,
  30                                   bool* potentially_dangling_markup) {
  31   // Fast verification that there's nothing that needs removal. This is the 99%
  32   // case, so we want it to be fast and don't care about impacting the speed
  33   // when we do find whitespace.
  34   bool found_whitespace = false;
  35   if (sizeof(*input) == 1 && input_len >= kMinimumLengthForSIMD) {
  36     // For large strings, memchr is much faster than any scalar code we can
  37     // write, even if we need to run it three times. (If this turns out to still
  38     // be a bottleneck, we could write our own vector code, but given that
  39     // memchr is so fast, it's unlikely to be relevant.)
  40     found_whitespace = memchr(input, '\n', input_len) != nullptr ||
  41                        memchr(input, '\r', input_len) != nullptr ||
  42                        memchr(input, '\t', input_len) != nullptr;
  43   } else {
  44     for (int i = 0; i < input_len; i++) {
  45       if (!IsRemovableURLWhitespace(input[i]))
  46         continue;
  47       found_whitespace = true;
  48       break;
  49     }
  50   }
  51
  52   if (!found_whitespace) {
  53     // Didn't find any whitespace, we don't need to do anything. We can just
  54     // return the input as the output.
  55     *output_len = input_len;
  56     return input;
  57   }
  58
  59   // Skip whitespace removal for `data:` URLs.
  60   //
  61   // TODO(mkwst): Ideally, this would use something like `base::StartsWith`, but
  62   // that turns out to be difficult to do correctly given this function's
  63   // character type templating.
  64   if (input_len > 5 && input[0] == 'd' && input[1] == 'a' && input[2] == 't' &&
  65       input[3] == 'a' && input[4] == ':') {
  66     *output_len = input_len;
  67     return input;
  68   }
  69
  70   // Remove the whitespace into the new buffer and return it.
  71   for (int i = 0; i < input_len; i++) {
  72     if (!IsRemovableURLWhitespace(input[i])) {
  73       if (potentially_dangling_markup && input[i] == 0x3C)
  74         *potentially_dangling_markup = true;
  75       buffer->push_back(input[i]);
  76     }
  77   }
  78   *output_len = buffer->length();
  79   return buffer->data();
  80 }
  81
  82 // Contains the canonical version of each possible input letter in the scheme
  83 // (basically, lower-cased). The corresponding entry will be 0 if the letter
  84 // is not allowed in a scheme.
  85 // clang-format off
  86 const char kSchemeCanonical[0x80] = {
  87 // 00-1f: all are invalid
  88      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
  89      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
  90 //  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
  91      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  '+',  0,  '-', '.',  0,
  92 //   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
  93     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
  94 //   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
  95      0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
  96 //   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
  97     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0,   0 ,  0,   0 ,  0,
  98 //   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
  99      0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
 100 //   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
 101     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0 ,  0 ,  0 ,  0 ,  0 };
 102 // clang-format on
 103
 104 // This could be a table lookup as well by setting the high bit for each
 105 // valid character, but it's only called once per URL, and it makes the lookup
 106 // table easier to read not having extra stuff in it.
 107 inline bool IsSchemeFirstChar(unsigned char c) {
 108   return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
 109 }
 110
 111 template <typename CHAR, typename UCHAR>
 112 bool DoScheme(const CHAR* spec,
 113               const Component& scheme,
 114               CanonOutput* output,
 115               Component* out_scheme) {
 116   if (scheme.is_empty()) {
 117     // Scheme is unspecified or empty, convert to empty by appending a colon.
 118     *out_scheme = Component(output->length(), 0);
 119     output->push_back(':');
 120     return false;
 121   }
 122
 123   // The output scheme starts from the current position.
 124   out_scheme->begin = output->length();
 125
 126   // Danger: it's important that this code does not strip any characters;
 127   // it only emits the canonical version (be it valid or escaped) for each
 128   // of the input characters. Stripping would put it out of sync with
 129   // FindAndCompareScheme, which could cause some security checks on
 130   // schemes to be incorrect.
 131   bool success = true;
 132   size_t begin = static_cast<size_t>(scheme.begin);
 133   size_t end = static_cast<size_t>(scheme.end());
 134   for (size_t i = begin; i < end; i++) {
 135     UCHAR ch = static_cast<UCHAR>(spec[i]);
 136     char replacement = 0;
 137     if (ch < 0x80) {
 138       if (i == begin) {
 139         // Need to do a special check for the first letter of the scheme.
 140         if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
 141           replacement = kSchemeCanonical[ch];
 142       } else {
 143         replacement = kSchemeCanonical[ch];
 144       }
 145     }
 146
 147     if (replacement) {
 148       output->push_back(replacement);
 149     } else if (ch == '%') {
 150       // Canonicalizing the scheme multiple times should lead to the same
 151       // result. Since invalid characters will be escaped, we need to preserve
 152       // the percent to avoid multiple escaping. The scheme will be invalid.
 153       success = false;
 154       output->push_back('%');
 155     } else {
 156       // Invalid character, store it but mark this scheme as invalid.
 157       success = false;
 158
 159       // This will escape the output and also handle encoding issues.
 160       // Ignore the return value since we already failed.
 161       AppendUTF8EscapedChar(spec, &i, end, output);
 162     }
 163   }
 164
 165   // The output scheme ends with the the current position, before appending
 166   // the colon.
 167   out_scheme->len = output->length() - out_scheme->begin;
 168   output->push_back(':');
 169   return success;
 170 }
 171
 172 // The username and password components reference ranges in the corresponding
 173 // *_spec strings. Typically, these specs will be the same (we're
 174 // canonicalizing a single source string), but may be different when
 175 // replacing components.
 176 template <typename CHAR, typename UCHAR>
 177 bool DoUserInfo(const CHAR* username_spec,
 178                 const Component& username,
 179                 const CHAR* password_spec,
 180                 const Component& password,
 181                 CanonOutput* output,
 182                 Component* out_username,
 183                 Component* out_password) {
 184   if (username.is_empty() && password.is_empty()) {
 185     // Common case: no user info. We strip empty username/passwords.
 186     *out_username = Component();
 187     *out_password = Component();
 188     return true;
 189   }
 190
 191   // Write the username.
 192   out_username->begin = output->length();
 193   if (username.is_nonempty()) {
 194     // This will escape characters not valid for the username.
 195     AppendStringOfType(&username_spec[username.begin],
 196                        static_cast<size_t>(username.len), CHAR_USERINFO,
 197                        output);
 198   }
 199   out_username->len = output->length() - out_username->begin;
 200
 201   // When there is a password, we need the separator. Note that we strip
 202   // empty but specified passwords.
 203   if (password.is_nonempty()) {
 204     output->push_back(':');
 205     out_password->begin = output->length();
 206     AppendStringOfType(&password_spec[password.begin],
 207                        static_cast<size_t>(password.len), CHAR_USERINFO,
 208                        output);
 209     out_password->len = output->length() - out_password->begin;
 210   } else {
 211     *out_password = Component();
 212   }
 213
 214   output->push_back('@');
 215   return true;
 216 }
 217
 218 // Helper functions for converting port integers to strings.
 219 inline void WritePortInt(char* output, int output_len, int port) {
 220   _itoa_s(port, output, output_len, 10);
 221 }
 222
 223 // This function will prepend the colon if there will be a port.
 224 template <typename CHAR, typename UCHAR>
 225 bool DoPort(const CHAR* spec,
 226             const Component& port,
 227             int default_port_for_scheme,
 228             CanonOutput* output,
 229             Component* out_port) {
 230   int port_num = ParsePort(spec, port);
 231   if (port_num == PORT_UNSPECIFIED || port_num == default_port_for_scheme) {
 232     *out_port = Component();
 233     return true;  // Leave port empty.
 234   }
 235
 236   if (port_num == PORT_INVALID) {
 237     // Invalid port: We'll copy the text from the input so the user can see
 238     // what the error was, and mark the URL as invalid by returning false.
 239     output->push_back(':');
 240     out_port->begin = output->length();
 241     AppendInvalidNarrowString(spec, static_cast<size_t>(port.begin),
 242                               static_cast<size_t>(port.end()), output);
 243     out_port->len = output->length() - out_port->begin;
 244     return false;
 245   }
 246
 247   // Convert port number back to an integer. Max port value is 5 digits, and
 248   // the Parsed::ExtractPort will have made sure the integer is in range.
 249   const int buf_size = 6;
 250   char buf[buf_size];
 251   WritePortInt(buf, buf_size, port_num);
 252
 253   // Append the port number to the output, preceded by a colon.
 254   output->push_back(':');
 255   out_port->begin = output->length();
 256   for (int i = 0; i < buf_size && buf[i]; i++)
 257     output->push_back(buf[i]);
 258
 259   out_port->len = output->length() - out_port->begin;
 260   return true;
 261 }
 262
 263 // clang-format off
 264 //   Percent-escape all characters from the fragment percent-encode set
 265 //   https://url.spec.whatwg.org/#fragment-percent-encode-set
 266 const bool kShouldEscapeCharInFragment[0x80] = {
 267 //  Control characters (0x00-0x1F)
 268     true,  true,  true,  true,  true,  true,  true,  true,
 269     true,  true,  true,  true,  true,  true,  true,  true,
 270     true,  true,  true,  true,  true,  true,  true,  true,
 271     true,  true,  true,  true,  true,  true,  true,  true,
 272 //  ' '    !      "      #      $      %      &      '
 273     true,  false, true,  false, false, false, false, false,
 274 //  (      )      *      +      ,      -      .      /
 275     false, false, false, false, false, false, false, false,
 276 //  0      1      2      3      4      5      6      7
 277     false, false, false, false, false, false, false, false,
 278 //  8      9      :      ;      <      =      >      ?
 279     false, false, false, false, true,  false, true,  false,
 280 //  @      A      B      C      D      E      F      G
 281     false, false, false, false, false, false, false, false,
 282 //  H      I      J      K      L      M      N      O
 283     false, false, false, false, false, false, false, false,
 284 //  P      Q      R      S      T      U      V      W
 285     false, false, false, false, false, false, false, false,
 286 //  X      Y      Z      [      \      ]      ^      _
 287     false, false, false, false, false, false, false, false,
 288 //  `      a      b      c      d      e      f      g
 289     true,  false, false, false, false, false, false, false,
 290 //  h      i      j      k      l      m      n      o
 291     false, false, false, false, false, false, false, false,
 292 //  p      q      r      s      t      u      v      w
 293     false, false, false, false, false, false, false, false,
 294 //  x      y      z      {      |      }      ~      DELETE
 295     false, false, false, false, false, false, false, true
 296 };
 297 // clang-format on
 298
 299 template <typename CHAR, typename UCHAR>
 300 void DoCanonicalizeRef(const CHAR* spec,
 301                        const Component& ref,
 302                        CanonOutput* output,
 303                        Component* out_ref) {
 304   if (!ref.is_valid()) {
 305     // Common case of no ref.
 306     *out_ref = Component();
 307     return;
 308   }
 309
 310   // Append the ref separator. Note that we need to do this even when the ref
 311   // is empty but present.
 312   output->push_back('#');
 313   out_ref->begin = output->length();
 314
 315   // Now iterate through all the characters, converting to UTF-8 and validating.
 316   size_t end = static_cast<size_t>(ref.end());
 317   for (size_t i = static_cast<size_t>(ref.begin); i < end; i++) {
 318     UCHAR current_char = static_cast<UCHAR>(spec[i]);
 319     if (current_char < 0x80) {
 320       if (kShouldEscapeCharInFragment[current_char])
 321         AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
 322       else
 323         output->push_back(static_cast<char>(spec[i]));
 324     } else {
 325       AppendUTF8EscapedChar(spec, &i, end, output);
 326     }
 327   }
 328
 329   out_ref->len = output->length() - out_ref->begin;
 330 }
 331
 332 }  // namespace
 333
 334 const char* RemoveURLWhitespace(const char* input,
 335                                 int input_len,
 336                                 CanonOutputT<char>* buffer,
 337                                 int* output_len,
 338                                 bool* potentially_dangling_markup) {
 339   return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
 340                                potentially_dangling_markup);
 341 }
 342
 343 const char16_t* RemoveURLWhitespace(const char16_t* input,
 344                                     int input_len,
 345                                     CanonOutputT<char16_t>* buffer,
 346                                     int* output_len,
 347                                     bool* potentially_dangling_markup) {
 348   return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
 349                                potentially_dangling_markup);
 350 }
 351
 352 char CanonicalSchemeChar(char16_t ch) {
 353   if (ch >= 0x80)
 354     return 0;  // Non-ASCII is not supported by schemes.
 355   return kSchemeCanonical[ch];
 356 }
 357
 358 bool CanonicalizeScheme(const char* spec,
 359                         const Component& scheme,
 360                         CanonOutput* output,
 361                         Component* out_scheme) {
 362   return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
 363 }
 364
 365 bool CanonicalizeScheme(const char16_t* spec,
 366                         const Component& scheme,
 367                         CanonOutput* output,
 368                         Component* out_scheme) {
 369   return DoScheme<char16_t, char16_t>(spec, scheme, output, out_scheme);
 370 }
 371
 372 bool CanonicalizeUserInfo(const char* username_source,
 373                           const Component& username,
 374                           const char* password_source,
 375                           const Component& password,
 376                           CanonOutput* output,
 377                           Component* out_username,
 378                           Component* out_password) {
 379   return DoUserInfo<char, unsigned char>(username_source, username,
 380                                          password_source, password, output,
 381                                          out_username, out_password);
 382 }
 383
 384 bool CanonicalizeUserInfo(const char16_t* username_source,
 385                           const Component& username,
 386                           const char16_t* password_source,
 387                           const Component& password,
 388                           CanonOutput* output,
 389                           Component* out_username,
 390                           Component* out_password) {
 391   return DoUserInfo<char16_t, char16_t>(username_source, username,
 392                                         password_source, password, output,
 393                                         out_username, out_password);
 394 }
 395
 396 bool CanonicalizePort(const char* spec,
 397                       const Component& port,
 398                       int default_port_for_scheme,
 399                       CanonOutput* output,
 400                       Component* out_port) {
 401   return DoPort<char, unsigned char>(spec, port, default_port_for_scheme,
 402                                      output, out_port);
 403 }
 404
 405 bool CanonicalizePort(const char16_t* spec,
 406                       const Component& port,
 407                       int default_port_for_scheme,
 408                       CanonOutput* output,
 409                       Component* out_port) {
 410   return DoPort<char16_t, char16_t>(spec, port, default_port_for_scheme, output,
 411                                     out_port);
 412 }
 413
 414 void CanonicalizeRef(const char* spec,
 415                      const Component& ref,
 416                      CanonOutput* output,
 417                      Component* out_ref) {
 418   DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
 419 }
 420
 421 void CanonicalizeRef(const char16_t* spec,
 422                      const Component& ref,
 423                      CanonOutput* output,
 424                      Component* out_ref) {
 425   DoCanonicalizeRef<char16_t, char16_t>(spec, ref, output, out_ref);
 426 }
 427
 428 }  // namespace url