1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // Canonicalizers for random bits that aren't big enough for their own files.
9 #include "url/url_canon.h"
10 #include "url/url_canon_internal.h"
16 // Returns true if the given character should be removed from the middle of a
18 inline bool IsRemovableURLWhitespace(int ch) {
19 return ch == '\r' || ch == '\n' || ch == '\t';
22 // Backend for RemoveURLWhitespace (see declaration in url_canon.h).
23 // It sucks that we have to do this, since this takes about 13% of the total URL
24 // canonicalization time.
25 template <typename CHAR>
26 const CHAR* DoRemoveURLWhitespace(const CHAR* input,
28 CanonOutputT<CHAR>* buffer,
30 bool* potentially_dangling_markup) {
31 // Fast verification that there's nothing that needs removal. This is the 99%
32 // case, so we want it to be fast and don't care about impacting the speed
33 // when we do find whitespace.
34 bool found_whitespace = false;
35 if (sizeof(*input) == 1 && input_len >= kMinimumLengthForSIMD) {
36 // For large strings, memchr is much faster than any scalar code we can
37 // write, even if we need to run it three times. (If this turns out to still
38 // be a bottleneck, we could write our own vector code, but given that
39 // memchr is so fast, it's unlikely to be relevant.)
40 found_whitespace = memchr(input, '\n', input_len) != nullptr ||
41 memchr(input, '\r', input_len) != nullptr ||
42 memchr(input, '\t', input_len) != nullptr;
44 for (int i = 0; i < input_len; i++) {
45 if (!IsRemovableURLWhitespace(input[i]))
47 found_whitespace = true;
52 if (!found_whitespace) {
53 // Didn't find any whitespace, we don't need to do anything. We can just
54 // return the input as the output.
55 *output_len = input_len;
59 // Skip whitespace removal for `data:` URLs.
61 // TODO(mkwst): Ideally, this would use something like `base::StartsWith`, but
62 // that turns out to be difficult to do correctly given this function's
63 // character type templating.
64 if (input_len > 5 && input[0] == 'd' && input[1] == 'a' && input[2] == 't' &&
65 input[3] == 'a' && input[4] == ':') {
66 *output_len = input_len;
70 // Remove the whitespace into the new buffer and return it.
71 for (int i = 0; i < input_len; i++) {
72 if (!IsRemovableURLWhitespace(input[i])) {
73 if (potentially_dangling_markup && input[i] == 0x3C)
74 *potentially_dangling_markup = true;
75 buffer->push_back(input[i]);
78 *output_len = buffer->length();
79 return buffer->data();
82 // Contains the canonical version of each possible input letter in the scheme
83 // (basically, lower-cased). The corresponding entry will be 0 if the letter
84 // is not allowed in a scheme.
86 const char kSchemeCanonical[0x80] = {
87 // 00-1f: all are invalid
88 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
89 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
90 // ' ' ! " # $ % & ' ( ) * + , - . /
91 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.', 0,
92 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
93 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 , 0 ,
94 // @ A B C D E F G H I J K L M N O
95 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
96 // P Q R S T U V W X Y Z [ \ ] ^ _
97 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 , 0,
98 // ` a b c d e f g h i j k l m n o
99 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
100 // p q r s t u v w x y z { | } ~
101 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 };
104 // This could be a table lookup as well by setting the high bit for each
105 // valid character, but it's only called once per URL, and it makes the lookup
106 // table easier to read not having extra stuff in it.
107 inline bool IsSchemeFirstChar(unsigned char c) {
108 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
111 template <typename CHAR, typename UCHAR>
112 bool DoScheme(const CHAR* spec,
113 const Component& scheme,
115 Component* out_scheme) {
116 if (scheme.is_empty()) {
117 // Scheme is unspecified or empty, convert to empty by appending a colon.
118 *out_scheme = Component(output->length(), 0);
119 output->push_back(':');
123 // The output scheme starts from the current position.
124 out_scheme->begin = output->length();
126 // Danger: it's important that this code does not strip any characters;
127 // it only emits the canonical version (be it valid or escaped) for each
128 // of the input characters. Stripping would put it out of sync with
129 // FindAndCompareScheme, which could cause some security checks on
130 // schemes to be incorrect.
132 size_t begin = static_cast<size_t>(scheme.begin);
133 size_t end = static_cast<size_t>(scheme.end());
134 for (size_t i = begin; i < end; i++) {
135 UCHAR ch = static_cast<UCHAR>(spec[i]);
136 char replacement = 0;
139 // Need to do a special check for the first letter of the scheme.
140 if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
141 replacement = kSchemeCanonical[ch];
143 replacement = kSchemeCanonical[ch];
148 output->push_back(replacement);
149 } else if (ch == '%') {
150 // Canonicalizing the scheme multiple times should lead to the same
151 // result. Since invalid characters will be escaped, we need to preserve
152 // the percent to avoid multiple escaping. The scheme will be invalid.
154 output->push_back('%');
156 // Invalid character, store it but mark this scheme as invalid.
159 // This will escape the output and also handle encoding issues.
160 // Ignore the return value since we already failed.
161 AppendUTF8EscapedChar(spec, &i, end, output);
165 // The output scheme ends with the the current position, before appending
167 out_scheme->len = output->length() - out_scheme->begin;
168 output->push_back(':');
172 // The username and password components reference ranges in the corresponding
173 // *_spec strings. Typically, these specs will be the same (we're
174 // canonicalizing a single source string), but may be different when
175 // replacing components.
176 template <typename CHAR, typename UCHAR>
177 bool DoUserInfo(const CHAR* username_spec,
178 const Component& username,
179 const CHAR* password_spec,
180 const Component& password,
182 Component* out_username,
183 Component* out_password) {
184 if (username.is_empty() && password.is_empty()) {
185 // Common case: no user info. We strip empty username/passwords.
186 *out_username = Component();
187 *out_password = Component();
191 // Write the username.
192 out_username->begin = output->length();
193 if (username.is_nonempty()) {
194 // This will escape characters not valid for the username.
195 AppendStringOfType(&username_spec[username.begin],
196 static_cast<size_t>(username.len), CHAR_USERINFO,
199 out_username->len = output->length() - out_username->begin;
201 // When there is a password, we need the separator. Note that we strip
202 // empty but specified passwords.
203 if (password.is_nonempty()) {
204 output->push_back(':');
205 out_password->begin = output->length();
206 AppendStringOfType(&password_spec[password.begin],
207 static_cast<size_t>(password.len), CHAR_USERINFO,
209 out_password->len = output->length() - out_password->begin;
211 *out_password = Component();
214 output->push_back('@');
218 // Helper functions for converting port integers to strings.
219 inline void WritePortInt(char* output, int output_len, int port) {
220 _itoa_s(port, output, output_len, 10);
223 // This function will prepend the colon if there will be a port.
224 template <typename CHAR, typename UCHAR>
225 bool DoPort(const CHAR* spec,
226 const Component& port,
227 int default_port_for_scheme,
229 Component* out_port) {
230 int port_num = ParsePort(spec, port);
231 if (port_num == PORT_UNSPECIFIED || port_num == default_port_for_scheme) {
232 *out_port = Component();
233 return true; // Leave port empty.
236 if (port_num == PORT_INVALID) {
237 // Invalid port: We'll copy the text from the input so the user can see
238 // what the error was, and mark the URL as invalid by returning false.
239 output->push_back(':');
240 out_port->begin = output->length();
241 AppendInvalidNarrowString(spec, static_cast<size_t>(port.begin),
242 static_cast<size_t>(port.end()), output);
243 out_port->len = output->length() - out_port->begin;
247 // Convert port number back to an integer. Max port value is 5 digits, and
248 // the Parsed::ExtractPort will have made sure the integer is in range.
249 const int buf_size = 6;
251 WritePortInt(buf, buf_size, port_num);
253 // Append the port number to the output, preceded by a colon.
254 output->push_back(':');
255 out_port->begin = output->length();
256 for (int i = 0; i < buf_size && buf[i]; i++)
257 output->push_back(buf[i]);
259 out_port->len = output->length() - out_port->begin;
264 // Percent-escape all characters from the fragment percent-encode set
265 // https://url.spec.whatwg.org/#fragment-percent-encode-set
266 const bool kShouldEscapeCharInFragment[0x80] = {
267 // Control characters (0x00-0x1F)
268 true, true, true, true, true, true, true, true,
269 true, true, true, true, true, true, true, true,
270 true, true, true, true, true, true, true, true,
271 true, true, true, true, true, true, true, true,
273 true, false, true, false, false, false, false, false,
275 false, false, false, false, false, false, false, false,
277 false, false, false, false, false, false, false, false,
279 false, false, false, false, true, false, true, false,
281 false, false, false, false, false, false, false, false,
283 false, false, false, false, false, false, false, false,
285 false, false, false, false, false, false, false, false,
287 false, false, false, false, false, false, false, false,
289 true, false, false, false, false, false, false, false,
291 false, false, false, false, false, false, false, false,
293 false, false, false, false, false, false, false, false,
294 // x y z { | } ~ DELETE
295 false, false, false, false, false, false, false, true
299 template <typename CHAR, typename UCHAR>
300 void DoCanonicalizeRef(const CHAR* spec,
301 const Component& ref,
303 Component* out_ref) {
304 if (!ref.is_valid()) {
305 // Common case of no ref.
306 *out_ref = Component();
310 // Append the ref separator. Note that we need to do this even when the ref
311 // is empty but present.
312 output->push_back('#');
313 out_ref->begin = output->length();
315 // Now iterate through all the characters, converting to UTF-8 and validating.
316 size_t end = static_cast<size_t>(ref.end());
317 for (size_t i = static_cast<size_t>(ref.begin); i < end; i++) {
318 UCHAR current_char = static_cast<UCHAR>(spec[i]);
319 if (current_char < 0x80) {
320 if (kShouldEscapeCharInFragment[current_char])
321 AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
323 output->push_back(static_cast<char>(spec[i]));
325 AppendUTF8EscapedChar(spec, &i, end, output);
329 out_ref->len = output->length() - out_ref->begin;
334 const char* RemoveURLWhitespace(const char* input,
336 CanonOutputT<char>* buffer,
338 bool* potentially_dangling_markup) {
339 return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
340 potentially_dangling_markup);
343 const char16_t* RemoveURLWhitespace(const char16_t* input,
345 CanonOutputT<char16_t>* buffer,
347 bool* potentially_dangling_markup) {
348 return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
349 potentially_dangling_markup);
352 char CanonicalSchemeChar(char16_t ch) {
354 return 0; // Non-ASCII is not supported by schemes.
355 return kSchemeCanonical[ch];
358 bool CanonicalizeScheme(const char* spec,
359 const Component& scheme,
361 Component* out_scheme) {
362 return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
365 bool CanonicalizeScheme(const char16_t* spec,
366 const Component& scheme,
368 Component* out_scheme) {
369 return DoScheme<char16_t, char16_t>(spec, scheme, output, out_scheme);
372 bool CanonicalizeUserInfo(const char* username_source,
373 const Component& username,
374 const char* password_source,
375 const Component& password,
377 Component* out_username,
378 Component* out_password) {
379 return DoUserInfo<char, unsigned char>(username_source, username,
380 password_source, password, output,
381 out_username, out_password);
384 bool CanonicalizeUserInfo(const char16_t* username_source,
385 const Component& username,
386 const char16_t* password_source,
387 const Component& password,
389 Component* out_username,
390 Component* out_password) {
391 return DoUserInfo<char16_t, char16_t>(username_source, username,
392 password_source, password, output,
393 out_username, out_password);
396 bool CanonicalizePort(const char* spec,
397 const Component& port,
398 int default_port_for_scheme,
400 Component* out_port) {
401 return DoPort<char, unsigned char>(spec, port, default_port_for_scheme,
405 bool CanonicalizePort(const char16_t* spec,
406 const Component& port,
407 int default_port_for_scheme,
409 Component* out_port) {
410 return DoPort<char16_t, char16_t>(spec, port, default_port_for_scheme, output,
414 void CanonicalizeRef(const char* spec,
415 const Component& ref,
417 Component* out_ref) {
418 DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
421 void CanonicalizeRef(const char16_t* spec,
422 const Component& ref,
424 Component* out_ref) {
425 DoCanonicalizeRef<char16_t, char16_t>(spec, ref, output, out_ref);