1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // Canonicalizers for random bits that aren't big enough for their own files.
9 #include "url/url_canon.h"
10 #include "url/url_canon_internal.h"
16 // Returns true if the given character should be removed from the middle of a
18 inline bool IsRemovableURLWhitespace(int ch) {
19 return ch == '\r' || ch == '\n' || ch == '\t';
22 // Backend for RemoveURLWhitespace (see declaration in url_canon.h).
23 // It sucks that we have to do this, since this takes about 13% of the total URL
24 // canonicalization time.
25 template <typename CHAR>
26 const CHAR* DoRemoveURLWhitespace(const CHAR* input,
28 CanonOutputT<CHAR>* buffer,
30 bool* potentially_dangling_markup) {
31 // Fast verification that there's nothing that needs removal. This is the 99%
32 // case, so we want it to be fast and don't care about impacting the speed
33 // when we do find whitespace.
34 int found_whitespace = false;
35 for (int i = 0; i < input_len; i++) {
36 if (!IsRemovableURLWhitespace(input[i]))
38 found_whitespace = true;
42 if (!found_whitespace) {
43 // Didn't find any whitespace, we don't need to do anything. We can just
44 // return the input as the output.
45 *output_len = input_len;
49 // Skip whitespace removal for `data:` URLs.
51 // TODO(mkwst): Ideally, this would use something like `base::StartsWith`, but
52 // that turns out to be difficult to do correctly given this function's
53 // character type templating.
54 if (input_len > 5 && input[0] == 'd' && input[1] == 'a' && input[2] == 't' &&
55 input[3] == 'a' && input[4] == ':') {
56 *output_len = input_len;
60 // Remove the whitespace into the new buffer and return it.
61 for (int i = 0; i < input_len; i++) {
62 if (!IsRemovableURLWhitespace(input[i])) {
63 if (potentially_dangling_markup && input[i] == 0x3C)
64 *potentially_dangling_markup = true;
65 buffer->push_back(input[i]);
68 *output_len = buffer->length();
69 return buffer->data();
72 // Contains the canonical version of each possible input letter in the scheme
73 // (basically, lower-cased). The corresponding entry will be 0 if the letter
74 // is not allowed in a scheme.
75 const char kSchemeCanonical[0x80] = {
76 // 00-1f: all are invalid
77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79 // ' ' ! " # $ % & ' ( ) * + , - . /
80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.', 0,
81 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
82 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 , 0 ,
83 // @ A B C D E F G H I J K L M N O
84 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
85 // P Q R S T U V W X Y Z [ \ ] ^ _
86 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 , 0,
87 // ` a b c d e f g h i j k l m n o
88 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
89 // p q r s t u v w x y z { | } ~
90 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 };
92 // This could be a table lookup as well by setting the high bit for each
93 // valid character, but it's only called once per URL, and it makes the lookup
94 // table easier to read not having extra stuff in it.
95 inline bool IsSchemeFirstChar(unsigned char c) {
96 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
99 template<typename CHAR, typename UCHAR>
100 bool DoScheme(const CHAR* spec,
101 const Component& scheme,
103 Component* out_scheme) {
104 if (scheme.len <= 0) {
105 // Scheme is unspecified or empty, convert to empty by appending a colon.
106 *out_scheme = Component(output->length(), 0);
107 output->push_back(':');
111 // The output scheme starts from the current position.
112 out_scheme->begin = output->length();
114 // Danger: it's important that this code does not strip any characters;
115 // it only emits the canonical version (be it valid or escaped) for each
116 // of the input characters. Stripping would put it out of sync with
117 // FindAndCompareScheme, which could cause some security checks on
118 // schemes to be incorrect.
120 int end = scheme.end();
121 for (int i = scheme.begin; i < end; i++) {
122 UCHAR ch = static_cast<UCHAR>(spec[i]);
123 char replacement = 0;
125 if (i == scheme.begin) {
126 // Need to do a special check for the first letter of the scheme.
127 if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
128 replacement = kSchemeCanonical[ch];
130 replacement = kSchemeCanonical[ch];
135 output->push_back(replacement);
136 } else if (ch == '%') {
137 // Canonicalizing the scheme multiple times should lead to the same
138 // result. Since invalid characters will be escaped, we need to preserve
139 // the percent to avoid multiple escaping. The scheme will be invalid.
141 output->push_back('%');
143 // Invalid character, store it but mark this scheme as invalid.
146 // This will escape the output and also handle encoding issues.
147 // Ignore the return value since we already failed.
148 AppendUTF8EscapedChar(spec, &i, end, output);
152 // The output scheme ends with the the current position, before appending
154 out_scheme->len = output->length() - out_scheme->begin;
155 output->push_back(':');
159 // The username and password components reference ranges in the corresponding
160 // *_spec strings. Typically, these specs will be the same (we're
161 // canonicalizing a single source string), but may be different when
162 // replacing components.
163 template<typename CHAR, typename UCHAR>
164 bool DoUserInfo(const CHAR* username_spec,
165 const Component& username,
166 const CHAR* password_spec,
167 const Component& password,
169 Component* out_username,
170 Component* out_password) {
171 if (username.len <= 0 && password.len <= 0) {
172 // Common case: no user info. We strip empty username/passwords.
173 *out_username = Component();
174 *out_password = Component();
178 // Write the username.
179 out_username->begin = output->length();
180 if (username.len > 0) {
181 // This will escape characters not valid for the username.
182 AppendStringOfType(&username_spec[username.begin], username.len,
183 CHAR_USERINFO, output);
185 out_username->len = output->length() - out_username->begin;
187 // When there is a password, we need the separator. Note that we strip
188 // empty but specified passwords.
189 if (password.len > 0) {
190 output->push_back(':');
191 out_password->begin = output->length();
192 AppendStringOfType(&password_spec[password.begin], password.len,
193 CHAR_USERINFO, output);
194 out_password->len = output->length() - out_password->begin;
196 *out_password = Component();
199 output->push_back('@');
203 // Helper functions for converting port integers to strings.
204 inline void WritePortInt(char* output, int output_len, int port) {
205 _itoa_s(port, output, output_len, 10);
208 // This function will prepend the colon if there will be a port.
209 template<typename CHAR, typename UCHAR>
210 bool DoPort(const CHAR* spec,
211 const Component& port,
212 int default_port_for_scheme,
214 Component* out_port) {
215 int port_num = ParsePort(spec, port);
216 if (port_num == PORT_UNSPECIFIED || port_num == default_port_for_scheme) {
217 *out_port = Component();
218 return true; // Leave port empty.
221 if (port_num == PORT_INVALID) {
222 // Invalid port: We'll copy the text from the input so the user can see
223 // what the error was, and mark the URL as invalid by returning false.
224 output->push_back(':');
225 out_port->begin = output->length();
226 AppendInvalidNarrowString(spec, port.begin, port.end(), output);
227 out_port->len = output->length() - out_port->begin;
231 // Convert port number back to an integer. Max port value is 5 digits, and
232 // the Parsed::ExtractPort will have made sure the integer is in range.
233 const int buf_size = 6;
235 WritePortInt(buf, buf_size, port_num);
237 // Append the port number to the output, preceded by a colon.
238 output->push_back(':');
239 out_port->begin = output->length();
240 for (int i = 0; i < buf_size && buf[i]; i++)
241 output->push_back(buf[i]);
243 out_port->len = output->length() - out_port->begin;
248 // Percent-escape all "C0 controls" (0x00-0x1F)
249 // https://infra.spec.whatwg.org/#c0-control along with the characters ' '
250 // (0x20), '"' (0x22), '<' (0x3C), '>' (0x3E), and '`' (0x60):
251 const bool kShouldEscapeCharInRef[0x80] = {
252 // Control characters (0x00-0x1F)
253 true, true, true, true, true, true, true, true,
254 true, true, true, true, true, true, true, true,
255 true, true, true, true, true, true, true, true,
256 true, true, true, true, true, true, true, true,
258 true, false, true, false, false, false, false, false,
260 false, false, false, false, false, false, false, false,
262 false, false, false, false, false, false, false, false,
264 false, false, false, false, true, false, true, false,
266 false, false, false, false, false, false, false, false,
268 false, false, false, false, false, false, false, false,
270 false, false, false, false, false, false, false, false,
272 false, false, false, false, false, false, false, false,
274 true, false, false, false, false, false, false, false,
276 false, false, false, false, false, false, false, false,
278 false, false, false, false, false, false, false, false,
280 false, false, false, false, false, false, false
284 template<typename CHAR, typename UCHAR>
285 void DoCanonicalizeRef(const CHAR* spec,
286 const Component& ref,
288 Component* out_ref) {
290 // Common case of no ref.
291 *out_ref = Component();
295 // Append the ref separator. Note that we need to do this even when the ref
296 // is empty but present.
297 output->push_back('#');
298 out_ref->begin = output->length();
300 // Now iterate through all the characters, converting to UTF-8 and validating.
302 for (int i = ref.begin; i < end; i++) {
304 // IE just strips NULLs, so we do too.
308 UCHAR current_char = static_cast<UCHAR>(spec[i]);
309 if (current_char < 0x80) {
310 if (kShouldEscapeCharInRef[current_char])
311 AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
313 output->push_back(static_cast<char>(spec[i]));
315 AppendUTF8EscapedChar(spec, &i, end, output);
319 out_ref->len = output->length() - out_ref->begin;
324 const char* RemoveURLWhitespace(const char* input,
326 CanonOutputT<char>* buffer,
328 bool* potentially_dangling_markup) {
329 return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
330 potentially_dangling_markup);
333 const base::char16* RemoveURLWhitespace(const base::char16* input,
335 CanonOutputT<base::char16>* buffer,
337 bool* potentially_dangling_markup) {
338 return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
339 potentially_dangling_markup);
342 char CanonicalSchemeChar(base::char16 ch) {
344 return 0; // Non-ASCII is not supported by schemes.
345 return kSchemeCanonical[ch];
348 bool CanonicalizeScheme(const char* spec,
349 const Component& scheme,
351 Component* out_scheme) {
352 return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
355 bool CanonicalizeScheme(const base::char16* spec,
356 const Component& scheme,
358 Component* out_scheme) {
359 return DoScheme<base::char16, base::char16>(spec, scheme, output, out_scheme);
362 bool CanonicalizeUserInfo(const char* username_source,
363 const Component& username,
364 const char* password_source,
365 const Component& password,
367 Component* out_username,
368 Component* out_password) {
369 return DoUserInfo<char, unsigned char>(
370 username_source, username, password_source, password,
371 output, out_username, out_password);
374 bool CanonicalizeUserInfo(const base::char16* username_source,
375 const Component& username,
376 const base::char16* password_source,
377 const Component& password,
379 Component* out_username,
380 Component* out_password) {
381 return DoUserInfo<base::char16, base::char16>(
382 username_source, username, password_source, password,
383 output, out_username, out_password);
386 bool CanonicalizePort(const char* spec,
387 const Component& port,
388 int default_port_for_scheme,
390 Component* out_port) {
391 return DoPort<char, unsigned char>(spec, port,
392 default_port_for_scheme,
396 bool CanonicalizePort(const base::char16* spec,
397 const Component& port,
398 int default_port_for_scheme,
400 Component* out_port) {
401 return DoPort<base::char16, base::char16>(spec, port, default_port_for_scheme,
405 void CanonicalizeRef(const char* spec,
406 const Component& ref,
408 Component* out_ref) {
409 DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
412 void CanonicalizeRef(const base::char16* spec,
413 const Component& ref,
415 Component* out_ref) {
416 DoCanonicalizeRef<base::char16, base::char16>(spec, ref, output, out_ref);