1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/logging.h"
6 #include "url/url_canon.h"
7 #include "url/url_canon_internal.h"
13 // For reference, here's what IE supports:
14 // Key: 0 (disallowed: failure if present in the input)
15 // + (allowed either escaped or unescaped, and unmodified)
16 // U (allowed escaped or unescaped but always unescaped if present in
18 // E (allowed escaped or unescaped but always escaped if present in
20 // % (only allowed escaped in the input, will be unmodified).
21 // I left blank alpha numeric characters.
23 // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
24 // -----------------------------------------------
25 // 0 0 E E E E E E E E E E E E E E E
26 // 1 E E E E E E E E E E E E E E E E
27 // 2 E + E E + E + + + + + + + U U 0
28 // 3 % % E + E 0 <-- Those are : ; < = > ?
30 // 5 U 0 U U U <-- Those are [ \ ] ^ _
32 // 7 E E E U E <-- Those are { | } ~ (UNPRINTABLE)
34 // NOTE: I didn't actually test all the control characters. Some may be
35 // disallowed in the input, but they are all accepted escaped except for 0.
36 // I also didn't test if characters affecting HTML parsing are allowed
37 // unescaped, e.g. (") or (#), which would indicate the beginning of the path.
38 // Surprisingly, space is accepted in the input and always escaped.
40 // This table lists the canonical version of all characters we allow in the
41 // input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
42 // value to indicate that this character should be escaped. We are a little more
43 // restrictive than IE, but less restrictive than Firefox.
45 // Note that we disallow the % character. We will allow it when part of an
46 // escape sequence, of course, but this disallows "%25". Even though IE allows
47 // it, allowing it would put us in a funny state. If there was an invalid
48 // escape sequence like "%zz", we'll add "%25zz" to the output and fail.
49 // Allowing percents means we'll succeed a second time, so validity would change
50 // based on how many times you run the canonicalizer. We prefer to always report
51 // the same vailidity, so reject this.
52 const unsigned char kEsc = 0xff;
53 const unsigned char kHostCharLookup[0x80] = {
54 // 00-1f: all are invalid
55 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57 // ' ' ! " # $ % & ' ( ) * + , - . /
58 kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0,
59 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
60 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc, 0 ,
61 // @ A B C D E F G H I J K L M N O
62 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
63 // P Q R S T U V W X Y Z [ \ ] ^ _
64 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , '_',
65 // ` a b c d e f g h i j k l m n o
66 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
67 // p q r s t u v w x y z { | } ~
68 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 };
70 const int kTempHostBufferLen = 1024;
71 typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;
72 typedef RawCanonOutputT<base::char16, kTempHostBufferLen> StackBufferW;
74 // Scans a host name and fills in the output flags according to what we find.
75 // |has_non_ascii| will be true if there are any non-7-bit characters, and
76 // |has_escaped| will be true if there is a percent sign.
77 template<typename CHAR, typename UCHAR>
78 void ScanHostname(const CHAR* spec,
79 const Component& host,
83 *has_non_ascii = false;
85 for (int i = host.begin; i < end; i++) {
86 if (static_cast<UCHAR>(spec[i]) >= 0x80)
87 *has_non_ascii = true;
88 else if (spec[i] == '%')
93 // Canonicalizes a host name that is entirely 8-bit characters (even though
94 // the type holding them may be 16 bits. Escaped characters will be unescaped.
95 // Non-7-bit characters (for example, UTF-8) will be passed unchanged.
97 // The |*has_non_ascii| flag will be true if there are non-7-bit characters in
100 // This function is used in two situations:
102 // * When the caller knows there is no non-ASCII or percent escaped
103 // characters. This is what DoHost does. The result will be a completely
104 // canonicalized host since we know nothing weird can happen (escaped
105 // characters could be unescaped to non-7-bit, so they have to be treated
106 // with suspicion at this point). It does not use the |has_non_ascii| flag.
108 // * When the caller has an 8-bit string that may need unescaping.
109 // DoComplexHost calls us this situation to do unescaping and validation.
110 // After this, it may do other IDN operations depending on the value of the
111 // |*has_non_ascii| flag.
113 // The return value indicates if the output is a potentially valid host name.
114 template<typename INCHAR, typename OUTCHAR>
115 bool DoSimpleHost(const INCHAR* host,
117 CanonOutputT<OUTCHAR>* output,
118 bool* has_non_ascii) {
119 *has_non_ascii = false;
122 for (int i = 0; i < host_len; ++i) {
123 unsigned int source = host[i];
125 // Unescape first, if possible.
126 // Source will be used only if decode operation was successful.
127 if (!DecodeEscaped(host, &i, host_len,
128 reinterpret_cast<unsigned char*>(&source))) {
129 // Invalid escaped character. There is nothing that can make this
130 // host valid. We append an escaped percent so the URL looks reasonable
131 // and mark as failed.
132 AppendEscapedChar('%', output);
139 // We have ASCII input, we can use our lookup table.
140 unsigned char replacement = kHostCharLookup[source];
142 // Invalid character, add it as percent-escaped and mark as failed.
143 AppendEscapedChar(source, output);
145 } else if (replacement == kEsc) {
146 // This character is valid but should be escaped.
147 AppendEscapedChar(source, output);
149 // Common case, the given character is valid in a hostname, the lookup
150 // table tells us the canonical representation of that character (lower
152 output->push_back(replacement);
155 // It's a non-ascii char. Just push it to the output.
156 // In case where we have char16 input, and char output it's safe to
157 // cast char16->char only if input string was converted to ASCII.
158 output->push_back(static_cast<OUTCHAR>(source));
159 *has_non_ascii = true;
166 // Canonicalizes a host that requires IDN conversion. Returns true on success
167 bool DoIDNHost(const base::char16* src, int src_len, CanonOutput* output) {
168 int original_output_len = output->length(); // So we can rewind below.
170 // We need to escape URL before doing IDN conversion, since punicode strings
171 // cannot be escaped after they are created.
172 RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
174 DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
176 StackBufferW wide_output;
177 if (!IDNToASCII(url_escaped_host.data(),
178 url_escaped_host.length(),
180 // Some error, give up. This will write some reasonable looking
181 // representation of the string to the output.
182 AppendInvalidNarrowString(src, 0, src_len, output);
186 // Now we check the ASCII output like a normal host. It will also handle
187 // unescaping. Although we unescaped everything before this function call, if
188 // somebody does %00 as fullwidth, ICU will convert this to ASCII.
189 bool success = DoSimpleHost(wide_output.data(),
190 wide_output.length(),
191 output, &has_non_ascii);
193 // ICU generated something that DoSimpleHost didn't think looked like
194 // ASCII. This is quite rare, but ICU might convert some characters to
195 // percent signs which might generate new escape sequences which might in
196 // turn be invalid. An example is U+FE6A "small percent" which ICU will
197 // name prep into an ASCII percent and then we can interpret the following
198 // characters as escaped characters.
200 // If DoSimpleHost didn't think the output was ASCII, just escape the
201 // thing we gave ICU and give up. DoSimpleHost will have handled a further
202 // level of escaping from ICU for simple ASCII cases (i.e. if ICU generates
203 // a new escaped ASCII sequence like "%41" we'll unescape it) but it won't
204 // do more (like handle escaped non-ASCII sequences). Handling the escaped
205 // ASCII isn't strictly necessary, but DoSimpleHost handles this case
206 // anyway so we handle it/
207 output->set_length(original_output_len);
208 AppendInvalidNarrowString(wide_output.data(), 0, wide_output.length(),
215 // 8-bit convert host to its ASCII version: this converts the UTF-8 input to
216 // UTF-16. The has_escaped flag should be set if the input string requires
218 bool DoComplexHost(const char* host, int host_len,
219 bool has_non_ascii, bool has_escaped, CanonOutput* output) {
220 // Save the current position in the output. We may write stuff and rewind it
221 // below, so we need to know where to rewind to.
222 int begin_length = output->length();
224 // Points to the UTF-8 data we want to convert. This will either be the
225 // input or the unescaped version written to |*output| if necessary.
226 const char* utf8_source;
229 // Unescape before converting to UTF-16 for IDN. We write this into the
230 // output because it most likely does not require IDNization, and we can
231 // save another huge stack buffer. It will be replaced below if it requires
232 // IDN. This will also update our non-ASCII flag so we know whether the
233 // unescaped input requires IDN.
234 if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
235 // Error with some escape sequence. We'll call the current output
236 // complete. DoSimpleHost will have written some "reasonable" output.
240 // Unescaping may have left us with ASCII input, in which case the
241 // unescaped version we wrote to output is complete.
242 if (!has_non_ascii) {
246 // Save the pointer into the data was just converted (it may be appended to
247 // other data in the output buffer).
248 utf8_source = &output->data()[begin_length];
249 utf8_source_len = output->length() - begin_length;
251 // We don't need to unescape, use input for IDNization later. (We know the
252 // input has non-ASCII, or the simple version would have been called
255 utf8_source_len = host_len;
258 // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
259 // Above, we may have used the output to write the unescaped values to, so
260 // we have to rewind it to where we started after we convert it to UTF-16.
262 if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
263 // In this error case, the input may or may not be the output.
265 for (int i = 0; i < utf8_source_len; i++)
266 utf8.push_back(utf8_source[i]);
267 output->set_length(begin_length);
268 AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
271 output->set_length(begin_length);
273 // This will call DoSimpleHost which will do normal ASCII canonicalization
274 // and also check for IP addresses in the outpt.
275 return DoIDNHost(utf16.data(), utf16.length(), output);
278 // UTF-16 convert host to its ASCII version. The set up is already ready for
279 // the backend, so we just pass through. The has_escaped flag should be set if
280 // the input string requires unescaping.
281 bool DoComplexHost(const base::char16* host, int host_len,
282 bool has_non_ascii, bool has_escaped, CanonOutput* output) {
284 // Yikes, we have escaped characters with wide input. The escaped
285 // characters should be interpreted as UTF-8. To solve this problem,
286 // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
288 // We don't bother to optimize the conversion in the ASCII case (which
289 // *could* just be a copy) and use the UTF-8 path, because it should be
290 // very rare that host names have escaped characters, and it is relatively
291 // fast to do the conversion anyway.
293 if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
294 AppendInvalidNarrowString(host, 0, host_len, output);
298 // Once we convert to UTF-8, we can use the 8-bit version of the complex
299 // host handling code above.
300 return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii,
301 has_escaped, output);
304 // No unescaping necessary, we can safely pass the input to ICU. This
305 // function will only get called if we either have escaped or non-ascii
306 // input, so it's safe to just use ICU now. Even if the input is ASCII,
307 // this function will do the right thing (just slower than we could).
308 return DoIDNHost(host, host_len, output);
311 template <typename CHAR, typename UCHAR>
312 bool DoHostSubstring(const CHAR* spec,
313 const Component& host,
314 CanonOutput* output) {
315 bool has_non_ascii, has_escaped;
316 ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
318 if (has_non_ascii || has_escaped) {
319 return DoComplexHost(&spec[host.begin], host.len, has_non_ascii,
320 has_escaped, output);
324 DoSimpleHost(&spec[host.begin], host.len, output, &has_non_ascii);
325 DCHECK(!has_non_ascii);
329 template <typename CHAR, typename UCHAR>
330 void DoHost(const CHAR* spec,
331 const Component& host,
333 CanonHostInfo* host_info) {
335 // Empty hosts don't need anything.
336 host_info->family = CanonHostInfo::NEUTRAL;
337 host_info->out_host = Component();
341 // Keep track of output's initial length, so we can rewind later.
342 const int output_begin = output->length();
344 if (DoHostSubstring<CHAR, UCHAR>(spec, host, output)) {
345 // After all the other canonicalization, check if we ended up with an IP
346 // address. IP addresses are small, so writing into this temporary buffer
347 // should not cause an allocation.
348 RawCanonOutput<64> canon_ip;
349 CanonicalizeIPAddress(output->data(),
350 MakeRange(output_begin, output->length()),
351 &canon_ip, host_info);
353 // If we got an IPv4/IPv6 address, copy the canonical form back to the
354 // real buffer. Otherwise, it's a hostname or broken IP, in which case
355 // we just leave it in place.
356 if (host_info->IsIPAddress()) {
357 output->set_length(output_begin);
358 output->Append(canon_ip.data(), canon_ip.length());
361 // Canonicalization failed. Set BROKEN to notify the caller.
362 host_info->family = CanonHostInfo::BROKEN;
365 host_info->out_host = MakeRange(output_begin, output->length());
370 bool CanonicalizeHost(const char* spec,
371 const Component& host,
373 Component* out_host) {
374 CanonHostInfo host_info;
375 DoHost<char, unsigned char>(spec, host, output, &host_info);
376 *out_host = host_info.out_host;
377 return (host_info.family != CanonHostInfo::BROKEN);
380 bool CanonicalizeHost(const base::char16* spec,
381 const Component& host,
383 Component* out_host) {
384 CanonHostInfo host_info;
385 DoHost<base::char16, base::char16>(spec, host, output, &host_info);
386 *out_host = host_info.out_host;
387 return (host_info.family != CanonHostInfo::BROKEN);
390 void CanonicalizeHostVerbose(const char* spec,
391 const Component& host,
393 CanonHostInfo* host_info) {
394 DoHost<char, unsigned char>(spec, host, output, host_info);
397 void CanonicalizeHostVerbose(const base::char16* spec,
398 const Component& host,
400 CanonHostInfo* host_info) {
401 DoHost<base::char16, base::char16>(spec, host, output, host_info);
404 bool CanonicalizeHostSubstring(const char* spec,
405 const Component& host,
406 CanonOutput* output) {
407 return DoHostSubstring<char, unsigned char>(spec, host, output);
410 bool CanonicalizeHostSubstring(const base::char16* spec,
411 const Component& host,
412 CanonOutput* output) {
413 return DoHostSubstring<base::char16, base::char16>(spec, host, output);