1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "url/url_canon_ip.h"
12 #include "base/check.h"
13 #include "url/url_canon_internal.h"
14 #include "url/url_features.h"
20 // Converts one of the character types that represent a numerical base to the
21 // corresponding base.
22 int BaseForType(SharedCharTypes type) {
35 // Converts an IPv4 component to a 32-bit number, while checking for overflow.
37 // Possible return values:
38 // - IPV4 - The number was valid, and did not overflow.
39 // - BROKEN - The input was numeric, but too large for a 32-bit field.
40 // - NEUTRAL - Input was not numeric.
42 // The input is assumed to be ASCII. The components are assumed to be non-empty.
43 template<typename CHAR>
44 CanonHostInfo::Family IPv4ComponentToNumber(const CHAR* spec,
45 const Component& component,
47 // Empty components are considered non-numeric.
48 if (component.is_empty())
49 return CanonHostInfo::NEUTRAL;
51 // Figure out the base
53 int base_prefix_len = 0; // Size of the prefix for this base.
54 if (spec[component.begin] == '0') {
55 // Either hex or dec, or a standalone zero.
56 if (component.len == 1) {
58 } else if (spec[component.begin + 1] == 'X' ||
59 spec[component.begin + 1] == 'x') {
70 // Extend the prefix to consume all leading zeros.
71 while (base_prefix_len < component.len &&
72 spec[component.begin + base_prefix_len] == '0')
75 // Put the component, minus any base prefix, into a NULL-terminated buffer so
76 // we can call the standard library. Because leading zeros have already been
77 // discarded, filling the entire buffer is guaranteed to trigger the 32-bit
79 const int kMaxComponentLen = 16;
80 char buf[kMaxComponentLen + 1]; // digits + '\0'
82 bool may_be_broken_octal = false;
83 for (int i = component.begin + base_prefix_len; i < component.end(); i++) {
85 return CanonHostInfo::NEUTRAL;
87 // We know the input is 7-bit, so convert to narrow (if this is the wide
88 // version of the template) by casting.
89 char input = static_cast<char>(spec[i]);
91 // Validate that this character is OK for the given base.
92 if (!IsCharOfType(input, base)) {
93 if (IsCharOfType(input, CHAR_DEC)) {
94 // Entirely numeric components with leading 0s that aren't octal are
96 may_be_broken_octal = true;
98 return CanonHostInfo::NEUTRAL;
102 // Fill the buffer, if there's space remaining. This check allows us to
103 // verify that all characters are numeric, even those that don't fit.
104 if (dest_i < kMaxComponentLen)
105 buf[dest_i++] = input;
108 if (may_be_broken_octal)
109 return CanonHostInfo::BROKEN;
113 // Use the 64-bit strtoi so we get a big number (no hex, decimal, or octal
114 // number can overflow a 64-bit number in <= 16 characters).
115 uint64_t num = _strtoui64(buf, NULL, BaseForType(base));
117 // Check for 32-bit overflow.
118 if (num > std::numeric_limits<uint32_t>::max())
119 return CanonHostInfo::BROKEN;
121 // No overflow. Success!
122 *number = static_cast<uint32_t>(num);
123 return CanonHostInfo::IPV4;
126 // See declaration of IPv4AddressToNumber for documentation.
127 template <typename CHAR, typename UCHAR>
128 CanonHostInfo::Family DoIPv4AddressToNumber(const CHAR* spec,
130 unsigned char address[4],
131 int* num_ipv4_components) {
132 // Ignore terminal dot, if present.
133 if (host.is_nonempty() && spec[host.end() - 1] == '.')
136 // Do nothing if empty.
138 return CanonHostInfo::NEUTRAL;
140 // Read component values. The first `existing_components` of them are
141 // populated front to back, with the first one corresponding to the last
142 // component, which allows for early exit if the last component isn't a
144 uint32_t component_values[4];
145 int existing_components = 0;
147 int current_component_end = host.end();
148 int current_position = current_component_end;
150 // If this is not the first character of a component, go to the next
152 if (current_position != host.begin && spec[current_position - 1] != '.') {
157 CanonHostInfo::Family family = IPv4ComponentToNumber(
159 Component(current_position, current_component_end - current_position),
160 &component_values[existing_components]);
162 // If `family` is NEUTRAL and this is the last component, return NEUTRAL. If
163 // `family` is NEUTRAL but not the last component, this is considered a
164 // BROKEN IPv4 address, as opposed to a non-IPv4 hostname.
165 if (family == CanonHostInfo::NEUTRAL && existing_components == 0)
166 return CanonHostInfo::NEUTRAL;
168 if (family != CanonHostInfo::IPV4)
169 return CanonHostInfo::BROKEN;
171 ++existing_components;
173 // If this is the final component, nothing else to do.
174 if (current_position == host.begin)
177 // If there are more than 4 components, fail.
178 if (existing_components == 4)
179 return CanonHostInfo::BROKEN;
181 current_component_end = current_position - 1;
185 // Use `component_values` to fill out the 4-component IP address.
187 // First, process all components but the last, while making sure each fits
188 // within an 8-bit field.
189 for (int i = existing_components - 1; i > 0; i--) {
190 if (component_values[i] > std::numeric_limits<uint8_t>::max())
191 return CanonHostInfo::BROKEN;
192 address[existing_components - i - 1] =
193 static_cast<unsigned char>(component_values[i]);
196 uint32_t last_value = component_values[0];
197 for (int i = 3; i >= existing_components - 1; i--) {
198 address[i] = static_cast<unsigned char>(last_value);
202 // If the last component has residual bits, report overflow.
204 return CanonHostInfo::BROKEN;
206 // Tell the caller how many components we saw.
207 *num_ipv4_components = existing_components;
210 return CanonHostInfo::IPV4;
213 // Return true if we've made a final IPV4/BROKEN decision, false if the result
214 // is NEUTRAL, and we could use a second opinion.
215 template<typename CHAR, typename UCHAR>
216 bool DoCanonicalizeIPv4Address(const CHAR* spec,
217 const Component& host,
219 CanonHostInfo* host_info) {
220 host_info->family = IPv4AddressToNumber(
221 spec, host, host_info->address, &host_info->num_ipv4_components);
223 switch (host_info->family) {
224 case CanonHostInfo::IPV4:
225 // Definitely an IPv4 address.
226 host_info->out_host.begin = output->length();
227 AppendIPv4Address(host_info->address, output);
228 host_info->out_host.len = output->length() - host_info->out_host.begin;
230 case CanonHostInfo::BROKEN:
231 // Definitely broken.
234 // Could be IPv6 or a hostname.
239 // Helper class that describes the main components of an IPv6 input string.
240 // See the following examples to understand how it breaks up an input string:
242 // [Example 1]: input = "[::aa:bb]"
243 // ==> num_hex_components = 2
244 // ==> hex_components[0] = Component(3,2) "aa"
245 // ==> hex_components[1] = Component(6,2) "bb"
246 // ==> index_of_contraction = 0
247 // ==> ipv4_component = Component(0, -1)
249 // [Example 2]: input = "[1:2::3:4:5]"
250 // ==> num_hex_components = 5
251 // ==> hex_components[0] = Component(1,1) "1"
252 // ==> hex_components[1] = Component(3,1) "2"
253 // ==> hex_components[2] = Component(6,1) "3"
254 // ==> hex_components[3] = Component(8,1) "4"
255 // ==> hex_components[4] = Component(10,1) "5"
256 // ==> index_of_contraction = 2
257 // ==> ipv4_component = Component(0, -1)
259 // [Example 3]: input = "[::ffff:192.168.0.1]"
260 // ==> num_hex_components = 1
261 // ==> hex_components[0] = Component(3,4) "ffff"
262 // ==> index_of_contraction = 0
263 // ==> ipv4_component = Component(8, 11) "192.168.0.1"
265 // [Example 4]: input = "[1::]"
266 // ==> num_hex_components = 1
267 // ==> hex_components[0] = Component(1,1) "1"
268 // ==> index_of_contraction = 1
269 // ==> ipv4_component = Component(0, -1)
271 // [Example 5]: input = "[::192.168.0.1]"
272 // ==> num_hex_components = 0
273 // ==> index_of_contraction = 0
274 // ==> ipv4_component = Component(8, 11) "192.168.0.1"
277 // Zero-out the parse information.
279 num_hex_components = 0;
280 index_of_contraction = -1;
281 ipv4_component.reset();
284 // There can be up to 8 hex components (colon separated) in the literal.
285 Component hex_components[8];
287 // The count of hex components present. Ranges from [0,8].
288 int num_hex_components;
290 // The index of the hex component that the "::" contraction precedes, or
291 // -1 if there is no contraction.
292 int index_of_contraction;
294 // The range of characters which are an IPv4 literal.
295 Component ipv4_component;
298 // Parse the IPv6 input string. If parsing succeeded returns true and fills
299 // |parsed| with the information. If parsing failed (because the input is
300 // invalid) returns false.
301 template<typename CHAR, typename UCHAR>
302 bool DoParseIPv6(const CHAR* spec, const Component& host, IPv6Parsed* parsed) {
303 // Zero-out the info.
309 // The index for start and end of address range (no brackets).
310 int begin = host.begin;
311 int end = host.end();
313 int cur_component_begin = begin; // Start of the current component.
315 // Scan through the input, searching for hex components, "::" contractions,
316 // and IPv4 components.
317 for (int i = begin; /* i <= end */; i++) {
318 bool is_colon = spec[i] == ':';
319 bool is_contraction = is_colon && i < end - 1 && spec[i + 1] == ':';
321 // We reached the end of the current component if we encounter a colon
322 // (separator between hex components, or start of a contraction), or end of
324 if (is_colon || i == end) {
325 int component_len = i - cur_component_begin;
327 // A component should not have more than 4 hex digits.
328 if (component_len > 4)
331 // Don't allow empty components.
332 if (component_len == 0) {
333 // The exception is when contractions appear at beginning of the
334 // input or at the end of the input.
335 if (!((is_contraction && i == begin) || (i == end &&
336 parsed->index_of_contraction == parsed->num_hex_components)))
340 // Add the hex component we just found to running list.
341 if (component_len > 0) {
342 // Can't have more than 8 components!
343 if (parsed->num_hex_components >= 8)
346 parsed->hex_components[parsed->num_hex_components++] =
347 Component(cur_component_begin, component_len);
352 break; // Reached the end of the input, DONE.
354 // We found a "::" contraction.
355 if (is_contraction) {
356 // There can be at most one contraction in the literal.
357 if (parsed->index_of_contraction != -1)
359 parsed->index_of_contraction = parsed->num_hex_components;
360 ++i; // Consume the colon we peeked.
364 // Colons are separators between components, keep track of where the
365 // current component started (after this colon).
366 cur_component_begin = i + 1;
368 if (static_cast<UCHAR>(spec[i]) >= 0x80)
369 return false; // Not ASCII.
371 if (!IsHexChar(static_cast<unsigned char>(spec[i]))) {
372 // Regular components are hex numbers. It is also possible for
373 // a component to be an IPv4 address in dotted form.
374 if (IsIPv4Char(static_cast<unsigned char>(spec[i]))) {
375 // Since IPv4 address can only appear at the end, assume the rest
376 // of the string is an IPv4 address. (We will parse this separately
378 parsed->ipv4_component =
379 Component(cur_component_begin, end - cur_component_begin);
382 // The character was neither a hex digit, nor an IPv4 character.
392 // Verifies the parsed IPv6 information, checking that the various components
393 // add up to the right number of bits (hex components are 16 bits, while
394 // embedded IPv4 formats are 32 bits, and contractions are placeholdes for
395 // 16 or more bits). Returns true if sizes match up, false otherwise. On
396 // success writes the length of the contraction (if any) to
397 // |out_num_bytes_of_contraction|.
398 bool CheckIPv6ComponentsSize(const IPv6Parsed& parsed,
399 int* out_num_bytes_of_contraction) {
400 // Each group of four hex digits contributes 16 bits.
401 int num_bytes_without_contraction = parsed.num_hex_components * 2;
403 // If an IPv4 address was embedded at the end, it contributes 32 bits.
404 if (parsed.ipv4_component.is_valid())
405 num_bytes_without_contraction += 4;
407 // If there was a "::" contraction, its size is going to be:
408 // MAX([16bits], [128bits] - num_bytes_without_contraction).
409 int num_bytes_of_contraction = 0;
410 if (parsed.index_of_contraction != -1) {
411 num_bytes_of_contraction = 16 - num_bytes_without_contraction;
412 if (num_bytes_of_contraction < 2)
413 num_bytes_of_contraction = 2;
416 // Check that the numbers add up.
417 if (num_bytes_without_contraction + num_bytes_of_contraction != 16)
420 *out_num_bytes_of_contraction = num_bytes_of_contraction;
424 // Converts a hex component into a number. This cannot fail since the caller has
425 // already verified that each character in the string was a hex digit, and
426 // that there were no more than 4 characters.
427 template <typename CHAR>
428 uint16_t IPv6HexComponentToNumber(const CHAR* spec,
429 const Component& component) {
430 DCHECK(component.len <= 4);
432 // Copy the hex string into a C-string.
434 for (int i = 0; i < component.len; ++i)
435 buf[i] = static_cast<char>(spec[component.begin + i]);
436 buf[component.len] = '\0';
438 // Convert it to a number (overflow is not possible, since with 4 hex
439 // characters we can at most have a 16 bit number).
440 return static_cast<uint16_t>(_strtoui64(buf, NULL, 16));
443 // Converts an IPv6 address to a 128-bit number (network byte order), returning
444 // true on success. False means that the input was not a valid IPv6 address.
445 template<typename CHAR, typename UCHAR>
446 bool DoIPv6AddressToNumber(const CHAR* spec,
447 const Component& host,
448 unsigned char address[16]) {
449 // Make sure the component is bounded by '[' and ']'.
450 int end = host.end();
451 if (host.is_empty() || spec[host.begin] != '[' || spec[end - 1] != ']')
454 // Exclude the square brackets.
455 Component ipv6_comp(host.begin + 1, host.len - 2);
457 // Parse the IPv6 address -- identify where all the colon separated hex
458 // components are, the "::" contraction, and the embedded IPv4 address.
459 IPv6Parsed ipv6_parsed;
460 if (!DoParseIPv6<CHAR, UCHAR>(spec, ipv6_comp, &ipv6_parsed))
463 // Do some basic size checks to make sure that the address doesn't
464 // specify more than 128 bits or fewer than 128 bits. This also resolves
465 // how may zero bytes the "::" contraction represents.
466 int num_bytes_of_contraction;
467 if (!CheckIPv6ComponentsSize(ipv6_parsed, &num_bytes_of_contraction))
470 int cur_index_in_address = 0;
472 // Loop through each hex components, and contraction in order.
473 for (int i = 0; i <= ipv6_parsed.num_hex_components; ++i) {
474 // Append the contraction if it appears before this component.
475 if (i == ipv6_parsed.index_of_contraction) {
476 for (int j = 0; j < num_bytes_of_contraction; ++j)
477 address[cur_index_in_address++] = 0;
479 // Append the hex component's value.
480 if (i != ipv6_parsed.num_hex_components) {
481 // Get the 16-bit value for this hex component.
482 uint16_t number = IPv6HexComponentToNumber<CHAR>(
483 spec, ipv6_parsed.hex_components[i]);
484 // Append to |address|, in network byte order.
485 address[cur_index_in_address++] = (number & 0xFF00) >> 8;
486 address[cur_index_in_address++] = (number & 0x00FF);
490 // If there was an IPv4 section, convert it into a 32-bit number and append
492 if (ipv6_parsed.ipv4_component.is_valid()) {
493 // Append the 32-bit number to |address|.
494 int num_ipv4_components = 0;
495 // IPv4AddressToNumber will remove the trailing dot from the component.
496 bool trailing_dot = ipv6_parsed.ipv4_component.is_nonempty() &&
497 spec[ipv6_parsed.ipv4_component.end() - 1] == '.';
498 // The URL standard requires the embedded IPv4 address to be concisely
499 // composed of 4 parts and disallows terminal dots.
500 // See https://url.spec.whatwg.org/#concept-ipv6-parser
501 if (CanonHostInfo::IPV4 !=
502 IPv4AddressToNumber(spec, ipv6_parsed.ipv4_component,
503 &address[cur_index_in_address],
504 &num_ipv4_components)) {
507 if ((num_ipv4_components != 4 || trailing_dot)) {
515 // Searches for the longest sequence of zeros in |address|, and writes the
516 // range into |contraction_range|. The run of zeros must be at least 16 bits,
517 // and if there is a tie the first is chosen.
518 void ChooseIPv6ContractionRange(const unsigned char address[16],
519 Component* contraction_range) {
520 // The longest run of zeros in |address| seen so far.
523 // The current run of zeros in |address| being iterated over.
526 for (int i = 0; i < 16; i += 2) {
527 // Test for 16 bits worth of zero.
528 bool is_zero = (address[i] == 0 && address[i + 1] == 0);
531 // Add the zero to the current range (or start a new one).
532 if (!cur_range.is_valid())
533 cur_range = Component(i, 0);
537 if (!is_zero || i == 14) {
538 // Just completed a run of zeros. If the run is greater than 16 bits,
539 // it is a candidate for the contraction.
540 if (cur_range.len > 2 && cur_range.len > max_range.len) {
541 max_range = cur_range;
546 *contraction_range = max_range;
549 // Return true if we've made a final IPV6/BROKEN decision, false if the result
550 // is NEUTRAL, and we could use a second opinion.
551 template<typename CHAR, typename UCHAR>
552 bool DoCanonicalizeIPv6Address(const CHAR* spec,
553 const Component& host,
555 CanonHostInfo* host_info) {
556 // Turn the IP address into a 128 bit number.
557 if (!IPv6AddressToNumber(spec, host, host_info->address)) {
558 // If it's not an IPv6 address, scan for characters that should *only*
559 // exist in an IPv6 address.
560 for (int i = host.begin; i < host.end(); i++) {
565 host_info->family = CanonHostInfo::BROKEN;
570 // No invalid characters. Could still be IPv4 or a hostname.
571 host_info->family = CanonHostInfo::NEUTRAL;
575 host_info->out_host.begin = output->length();
576 output->push_back('[');
577 AppendIPv6Address(host_info->address, output);
578 output->push_back(']');
579 host_info->out_host.len = output->length() - host_info->out_host.begin;
581 host_info->family = CanonHostInfo::IPV6;
587 void AppendIPv4Address(const unsigned char address[4], CanonOutput* output) {
588 for (int i = 0; i < 4; i++) {
590 _itoa_s(address[i], str, 10);
592 for (int ch = 0; str[ch] != 0; ch++)
593 output->push_back(str[ch]);
596 output->push_back('.');
600 void AppendIPv6Address(const unsigned char address[16], CanonOutput* output) {
601 // We will output the address according to the rules in:
602 // http://tools.ietf.org/html/draft-kawamura-ipv6-text-representation-01#section-4
604 // Start by finding where to place the "::" contraction (if any).
605 Component contraction_range;
606 ChooseIPv6ContractionRange(address, &contraction_range);
608 for (int i = 0; i <= 14;) {
609 // We check 2 bytes at a time, from bytes (0, 1) to (14, 15), inclusive.
611 if (i == contraction_range.begin && contraction_range.len > 0) {
612 // Jump over the contraction.
614 output->push_back(':');
615 output->push_back(':');
616 i = contraction_range.end();
618 // Consume the next 16 bits from |address|.
619 int x = address[i] << 8 | address[i + 1];
623 // Stringify the 16 bit number (at most requires 4 hex digits).
626 for (int ch = 0; str[ch] != 0; ++ch)
627 output->push_back(str[ch]);
629 // Put a colon after each number, except the last.
631 output->push_back(':');
636 void CanonicalizeIPAddress(const char* spec,
637 const Component& host,
639 CanonHostInfo* host_info) {
640 if (DoCanonicalizeIPv4Address<char, unsigned char>(
641 spec, host, output, host_info))
643 if (DoCanonicalizeIPv6Address<char, unsigned char>(
644 spec, host, output, host_info))
648 void CanonicalizeIPAddress(const char16_t* spec,
649 const Component& host,
651 CanonHostInfo* host_info) {
652 if (DoCanonicalizeIPv4Address<char16_t, char16_t>(spec, host, output,
655 if (DoCanonicalizeIPv6Address<char16_t, char16_t>(spec, host, output,
660 CanonHostInfo::Family IPv4AddressToNumber(const char* spec,
661 const Component& host,
662 unsigned char address[4],
663 int* num_ipv4_components) {
664 return DoIPv4AddressToNumber<char, unsigned char>(spec, host, address,
665 num_ipv4_components);
668 CanonHostInfo::Family IPv4AddressToNumber(const char16_t* spec,
669 const Component& host,
670 unsigned char address[4],
671 int* num_ipv4_components) {
672 return DoIPv4AddressToNumber<char16_t, char16_t>(spec, host, address,
673 num_ipv4_components);
676 bool IPv6AddressToNumber(const char* spec,
677 const Component& host,
678 unsigned char address[16]) {
679 return DoIPv6AddressToNumber<char, unsigned char>(spec, host, address);
682 bool IPv6AddressToNumber(const char16_t* spec,
683 const Component& host,
684 unsigned char address[16]) {
685 return DoIPv6AddressToNumber<char16_t, char16_t>(spec, host, address);