-// Copyright 2013 The Chromium Authors. All rights reserved.
+// Copyright 2013 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <stdlib.h>
#include <string.h>
+#include <string_view>
+
#include "base/component_export.h"
#include "base/export_template.h"
-#include "base/strings/string16.h"
+#include "base/memory/raw_ptr_exclusion.h"
+#include "base/numerics/clamped_math.h"
#include "url/third_party/mozilla/url_parse.h"
namespace url {
// resize function that is called when the existing buffer is not big enough.
// The derived class is then in charge of setting up our buffer which we will
// manage.
-template<typename T>
+template <typename T>
class CanonOutputT {
public:
- CanonOutputT() : buffer_(NULL), buffer_len_(0), cur_len_(0) {
- }
- virtual ~CanonOutputT() {
- }
+ CanonOutputT() = default;
+ virtual ~CanonOutputT() = default;
// Implemented to resize the buffer. This function should update the buffer
// pointer to point to the new buffer, and any old data up to |cur_len_| in
// the buffer must be copied over.
//
// The new size |sz| must be larger than buffer_len_.
- virtual void Resize(int sz) = 0;
+ virtual void Resize(size_t sz) = 0;
// Accessor for returning a character at a given position. The input offset
// must be in the valid range.
- inline T at(int offset) const {
- return buffer_[offset];
- }
+ inline T at(size_t offset) const { return buffer_[offset]; }
// Sets the character at the given position. The given position MUST be less
// than the length().
- inline void set(int offset, T ch) {
- buffer_[offset] = ch;
- }
+ inline void set(size_t offset, T ch) { buffer_[offset] = ch; }
// Returns the number of characters currently in the buffer.
- inline int length() const {
- return cur_len_;
- }
+ inline size_t length() const { return cur_len_; }
// Returns the current capacity of the buffer. The length() is the number of
// characters that have been declared to be written, but the capacity() is
// the number that can be written without reallocation. If the caller must
// write many characters at once, it can make sure there is enough capacity,
// write the data, then use set_size() to declare the new length().
- int capacity() const {
- return buffer_len_;
+ size_t capacity() const { return buffer_len_; }
+
+ // Returns the contents of the buffer as a string_view.
+ std::basic_string_view<T> view() const {
+ return std::basic_string_view<T>(data(), length());
}
// Called by the user of this class to get the output. The output will NOT
// be NULL-terminated. Call length() to get the
// length.
- const T* data() const {
- return buffer_;
- }
- T* data() {
- return buffer_;
- }
+ const T* data() const { return buffer_; }
+ T* data() { return buffer_; }
// Shortens the URL to the new length. Used for "backing up" when processing
// relative paths. This can also be used if an external function writes a lot
// to declare the new length.
//
// This MUST NOT be used to expand the size of the buffer beyond capacity().
- void set_length(int new_len) {
- cur_len_ = new_len;
- }
+ void set_length(size_t new_len) { cur_len_ = new_len; }
// This is the most performance critical function, since it is called for
// every character.
}
// Appends the given string to the output.
- void Append(const T* str, int str_len) {
- if (cur_len_ + str_len > buffer_len_) {
- if (!Grow(cur_len_ + str_len - buffer_len_))
+ void Append(const T* str, size_t str_len) {
+ if (str_len > buffer_len_ - cur_len_) {
+ if (!Grow(str_len - (buffer_len_ - cur_len_)))
return;
}
- for (int i = 0; i < str_len; i++)
- buffer_[cur_len_ + i] = str[i];
+ memcpy(buffer_ + cur_len_, str, str_len * sizeof(T));
cur_len_ += str_len;
}
- void ReserveSizeIfNeeded(int estimated_size) {
+ void Append(std::basic_string_view<T> str) { Append(str.data(), str.size()); }
+
+ void ReserveSizeIfNeeded(size_t estimated_size) {
// Reserve a bit extra to account for escaped chars.
if (estimated_size > buffer_len_)
- Resize(estimated_size + 8);
+ Resize((base::ClampedNumeric<size_t>(estimated_size) + 8).RawValue());
}
protected:
// Grows the given buffer so that it can fit at least |min_additional|
// characters. Returns true if the buffer could be resized, false on OOM.
- bool Grow(int min_additional) {
- static const int kMinBufferLen = 16;
- int new_len = (buffer_len_ == 0) ? kMinBufferLen : buffer_len_;
+ bool Grow(size_t min_additional) {
+ static const size_t kMinBufferLen = 16;
+ size_t new_len = (buffer_len_ == 0) ? kMinBufferLen : buffer_len_;
do {
if (new_len >= (1 << 30)) // Prevent overflow below.
return false;
return true;
}
- T* buffer_;
- int buffer_len_;
+ // `buffer_` is not a raw_ptr<...> for performance reasons (based on analysis
+ // of sampling profiler data).
+ RAW_PTR_EXCLUSION T* buffer_ = nullptr;
+ size_t buffer_len_ = 0;
// Used characters in the buffer.
- int cur_len_;
+ size_t cur_len_ = 0;
};
// Simple implementation of the CanonOutput using new[]. This class
// also supports a static buffer so if it is allocated on the stack, most
// URLs can be canonicalized with no heap allocations.
-template<typename T, int fixed_capacity = 1024>
+template <typename T, int fixed_capacity = 1024>
class RawCanonOutputT : public CanonOutputT<T> {
public:
RawCanonOutputT() : CanonOutputT<T>() {
delete[] this->buffer_;
}
- void Resize(int sz) override {
+ void Resize(size_t sz) override {
T* new_buf = new T[sz];
memcpy(new_buf, this->buffer_,
sizeof(T) * (this->cur_len_ < sz ? this->cur_len_ : sz));
extern template class EXPORT_TEMPLATE_DECLARE(COMPONENT_EXPORT(URL))
CanonOutputT<char>;
extern template class EXPORT_TEMPLATE_DECLARE(COMPONENT_EXPORT(URL))
- CanonOutputT<base::char16>;
+ CanonOutputT<char16_t>;
// Normally, all canonicalization output is in narrow characters. We support
// the templates so it can also be used internally if a wide buffer is
// required.
typedef CanonOutputT<char> CanonOutput;
-typedef CanonOutputT<base::char16> CanonOutputW;
+typedef CanonOutputT<char16_t> CanonOutputW;
-template<int fixed_capacity>
+template <int fixed_capacity>
class RawCanonOutput : public RawCanonOutputT<char, fixed_capacity> {};
-template<int fixed_capacity>
-class RawCanonOutputW : public RawCanonOutputT<base::char16, fixed_capacity> {};
+template <int fixed_capacity>
+class RawCanonOutputW : public RawCanonOutputT<char16_t, fixed_capacity> {};
// Character set converter ----------------------------------------------------
//
// decimal, (such as "你") with escaping of the ampersand, number
// sign, and semicolon (in the previous example it would be
// "%26%2320320%3B"). This rule is based on what IE does in this situation.
- virtual void ConvertFromUTF16(const base::char16* input,
+ virtual void ConvertFromUTF16(const char16_t* input,
int input_len,
CanonOutput* output) = 0;
};
int* output_len,
bool* potentially_dangling_markup);
COMPONENT_EXPORT(URL)
-const base::char16* RemoveURLWhitespace(const base::char16* input,
- int input_len,
- CanonOutputT<base::char16>* buffer,
- int* output_len,
- bool* potentially_dangling_markup);
+const char16_t* RemoveURLWhitespace(const char16_t* input,
+ int input_len,
+ CanonOutputT<char16_t>* buffer,
+ int* output_len,
+ bool* potentially_dangling_markup);
// IDN ------------------------------------------------------------------------
//
// On error, returns false. The output in this case is undefined.
COMPONENT_EXPORT(URL)
-bool IDNToASCII(const base::char16* src, int src_len, CanonOutputW* output);
+bool IDNToASCII(std::u16string_view src, CanonOutputW* output);
// Piece-by-piece canonicalizers ----------------------------------------------
//
// These individual canonicalizers append the canonicalized versions of the
-// corresponding URL component to the given std::string. The spec and the
+// corresponding URL component to the given CanonOutput. The spec and the
// previously-identified range of that component are the input. The range of
// the canonicalized component will be written to the output component.
//
CanonOutput* output,
Component* out_scheme);
COMPONENT_EXPORT(URL)
-bool CanonicalizeScheme(const base::char16* spec,
+bool CanonicalizeScheme(const char16_t* spec,
const Component& scheme,
CanonOutput* output,
Component* out_scheme);
Component* out_username,
Component* out_password);
COMPONENT_EXPORT(URL)
-bool CanonicalizeUserInfo(const base::char16* username_source,
+bool CanonicalizeUserInfo(const char16_t* username_source,
const Component& username,
- const base::char16* password_source,
+ const char16_t* password_source,
const Component& password,
CanonOutput* output,
Component* out_username,
// This field summarizes how the input was classified by the canonicalizer.
enum Family {
- NEUTRAL, // - Doesn't resemble an IP address. As far as the IP
- // canonicalizer is concerned, it should be treated as a
- // hostname.
- BROKEN, // - Almost an IP, but was not canonicalized. This could be an
- // IPv4 address where truncation occurred, or something
- // containing the special characters :[] which did not parse
- // as an IPv6 address. Never attempt to connect to this
- // address, because it might actually succeed!
- IPV4, // - Successfully canonicalized as an IPv4 address.
- IPV6, // - Successfully canonicalized as an IPv6 address.
+ NEUTRAL, // - Doesn't resemble an IP address. As far as the IP
+ // canonicalizer is concerned, it should be treated as a
+ // hostname.
+ BROKEN, // - Almost an IP, but was not canonicalized. This could be an
+ // IPv4 address where truncation occurred, or something
+ // containing the special characters :[] which did not parse
+ // as an IPv6 address. Never attempt to connect to this
+ // address, because it might actually succeed!
+ IPV4, // - Successfully canonicalized as an IPv4 address.
+ IPV6, // - Successfully canonicalized as an IPv6 address.
};
Family family;
}
};
-
// Host.
//
// The 8-bit version requires UTF-8 encoding. Use this version when you only
CanonOutput* output,
Component* out_host);
COMPONENT_EXPORT(URL)
-bool CanonicalizeHost(const base::char16* spec,
+bool CanonicalizeHost(const char16_t* spec,
const Component& host,
CanonOutput* output,
Component* out_host);
CanonOutput* output,
CanonHostInfo* host_info);
COMPONENT_EXPORT(URL)
-void CanonicalizeHostVerbose(const base::char16* spec,
+void CanonicalizeHostVerbose(const char16_t* spec,
const Component& host,
CanonOutput* output,
CanonHostInfo* host_info);
const Component& host,
CanonOutput* output);
COMPONENT_EXPORT(URL)
-bool CanonicalizeHostSubstring(const base::char16* spec,
+bool CanonicalizeHostSubstring(const char16_t* spec,
const Component& host,
CanonOutput* output);
CanonOutput* output,
CanonHostInfo* host_info);
COMPONENT_EXPORT(URL)
-void CanonicalizeIPAddress(const base::char16* spec,
+void CanonicalizeIPAddress(const char16_t* spec,
const Component& host,
CanonOutput* output,
CanonHostInfo* host_info);
CanonOutput* output,
Component* out_port);
COMPONENT_EXPORT(URL)
-bool CanonicalizePort(const base::char16* spec,
+bool CanonicalizePort(const char16_t* spec,
const Component& port,
int default_port_for_scheme,
CanonOutput* output,
Component* out_port);
// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
-// if the scheme is unknown.
+// if the scheme is unknown. Based on https://url.spec.whatwg.org/#default-port
COMPONENT_EXPORT(URL)
int DefaultPortForScheme(const char* scheme, int scheme_len);
CanonOutput* output,
Component* out_path);
COMPONENT_EXPORT(URL)
-bool CanonicalizePath(const base::char16* spec,
+bool CanonicalizePath(const char16_t* spec,
const Component& path,
CanonOutput* output,
Component* out_path);
+// Like CanonicalizePath(), but does not assume that its operating on the
+// entire path. It therefore does not prepend a slash, etc.
+COMPONENT_EXPORT(URL)
+bool CanonicalizePartialPath(const char* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path);
+COMPONENT_EXPORT(URL)
+bool CanonicalizePartialPath(const char16_t* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path);
+
// Canonicalizes the input as a file path. This is like CanonicalizePath except
// that it also handles Windows drive specs. For example, the path can begin
// with "c|\" and it will get properly canonicalized to "C:/".
CanonOutput* output,
Component* out_path);
COMPONENT_EXPORT(URL)
-bool FileCanonicalizePath(const base::char16* spec,
+bool FileCanonicalizePath(const char16_t* spec,
const Component& path,
CanonOutput* output,
Component* out_path);
CanonOutput* output,
Component* out_query);
COMPONENT_EXPORT(URL)
-void CanonicalizeQuery(const base::char16* spec,
+void CanonicalizeQuery(const char16_t* spec,
const Component& query,
CharsetConverter* converter,
CanonOutput* output,
CanonOutput* output,
Component* out_path);
COMPONENT_EXPORT(URL)
-void CanonicalizeRef(const base::char16* spec,
+void CanonicalizeRef(const char16_t* spec,
const Component& path,
CanonOutput* output,
Component* out_path);
CanonOutput* output,
Parsed* new_parsed);
COMPONENT_EXPORT(URL)
-bool CanonicalizeStandardURL(const base::char16* spec,
+bool CanonicalizeStandardURL(const char16_t* spec,
int spec_len,
const Parsed& parsed,
SchemeType scheme_type,
CanonOutput* output,
Parsed* new_parsed);
COMPONENT_EXPORT(URL)
-bool CanonicalizeFileURL(const base::char16* spec,
+bool CanonicalizeFileURL(const char16_t* spec,
int spec_len,
const Parsed& parsed,
CharsetConverter* query_converter,
CanonOutput* output,
Parsed* new_parsed);
COMPONENT_EXPORT(URL)
-bool CanonicalizeFileSystemURL(const base::char16* spec,
+bool CanonicalizeFileSystemURL(const char16_t* spec,
int spec_len,
const Parsed& parsed,
CharsetConverter* query_converter,
CanonOutput* output,
Parsed* new_parsed);
COMPONENT_EXPORT(URL)
-bool CanonicalizePathURL(const base::char16* spec,
+bool CanonicalizePathURL(const char16_t* spec,
int spec_len,
const Parsed& parsed,
CanonOutput* output,
Parsed* new_parsed);
+// Use to canonicalize just the path component of a "path" URL; e.g. the
+// path of a javascript URL.
+COMPONENT_EXPORT(URL)
+void CanonicalizePathURLPath(const char* source,
+ const Component& component,
+ CanonOutput* output,
+ Component* new_component);
+COMPONENT_EXPORT(URL)
+void CanonicalizePathURLPath(const char16_t* source,
+ const Component& component,
+ CanonOutput* output,
+ Component* new_component);
+
// Use for mailto URLs. This "canonicalizes" the URL into a path and query
// component. It does not attempt to merge "to" fields. It uses UTF-8 for
// the query encoding if there is a query. This is because a mailto URL is
CanonOutput* output,
Parsed* new_parsed);
COMPONENT_EXPORT(URL)
-bool CanonicalizeMailtoURL(const base::char16* spec,
+bool CanonicalizeMailtoURL(const char16_t* spec,
int spec_len,
const Parsed& parsed,
CanonOutput* output,
// This structures does not own any data. It is the caller's responsibility to
// ensure that the data the pointers point to stays in scope and is not
// modified.
-template<typename CHAR>
+template <typename CHAR>
struct URLComponentSource {
// Constructor normally used by callers wishing to replace components. This
// will make them all NULL, which is no replacement. The caller would then
// override the components they want to replace.
URLComponentSource()
- : scheme(NULL),
- username(NULL),
- password(NULL),
- host(NULL),
- port(NULL),
- path(NULL),
- query(NULL),
- ref(NULL) {
- }
+ : scheme(nullptr),
+ username(nullptr),
+ password(nullptr),
+ host(nullptr),
+ port(nullptr),
+ path(nullptr),
+ query(nullptr),
+ ref(nullptr) {}
// Constructor normally used internally to initialize all the components to
// point to the same spec.
port(default_value),
path(default_value),
query(default_value),
- ref(default_value) {
- }
-
- const CHAR* scheme;
- const CHAR* username;
- const CHAR* password;
- const CHAR* host;
- const CHAR* port;
- const CHAR* path;
- const CHAR* query;
- const CHAR* ref;
+ ref(default_value) {}
+
+ // This field is not a raw_ptr<> because it was filtered by the rewriter for:
+ // #addr-of
+ RAW_PTR_EXCLUSION const CHAR* scheme;
+ // This field is not a raw_ptr<> because it was filtered by the rewriter for:
+ // #addr-of
+ RAW_PTR_EXCLUSION const CHAR* username;
+ // This field is not a raw_ptr<> because it was filtered by the rewriter for:
+ // #addr-of
+ RAW_PTR_EXCLUSION const CHAR* password;
+ // This field is not a raw_ptr<> because it was filtered by the rewriter for:
+ // #addr-of
+ RAW_PTR_EXCLUSION const CHAR* host;
+ // This field is not a raw_ptr<> because it was filtered by the rewriter for:
+ // #addr-of
+ RAW_PTR_EXCLUSION const CHAR* port;
+ // This field is not a raw_ptr<> because it was filtered by the rewriter for:
+ // #addr-of
+ RAW_PTR_EXCLUSION const CHAR* path;
+ // This field is not a raw_ptr<> because it was filtered by the rewriter for:
+ // #addr-of
+ RAW_PTR_EXCLUSION const CHAR* query;
+ // This field is not a raw_ptr<> because it was filtered by the rewriter for:
+ // #addr-of
+ RAW_PTR_EXCLUSION const CHAR* ref;
};
// This structure encapsulates information on modifying a URL. Each component
// IN SCOPE BY THE CALLER for as long as this object exists!
//
// Prefer the 8-bit replacement version if possible since it is more efficient.
-template<typename CHAR>
+template <typename CHAR>
class Replacements {
public:
- Replacements() {
- }
+ Replacements() {}
// Scheme
void SetScheme(const CHAR* s, const Component& comp) {
COMPONENT_EXPORT(URL)
bool ReplaceStandardURL(const char* base,
const Parsed& base_parsed,
- const Replacements<base::char16>& replacements,
+ const Replacements<char16_t>& replacements,
SchemeType scheme_type,
CharsetConverter* query_converter,
CanonOutput* output,
COMPONENT_EXPORT(URL)
bool ReplaceFileSystemURL(const char* base,
const Parsed& base_parsed,
- const Replacements<base::char16>& replacements,
+ const Replacements<char16_t>& replacements,
CharsetConverter* query_converter,
CanonOutput* output,
Parsed* new_parsed);
COMPONENT_EXPORT(URL)
bool ReplaceFileURL(const char* base,
const Parsed& base_parsed,
- const Replacements<base::char16>& replacements,
+ const Replacements<char16_t>& replacements,
CharsetConverter* query_converter,
CanonOutput* output,
Parsed* new_parsed);
COMPONENT_EXPORT(URL)
bool ReplacePathURL(const char* base,
const Parsed& base_parsed,
- const Replacements<base::char16>& replacements,
+ const Replacements<char16_t>& replacements,
CanonOutput* output,
Parsed* new_parsed);
COMPONENT_EXPORT(URL)
bool ReplaceMailtoURL(const char* base,
const Parsed& base_parsed,
- const Replacements<base::char16>& replacements,
+ const Replacements<char16_t>& replacements,
CanonOutput* output,
Parsed* new_parsed);
COMPONENT_EXPORT(URL)
bool IsRelativeURL(const char* base,
const Parsed& base_parsed,
- const base::char16* fragment,
+ const char16_t* fragment,
int fragment_len,
bool is_base_hierarchical,
bool* is_relative,
bool ResolveRelativeURL(const char* base_url,
const Parsed& base_parsed,
bool base_is_file,
- const base::char16* relative_url,
+ const char16_t* relative_url,
const Component& relative_component,
CharsetConverter* query_converter,
CanonOutput* output,