src/extensions/common/url_pattern.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "extensions/common/url_pattern.h"
   6
   7 #include "base/strings/string_number_conversions.h"
   8 #include "base/strings/string_piece.h"
   9 #include "base/strings/string_split.h"
  10 #include "base/strings/string_util.h"
  11 #include "content/public/common/url_constants.h"
  12 #include "extensions/common/constants.h"
  13 #include "url/gurl.h"
  14 #include "url/url_util.h"
  15
  16 const char URLPattern::kAllUrlsPattern[] = "<all_urls>";
  17
  18 namespace {
  19
  20 // TODO(aa): What about more obscure schemes like data: and javascript: ?
  21 // Note: keep this array in sync with kValidSchemeMasks.
  22 const char* kValidSchemes[] = {
  23   content::kHttpScheme,
  24   content::kHttpsScheme,
  25   chrome::kFileScheme,
  26   chrome::kFtpScheme,
  27   chrome::kChromeUIScheme,
  28   extensions::kExtensionScheme,
  29   chrome::kFileSystemScheme,
  30 };
  31
  32 const int kValidSchemeMasks[] = {
  33   URLPattern::SCHEME_HTTP,
  34   URLPattern::SCHEME_HTTPS,
  35   URLPattern::SCHEME_FILE,
  36   URLPattern::SCHEME_FTP,
  37   URLPattern::SCHEME_CHROMEUI,
  38   URLPattern::SCHEME_EXTENSION,
  39   URLPattern::SCHEME_FILESYSTEM,
  40 };
  41
  42 COMPILE_ASSERT(arraysize(kValidSchemes) == arraysize(kValidSchemeMasks),
  43                must_keep_these_arrays_in_sync);
  44
  45 const char kParseSuccess[] = "Success.";
  46 const char kParseErrorMissingSchemeSeparator[] = "Missing scheme separator.";
  47 const char kParseErrorInvalidScheme[] = "Invalid scheme.";
  48 const char kParseErrorWrongSchemeType[] = "Wrong scheme type.";
  49 const char kParseErrorEmptyHost[] = "Host can not be empty.";
  50 const char kParseErrorInvalidHostWildcard[] = "Invalid host wildcard.";
  51 const char kParseErrorEmptyPath[] = "Empty path.";
  52 const char kParseErrorInvalidPort[] = "Invalid port.";
  53
  54 // Message explaining each URLPattern::ParseResult.
  55 const char* const kParseResultMessages[] = {
  56   kParseSuccess,
  57   kParseErrorMissingSchemeSeparator,
  58   kParseErrorInvalidScheme,
  59   kParseErrorWrongSchemeType,
  60   kParseErrorEmptyHost,
  61   kParseErrorInvalidHostWildcard,
  62   kParseErrorEmptyPath,
  63   kParseErrorInvalidPort,
  64 };
  65
  66 COMPILE_ASSERT(URLPattern::NUM_PARSE_RESULTS == arraysize(kParseResultMessages),
  67                must_add_message_for_each_parse_result);
  68
  69 const char kPathSeparator[] = "/";
  70
  71 bool IsStandardScheme(const std::string& scheme) {
  72   // "*" gets the same treatment as a standard scheme.
  73   if (scheme == "*")
  74     return true;
  75
  76   return url_util::IsStandard(scheme.c_str(),
  77       url_parse::Component(0, static_cast<int>(scheme.length())));
  78 }
  79
  80 bool IsValidPortForScheme(const std::string& scheme, const std::string& port) {
  81   if (port == "*")
  82     return true;
  83
  84   // Only accept non-wildcard ports if the scheme uses ports.
  85   if (url_canon::DefaultPortForScheme(scheme.c_str(), scheme.length()) ==
  86       url_parse::PORT_UNSPECIFIED) {
  87     return false;
  88   }
  89
  90   int parsed_port = url_parse::PORT_UNSPECIFIED;
  91   if (!base::StringToInt(port, &parsed_port))
  92     return false;
  93   return (parsed_port >= 0) && (parsed_port < 65536);
  94 }
  95
  96 // Returns |path| with the trailing wildcard stripped if one existed.
  97 //
  98 // The functions that rely on this (OverlapsWith and Contains) are only
  99 // called for the patterns inside URLPatternSet. In those cases, we know that
 100 // the path will have only a single wildcard at the end. This makes figuring
 101 // out overlap much easier. It seems like there is probably a computer-sciency
 102 // way to solve the general case, but we don't need that yet.
 103 std::string StripTrailingWildcard(const std::string& path) {
 104   size_t wildcard_index = path.find('*');
 105   size_t path_last = path.size() - 1;
 106   DCHECK(wildcard_index == std::string::npos || wildcard_index == path_last);
 107   return wildcard_index == path_last ? path.substr(0, path_last) : path;
 108 }
 109
 110 }  // namespace
 111
 112 URLPattern::URLPattern()
 113     : valid_schemes_(SCHEME_NONE),
 114       match_all_urls_(false),
 115       match_subdomains_(false),
 116       port_("*") {}
 117
 118 URLPattern::URLPattern(int valid_schemes)
 119     : valid_schemes_(valid_schemes),
 120       match_all_urls_(false),
 121       match_subdomains_(false),
 122       port_("*") {}
 123
 124 URLPattern::URLPattern(int valid_schemes, const std::string& pattern)
 125     // Strict error checking is used, because this constructor is only
 126     // appropriate when we know |pattern| is valid.
 127     : valid_schemes_(valid_schemes),
 128       match_all_urls_(false),
 129       match_subdomains_(false),
 130       port_("*") {
 131   if (PARSE_SUCCESS != Parse(pattern))
 132     NOTREACHED() << "URLPattern is invalid: " << pattern;
 133 }
 134
 135 URLPattern::~URLPattern() {
 136 }
 137
 138 bool URLPattern::operator<(const URLPattern& other) const {
 139   return GetAsString() < other.GetAsString();
 140 }
 141
 142 bool URLPattern::operator>(const URLPattern& other) const {
 143   return GetAsString() > other.GetAsString();
 144 }
 145
 146 bool URLPattern::operator==(const URLPattern& other) const {
 147   return GetAsString() == other.GetAsString();
 148 }
 149
 150 URLPattern::ParseResult URLPattern::Parse(const std::string& pattern) {
 151   spec_.clear();
 152   SetMatchAllURLs(false);
 153   SetMatchSubdomains(false);
 154   SetPort("*");
 155
 156   // Special case pattern to match every valid URL.
 157   if (pattern == kAllUrlsPattern) {
 158     SetMatchAllURLs(true);
 159     return PARSE_SUCCESS;
 160   }
 161
 162   // Parse out the scheme.
 163   size_t scheme_end_pos = pattern.find(content::kStandardSchemeSeparator);
 164   bool has_standard_scheme_separator = true;
 165
 166   // Some urls also use ':' alone as the scheme separator.
 167   if (scheme_end_pos == std::string::npos) {
 168     scheme_end_pos = pattern.find(':');
 169     has_standard_scheme_separator = false;
 170   }
 171
 172   if (scheme_end_pos == std::string::npos)
 173     return PARSE_ERROR_MISSING_SCHEME_SEPARATOR;
 174
 175   if (!SetScheme(pattern.substr(0, scheme_end_pos)))
 176     return PARSE_ERROR_INVALID_SCHEME;
 177
 178   bool standard_scheme = IsStandardScheme(scheme_);
 179   if (standard_scheme != has_standard_scheme_separator)
 180     return PARSE_ERROR_WRONG_SCHEME_SEPARATOR;
 181
 182   // Advance past the scheme separator.
 183   scheme_end_pos +=
 184       (standard_scheme ? strlen(content::kStandardSchemeSeparator) : 1);
 185   if (scheme_end_pos >= pattern.size())
 186     return PARSE_ERROR_EMPTY_HOST;
 187
 188   // Parse out the host and path.
 189   size_t host_start_pos = scheme_end_pos;
 190   size_t path_start_pos = 0;
 191
 192   if (!standard_scheme) {
 193     path_start_pos = host_start_pos;
 194   } else if (scheme_ == chrome::kFileScheme) {
 195     size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
 196     if (host_end_pos == std::string::npos) {
 197       // Allow hostname omission.
 198       // e.g. file://* is interpreted as file:///*,
 199       // file://foo* is interpreted as file:///foo*.
 200       path_start_pos = host_start_pos - 1;
 201     } else {
 202       // Ignore hostname if scheme is file://.
 203       // e.g. file://localhost/foo is equal to file:///foo.
 204       path_start_pos = host_end_pos;
 205     }
 206   } else {
 207     size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
 208
 209     // Host is required.
 210     if (host_start_pos == host_end_pos)
 211       return PARSE_ERROR_EMPTY_HOST;
 212
 213     if (host_end_pos == std::string::npos)
 214       return PARSE_ERROR_EMPTY_PATH;
 215
 216     host_ = pattern.substr(host_start_pos, host_end_pos - host_start_pos);
 217
 218     // The first component can optionally be '*' to match all subdomains.
 219     std::vector<std::string> host_components;
 220     base::SplitString(host_, '.', &host_components);
 221     if (host_components[0] == "*") {
 222       match_subdomains_ = true;
 223       host_components.erase(host_components.begin(),
 224                             host_components.begin() + 1);
 225     }
 226     host_ = JoinString(host_components, '.');
 227
 228     path_start_pos = host_end_pos;
 229   }
 230
 231   SetPath(pattern.substr(path_start_pos));
 232
 233   size_t port_pos = host_.find(':');
 234   if (port_pos != std::string::npos) {
 235     if (!SetPort(host_.substr(port_pos + 1)))
 236       return PARSE_ERROR_INVALID_PORT;
 237     host_ = host_.substr(0, port_pos);
 238   }
 239
 240   // No other '*' can occur in the host, though. This isn't necessary, but is
 241   // done as a convenience to developers who might otherwise be confused and
 242   // think '*' works as a glob in the host.
 243   if (host_.find('*') != std::string::npos)
 244     return PARSE_ERROR_INVALID_HOST_WILDCARD;
 245
 246   return PARSE_SUCCESS;
 247 }
 248
 249 void URLPattern::SetValidSchemes(int valid_schemes) {
 250   spec_.clear();
 251   valid_schemes_ = valid_schemes;
 252 }
 253
 254 void URLPattern::SetHost(const std::string& host) {
 255   spec_.clear();
 256   host_ = host;
 257 }
 258
 259 void URLPattern::SetMatchAllURLs(bool val) {
 260   spec_.clear();
 261   match_all_urls_ = val;
 262
 263   if (val) {
 264     match_subdomains_ = true;
 265     scheme_ = "*";
 266     host_.clear();
 267     SetPath("/*");
 268   }
 269 }
 270
 271 void URLPattern::SetMatchSubdomains(bool val) {
 272   spec_.clear();
 273   match_subdomains_ = val;
 274 }
 275
 276 bool URLPattern::SetScheme(const std::string& scheme) {
 277   spec_.clear();
 278   scheme_ = scheme;
 279   if (scheme_ == "*") {
 280     valid_schemes_ &= (SCHEME_HTTP | SCHEME_HTTPS);
 281   } else if (!IsValidScheme(scheme_)) {
 282     return false;
 283   }
 284   return true;
 285 }
 286
 287 bool URLPattern::IsValidScheme(const std::string& scheme) const {
 288   if (valid_schemes_ == SCHEME_ALL)
 289     return true;
 290
 291   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
 292     if (scheme == kValidSchemes[i] && (valid_schemes_ & kValidSchemeMasks[i]))
 293       return true;
 294   }
 295
 296   return false;
 297 }
 298
 299 void URLPattern::SetPath(const std::string& path) {
 300   spec_.clear();
 301   path_ = path;
 302   path_escaped_ = path_;
 303   ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\");
 304   ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?");
 305 }
 306
 307 bool URLPattern::SetPort(const std::string& port) {
 308   spec_.clear();
 309   if (IsValidPortForScheme(scheme_, port)) {
 310     port_ = port;
 311     return true;
 312   }
 313   return false;
 314 }
 315
 316 bool URLPattern::MatchesURL(const GURL& test) const {
 317   const GURL* test_url = &test;
 318   bool has_inner_url = test.inner_url() != NULL;
 319
 320   if (has_inner_url) {
 321     if (!test.SchemeIsFileSystem())
 322       return false;  // The only nested URLs we handle are filesystem URLs.
 323     test_url = test.inner_url();
 324   }
 325
 326   if (!MatchesScheme(test_url->scheme()))
 327     return false;
 328
 329   if (match_all_urls_)
 330     return true;
 331
 332   std::string path_for_request = test.PathForRequest();
 333   if (has_inner_url)
 334     path_for_request = test_url->path() + path_for_request;
 335
 336   return MatchesSecurityOriginHelper(*test_url) &&
 337          MatchesPath(path_for_request);
 338 }
 339
 340 bool URLPattern::MatchesSecurityOrigin(const GURL& test) const {
 341   const GURL* test_url = &test;
 342   bool has_inner_url = test.inner_url() != NULL;
 343
 344   if (has_inner_url) {
 345     if (!test.SchemeIsFileSystem())
 346       return false;  // The only nested URLs we handle are filesystem URLs.
 347     test_url = test.inner_url();
 348   }
 349
 350   if (!MatchesScheme(test_url->scheme()))
 351     return false;
 352
 353   if (match_all_urls_)
 354     return true;
 355
 356   return MatchesSecurityOriginHelper(*test_url);
 357 }
 358
 359 bool URLPattern::MatchesScheme(const std::string& test) const {
 360   if (!IsValidScheme(test))
 361     return false;
 362
 363   return scheme_ == "*" || test == scheme_;
 364 }
 365
 366 bool URLPattern::MatchesHost(const std::string& host) const {
 367   std::string test(content::kHttpScheme);
 368   test += content::kStandardSchemeSeparator;
 369   test += host;
 370   test += "/";
 371   return MatchesHost(GURL(test));
 372 }
 373
 374 bool URLPattern::MatchesHost(const GURL& test) const {
 375   // If the hosts are exactly equal, we have a match.
 376   if (test.host() == host_)
 377     return true;
 378
 379   // If we're matching subdomains, and we have no host in the match pattern,
 380   // that means that we're matching all hosts, which means we have a match no
 381   // matter what the test host is.
 382   if (match_subdomains_ && host_.empty())
 383     return true;
 384
 385   // Otherwise, we can only match if our match pattern matches subdomains.
 386   if (!match_subdomains_)
 387     return false;
 388
 389   // We don't do subdomain matching against IP addresses, so we can give up now
 390   // if the test host is an IP address.
 391   if (test.HostIsIPAddress())
 392     return false;
 393
 394   // Check if the test host is a subdomain of our host.
 395   if (test.host().length() <= (host_.length() + 1))
 396     return false;
 397
 398   if (test.host().compare(test.host().length() - host_.length(),
 399                           host_.length(), host_) != 0)
 400     return false;
 401
 402   return test.host()[test.host().length() - host_.length() - 1] == '.';
 403 }
 404
 405 bool URLPattern::MatchesPath(const std::string& test) const {
 406   // Make the behaviour of OverlapsWith consistent with MatchesURL, which is
 407   // need to match hosted apps on e.g. 'google.com' also run on 'google.com/'.
 408   if (test + "/*" == path_escaped_)
 409     return true;
 410
 411   return MatchPattern(test, path_escaped_);
 412 }
 413
 414 const std::string& URLPattern::GetAsString() const {
 415   if (!spec_.empty())
 416     return spec_;
 417
 418   if (match_all_urls_) {
 419     spec_ = kAllUrlsPattern;
 420     return spec_;
 421   }
 422
 423   bool standard_scheme = IsStandardScheme(scheme_);
 424
 425   std::string spec = scheme_ +
 426       (standard_scheme ? content::kStandardSchemeSeparator : ":");
 427
 428   if (scheme_ != chrome::kFileScheme && standard_scheme) {
 429     if (match_subdomains_) {
 430       spec += "*";
 431       if (!host_.empty())
 432         spec += ".";
 433     }
 434
 435     if (!host_.empty())
 436       spec += host_;
 437
 438     if (port_ != "*") {
 439       spec += ":";
 440       spec += port_;
 441     }
 442   }
 443
 444   if (!path_.empty())
 445     spec += path_;
 446
 447   spec_ = spec;
 448   return spec_;
 449 }
 450
 451 bool URLPattern::OverlapsWith(const URLPattern& other) const {
 452   if (match_all_urls() || other.match_all_urls())
 453     return true;
 454   return (MatchesAnyScheme(other.GetExplicitSchemes()) ||
 455           other.MatchesAnyScheme(GetExplicitSchemes()))
 456       && (MatchesHost(other.host()) || other.MatchesHost(host()))
 457       && (MatchesPortPattern(other.port()) || other.MatchesPortPattern(port()))
 458       && (MatchesPath(StripTrailingWildcard(other.path())) ||
 459           other.MatchesPath(StripTrailingWildcard(path())));
 460 }
 461
 462 bool URLPattern::Contains(const URLPattern& other) const {
 463   if (match_all_urls())
 464     return true;
 465   return MatchesAllSchemes(other.GetExplicitSchemes())
 466       && MatchesHost(other.host())
 467       && MatchesPortPattern(other.port())
 468       && MatchesPath(StripTrailingWildcard(other.path()));
 469 }
 470
 471 bool URLPattern::MatchesAnyScheme(
 472     const std::vector<std::string>& schemes) const {
 473   for (std::vector<std::string>::const_iterator i = schemes.begin();
 474        i != schemes.end(); ++i) {
 475     if (MatchesScheme(*i))
 476       return true;
 477   }
 478
 479   return false;
 480 }
 481
 482 bool URLPattern::MatchesAllSchemes(
 483     const std::vector<std::string>& schemes) const {
 484   for (std::vector<std::string>::const_iterator i = schemes.begin();
 485        i != schemes.end(); ++i) {
 486     if (!MatchesScheme(*i))
 487       return false;
 488   }
 489
 490   return true;
 491 }
 492
 493 bool URLPattern::MatchesSecurityOriginHelper(const GURL& test) const {
 494   // Ignore hostname if scheme is file://.
 495   if (scheme_ != chrome::kFileScheme && !MatchesHost(test))
 496     return false;
 497
 498   if (!MatchesPortPattern(base::IntToString(test.EffectiveIntPort())))
 499     return false;
 500
 501   return true;
 502 }
 503
 504 bool URLPattern::MatchesPortPattern(const std::string& port) const {
 505   return port_ == "*" || port_ == port;
 506 }
 507
 508 std::vector<std::string> URLPattern::GetExplicitSchemes() const {
 509   std::vector<std::string> result;
 510
 511   if (scheme_ != "*" && !match_all_urls_ && IsValidScheme(scheme_)) {
 512     result.push_back(scheme_);
 513     return result;
 514   }
 515
 516   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
 517     if (MatchesScheme(kValidSchemes[i])) {
 518       result.push_back(kValidSchemes[i]);
 519     }
 520   }
 521
 522   return result;
 523 }
 524
 525 std::vector<URLPattern> URLPattern::ConvertToExplicitSchemes() const {
 526   std::vector<std::string> explicit_schemes = GetExplicitSchemes();
 527   std::vector<URLPattern> result;
 528
 529   for (std::vector<std::string>::const_iterator i = explicit_schemes.begin();
 530        i != explicit_schemes.end(); ++i) {
 531     URLPattern temp = *this;
 532     temp.SetScheme(*i);
 533     temp.SetMatchAllURLs(false);
 534     result.push_back(temp);
 535   }
 536
 537   return result;
 538 }
 539
 540 // static
 541 const char* URLPattern::GetParseResultString(
 542     URLPattern::ParseResult parse_result) {
 543   return kParseResultMessages[parse_result];
 544 }