1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/url_matcher/url_matcher.h"
11 #include "base/logging.h"
12 #include "base/memory/ptr_util.h"
13 #include "base/stl_util.h"
15 #include "url/url_canon.h"
17 namespace url_matcher {
19 // This set of classes implement a mapping of URL Component Patterns, such as
20 // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
21 // for use in substring comparisons.
23 // The idea of this mapping is to reduce the problem of comparing many
24 // URL Component Patterns against one URL to the problem of searching many
25 // substrings in one string:
27 // ---------------------- -----------------
28 // | URL Query operator | ----translate----> | StringPattern |
29 // ---------------------- -----------------
35 // ---------------------- -----------------
36 // | URL to compare | | |
37 // | to all URL Query | ----translate----> | String |
39 // ---------------------- -----------------
41 // The reason for this problem reduction is that there are efficient algorithms
42 // for searching many substrings in one string (see Aho-Corasick algorithm).
44 // Additionally, some of the same pieces are reused to implement regular
45 // expression comparisons. The FilteredRE2 implementation for matching many
46 // regular expressions against one string uses prefiltering, in which a set
47 // of substrings (derived from the regexes) are first searched for, to reduce
48 // the number of regular expressions to test; the prefiltering step also
51 // Case 1: {host,path,query}_{prefix,suffix,equals} searches.
52 // ==========================================================
54 // For searches in this class, we normalize URLs as follows:
57 // Remove scheme, port and segment from URL:
58 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
59 // www.example.com/index.html?search=foo
61 // We remove the scheme and port number because they can be checked later
62 // in a secondary filter step. We remove the segment (the #... part) because
63 // this is not guaranteed to be ASCII-7 encoded.
66 // Translate URL to String and add the following position markers:
67 // - BU = Beginning of URL
68 // - ED = End of Domain
71 // Furthermore, the hostname is canonicalized to start with a ".".
73 // Position markers are represented as characters >127, which are therefore
74 // guaranteed not to be part of the ASCII-7 encoded URL character set.
76 // -> www.example.com/index.html?search=foo becomes
77 // BU .www.example.com ED /index.html EP ?search=foo EU
79 // -> www.example.com/index.html becomes
80 // BU .www.example.com ED /index.html EP EU
83 // Translate URL Component Patterns as follows:
85 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
86 // -> host_prefix("www.example") = BU .www.example
88 // host_suffix(suffix) = suffix ED
89 // -> host_suffix("example.com") = example.com ED
90 // -> host_suffix(".example.com") = .example.com ED
92 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED
93 // -> host_equals("www.example.com") = BU .www.example.com ED
95 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
97 // With this, we can search the StringPatterns in the normalized URL.
100 // Case 2: url_{prefix,suffix,equals,contains} searches.
101 // =====================================================
103 // Step 1: as above, except that
104 // - the scheme is not removed
105 // - the port is not removed if it is specified and does not match the default
106 // port for the given scheme.
109 // Translate URL to String and add the following position markers:
110 // - BU = Beginning of URL
113 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
114 // BU http://www.example.com:8080/index.html?search=foo EU
115 // -> http://www.example.com:80/index.html?search=foo#first_match becomes
116 // BU http://www.example.com/index.html?search=foo EU
118 // url_prefix(prefix) = BU prefix
119 // -> url_prefix("http://www.example") = BU http://www.example
121 // url_contains(substring) = substring
122 // -> url_contains("index") = index
125 // Case 3: {host,path,query}_contains searches.
126 // ============================================
128 // These kinds of searches are not supported directly but can be derived
129 // by a combination of a url_contains() query followed by an explicit test:
131 // host_contains(str) = url_contains(str) followed by test whether str occurs
132 // in host component of original URL.
133 // -> host_contains("example.co") = example.co
134 // followed by gurl.host().find("example.co");
136 // [similarly for path_contains and query_contains].
139 // Regular expression matching (url_matches searches)
140 // ==================================================
142 // This class also supports matching regular expressions (RE2 syntax)
143 // against full URLs, which are transformed as in case 2.
147 bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {
148 return criterion == URLMatcherCondition::URL_MATCHES;
151 bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {
152 return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES;
158 // URLMatcherCondition
161 URLMatcherCondition::URLMatcherCondition()
162 : criterion_(HOST_PREFIX), string_pattern_(nullptr) {}
164 URLMatcherCondition::~URLMatcherCondition() {}
166 URLMatcherCondition::URLMatcherCondition(
168 const StringPattern* string_pattern)
169 : criterion_(criterion),
170 string_pattern_(string_pattern) {}
172 URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
173 : criterion_(rhs.criterion_),
174 string_pattern_(rhs.string_pattern_) {}
176 URLMatcherCondition& URLMatcherCondition::operator=(
177 const URLMatcherCondition& rhs) {
178 criterion_ = rhs.criterion_;
179 string_pattern_ = rhs.string_pattern_;
183 bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
184 if (criterion_ < rhs.criterion_) return true;
185 if (criterion_ > rhs.criterion_) return false;
186 if (string_pattern_ != nullptr && rhs.string_pattern_ != nullptr)
187 return *string_pattern_ < *rhs.string_pattern_;
188 if (string_pattern_ == nullptr && rhs.string_pattern_ != nullptr)
190 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
195 bool URLMatcherCondition::IsFullURLCondition() const {
196 // For these criteria the SubstringMatcher needs to be executed on the
197 // GURL that is canonicalized with
198 // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
199 switch (criterion_) {
214 bool URLMatcherCondition::IsRegexCondition() const {
215 return IsRegexCriterion(criterion_);
218 bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {
219 return IsOriginAndPathRegexCriterion(criterion_);
222 bool URLMatcherCondition::IsMatch(
223 const std::set<StringPattern::ID>& matching_patterns,
224 const GURL& url) const {
225 DCHECK(string_pattern_);
226 if (!base::ContainsKey(matching_patterns, string_pattern_->id()))
228 // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
229 // a substring match on the raw URL. In case of a match, we need to verify
230 // that the match was found in the correct component of the URL.
231 switch (criterion_) {
233 return url.host().find(string_pattern_->pattern()) !=
236 return url.path().find(string_pattern_->pattern()) !=
239 return url.query().find(string_pattern_->pattern()) !=
248 // URLMatcherConditionFactory
252 // These are symbols that are not contained in 7-bit ASCII used in GURLs.
253 const char kBeginningOfURL[] = {static_cast<char>(-1), 0};
254 const char kEndOfDomain[] = {static_cast<char>(-2), 0};
255 const char kEndOfPath[] = {static_cast<char>(-3), 0};
256 const char kQueryComponentDelimiter[] = {static_cast<char>(-4), 0};
257 const char kEndOfURL[] = {static_cast<char>(-5), 0};
259 // The delimiter for query parameters
260 const char kQuerySeparator = '&';
263 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
265 URLMatcherConditionFactory::~URLMatcherConditionFactory() {
268 std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
269 const GURL& url) const {
270 return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
271 url.path() + kEndOfPath +
272 (url.has_query() ? CanonicalizeQuery(url.query(), true, true)
277 URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
278 const std::string& prefix) {
279 return CreateCondition(URLMatcherCondition::HOST_PREFIX,
280 kBeginningOfURL + CanonicalizeHostPrefix(prefix));
283 URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
284 const std::string& suffix) {
285 return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
286 CanonicalizeHostSuffix(suffix) + kEndOfDomain);
289 URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
290 const std::string& str) {
291 return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
294 URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
295 const std::string& str) {
296 return CreateCondition(URLMatcherCondition::HOST_EQUALS,
297 kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
300 URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
301 const std::string& prefix) {
302 return CreateCondition(URLMatcherCondition::PATH_PREFIX,
303 kEndOfDomain + prefix);
306 URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
307 const std::string& suffix) {
308 return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath);
311 URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
312 const std::string& str) {
313 return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
316 URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
317 const std::string& str) {
318 return CreateCondition(URLMatcherCondition::PATH_EQUALS,
319 kEndOfDomain + str + kEndOfPath);
322 URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
323 const std::string& prefix) {
325 if (!prefix.empty() && prefix[0] == '?')
326 pattern = kEndOfPath + CanonicalizeQuery(prefix.substr(1), true, false);
328 pattern = kEndOfPath + CanonicalizeQuery(prefix, true, false);
330 return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern);
333 URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
334 const std::string& suffix) {
335 if (!suffix.empty() && suffix[0] == '?') {
336 return CreateQueryEqualsCondition(suffix);
338 return CreateCondition(URLMatcherCondition::QUERY_SUFFIX,
339 CanonicalizeQuery(suffix, false, true) + kEndOfURL);
343 URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
344 const std::string& str) {
345 if (!str.empty() && str[0] == '?')
346 return CreateQueryPrefixCondition(str);
348 return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
351 URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
352 const std::string& str) {
354 if (!str.empty() && str[0] == '?')
356 kEndOfPath + CanonicalizeQuery(str.substr(1), true, true) + kEndOfURL;
358 pattern = kEndOfPath + CanonicalizeQuery(str, true, true) + kEndOfURL;
360 return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern);
364 URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
365 const std::string& host_suffix,
366 const std::string& path_prefix) {
367 return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
368 CanonicalizeHostSuffix(host_suffix) + kEndOfDomain + path_prefix);
372 URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
373 const std::string& host,
374 const std::string& path_prefix) {
375 return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX,
376 kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain +
380 std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
381 const GURL& url) const {
382 GURL::Replacements replacements;
383 replacements.ClearPassword();
384 replacements.ClearUsername();
385 replacements.ClearRef();
386 // Clear port if it is implicit from scheme.
387 if (url.has_port()) {
388 const std::string& port = url.scheme();
389 if (url::DefaultPortForScheme(port.c_str(), port.size()) ==
390 url.EffectiveIntPort()) {
391 replacements.ClearPort();
394 return kBeginningOfURL + url.ReplaceComponents(replacements).spec() +
398 static std::string CanonicalizeURLForRegexSearchesHelper(
401 GURL::Replacements replacements;
402 replacements.ClearPassword();
403 replacements.ClearUsername();
404 replacements.ClearRef();
406 replacements.ClearQuery();
407 // Clear port if it is implicit from scheme.
408 if (url.has_port()) {
409 const std::string& port = url.scheme();
410 if (url::DefaultPortForScheme(port.c_str(), port.size()) ==
411 url.EffectiveIntPort()) {
412 replacements.ClearPort();
415 return url.ReplaceComponents(replacements).spec();
418 std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
419 const GURL& url) const {
420 return CanonicalizeURLForRegexSearchesHelper(url, false);
424 URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
425 const GURL& url) const {
426 return CanonicalizeURLForRegexSearchesHelper(url, true);
429 URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
430 const std::string& prefix) {
431 return CreateCondition(URLMatcherCondition::URL_PREFIX,
432 kBeginningOfURL + prefix);
435 URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
436 const std::string& suffix) {
437 return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
440 URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
441 const std::string& str) {
442 return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
445 URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
446 const std::string& str) {
447 return CreateCondition(URLMatcherCondition::URL_EQUALS,
448 kBeginningOfURL + str + kEndOfURL);
451 URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
452 const std::string& regex) {
453 return CreateCondition(URLMatcherCondition::URL_MATCHES, regex);
457 URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
458 const std::string& regex) {
459 return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex);
462 void URLMatcherConditionFactory::ForgetUnusedPatterns(
463 const std::set<StringPattern::ID>& used_patterns) {
464 auto i = substring_pattern_singletons_.begin();
465 while (i != substring_pattern_singletons_.end()) {
466 if (base::ContainsKey(used_patterns, i->first->id()))
469 substring_pattern_singletons_.erase(i++);
472 i = regex_pattern_singletons_.begin();
473 while (i != regex_pattern_singletons_.end()) {
474 if (base::ContainsKey(used_patterns, i->first->id()))
477 regex_pattern_singletons_.erase(i++);
480 i = origin_and_path_regex_pattern_singletons_.begin();
481 while (i != origin_and_path_regex_pattern_singletons_.end()) {
482 if (base::ContainsKey(used_patterns, i->first->id()))
485 origin_and_path_regex_pattern_singletons_.erase(i++);
489 bool URLMatcherConditionFactory::IsEmpty() const {
490 return substring_pattern_singletons_.empty() &&
491 regex_pattern_singletons_.empty() &&
492 origin_and_path_regex_pattern_singletons_.empty();
495 URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
496 URLMatcherCondition::Criterion criterion,
497 const std::string& pattern) {
498 StringPattern search_pattern(pattern, 0);
499 PatternSingletons* pattern_singletons = nullptr;
500 if (IsRegexCriterion(criterion))
501 pattern_singletons = ®ex_pattern_singletons_;
502 else if (IsOriginAndPathRegexCriterion(criterion))
503 pattern_singletons = &origin_and_path_regex_pattern_singletons_;
505 pattern_singletons = &substring_pattern_singletons_;
507 auto iter = pattern_singletons->find(&search_pattern);
509 if (iter != pattern_singletons->end())
510 return URLMatcherCondition(criterion, iter->first);
512 StringPattern* new_pattern = new StringPattern(pattern, id_counter_++);
513 (*pattern_singletons)[new_pattern] = base::WrapUnique(new_pattern);
514 return URLMatcherCondition(criterion, new_pattern);
517 std::string URLMatcherConditionFactory::CanonicalizeHostSuffix(
518 const std::string& suffix) const {
521 return suffix.back() == '.' ? suffix : suffix + ".";
524 std::string URLMatcherConditionFactory::CanonicalizeHostPrefix(
525 const std::string& prefix) const {
528 return prefix[0] == '.' ? prefix : "." + prefix;
531 std::string URLMatcherConditionFactory::CanonicalizeHostname(
532 const std::string& hostname) const {
533 return CanonicalizeHostPrefix(CanonicalizeHostSuffix(hostname));
536 // This function prepares the query string by replacing query separator with a
537 // magic value (|kQueryComponentDelimiter|). When the boolean
538 // |prepend_beginning_of_query_component| is true the function prepends the
539 // query with the same magic. This is done to locate the start of a key value
540 // pair in the query string. The parameter |query| is passed by value
541 // intentionally, since it is locally modified.
542 std::string URLMatcherConditionFactory::CanonicalizeQuery(
544 bool prepend_beginning_of_query_component,
545 bool append_end_of_query_component) const {
546 for (std::string::iterator it = query.begin(); it != query.end(); ++it) {
547 if (*it == kQuerySeparator)
548 *it = kQueryComponentDelimiter[0];
550 if (prepend_beginning_of_query_component)
551 query = kQueryComponentDelimiter + query;
552 if (append_end_of_query_component)
553 query += kQueryComponentDelimiter;
557 bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
559 StringPattern* rhs) const {
560 if (lhs == nullptr && rhs != nullptr)
562 if (lhs != nullptr && rhs != nullptr)
563 return lhs->pattern() < rhs->pattern();
564 // Either both are NULL or only rhs is NULL.
569 // URLQueryElementMatcherCondition
572 URLQueryElementMatcherCondition::URLQueryElementMatcherCondition(
573 const std::string& key,
574 const std::string& value,
575 QueryValueMatchType query_value_match_type,
576 QueryElementType query_element_type,
578 URLMatcherConditionFactory* factory) {
579 match_type_ = match_type;
581 if (query_element_type == ELEMENT_TYPE_KEY_VALUE) {
582 key_ = kQueryComponentDelimiter + key + "=";
585 key_ = kQueryComponentDelimiter + key;
586 value_ = std::string();
589 if (query_value_match_type == QUERY_VALUE_MATCH_EXACT)
590 value_ += kQueryComponentDelimiter;
592 // If |value_| is empty no need to find the |key_| and verify if the value
593 // matches. Simply checking the presence of key is sufficient, which is done
596 match_type_ = MATCH_ANY;
598 URLMatcherCondition condition;
599 // If |match_type_| is MATCH_ANY, then we could simply look for the
600 // combination of |key_| + |value_|, which can be efficiently done by
602 if (match_type_ == MATCH_ANY)
603 condition = factory->CreateQueryContainsCondition(key_ + value_);
605 condition = factory->CreateQueryContainsCondition(key_);
606 string_pattern_ = condition.string_pattern();
608 key_length_ = key_.length();
609 value_length_ = value_.length();
612 URLQueryElementMatcherCondition::URLQueryElementMatcherCondition(
613 const URLQueryElementMatcherCondition& other) = default;
615 URLQueryElementMatcherCondition::~URLQueryElementMatcherCondition() {}
617 bool URLQueryElementMatcherCondition::operator<(
618 const URLQueryElementMatcherCondition& rhs) const {
619 if (match_type_ != rhs.match_type_)
620 return match_type_ < rhs.match_type_;
621 if (string_pattern_ != nullptr && rhs.string_pattern_ != nullptr)
622 return *string_pattern_ < *rhs.string_pattern_;
623 if (string_pattern_ == nullptr && rhs.string_pattern_ != nullptr)
625 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
630 bool URLQueryElementMatcherCondition::IsMatch(
631 const std::string& url_for_component_searches) const {
632 switch (match_type_) {
634 // For MATCH_ANY, no additional verification step is needed. We can trust
635 // the SubstringMatcher to do the verification.
642 while ((offset = url_for_component_searches.find(key_, start)) !=
644 if (url_for_component_searches.compare(
645 offset + key_length_, value_length_, value_) != 0) {
650 start = offset + key_length_ + value_length_ - 1;
655 size_t offset = url_for_component_searches.find(key_);
656 return url_for_component_searches.compare(
657 offset + key_length_, value_length_, value_) == 0;
660 size_t offset = url_for_component_searches.rfind(key_);
661 return url_for_component_searches.compare(
662 offset + key_length_, value_length_, value_) == 0;
670 // URLMatcherSchemeFilter
673 URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter)
675 filters_.push_back(filter);
678 URLMatcherSchemeFilter::URLMatcherSchemeFilter(
679 const std::vector<std::string>& filters)
680 : filters_(filters) {}
682 URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}
684 bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const {
685 return base::ContainsValue(filters_, url.scheme());
689 // URLMatcherPortFilter
692 URLMatcherPortFilter::URLMatcherPortFilter(
693 const std::vector<URLMatcherPortFilter::Range>& ranges)
696 URLMatcherPortFilter::~URLMatcherPortFilter() {}
698 bool URLMatcherPortFilter::IsMatch(const GURL& url) const {
699 int port = url.EffectiveIntPort();
700 for (std::vector<Range>::const_iterator i = ranges_.begin();
701 i != ranges_.end(); ++i) {
702 if (i->first <= port && port <= i->second)
709 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from,
711 return Range(from, to);
715 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) {
716 return Range(port, port);
720 // URLMatcherConditionSet
723 URLMatcherConditionSet::~URLMatcherConditionSet() {}
725 URLMatcherConditionSet::URLMatcherConditionSet(
727 const Conditions& conditions)
729 conditions_(conditions) {}
731 URLMatcherConditionSet::URLMatcherConditionSet(
733 const Conditions& conditions,
734 std::unique_ptr<URLMatcherSchemeFilter> scheme_filter,
735 std::unique_ptr<URLMatcherPortFilter> port_filter)
737 conditions_(conditions),
738 scheme_filter_(std::move(scheme_filter)),
739 port_filter_(std::move(port_filter)) {}
741 URLMatcherConditionSet::URLMatcherConditionSet(
743 const Conditions& conditions,
744 const QueryConditions& query_conditions,
745 std::unique_ptr<URLMatcherSchemeFilter> scheme_filter,
746 std::unique_ptr<URLMatcherPortFilter> port_filter)
748 conditions_(conditions),
749 query_conditions_(query_conditions),
750 scheme_filter_(std::move(scheme_filter)),
751 port_filter_(std::move(port_filter)) {}
753 bool URLMatcherConditionSet::IsMatch(
754 const std::set<StringPattern::ID>& matching_patterns,
755 const GURL& url) const {
756 return IsMatch(matching_patterns, url, std::string());
759 bool URLMatcherConditionSet::IsMatch(
760 const std::set<StringPattern::ID>& matching_patterns,
762 const std::string& url_for_component_searches) const {
763 for (Conditions::const_iterator i = conditions_.begin();
764 i != conditions_.end(); ++i) {
765 if (!i->IsMatch(matching_patterns, url))
768 if (scheme_filter_.get() && !scheme_filter_->IsMatch(url))
770 if (port_filter_.get() && !port_filter_->IsMatch(url))
772 if (query_conditions_.empty())
774 // The loop is duplicated below for performance reasons. If not all query
775 // elements are found, no need to verify match that is expected to take more
777 for (QueryConditions::const_iterator i = query_conditions_.begin();
778 i != query_conditions_.end();
780 if (!base::ContainsKey(matching_patterns, i->string_pattern()->id()))
783 for (QueryConditions::const_iterator i = query_conditions_.begin();
784 i != query_conditions_.end();
786 if (!i->IsMatch(url_for_component_searches))
796 URLMatcher::URLMatcher() {}
798 URLMatcher::~URLMatcher() {}
800 void URLMatcher::AddConditionSets(
801 const URLMatcherConditionSet::Vector& condition_sets) {
802 for (URLMatcherConditionSet::Vector::const_iterator i =
803 condition_sets.begin(); i != condition_sets.end(); ++i) {
804 DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
805 url_matcher_condition_sets_.end());
806 url_matcher_condition_sets_[(*i)->id()] = *i;
808 UpdateInternalDatastructures();
811 void URLMatcher::RemoveConditionSets(
812 const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) {
813 for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
814 condition_set_ids.begin(); i != condition_set_ids.end(); ++i) {
815 DCHECK(url_matcher_condition_sets_.find(*i) !=
816 url_matcher_condition_sets_.end());
817 url_matcher_condition_sets_.erase(*i);
819 UpdateInternalDatastructures();
822 void URLMatcher::ClearUnusedConditionSets() {
823 UpdateConditionFactory();
826 std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
827 const GURL& url) const {
828 // Find all IDs of StringPatterns that match |url|.
829 // See URLMatcherConditionFactory for the canonicalization of URLs and the
830 // distinction between full url searches and url component searches.
831 std::set<StringPattern::ID> matches;
832 std::string url_for_component_searches;
834 if (!full_url_matcher_.IsEmpty()) {
835 full_url_matcher_.Match(
836 condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
838 if (!url_component_matcher_.IsEmpty()) {
839 url_for_component_searches =
840 condition_factory_.CanonicalizeURLForComponentSearches(url);
841 url_component_matcher_.Match(url_for_component_searches, &matches);
843 if (!regex_set_matcher_.IsEmpty()) {
844 regex_set_matcher_.Match(
845 condition_factory_.CanonicalizeURLForRegexSearches(url), &matches);
847 if (!origin_and_path_regex_set_matcher_.IsEmpty()) {
848 origin_and_path_regex_set_matcher_.Match(
849 condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url),
853 // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
855 std::set<URLMatcherConditionSet::ID> result;
856 for (std::set<StringPattern::ID>::const_iterator i = matches.begin();
857 i != matches.end(); ++i) {
858 // For each URLMatcherConditionSet there is exactly one condition
859 // registered in substring_match_triggers_. This means that the following
860 // logic tests each URLMatcherConditionSet exactly once if it can be
861 // completely fulfilled.
862 StringPatternTriggers::const_iterator triggered_condition_sets_iter =
863 substring_match_triggers_.find(*i);
864 if (triggered_condition_sets_iter == substring_match_triggers_.end())
865 continue; // Not all substring matches are triggers for a condition set.
866 const std::set<URLMatcherConditionSet::ID>& condition_sets =
867 triggered_condition_sets_iter->second;
868 for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
869 condition_sets.begin(); j != condition_sets.end(); ++j) {
870 URLMatcherConditionSets::const_iterator condition_set_iter =
871 url_matcher_condition_sets_.find(*j);
872 DCHECK(condition_set_iter != url_matcher_condition_sets_.end());
873 if (condition_set_iter->second->IsMatch(
874 matches, url, url_for_component_searches))
882 bool URLMatcher::IsEmpty() const {
883 return condition_factory_.IsEmpty() &&
884 url_matcher_condition_sets_.empty() &&
885 substring_match_triggers_.empty() &&
886 full_url_matcher_.IsEmpty() &&
887 url_component_matcher_.IsEmpty() &&
888 regex_set_matcher_.IsEmpty() &&
889 origin_and_path_regex_set_matcher_.IsEmpty() &&
890 registered_full_url_patterns_.empty() &&
891 registered_url_component_patterns_.empty();
894 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
895 // The purpose of |full_url_conditions| is just that we need to execute
896 // the same logic once for Full URL searches and once for URL Component
897 // searches (see URLMatcherConditionFactory).
899 // Determine which patterns need to be registered when this function
901 std::set<const StringPattern*> new_patterns;
902 for (URLMatcherConditionSets::const_iterator condition_set_iter =
903 url_matcher_condition_sets_.begin();
904 condition_set_iter != url_matcher_condition_sets_.end();
905 ++condition_set_iter) {
906 const URLMatcherConditionSet::Conditions& conditions =
907 condition_set_iter->second->conditions();
908 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
909 conditions.begin(); condition_iter != conditions.end();
911 // If we are called to process Full URL searches, ignore others, and
912 // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
913 if (!condition_iter->IsRegexCondition() &&
914 !condition_iter->IsOriginAndPathRegexCondition() &&
915 full_url_conditions == condition_iter->IsFullURLCondition())
916 new_patterns.insert(condition_iter->string_pattern());
919 if (full_url_conditions)
922 const URLMatcherConditionSet::QueryConditions& query_conditions =
923 condition_set_iter->second->query_conditions();
924 for (URLMatcherConditionSet::QueryConditions::const_iterator
925 query_condition_iter = query_conditions.begin();
926 query_condition_iter != query_conditions.end();
927 ++query_condition_iter) {
928 new_patterns.insert(query_condition_iter->string_pattern());
932 // This is the set of patterns that were registered before this function
934 std::set<const StringPattern*>& registered_patterns =
935 full_url_conditions ? registered_full_url_patterns_
936 : registered_url_component_patterns_;
938 // Add all patterns that are in new_patterns but not in registered_patterns.
939 std::vector<const StringPattern*> patterns_to_register =
940 base::STLSetDifference<std::vector<const StringPattern*> >(
941 new_patterns, registered_patterns);
943 // Remove all patterns that are in registered_patterns but not in
945 std::vector<const StringPattern*> patterns_to_unregister =
946 base::STLSetDifference<std::vector<const StringPattern*> >(
947 registered_patterns, new_patterns);
949 // Update the SubstringSetMatcher.
950 SubstringSetMatcher& url_matcher =
951 full_url_conditions ? full_url_matcher_ : url_component_matcher_;
952 url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
953 patterns_to_unregister);
955 // Update the set of registered_patterns for the next time this function
957 registered_patterns.swap(new_patterns);
960 void URLMatcher::UpdateRegexSetMatcher() {
961 std::vector<const StringPattern*> new_patterns;
962 std::vector<const StringPattern*> new_origin_and_path_patterns;
964 for (URLMatcherConditionSets::const_iterator condition_set_iter =
965 url_matcher_condition_sets_.begin();
966 condition_set_iter != url_matcher_condition_sets_.end();
967 ++condition_set_iter) {
968 const URLMatcherConditionSet::Conditions& conditions =
969 condition_set_iter->second->conditions();
970 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
971 conditions.begin(); condition_iter != conditions.end();
973 if (condition_iter->IsRegexCondition()) {
974 new_patterns.push_back(condition_iter->string_pattern());
975 } else if (condition_iter->IsOriginAndPathRegexCondition()) {
976 new_origin_and_path_patterns.push_back(
977 condition_iter->string_pattern());
982 // Start over from scratch. We can't really do better than this, since the
983 // FilteredRE2 backend doesn't support incremental updates.
984 regex_set_matcher_.ClearPatterns();
985 regex_set_matcher_.AddPatterns(new_patterns);
986 origin_and_path_regex_set_matcher_.ClearPatterns();
987 origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns);
990 void URLMatcher::UpdateTriggers() {
991 // Count substring pattern frequencies.
992 std::map<StringPattern::ID, size_t> substring_pattern_frequencies;
993 for (URLMatcherConditionSets::const_iterator condition_set_iter =
994 url_matcher_condition_sets_.begin();
995 condition_set_iter != url_matcher_condition_sets_.end();
996 ++condition_set_iter) {
997 const URLMatcherConditionSet::Conditions& conditions =
998 condition_set_iter->second->conditions();
999 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
1000 conditions.begin(); condition_iter != conditions.end();
1002 const StringPattern* pattern = condition_iter->string_pattern();
1003 substring_pattern_frequencies[pattern->id()]++;
1006 const URLMatcherConditionSet::QueryConditions& query_conditions =
1007 condition_set_iter->second->query_conditions();
1008 for (URLMatcherConditionSet::QueryConditions::const_iterator
1009 query_condition_iter = query_conditions.begin();
1010 query_condition_iter != query_conditions.end();
1011 ++query_condition_iter) {
1012 const StringPattern* pattern = query_condition_iter->string_pattern();
1013 substring_pattern_frequencies[pattern->id()]++;
1017 // Update trigger conditions: Determine for each URLMatcherConditionSet which
1018 // URLMatcherCondition contains a StringPattern that occurs least
1019 // frequently in this URLMatcher. We assume that this condition is very
1020 // specific and occurs rarely in URLs. If a match occurs for this
1021 // URLMatcherCondition, we want to test all other URLMatcherCondition in the
1022 // respective URLMatcherConditionSet as well to see whether the entire
1023 // URLMatcherConditionSet is considered matching.
1024 substring_match_triggers_.clear();
1025 for (URLMatcherConditionSets::const_iterator condition_set_iter =
1026 url_matcher_condition_sets_.begin();
1027 condition_set_iter != url_matcher_condition_sets_.end();
1028 ++condition_set_iter) {
1029 const URLMatcherConditionSet::Conditions& conditions =
1030 condition_set_iter->second->conditions();
1031 if (conditions.empty())
1033 URLMatcherConditionSet::Conditions::const_iterator condition_iter =
1035 StringPattern::ID trigger = condition_iter->string_pattern()->id();
1036 // We skip the first element in the following loop.
1038 for (; condition_iter != conditions.end(); ++condition_iter) {
1039 StringPattern::ID current_id =
1040 condition_iter->string_pattern()->id();
1041 if (substring_pattern_frequencies[trigger] >
1042 substring_pattern_frequencies[current_id]) {
1043 trigger = current_id;
1047 const URLMatcherConditionSet::QueryConditions& query_conditions =
1048 condition_set_iter->second->query_conditions();
1049 for (URLMatcherConditionSet::QueryConditions::const_iterator
1050 query_condition_iter = query_conditions.begin();
1051 query_condition_iter != query_conditions.end();
1052 ++query_condition_iter) {
1053 StringPattern::ID current_id =
1054 query_condition_iter->string_pattern()->id();
1055 if (substring_pattern_frequencies[trigger] >
1056 substring_pattern_frequencies[current_id]) {
1057 trigger = current_id;
1061 substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
1065 void URLMatcher::UpdateConditionFactory() {
1066 std::set<StringPattern::ID> used_patterns;
1067 for (URLMatcherConditionSets::const_iterator condition_set_iter =
1068 url_matcher_condition_sets_.begin();
1069 condition_set_iter != url_matcher_condition_sets_.end();
1070 ++condition_set_iter) {
1071 const URLMatcherConditionSet::Conditions& conditions =
1072 condition_set_iter->second->conditions();
1073 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
1074 conditions.begin(); condition_iter != conditions.end();
1076 used_patterns.insert(condition_iter->string_pattern()->id());
1078 const URLMatcherConditionSet::QueryConditions& query_conditions =
1079 condition_set_iter->second->query_conditions();
1080 for (URLMatcherConditionSet::QueryConditions::const_iterator
1081 query_condition_iter = query_conditions.begin();
1082 query_condition_iter != query_conditions.end();
1083 ++query_condition_iter) {
1084 used_patterns.insert(query_condition_iter->string_pattern()->id());
1087 condition_factory_.ForgetUnusedPatterns(used_patterns);
1090 void URLMatcher::UpdateInternalDatastructures() {
1091 UpdateSubstringSetMatcher(false);
1092 UpdateSubstringSetMatcher(true);
1093 UpdateRegexSetMatcher();
1095 UpdateConditionFactory();
1098 } // namespace url_matcher