Upload upstream chromium 69.0.3497
[platform/framework/web/chromium-efl.git] / components / url_matcher / url_matcher.cc
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/url_matcher/url_matcher.h"
6
7 #include <algorithm>
8 #include <iterator>
9 #include <utility>
10
11 #include "base/logging.h"
12 #include "base/memory/ptr_util.h"
13 #include "base/stl_util.h"
14 #include "url/gurl.h"
15 #include "url/url_canon.h"
16
17 namespace url_matcher {
18
19 // This set of classes implement a mapping of URL Component Patterns, such as
20 // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
21 // for use in substring comparisons.
22 //
23 // The idea of this mapping is to reduce the problem of comparing many
24 // URL Component Patterns against one URL to the problem of searching many
25 // substrings in one string:
26 //
27 // ----------------------                    -----------------
28 // | URL Query operator | ----translate----> | StringPattern |
29 // ----------------------                    -----------------
30 //                                                   ^
31 //                                                   |
32 //                                                compare
33 //                                                   |
34 //                                                   v
35 // ----------------------                    -----------------
36 // | URL to compare     |                    |               |
37 // | to all URL Query   | ----translate----> | String        |
38 // | operators          |                    |               |
39 // ----------------------                    -----------------
40 //
41 // The reason for this problem reduction is that there are efficient algorithms
42 // for searching many substrings in one string (see Aho-Corasick algorithm).
43 //
44 // Additionally, some of the same pieces are reused to implement regular
45 // expression comparisons. The FilteredRE2 implementation for matching many
46 // regular expressions against one string uses prefiltering, in which a set
47 // of substrings (derived from the regexes) are first searched for, to reduce
48 // the number of regular expressions to test; the prefiltering step also
49 // uses Aho-Corasick.
50 //
51 // Case 1: {host,path,query}_{prefix,suffix,equals} searches.
52 // ==========================================================
53 //
54 // For searches in this class, we normalize URLs as follows:
55 //
56 // Step 1:
57 // Remove scheme, port and segment from URL:
58 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
59 //    www.example.com/index.html?search=foo
60 //
61 // We remove the scheme and port number because they can be checked later
62 // in a secondary filter step. We remove the segment (the #... part) because
63 // this is not guaranteed to be ASCII-7 encoded.
64 //
65 // Step 2:
66 // Translate URL to String and add the following position markers:
67 // - BU = Beginning of URL
68 // - ED = End of Domain
69 // - EP = End of Path
70 // - EU = End of URL
71 // Furthermore, the hostname is canonicalized to start with a ".".
72 //
73 // Position markers are represented as characters >127, which are therefore
74 // guaranteed not to be part of the ASCII-7 encoded URL character set.
75 //
76 // -> www.example.com/index.html?search=foo becomes
77 // BU .www.example.com ED /index.html EP ?search=foo EU
78 //
79 // -> www.example.com/index.html becomes
80 // BU .www.example.com ED /index.html EP EU
81 //
82 // Step 3:
83 // Translate URL Component Patterns as follows:
84 //
85 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
86 // -> host_prefix("www.example") = BU .www.example
87 //
88 // host_suffix(suffix) = suffix ED
89 // -> host_suffix("example.com") = example.com ED
90 // -> host_suffix(".example.com") = .example.com ED
91 //
92 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED
93 // -> host_equals("www.example.com") = BU .www.example.com ED
94 //
95 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
96 //
97 // With this, we can search the StringPatterns in the normalized URL.
98 //
99 //
100 // Case 2: url_{prefix,suffix,equals,contains} searches.
101 // =====================================================
102 //
103 // Step 1: as above, except that
104 // - the scheme is not removed
105 // - the port is not removed if it is specified and does not match the default
106 //   port for the given scheme.
107 //
108 // Step 2:
109 // Translate URL to String and add the following position markers:
110 // - BU = Beginning of URL
111 // - EU = End of URL
112 //
113 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
114 // BU http://www.example.com:8080/index.html?search=foo EU
115 // -> http://www.example.com:80/index.html?search=foo#first_match becomes
116 // BU http://www.example.com/index.html?search=foo EU
117 //
118 // url_prefix(prefix) = BU prefix
119 // -> url_prefix("http://www.example") = BU http://www.example
120 //
121 // url_contains(substring) = substring
122 // -> url_contains("index") = index
123 //
124 //
125 // Case 3: {host,path,query}_contains searches.
126 // ============================================
127 //
128 // These kinds of searches are not supported directly but can be derived
129 // by a combination of a url_contains() query followed by an explicit test:
130 //
131 // host_contains(str) = url_contains(str) followed by test whether str occurs
132 //   in host component of original URL.
133 // -> host_contains("example.co") = example.co
134 //    followed by gurl.host().find("example.co");
135 //
136 // [similarly for path_contains and query_contains].
137 //
138 //
139 // Regular expression matching (url_matches searches)
140 // ==================================================
141 //
142 // This class also supports matching regular expressions (RE2 syntax)
143 // against full URLs, which are transformed as in case 2.
144
145 namespace {
146
147 bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {
148   return criterion == URLMatcherCondition::URL_MATCHES;
149 }
150
151 bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {
152   return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES;
153 }
154
155 }  // namespace
156
157 //
158 // URLMatcherCondition
159 //
160
161 URLMatcherCondition::URLMatcherCondition()
162     : criterion_(HOST_PREFIX), string_pattern_(nullptr) {}
163
164 URLMatcherCondition::~URLMatcherCondition() {}
165
166 URLMatcherCondition::URLMatcherCondition(
167     Criterion criterion,
168     const StringPattern* string_pattern)
169     : criterion_(criterion),
170       string_pattern_(string_pattern) {}
171
172 URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
173     : criterion_(rhs.criterion_),
174       string_pattern_(rhs.string_pattern_) {}
175
176 URLMatcherCondition& URLMatcherCondition::operator=(
177     const URLMatcherCondition& rhs) {
178   criterion_ = rhs.criterion_;
179   string_pattern_ = rhs.string_pattern_;
180   return *this;
181 }
182
183 bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
184   if (criterion_ < rhs.criterion_) return true;
185   if (criterion_ > rhs.criterion_) return false;
186   if (string_pattern_ != nullptr && rhs.string_pattern_ != nullptr)
187     return *string_pattern_ < *rhs.string_pattern_;
188   if (string_pattern_ == nullptr && rhs.string_pattern_ != nullptr)
189     return true;
190   // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
191   // or both are NULL.
192   return false;
193 }
194
195 bool URLMatcherCondition::IsFullURLCondition() const {
196   // For these criteria the SubstringMatcher needs to be executed on the
197   // GURL that is canonicalized with
198   // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
199   switch (criterion_) {
200     case HOST_CONTAINS:
201     case PATH_CONTAINS:
202     case QUERY_CONTAINS:
203     case URL_PREFIX:
204     case URL_SUFFIX:
205     case URL_CONTAINS:
206     case URL_EQUALS:
207       return true;
208     default:
209       break;
210   }
211   return false;
212 }
213
214 bool URLMatcherCondition::IsRegexCondition() const {
215   return IsRegexCriterion(criterion_);
216 }
217
218 bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {
219   return IsOriginAndPathRegexCriterion(criterion_);
220 }
221
222 bool URLMatcherCondition::IsMatch(
223     const std::set<StringPattern::ID>& matching_patterns,
224     const GURL& url) const {
225   DCHECK(string_pattern_);
226   if (!base::ContainsKey(matching_patterns, string_pattern_->id()))
227     return false;
228   // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
229   // a substring match on the raw URL. In case of a match, we need to verify
230   // that the match was found in the correct component of the URL.
231   switch (criterion_) {
232     case HOST_CONTAINS:
233       return url.host().find(string_pattern_->pattern()) !=
234           std::string::npos;
235     case PATH_CONTAINS:
236       return url.path().find(string_pattern_->pattern()) !=
237           std::string::npos;
238     case QUERY_CONTAINS:
239       return url.query().find(string_pattern_->pattern()) !=
240           std::string::npos;
241     default:
242       break;
243   }
244   return true;
245 }
246
247 //
248 // URLMatcherConditionFactory
249 //
250
251 namespace {
252 // These are symbols that are not contained in 7-bit ASCII used in GURLs.
253 const char kBeginningOfURL[] = {static_cast<char>(-1), 0};
254 const char kEndOfDomain[] = {static_cast<char>(-2), 0};
255 const char kEndOfPath[] = {static_cast<char>(-3), 0};
256 const char kQueryComponentDelimiter[] = {static_cast<char>(-4), 0};
257 const char kEndOfURL[] = {static_cast<char>(-5), 0};
258
259 // The delimiter for query parameters
260 const char kQuerySeparator = '&';
261 }  // namespace
262
263 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
264
265 URLMatcherConditionFactory::~URLMatcherConditionFactory() {
266 }
267
268 std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
269     const GURL& url) const {
270   return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
271          url.path() + kEndOfPath +
272          (url.has_query() ? CanonicalizeQuery(url.query(), true, true)
273                           : std::string()) +
274          kEndOfURL;
275 }
276
277 URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
278     const std::string& prefix) {
279   return CreateCondition(URLMatcherCondition::HOST_PREFIX,
280       kBeginningOfURL + CanonicalizeHostPrefix(prefix));
281 }
282
283 URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
284     const std::string& suffix) {
285   return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
286       CanonicalizeHostSuffix(suffix) + kEndOfDomain);
287 }
288
289 URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
290     const std::string& str) {
291   return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
292 }
293
294 URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
295     const std::string& str) {
296   return CreateCondition(URLMatcherCondition::HOST_EQUALS,
297       kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
298 }
299
300 URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
301     const std::string& prefix) {
302   return CreateCondition(URLMatcherCondition::PATH_PREFIX,
303       kEndOfDomain + prefix);
304 }
305
306 URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
307     const std::string& suffix) {
308   return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath);
309 }
310
311 URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
312     const std::string& str) {
313   return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
314 }
315
316 URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
317     const std::string& str) {
318   return CreateCondition(URLMatcherCondition::PATH_EQUALS,
319       kEndOfDomain + str + kEndOfPath);
320 }
321
322 URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
323     const std::string& prefix) {
324   std::string pattern;
325   if (!prefix.empty() && prefix[0] == '?')
326     pattern = kEndOfPath + CanonicalizeQuery(prefix.substr(1), true, false);
327   else
328     pattern = kEndOfPath + CanonicalizeQuery(prefix, true, false);
329
330   return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern);
331 }
332
333 URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
334     const std::string& suffix) {
335   if (!suffix.empty() && suffix[0] == '?') {
336     return CreateQueryEqualsCondition(suffix);
337   } else {
338     return CreateCondition(URLMatcherCondition::QUERY_SUFFIX,
339                            CanonicalizeQuery(suffix, false, true) + kEndOfURL);
340   }
341 }
342
343 URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
344     const std::string& str) {
345   if (!str.empty() && str[0] == '?')
346     return CreateQueryPrefixCondition(str);
347   else
348     return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
349 }
350
351 URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
352     const std::string& str) {
353   std::string pattern;
354   if (!str.empty() && str[0] == '?')
355     pattern =
356         kEndOfPath + CanonicalizeQuery(str.substr(1), true, true) + kEndOfURL;
357   else
358     pattern = kEndOfPath + CanonicalizeQuery(str, true, true) + kEndOfURL;
359
360   return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern);
361 }
362
363 URLMatcherCondition
364     URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
365     const std::string& host_suffix,
366     const std::string& path_prefix) {
367   return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
368       CanonicalizeHostSuffix(host_suffix) + kEndOfDomain + path_prefix);
369 }
370
371 URLMatcherCondition
372 URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
373     const std::string& host,
374     const std::string& path_prefix) {
375   return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX,
376       kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain +
377       path_prefix);
378 }
379
380 std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
381     const GURL& url) const {
382   GURL::Replacements replacements;
383   replacements.ClearPassword();
384   replacements.ClearUsername();
385   replacements.ClearRef();
386   // Clear port if it is implicit from scheme.
387   if (url.has_port()) {
388     const std::string& port = url.scheme();
389     if (url::DefaultPortForScheme(port.c_str(), port.size()) ==
390         url.EffectiveIntPort()) {
391       replacements.ClearPort();
392     }
393   }
394   return kBeginningOfURL + url.ReplaceComponents(replacements).spec() +
395       kEndOfURL;
396 }
397
398 static std::string CanonicalizeURLForRegexSearchesHelper(
399     const GURL& url,
400     bool clear_query) {
401   GURL::Replacements replacements;
402   replacements.ClearPassword();
403   replacements.ClearUsername();
404   replacements.ClearRef();
405   if (clear_query)
406     replacements.ClearQuery();
407   // Clear port if it is implicit from scheme.
408   if (url.has_port()) {
409     const std::string& port = url.scheme();
410     if (url::DefaultPortForScheme(port.c_str(), port.size()) ==
411         url.EffectiveIntPort()) {
412       replacements.ClearPort();
413     }
414   }
415   return url.ReplaceComponents(replacements).spec();
416 }
417
418 std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
419     const GURL& url) const {
420   return CanonicalizeURLForRegexSearchesHelper(url, false);
421 }
422
423 std::string
424 URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
425     const GURL& url) const {
426   return CanonicalizeURLForRegexSearchesHelper(url, true);
427 }
428
429 URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
430     const std::string& prefix) {
431   return CreateCondition(URLMatcherCondition::URL_PREFIX,
432       kBeginningOfURL + prefix);
433 }
434
435 URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
436     const std::string& suffix) {
437   return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
438 }
439
440 URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
441     const std::string& str) {
442   return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
443 }
444
445 URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
446     const std::string& str) {
447   return CreateCondition(URLMatcherCondition::URL_EQUALS,
448       kBeginningOfURL + str + kEndOfURL);
449 }
450
451 URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
452     const std::string& regex) {
453   return CreateCondition(URLMatcherCondition::URL_MATCHES, regex);
454 }
455
456 URLMatcherCondition
457 URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
458     const std::string& regex) {
459   return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex);
460 }
461
462 void URLMatcherConditionFactory::ForgetUnusedPatterns(
463       const std::set<StringPattern::ID>& used_patterns) {
464   auto i = substring_pattern_singletons_.begin();
465   while (i != substring_pattern_singletons_.end()) {
466     if (base::ContainsKey(used_patterns, i->first->id()))
467       ++i;
468     else
469       substring_pattern_singletons_.erase(i++);
470   }
471
472   i = regex_pattern_singletons_.begin();
473   while (i != regex_pattern_singletons_.end()) {
474     if (base::ContainsKey(used_patterns, i->first->id()))
475       ++i;
476     else
477       regex_pattern_singletons_.erase(i++);
478   }
479
480   i = origin_and_path_regex_pattern_singletons_.begin();
481   while (i != origin_and_path_regex_pattern_singletons_.end()) {
482     if (base::ContainsKey(used_patterns, i->first->id()))
483       ++i;
484     else
485       origin_and_path_regex_pattern_singletons_.erase(i++);
486   }
487 }
488
489 bool URLMatcherConditionFactory::IsEmpty() const {
490   return substring_pattern_singletons_.empty() &&
491       regex_pattern_singletons_.empty() &&
492       origin_and_path_regex_pattern_singletons_.empty();
493 }
494
495 URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
496     URLMatcherCondition::Criterion criterion,
497     const std::string& pattern) {
498   StringPattern search_pattern(pattern, 0);
499   PatternSingletons* pattern_singletons = nullptr;
500   if (IsRegexCriterion(criterion))
501     pattern_singletons = &regex_pattern_singletons_;
502   else if (IsOriginAndPathRegexCriterion(criterion))
503     pattern_singletons = &origin_and_path_regex_pattern_singletons_;
504   else
505     pattern_singletons = &substring_pattern_singletons_;
506
507   auto iter = pattern_singletons->find(&search_pattern);
508
509   if (iter != pattern_singletons->end())
510     return URLMatcherCondition(criterion, iter->first);
511
512   StringPattern* new_pattern = new StringPattern(pattern, id_counter_++);
513   (*pattern_singletons)[new_pattern] = base::WrapUnique(new_pattern);
514   return URLMatcherCondition(criterion, new_pattern);
515 }
516
517 std::string URLMatcherConditionFactory::CanonicalizeHostSuffix(
518     const std::string& suffix) const {
519   if (suffix.empty())
520     return ".";
521   return suffix.back() == '.' ? suffix : suffix + ".";
522 }
523
524 std::string URLMatcherConditionFactory::CanonicalizeHostPrefix(
525     const std::string& prefix) const {
526   if (prefix.empty())
527     return ".";
528   return prefix[0] == '.' ? prefix : "." + prefix;
529 }
530
531 std::string URLMatcherConditionFactory::CanonicalizeHostname(
532     const std::string& hostname) const {
533   return CanonicalizeHostPrefix(CanonicalizeHostSuffix(hostname));
534 }
535
536 // This function prepares the query string by replacing query separator with a
537 // magic value (|kQueryComponentDelimiter|). When the boolean
538 // |prepend_beginning_of_query_component| is true the function prepends the
539 // query with the same magic. This is done to locate the start of a key value
540 // pair in the query string. The parameter |query| is passed by value
541 // intentionally, since it is locally modified.
542 std::string URLMatcherConditionFactory::CanonicalizeQuery(
543     std::string query,
544     bool prepend_beginning_of_query_component,
545     bool append_end_of_query_component) const {
546   for (std::string::iterator it = query.begin(); it != query.end(); ++it) {
547     if (*it == kQuerySeparator)
548       *it = kQueryComponentDelimiter[0];
549   }
550   if (prepend_beginning_of_query_component)
551     query = kQueryComponentDelimiter + query;
552   if (append_end_of_query_component)
553     query += kQueryComponentDelimiter;
554   return query;
555 }
556
557 bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
558     StringPattern* lhs,
559     StringPattern* rhs) const {
560   if (lhs == nullptr && rhs != nullptr)
561     return true;
562   if (lhs != nullptr && rhs != nullptr)
563     return lhs->pattern() < rhs->pattern();
564   // Either both are NULL or only rhs is NULL.
565   return false;
566 }
567
568 //
569 // URLQueryElementMatcherCondition
570 //
571
572 URLQueryElementMatcherCondition::URLQueryElementMatcherCondition(
573     const std::string& key,
574     const std::string& value,
575     QueryValueMatchType query_value_match_type,
576     QueryElementType query_element_type,
577     Type match_type,
578     URLMatcherConditionFactory* factory) {
579   match_type_ = match_type;
580
581   if (query_element_type == ELEMENT_TYPE_KEY_VALUE) {
582     key_ = kQueryComponentDelimiter + key + "=";
583     value_ = value;
584   } else {
585     key_ = kQueryComponentDelimiter + key;
586     value_ = std::string();
587   }
588
589   if (query_value_match_type == QUERY_VALUE_MATCH_EXACT)
590     value_ += kQueryComponentDelimiter;
591
592   // If |value_| is empty no need to find the |key_| and verify if the value
593   // matches. Simply checking the presence of key is sufficient, which is done
594   // by MATCH_ANY
595   if (value_.empty())
596     match_type_ = MATCH_ANY;
597
598   URLMatcherCondition condition;
599   // If |match_type_| is MATCH_ANY, then we could simply look for the
600   // combination of |key_| + |value_|, which can be efficiently done by
601   // SubstringMatcher
602   if (match_type_ == MATCH_ANY)
603     condition = factory->CreateQueryContainsCondition(key_ + value_);
604   else
605     condition = factory->CreateQueryContainsCondition(key_);
606   string_pattern_ = condition.string_pattern();
607
608   key_length_ = key_.length();
609   value_length_ = value_.length();
610 }
611
612 URLQueryElementMatcherCondition::URLQueryElementMatcherCondition(
613     const URLQueryElementMatcherCondition& other) = default;
614
615 URLQueryElementMatcherCondition::~URLQueryElementMatcherCondition() {}
616
617 bool URLQueryElementMatcherCondition::operator<(
618     const URLQueryElementMatcherCondition& rhs) const {
619   if (match_type_ != rhs.match_type_)
620     return match_type_ < rhs.match_type_;
621   if (string_pattern_ != nullptr && rhs.string_pattern_ != nullptr)
622     return *string_pattern_ < *rhs.string_pattern_;
623   if (string_pattern_ == nullptr && rhs.string_pattern_ != nullptr)
624     return true;
625   // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
626   // or both are NULL.
627   return false;
628 }
629
630 bool URLQueryElementMatcherCondition::IsMatch(
631     const std::string& url_for_component_searches) const {
632   switch (match_type_) {
633     case MATCH_ANY: {
634       // For MATCH_ANY, no additional verification step is needed. We can trust
635       // the SubstringMatcher to do the verification.
636       return true;
637     }
638     case MATCH_ALL: {
639       size_t start = 0;
640       int found = 0;
641       size_t offset;
642       while ((offset = url_for_component_searches.find(key_, start)) !=
643              std::string::npos) {
644         if (url_for_component_searches.compare(
645                 offset + key_length_, value_length_, value_) != 0) {
646           return false;
647         } else {
648           ++found;
649         }
650         start = offset + key_length_ + value_length_ - 1;
651       }
652       return !!found;
653     }
654     case MATCH_FIRST: {
655       size_t offset = url_for_component_searches.find(key_);
656       return url_for_component_searches.compare(
657                  offset + key_length_, value_length_, value_) == 0;
658     }
659     case MATCH_LAST: {
660       size_t offset = url_for_component_searches.rfind(key_);
661       return url_for_component_searches.compare(
662                  offset + key_length_, value_length_, value_) == 0;
663     }
664   }
665   NOTREACHED();
666   return false;
667 }
668
669 //
670 // URLMatcherSchemeFilter
671 //
672
673 URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter)
674     : filters_(1) {
675   filters_.push_back(filter);
676 }
677
678 URLMatcherSchemeFilter::URLMatcherSchemeFilter(
679     const std::vector<std::string>& filters)
680     : filters_(filters) {}
681
682 URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}
683
684 bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const {
685   return base::ContainsValue(filters_, url.scheme());
686 }
687
688 //
689 // URLMatcherPortFilter
690 //
691
692 URLMatcherPortFilter::URLMatcherPortFilter(
693     const std::vector<URLMatcherPortFilter::Range>& ranges)
694     : ranges_(ranges) {}
695
696 URLMatcherPortFilter::~URLMatcherPortFilter() {}
697
698 bool URLMatcherPortFilter::IsMatch(const GURL& url) const {
699   int port = url.EffectiveIntPort();
700   for (std::vector<Range>::const_iterator i = ranges_.begin();
701        i != ranges_.end(); ++i) {
702     if (i->first <= port && port <= i->second)
703       return true;
704   }
705   return false;
706 }
707
708 // static
709 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from,
710                                                               int to) {
711   return Range(from, to);
712 }
713
714 // static
715 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) {
716   return Range(port, port);
717 }
718
719 //
720 // URLMatcherConditionSet
721 //
722
723 URLMatcherConditionSet::~URLMatcherConditionSet() {}
724
725 URLMatcherConditionSet::URLMatcherConditionSet(
726     ID id,
727     const Conditions& conditions)
728     : id_(id),
729       conditions_(conditions) {}
730
731 URLMatcherConditionSet::URLMatcherConditionSet(
732     ID id,
733     const Conditions& conditions,
734     std::unique_ptr<URLMatcherSchemeFilter> scheme_filter,
735     std::unique_ptr<URLMatcherPortFilter> port_filter)
736     : id_(id),
737       conditions_(conditions),
738       scheme_filter_(std::move(scheme_filter)),
739       port_filter_(std::move(port_filter)) {}
740
741 URLMatcherConditionSet::URLMatcherConditionSet(
742     ID id,
743     const Conditions& conditions,
744     const QueryConditions& query_conditions,
745     std::unique_ptr<URLMatcherSchemeFilter> scheme_filter,
746     std::unique_ptr<URLMatcherPortFilter> port_filter)
747     : id_(id),
748       conditions_(conditions),
749       query_conditions_(query_conditions),
750       scheme_filter_(std::move(scheme_filter)),
751       port_filter_(std::move(port_filter)) {}
752
753 bool URLMatcherConditionSet::IsMatch(
754     const std::set<StringPattern::ID>& matching_patterns,
755     const GURL& url) const {
756   return IsMatch(matching_patterns, url, std::string());
757 }
758
759 bool URLMatcherConditionSet::IsMatch(
760     const std::set<StringPattern::ID>& matching_patterns,
761     const GURL& url,
762     const std::string& url_for_component_searches) const {
763   for (Conditions::const_iterator i = conditions_.begin();
764        i != conditions_.end(); ++i) {
765     if (!i->IsMatch(matching_patterns, url))
766       return false;
767   }
768   if (scheme_filter_.get() && !scheme_filter_->IsMatch(url))
769     return false;
770   if (port_filter_.get() && !port_filter_->IsMatch(url))
771     return false;
772   if (query_conditions_.empty())
773     return true;
774   // The loop is duplicated below for performance reasons. If not all query
775   // elements are found, no need to verify match that is expected to take more
776   // cycles.
777   for (QueryConditions::const_iterator i = query_conditions_.begin();
778        i != query_conditions_.end();
779        ++i) {
780     if (!base::ContainsKey(matching_patterns, i->string_pattern()->id()))
781       return false;
782   }
783   for (QueryConditions::const_iterator i = query_conditions_.begin();
784        i != query_conditions_.end();
785        ++i) {
786     if (!i->IsMatch(url_for_component_searches))
787       return false;
788   }
789   return true;
790 }
791
792 //
793 // URLMatcher
794 //
795
796 URLMatcher::URLMatcher() {}
797
798 URLMatcher::~URLMatcher() {}
799
800 void URLMatcher::AddConditionSets(
801     const URLMatcherConditionSet::Vector& condition_sets) {
802   for (URLMatcherConditionSet::Vector::const_iterator i =
803        condition_sets.begin(); i != condition_sets.end(); ++i) {
804     DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
805         url_matcher_condition_sets_.end());
806     url_matcher_condition_sets_[(*i)->id()] = *i;
807   }
808   UpdateInternalDatastructures();
809 }
810
811 void URLMatcher::RemoveConditionSets(
812     const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) {
813   for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
814        condition_set_ids.begin(); i != condition_set_ids.end(); ++i) {
815     DCHECK(url_matcher_condition_sets_.find(*i) !=
816         url_matcher_condition_sets_.end());
817     url_matcher_condition_sets_.erase(*i);
818   }
819   UpdateInternalDatastructures();
820 }
821
822 void URLMatcher::ClearUnusedConditionSets() {
823   UpdateConditionFactory();
824 }
825
826 std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
827     const GURL& url) const {
828   // Find all IDs of StringPatterns that match |url|.
829   // See URLMatcherConditionFactory for the canonicalization of URLs and the
830   // distinction between full url searches and url component searches.
831   std::set<StringPattern::ID> matches;
832   std::string url_for_component_searches;
833
834   if (!full_url_matcher_.IsEmpty()) {
835     full_url_matcher_.Match(
836         condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
837   }
838   if (!url_component_matcher_.IsEmpty()) {
839     url_for_component_searches =
840         condition_factory_.CanonicalizeURLForComponentSearches(url);
841     url_component_matcher_.Match(url_for_component_searches, &matches);
842   }
843   if (!regex_set_matcher_.IsEmpty()) {
844     regex_set_matcher_.Match(
845         condition_factory_.CanonicalizeURLForRegexSearches(url), &matches);
846   }
847   if (!origin_and_path_regex_set_matcher_.IsEmpty()) {
848     origin_and_path_regex_set_matcher_.Match(
849         condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url),
850         &matches);
851   }
852
853   // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
854   // were fulfilled.
855   std::set<URLMatcherConditionSet::ID> result;
856   for (std::set<StringPattern::ID>::const_iterator i = matches.begin();
857        i != matches.end(); ++i) {
858     // For each URLMatcherConditionSet there is exactly one condition
859     // registered in substring_match_triggers_. This means that the following
860     // logic tests each URLMatcherConditionSet exactly once if it can be
861     // completely fulfilled.
862     StringPatternTriggers::const_iterator triggered_condition_sets_iter =
863         substring_match_triggers_.find(*i);
864     if (triggered_condition_sets_iter == substring_match_triggers_.end())
865       continue;  // Not all substring matches are triggers for a condition set.
866     const std::set<URLMatcherConditionSet::ID>& condition_sets =
867         triggered_condition_sets_iter->second;
868     for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
869          condition_sets.begin(); j != condition_sets.end(); ++j) {
870       URLMatcherConditionSets::const_iterator condition_set_iter =
871           url_matcher_condition_sets_.find(*j);
872       DCHECK(condition_set_iter != url_matcher_condition_sets_.end());
873       if (condition_set_iter->second->IsMatch(
874               matches, url, url_for_component_searches))
875         result.insert(*j);
876     }
877   }
878
879   return result;
880 }
881
882 bool URLMatcher::IsEmpty() const {
883   return condition_factory_.IsEmpty() &&
884       url_matcher_condition_sets_.empty() &&
885       substring_match_triggers_.empty() &&
886       full_url_matcher_.IsEmpty() &&
887       url_component_matcher_.IsEmpty() &&
888       regex_set_matcher_.IsEmpty() &&
889       origin_and_path_regex_set_matcher_.IsEmpty() &&
890       registered_full_url_patterns_.empty() &&
891       registered_url_component_patterns_.empty();
892 }
893
894 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
895   // The purpose of |full_url_conditions| is just that we need to execute
896   // the same logic once for Full URL searches and once for URL Component
897   // searches (see URLMatcherConditionFactory).
898
899   // Determine which patterns need to be registered when this function
900   // terminates.
901   std::set<const StringPattern*> new_patterns;
902   for (URLMatcherConditionSets::const_iterator condition_set_iter =
903       url_matcher_condition_sets_.begin();
904       condition_set_iter != url_matcher_condition_sets_.end();
905       ++condition_set_iter) {
906     const URLMatcherConditionSet::Conditions& conditions =
907         condition_set_iter->second->conditions();
908     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
909          conditions.begin(); condition_iter != conditions.end();
910          ++condition_iter) {
911       // If we are called to process Full URL searches, ignore others, and
912       // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
913       if (!condition_iter->IsRegexCondition() &&
914           !condition_iter->IsOriginAndPathRegexCondition() &&
915           full_url_conditions == condition_iter->IsFullURLCondition())
916         new_patterns.insert(condition_iter->string_pattern());
917     }
918
919     if (full_url_conditions)
920       continue;
921
922     const URLMatcherConditionSet::QueryConditions& query_conditions =
923         condition_set_iter->second->query_conditions();
924     for (URLMatcherConditionSet::QueryConditions::const_iterator
925              query_condition_iter = query_conditions.begin();
926          query_condition_iter != query_conditions.end();
927          ++query_condition_iter) {
928       new_patterns.insert(query_condition_iter->string_pattern());
929     }
930   }
931
932   // This is the set of patterns that were registered before this function
933   // is called.
934   std::set<const StringPattern*>& registered_patterns =
935       full_url_conditions ? registered_full_url_patterns_
936                           : registered_url_component_patterns_;
937
938   // Add all patterns that are in new_patterns but not in registered_patterns.
939   std::vector<const StringPattern*> patterns_to_register =
940       base::STLSetDifference<std::vector<const StringPattern*> >(
941           new_patterns, registered_patterns);
942
943   // Remove all patterns that are in registered_patterns but not in
944   // new_patterns.
945   std::vector<const StringPattern*> patterns_to_unregister =
946       base::STLSetDifference<std::vector<const StringPattern*> >(
947            registered_patterns, new_patterns);
948
949   // Update the SubstringSetMatcher.
950   SubstringSetMatcher& url_matcher =
951       full_url_conditions ? full_url_matcher_ : url_component_matcher_;
952   url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
953                                             patterns_to_unregister);
954
955   // Update the set of registered_patterns for the next time this function
956   // is being called.
957   registered_patterns.swap(new_patterns);
958 }
959
960 void URLMatcher::UpdateRegexSetMatcher() {
961   std::vector<const StringPattern*> new_patterns;
962   std::vector<const StringPattern*> new_origin_and_path_patterns;
963
964   for (URLMatcherConditionSets::const_iterator condition_set_iter =
965       url_matcher_condition_sets_.begin();
966       condition_set_iter != url_matcher_condition_sets_.end();
967       ++condition_set_iter) {
968     const URLMatcherConditionSet::Conditions& conditions =
969         condition_set_iter->second->conditions();
970     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
971          conditions.begin(); condition_iter != conditions.end();
972          ++condition_iter) {
973       if (condition_iter->IsRegexCondition()) {
974         new_patterns.push_back(condition_iter->string_pattern());
975       } else if (condition_iter->IsOriginAndPathRegexCondition()) {
976         new_origin_and_path_patterns.push_back(
977             condition_iter->string_pattern());
978       }
979     }
980   }
981
982   // Start over from scratch. We can't really do better than this, since the
983   // FilteredRE2 backend doesn't support incremental updates.
984   regex_set_matcher_.ClearPatterns();
985   regex_set_matcher_.AddPatterns(new_patterns);
986   origin_and_path_regex_set_matcher_.ClearPatterns();
987   origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns);
988 }
989
990 void URLMatcher::UpdateTriggers() {
991   // Count substring pattern frequencies.
992   std::map<StringPattern::ID, size_t> substring_pattern_frequencies;
993   for (URLMatcherConditionSets::const_iterator condition_set_iter =
994       url_matcher_condition_sets_.begin();
995       condition_set_iter != url_matcher_condition_sets_.end();
996       ++condition_set_iter) {
997     const URLMatcherConditionSet::Conditions& conditions =
998         condition_set_iter->second->conditions();
999     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
1000          conditions.begin(); condition_iter != conditions.end();
1001          ++condition_iter) {
1002       const StringPattern* pattern = condition_iter->string_pattern();
1003       substring_pattern_frequencies[pattern->id()]++;
1004     }
1005
1006     const URLMatcherConditionSet::QueryConditions& query_conditions =
1007         condition_set_iter->second->query_conditions();
1008     for (URLMatcherConditionSet::QueryConditions::const_iterator
1009              query_condition_iter = query_conditions.begin();
1010          query_condition_iter != query_conditions.end();
1011          ++query_condition_iter) {
1012       const StringPattern* pattern = query_condition_iter->string_pattern();
1013       substring_pattern_frequencies[pattern->id()]++;
1014     }
1015   }
1016
1017   // Update trigger conditions: Determine for each URLMatcherConditionSet which
1018   // URLMatcherCondition contains a StringPattern that occurs least
1019   // frequently in this URLMatcher. We assume that this condition is very
1020   // specific and occurs rarely in URLs. If a match occurs for this
1021   // URLMatcherCondition, we want to test all other URLMatcherCondition in the
1022   // respective URLMatcherConditionSet as well to see whether the entire
1023   // URLMatcherConditionSet is considered matching.
1024   substring_match_triggers_.clear();
1025   for (URLMatcherConditionSets::const_iterator condition_set_iter =
1026       url_matcher_condition_sets_.begin();
1027       condition_set_iter != url_matcher_condition_sets_.end();
1028       ++condition_set_iter) {
1029     const URLMatcherConditionSet::Conditions& conditions =
1030         condition_set_iter->second->conditions();
1031     if (conditions.empty())
1032       continue;
1033     URLMatcherConditionSet::Conditions::const_iterator condition_iter =
1034         conditions.begin();
1035     StringPattern::ID trigger = condition_iter->string_pattern()->id();
1036     // We skip the first element in the following loop.
1037     ++condition_iter;
1038     for (; condition_iter != conditions.end(); ++condition_iter) {
1039       StringPattern::ID current_id =
1040           condition_iter->string_pattern()->id();
1041       if (substring_pattern_frequencies[trigger] >
1042           substring_pattern_frequencies[current_id]) {
1043         trigger = current_id;
1044       }
1045     }
1046
1047     const URLMatcherConditionSet::QueryConditions& query_conditions =
1048         condition_set_iter->second->query_conditions();
1049     for (URLMatcherConditionSet::QueryConditions::const_iterator
1050              query_condition_iter = query_conditions.begin();
1051          query_condition_iter != query_conditions.end();
1052          ++query_condition_iter) {
1053       StringPattern::ID current_id =
1054           query_condition_iter->string_pattern()->id();
1055       if (substring_pattern_frequencies[trigger] >
1056           substring_pattern_frequencies[current_id]) {
1057         trigger = current_id;
1058       }
1059     }
1060
1061     substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
1062   }
1063 }
1064
1065 void URLMatcher::UpdateConditionFactory() {
1066   std::set<StringPattern::ID> used_patterns;
1067   for (URLMatcherConditionSets::const_iterator condition_set_iter =
1068       url_matcher_condition_sets_.begin();
1069       condition_set_iter != url_matcher_condition_sets_.end();
1070       ++condition_set_iter) {
1071     const URLMatcherConditionSet::Conditions& conditions =
1072         condition_set_iter->second->conditions();
1073     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
1074          conditions.begin(); condition_iter != conditions.end();
1075          ++condition_iter) {
1076       used_patterns.insert(condition_iter->string_pattern()->id());
1077     }
1078     const URLMatcherConditionSet::QueryConditions& query_conditions =
1079         condition_set_iter->second->query_conditions();
1080     for (URLMatcherConditionSet::QueryConditions::const_iterator
1081              query_condition_iter = query_conditions.begin();
1082          query_condition_iter != query_conditions.end();
1083          ++query_condition_iter) {
1084       used_patterns.insert(query_condition_iter->string_pattern()->id());
1085     }
1086   }
1087   condition_factory_.ForgetUnusedPatterns(used_patterns);
1088 }
1089
1090 void URLMatcher::UpdateInternalDatastructures() {
1091   UpdateSubstringSetMatcher(false);
1092   UpdateSubstringSetMatcher(true);
1093   UpdateRegexSetMatcher();
1094   UpdateTriggers();
1095   UpdateConditionFactory();
1096 }
1097
1098 }  // namespace url_matcher