Upstream version 7.36.149.0
[platform/framework/web/crosswalk.git] / src / components / url_matcher / url_matcher.cc
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/url_matcher/url_matcher.h"
6
7 #include <algorithm>
8 #include <iterator>
9
10 #include "base/logging.h"
11 #include "url/gurl.h"
12 #include "url/url_canon.h"
13
14 namespace url_matcher {
15
16 // This set of classes implement a mapping of URL Component Patterns, such as
17 // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
18 // for use in substring comparisons.
19 //
20 // The idea of this mapping is to reduce the problem of comparing many
21 // URL Component Patterns against one URL to the problem of searching many
22 // substrings in one string:
23 //
24 // ----------------------                    -----------------
25 // | URL Query operator | ----translate----> | StringPattern |
26 // ----------------------                    -----------------
27 //                                                   ^
28 //                                                   |
29 //                                                compare
30 //                                                   |
31 //                                                   v
32 // ----------------------                    -----------------
33 // | URL to compare     |                    |               |
34 // | to all URL Query   | ----translate----> | String        |
35 // | operators          |                    |               |
36 // ----------------------                    -----------------
37 //
38 // The reason for this problem reduction is that there are efficient algorithms
39 // for searching many substrings in one string (see Aho-Corasick algorithm).
40 //
41 // Additionally, some of the same pieces are reused to implement regular
42 // expression comparisons. The FilteredRE2 implementation for matching many
43 // regular expressions against one string uses prefiltering, in which a set
44 // of substrings (derived from the regexes) are first searched for, to reduce
45 // the number of regular expressions to test; the prefiltering step also
46 // uses Aho-Corasick.
47 //
48 // Case 1: {host,path,query}_{prefix,suffix,equals} searches.
49 // ==========================================================
50 //
51 // For searches in this class, we normalize URLs as follows:
52 //
53 // Step 1:
54 // Remove scheme, port and segment from URL:
55 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
56 //    www.example.com/index.html?search=foo
57 //
58 // We remove the scheme and port number because they can be checked later
59 // in a secondary filter step. We remove the segment (the #... part) because
60 // this is not guaranteed to be ASCII-7 encoded.
61 //
62 // Step 2:
63 // Translate URL to String and add the following position markers:
64 // - BU = Beginning of URL
65 // - ED = End of Domain
66 // - EP = End of Path
67 // - EU = End of URL
68 // Furthermore, the hostname is canonicalized to start with a ".".
69 //
70 // Position markers are represented as characters >127, which are therefore
71 // guaranteed not to be part of the ASCII-7 encoded URL character set.
72 //
73 // -> www.example.com/index.html?search=foo becomes
74 // BU .www.example.com ED /index.html EP ?search=foo EU
75 //
76 // -> www.example.com/index.html becomes
77 // BU .www.example.com ED /index.html EP EU
78 //
79 // Step 3:
80 // Translate URL Component Patterns as follows:
81 //
82 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
83 // -> host_prefix("www.example") = BU .www.example
84 //
85 // host_suffix(suffix) = suffix ED
86 // -> host_suffix("example.com") = example.com ED
87 // -> host_suffix(".example.com") = .example.com ED
88 //
89 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED
90 // -> host_equals("www.example.com") = BU .www.example.com ED
91 //
92 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
93 //
94 // With this, we can search the StringPatterns in the normalized URL.
95 //
96 //
97 // Case 2: url_{prefix,suffix,equals,contains} searches.
98 // =====================================================
99 //
100 // Step 1: as above, except that
101 // - the scheme is not removed
102 // - the port is not removed if it is specified and does not match the default
103 //   port for the given scheme.
104 //
105 // Step 2:
106 // Translate URL to String and add the following position markers:
107 // - BU = Beginning of URL
108 // - EU = End of URL
109 //
110 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
111 // BU http://www.example.com:8080/index.html?search=foo EU
112 // -> http://www.example.com:80/index.html?search=foo#first_match becomes
113 // BU http://www.example.com/index.html?search=foo EU
114 //
115 // url_prefix(prefix) = BU prefix
116 // -> url_prefix("http://www.example") = BU http://www.example
117 //
118 // url_contains(substring) = substring
119 // -> url_contains("index") = index
120 //
121 //
122 // Case 3: {host,path,query}_contains searches.
123 // ============================================
124 //
125 // These kinds of searches are not supported directly but can be derived
126 // by a combination of a url_contains() query followed by an explicit test:
127 //
128 // host_contains(str) = url_contains(str) followed by test whether str occurs
129 //   in host component of original URL.
130 // -> host_contains("example.co") = example.co
131 //    followed by gurl.host().find("example.co");
132 //
133 // [similarly for path_contains and query_contains].
134 //
135 //
136 // Regular expression matching (url_matches searches)
137 // ==================================================
138 //
139 // This class also supports matching regular expressions (RE2 syntax)
140 // against full URLs, which are transformed as in case 2.
141
142 namespace {
143
144 bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {
145   return criterion == URLMatcherCondition::URL_MATCHES;
146 }
147
148 bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {
149   return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES;
150 }
151
152 }  // namespace
153
154 //
155 // URLMatcherCondition
156 //
157
158 URLMatcherCondition::URLMatcherCondition()
159     : criterion_(HOST_PREFIX),
160       string_pattern_(NULL) {}
161
162 URLMatcherCondition::~URLMatcherCondition() {}
163
164 URLMatcherCondition::URLMatcherCondition(
165     Criterion criterion,
166     const StringPattern* string_pattern)
167     : criterion_(criterion),
168       string_pattern_(string_pattern) {}
169
170 URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
171     : criterion_(rhs.criterion_),
172       string_pattern_(rhs.string_pattern_) {}
173
174 URLMatcherCondition& URLMatcherCondition::operator=(
175     const URLMatcherCondition& rhs) {
176   criterion_ = rhs.criterion_;
177   string_pattern_ = rhs.string_pattern_;
178   return *this;
179 }
180
181 bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
182   if (criterion_ < rhs.criterion_) return true;
183   if (criterion_ > rhs.criterion_) return false;
184   if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
185     return *string_pattern_ < *rhs.string_pattern_;
186   if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true;
187   // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
188   // or both are NULL.
189   return false;
190 }
191
192 bool URLMatcherCondition::IsFullURLCondition() const {
193   // For these criteria the SubstringMatcher needs to be executed on the
194   // GURL that is canonicalized with
195   // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
196   switch (criterion_) {
197     case HOST_CONTAINS:
198     case PATH_CONTAINS:
199     case QUERY_CONTAINS:
200     case URL_PREFIX:
201     case URL_SUFFIX:
202     case URL_CONTAINS:
203     case URL_EQUALS:
204       return true;
205     default:
206       break;
207   }
208   return false;
209 }
210
211 bool URLMatcherCondition::IsRegexCondition() const {
212   return IsRegexCriterion(criterion_);
213 }
214
215 bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {
216   return IsOriginAndPathRegexCriterion(criterion_);
217 }
218
219 bool URLMatcherCondition::IsMatch(
220     const std::set<StringPattern::ID>& matching_patterns,
221     const GURL& url) const {
222   DCHECK(string_pattern_);
223   if (!ContainsKey(matching_patterns, string_pattern_->id()))
224     return false;
225   // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
226   // a substring match on the raw URL. In case of a match, we need to verify
227   // that the match was found in the correct component of the URL.
228   switch (criterion_) {
229     case HOST_CONTAINS:
230       return url.host().find(string_pattern_->pattern()) !=
231           std::string::npos;
232     case PATH_CONTAINS:
233       return url.path().find(string_pattern_->pattern()) !=
234           std::string::npos;
235     case QUERY_CONTAINS:
236       return url.query().find(string_pattern_->pattern()) !=
237           std::string::npos;
238     default:
239       break;
240   }
241   return true;
242 }
243
244 //
245 // URLMatcherConditionFactory
246 //
247
248 namespace {
249 // These are symbols that are not contained in 7-bit ASCII used in GURLs.
250 const char kBeginningOfURL[] = {static_cast<char>(-1), 0};
251 const char kEndOfDomain[] = {static_cast<char>(-2), 0};
252 const char kEndOfPath[] = {static_cast<char>(-3), 0};
253 const char kQueryComponentDelimiter[] = {static_cast<char>(-4), 0};
254 const char kEndOfURL[] = {static_cast<char>(-5), 0};
255
256 // The delimiter for query parameters
257 const char kQuerySeparator = '&';
258 }  // namespace
259
260 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
261
262 URLMatcherConditionFactory::~URLMatcherConditionFactory() {
263   STLDeleteElements(&substring_pattern_singletons_);
264   STLDeleteElements(&regex_pattern_singletons_);
265   STLDeleteElements(&origin_and_path_regex_pattern_singletons_);
266 }
267
268 std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
269     const GURL& url) const {
270   return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
271          url.path() + kEndOfPath +
272          (url.has_query() ? CanonicalizeQuery(url.query(), true, true)
273                           : std::string()) +
274          kEndOfURL;
275 }
276
277 URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
278     const std::string& prefix) {
279   return CreateCondition(URLMatcherCondition::HOST_PREFIX,
280       kBeginningOfURL + CanonicalizeHostname(prefix));
281 }
282
283 URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
284     const std::string& suffix) {
285   return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
286       suffix + kEndOfDomain);
287 }
288
289 URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
290     const std::string& str) {
291   return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
292 }
293
294 URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
295     const std::string& str) {
296   return CreateCondition(URLMatcherCondition::HOST_EQUALS,
297       kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
298 }
299
300 URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
301     const std::string& prefix) {
302   return CreateCondition(URLMatcherCondition::PATH_PREFIX,
303       kEndOfDomain + prefix);
304 }
305
306 URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
307     const std::string& suffix) {
308   return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath);
309 }
310
311 URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
312     const std::string& str) {
313   return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
314 }
315
316 URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
317     const std::string& str) {
318   return CreateCondition(URLMatcherCondition::PATH_EQUALS,
319       kEndOfDomain + str + kEndOfPath);
320 }
321
322 URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
323     const std::string& prefix) {
324   std::string pattern;
325   if (!prefix.empty() && prefix[0] == '?')
326     pattern = kEndOfPath + CanonicalizeQuery(prefix.substr(1), true, false);
327   else
328     pattern = kEndOfPath + CanonicalizeQuery(prefix, true, false);
329
330   return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern);
331 }
332
333 URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
334     const std::string& suffix) {
335   if (!suffix.empty() && suffix[0] == '?') {
336     return CreateQueryEqualsCondition(suffix);
337   } else {
338     return CreateCondition(URLMatcherCondition::QUERY_SUFFIX,
339                            CanonicalizeQuery(suffix, false, true) + kEndOfURL);
340   }
341 }
342
343 URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
344     const std::string& str) {
345   if (!str.empty() && str[0] == '?')
346     return CreateQueryPrefixCondition(str);
347   else
348     return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
349 }
350
351 URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
352     const std::string& str) {
353   std::string pattern;
354   if (!str.empty() && str[0] == '?')
355     pattern =
356         kEndOfPath + CanonicalizeQuery(str.substr(1), true, true) + kEndOfURL;
357   else
358     pattern = kEndOfPath + CanonicalizeQuery(str, true, true) + kEndOfURL;
359
360   return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern);
361 }
362
363 URLMatcherCondition
364     URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
365     const std::string& host_suffix,
366     const std::string& path_prefix) {
367   return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
368       host_suffix + kEndOfDomain + path_prefix);
369 }
370
371 URLMatcherCondition
372 URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
373     const std::string& host,
374     const std::string& path_prefix) {
375   return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX,
376       kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain +
377       path_prefix);
378 }
379
380 std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
381     const GURL& url) const {
382   GURL::Replacements replacements;
383   replacements.ClearPassword();
384   replacements.ClearUsername();
385   replacements.ClearRef();
386   // Clear port if it is implicit from scheme.
387   if (url.has_port()) {
388     const std::string& port = url.scheme();
389     if (url::DefaultPortForScheme(port.c_str(), port.size()) ==
390         url.EffectiveIntPort()) {
391       replacements.ClearPort();
392     }
393   }
394   return kBeginningOfURL + url.ReplaceComponents(replacements).spec() +
395       kEndOfURL;
396 }
397
398 static std::string CanonicalizeURLForRegexSearchesHelper(
399     const GURL& url,
400     bool clear_query) {
401   GURL::Replacements replacements;
402   replacements.ClearPassword();
403   replacements.ClearUsername();
404   replacements.ClearRef();
405   if (clear_query)
406     replacements.ClearQuery();
407   // Clear port if it is implicit from scheme.
408   if (url.has_port()) {
409     const std::string& port = url.scheme();
410     if (url::DefaultPortForScheme(port.c_str(), port.size()) ==
411         url.EffectiveIntPort()) {
412       replacements.ClearPort();
413     }
414   }
415   return url.ReplaceComponents(replacements).spec();
416 }
417
418 std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
419     const GURL& url) const {
420   return CanonicalizeURLForRegexSearchesHelper(url, false);
421 }
422
423 std::string
424 URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
425     const GURL& url) const {
426   return CanonicalizeURLForRegexSearchesHelper(url, true);
427 }
428
429 URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
430     const std::string& prefix) {
431   return CreateCondition(URLMatcherCondition::URL_PREFIX,
432       kBeginningOfURL + prefix);
433 }
434
435 URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
436     const std::string& suffix) {
437   return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
438 }
439
440 URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
441     const std::string& str) {
442   return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
443 }
444
445 URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
446     const std::string& str) {
447   return CreateCondition(URLMatcherCondition::URL_EQUALS,
448       kBeginningOfURL + str + kEndOfURL);
449 }
450
451 URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
452     const std::string& regex) {
453   return CreateCondition(URLMatcherCondition::URL_MATCHES, regex);
454 }
455
456 URLMatcherCondition
457 URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
458     const std::string& regex) {
459   return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex);
460 }
461
462 void URLMatcherConditionFactory::ForgetUnusedPatterns(
463       const std::set<StringPattern::ID>& used_patterns) {
464   PatternSingletons::iterator i = substring_pattern_singletons_.begin();
465   while (i != substring_pattern_singletons_.end()) {
466     if (ContainsKey(used_patterns, (*i)->id())) {
467       ++i;
468     } else {
469       delete *i;
470       substring_pattern_singletons_.erase(i++);
471     }
472   }
473   i = regex_pattern_singletons_.begin();
474   while (i != regex_pattern_singletons_.end()) {
475     if (ContainsKey(used_patterns, (*i)->id())) {
476       ++i;
477     } else {
478       delete *i;
479       regex_pattern_singletons_.erase(i++);
480     }
481   }
482   i = origin_and_path_regex_pattern_singletons_.begin();
483   while (i != origin_and_path_regex_pattern_singletons_.end()) {
484     if (ContainsKey(used_patterns, (*i)->id())) {
485       ++i;
486     } else {
487       delete *i;
488       origin_and_path_regex_pattern_singletons_.erase(i++);
489     }
490   }
491 }
492
493 bool URLMatcherConditionFactory::IsEmpty() const {
494   return substring_pattern_singletons_.empty() &&
495       regex_pattern_singletons_.empty() &&
496       origin_and_path_regex_pattern_singletons_.empty();
497 }
498
499 URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
500     URLMatcherCondition::Criterion criterion,
501     const std::string& pattern) {
502   StringPattern search_pattern(pattern, 0);
503   PatternSingletons* pattern_singletons = NULL;
504   if (IsRegexCriterion(criterion))
505     pattern_singletons = &regex_pattern_singletons_;
506   else if (IsOriginAndPathRegexCriterion(criterion))
507     pattern_singletons = &origin_and_path_regex_pattern_singletons_;
508   else
509     pattern_singletons = &substring_pattern_singletons_;
510
511   PatternSingletons::const_iterator iter =
512       pattern_singletons->find(&search_pattern);
513
514   if (iter != pattern_singletons->end()) {
515     return URLMatcherCondition(criterion, *iter);
516   } else {
517     StringPattern* new_pattern =
518         new StringPattern(pattern, id_counter_++);
519     pattern_singletons->insert(new_pattern);
520     return URLMatcherCondition(criterion, new_pattern);
521   }
522 }
523
524 std::string URLMatcherConditionFactory::CanonicalizeHostname(
525     const std::string& hostname) const {
526   if (!hostname.empty() && hostname[0] == '.')
527     return hostname;
528   else
529     return "." + hostname;
530 }
531
532 // This function prepares the query string by replacing query separator with a
533 // magic value (|kQueryComponentDelimiter|). When the boolean
534 // |prepend_beginning_of_query_component| is true the function prepends the
535 // query with the same magic. This is done to locate the start of a key value
536 // pair in the query string. The parameter |query| is passed by value
537 // intentionally, since it is locally modified.
538 std::string URLMatcherConditionFactory::CanonicalizeQuery(
539     std::string query,
540     bool prepend_beginning_of_query_component,
541     bool append_end_of_query_component) const {
542   for (std::string::iterator it = query.begin(); it != query.end(); ++it) {
543     if (*it == kQuerySeparator)
544       *it = kQueryComponentDelimiter[0];
545   }
546   if (prepend_beginning_of_query_component)
547     query = kQueryComponentDelimiter + query;
548   if (append_end_of_query_component)
549     query += kQueryComponentDelimiter;
550   return query;
551 }
552
553 bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
554     StringPattern* lhs,
555     StringPattern* rhs) const {
556   if (lhs == NULL && rhs != NULL) return true;
557   if (lhs != NULL && rhs != NULL)
558     return lhs->pattern() < rhs->pattern();
559   // Either both are NULL or only rhs is NULL.
560   return false;
561 }
562
563 //
564 // URLQueryElementMatcherCondition
565 //
566
567 URLQueryElementMatcherCondition::URLQueryElementMatcherCondition(
568     const std::string& key,
569     const std::string& value,
570     QueryValueMatchType query_value_match_type,
571     QueryElementType query_element_type,
572     Type match_type,
573     URLMatcherConditionFactory* factory) {
574   match_type_ = match_type;
575
576   if (query_element_type == ELEMENT_TYPE_KEY_VALUE) {
577     key_ = kQueryComponentDelimiter + key + "=";
578     value_ = value;
579   } else {
580     key_ = kQueryComponentDelimiter + key;
581     value_ = std::string();
582   }
583
584   if (query_value_match_type == QUERY_VALUE_MATCH_EXACT)
585     value_ += kQueryComponentDelimiter;
586
587   // If |value_| is empty no need to find the |key_| and verify if the value
588   // matches. Simply checking the presence of key is sufficient, which is done
589   // by MATCH_ANY
590   if (value_.empty())
591     match_type_ = MATCH_ANY;
592
593   URLMatcherCondition condition;
594   // If |match_type_| is MATCH_ANY, then we could simply look for the
595   // combination of |key_| + |value_|, which can be efficiently done by
596   // SubstringMatcher
597   if (match_type_ == MATCH_ANY)
598     condition = factory->CreateQueryContainsCondition(key_ + value_);
599   else
600     condition = factory->CreateQueryContainsCondition(key_);
601   string_pattern_ = condition.string_pattern();
602
603   key_length_ = key_.length();
604   value_length_ = value_.length();
605 }
606
607 URLQueryElementMatcherCondition::~URLQueryElementMatcherCondition() {}
608
609 bool URLQueryElementMatcherCondition::operator<(
610     const URLQueryElementMatcherCondition& rhs) const {
611   if (match_type_ != rhs.match_type_)
612     return match_type_ < rhs.match_type_;
613   if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
614     return *string_pattern_ < *rhs.string_pattern_;
615   if (string_pattern_ == NULL && rhs.string_pattern_ != NULL)
616     return true;
617   // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
618   // or both are NULL.
619   return false;
620 }
621
622 bool URLQueryElementMatcherCondition::IsMatch(
623     const std::string& url_for_component_searches) const {
624   switch (match_type_) {
625     case MATCH_ANY: {
626       // For MATCH_ANY, no additional verification step is needed. We can trust
627       // the SubstringMatcher to do the verification.
628       return true;
629     }
630     case MATCH_ALL: {
631       size_t start = 0;
632       int found = 0;
633       size_t offset;
634       while ((offset = url_for_component_searches.find(key_, start)) !=
635              std::string::npos) {
636         if (url_for_component_searches.compare(
637                 offset + key_length_, value_length_, value_) != 0) {
638           return false;
639         } else {
640           ++found;
641         }
642         start = offset + key_length_ + value_length_ - 1;
643       }
644       return !!found;
645     }
646     case MATCH_FIRST: {
647       size_t offset = url_for_component_searches.find(key_);
648       return url_for_component_searches.compare(
649                  offset + key_length_, value_length_, value_) == 0;
650     }
651     case MATCH_LAST: {
652       size_t offset = url_for_component_searches.rfind(key_);
653       return url_for_component_searches.compare(
654                  offset + key_length_, value_length_, value_) == 0;
655     }
656   }
657   NOTREACHED();
658   return false;
659 }
660
661 //
662 // URLMatcherSchemeFilter
663 //
664
665 URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter)
666     : filters_(1) {
667   filters_.push_back(filter);
668 }
669
670 URLMatcherSchemeFilter::URLMatcherSchemeFilter(
671     const std::vector<std::string>& filters)
672     : filters_(filters) {}
673
674 URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}
675
676 bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const {
677   return std::find(filters_.begin(), filters_.end(), url.scheme()) !=
678       filters_.end();
679 }
680
681 //
682 // URLMatcherPortFilter
683 //
684
685 URLMatcherPortFilter::URLMatcherPortFilter(
686     const std::vector<URLMatcherPortFilter::Range>& ranges)
687     : ranges_(ranges) {}
688
689 URLMatcherPortFilter::~URLMatcherPortFilter() {}
690
691 bool URLMatcherPortFilter::IsMatch(const GURL& url) const {
692   int port = url.EffectiveIntPort();
693   for (std::vector<Range>::const_iterator i = ranges_.begin();
694        i != ranges_.end(); ++i) {
695     if (i->first <= port && port <= i->second)
696       return true;
697   }
698   return false;
699 }
700
701 // static
702 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from,
703                                                               int to) {
704   return Range(from, to);
705 }
706
707 // static
708 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) {
709   return Range(port, port);
710 }
711
712 //
713 // URLMatcherConditionSet
714 //
715
716 URLMatcherConditionSet::~URLMatcherConditionSet() {}
717
718 URLMatcherConditionSet::URLMatcherConditionSet(
719     ID id,
720     const Conditions& conditions)
721     : id_(id),
722       conditions_(conditions) {}
723
724 URLMatcherConditionSet::URLMatcherConditionSet(
725     ID id,
726     const Conditions& conditions,
727     scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
728     scoped_ptr<URLMatcherPortFilter> port_filter)
729     : id_(id),
730       conditions_(conditions),
731       scheme_filter_(scheme_filter.Pass()),
732       port_filter_(port_filter.Pass()) {}
733
734 URLMatcherConditionSet::URLMatcherConditionSet(
735     ID id,
736     const Conditions& conditions,
737     const QueryConditions& query_conditions,
738     scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
739     scoped_ptr<URLMatcherPortFilter> port_filter)
740     : id_(id),
741       conditions_(conditions),
742       query_conditions_(query_conditions),
743       scheme_filter_(scheme_filter.Pass()),
744       port_filter_(port_filter.Pass()) {}
745
746 bool URLMatcherConditionSet::IsMatch(
747     const std::set<StringPattern::ID>& matching_patterns,
748     const GURL& url) const {
749   return IsMatch(matching_patterns, url, std::string());
750 }
751
752 bool URLMatcherConditionSet::IsMatch(
753     const std::set<StringPattern::ID>& matching_patterns,
754     const GURL& url,
755     const std::string& url_for_component_searches) const {
756   for (Conditions::const_iterator i = conditions_.begin();
757        i != conditions_.end(); ++i) {
758     if (!i->IsMatch(matching_patterns, url))
759       return false;
760   }
761   if (scheme_filter_.get() && !scheme_filter_->IsMatch(url))
762     return false;
763   if (port_filter_.get() && !port_filter_->IsMatch(url))
764     return false;
765   if (query_conditions_.empty())
766     return true;
767   // The loop is duplicated below for performance reasons. If not all query
768   // elements are found, no need to verify match that is expected to take more
769   // cycles.
770   for (QueryConditions::const_iterator i = query_conditions_.begin();
771        i != query_conditions_.end();
772        ++i) {
773     if (!ContainsKey(matching_patterns, i->string_pattern()->id()))
774       return false;
775   }
776   for (QueryConditions::const_iterator i = query_conditions_.begin();
777        i != query_conditions_.end();
778        ++i) {
779     if (!i->IsMatch(url_for_component_searches))
780       return false;
781   }
782   return true;
783 }
784
785 //
786 // URLMatcher
787 //
788
789 URLMatcher::URLMatcher() {}
790
791 URLMatcher::~URLMatcher() {}
792
793 void URLMatcher::AddConditionSets(
794     const URLMatcherConditionSet::Vector& condition_sets) {
795   for (URLMatcherConditionSet::Vector::const_iterator i =
796        condition_sets.begin(); i != condition_sets.end(); ++i) {
797     DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
798         url_matcher_condition_sets_.end());
799     url_matcher_condition_sets_[(*i)->id()] = *i;
800   }
801   UpdateInternalDatastructures();
802 }
803
804 void URLMatcher::RemoveConditionSets(
805     const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) {
806   for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
807        condition_set_ids.begin(); i != condition_set_ids.end(); ++i) {
808     DCHECK(url_matcher_condition_sets_.find(*i) !=
809         url_matcher_condition_sets_.end());
810     url_matcher_condition_sets_.erase(*i);
811   }
812   UpdateInternalDatastructures();
813 }
814
815 void URLMatcher::ClearUnusedConditionSets() {
816   UpdateConditionFactory();
817 }
818
819 std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
820     const GURL& url) const {
821   // Find all IDs of StringPatterns that match |url|.
822   // See URLMatcherConditionFactory for the canonicalization of URLs and the
823   // distinction between full url searches and url component searches.
824   std::set<StringPattern::ID> matches;
825   std::string url_for_component_searches;
826
827   if (!full_url_matcher_.IsEmpty()) {
828     full_url_matcher_.Match(
829         condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
830   }
831   if (!url_component_matcher_.IsEmpty()) {
832     url_for_component_searches =
833         condition_factory_.CanonicalizeURLForComponentSearches(url);
834     url_component_matcher_.Match(url_for_component_searches, &matches);
835   }
836   if (!regex_set_matcher_.IsEmpty()) {
837     regex_set_matcher_.Match(
838         condition_factory_.CanonicalizeURLForRegexSearches(url), &matches);
839   }
840   if (!origin_and_path_regex_set_matcher_.IsEmpty()) {
841     origin_and_path_regex_set_matcher_.Match(
842         condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url),
843         &matches);
844   }
845
846   // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
847   // were fulfilled.
848   std::set<URLMatcherConditionSet::ID> result;
849   for (std::set<StringPattern::ID>::const_iterator i = matches.begin();
850        i != matches.end(); ++i) {
851     // For each URLMatcherConditionSet there is exactly one condition
852     // registered in substring_match_triggers_. This means that the following
853     // logic tests each URLMatcherConditionSet exactly once if it can be
854     // completely fulfilled.
855     StringPatternTriggers::const_iterator triggered_condition_sets_iter =
856         substring_match_triggers_.find(*i);
857     if (triggered_condition_sets_iter == substring_match_triggers_.end())
858       continue;  // Not all substring matches are triggers for a condition set.
859     const std::set<URLMatcherConditionSet::ID>& condition_sets =
860         triggered_condition_sets_iter->second;
861     for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
862          condition_sets.begin(); j != condition_sets.end(); ++j) {
863       URLMatcherConditionSets::const_iterator condition_set_iter =
864           url_matcher_condition_sets_.find(*j);
865       DCHECK(condition_set_iter != url_matcher_condition_sets_.end());
866       if (condition_set_iter->second->IsMatch(
867               matches, url, url_for_component_searches))
868         result.insert(*j);
869     }
870   }
871
872   return result;
873 }
874
875 bool URLMatcher::IsEmpty() const {
876   return condition_factory_.IsEmpty() &&
877       url_matcher_condition_sets_.empty() &&
878       substring_match_triggers_.empty() &&
879       full_url_matcher_.IsEmpty() &&
880       url_component_matcher_.IsEmpty() &&
881       regex_set_matcher_.IsEmpty() &&
882       origin_and_path_regex_set_matcher_.IsEmpty() &&
883       registered_full_url_patterns_.empty() &&
884       registered_url_component_patterns_.empty();
885 }
886
887 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
888   // The purpose of |full_url_conditions| is just that we need to execute
889   // the same logic once for Full URL searches and once for URL Component
890   // searches (see URLMatcherConditionFactory).
891
892   // Determine which patterns need to be registered when this function
893   // terminates.
894   std::set<const StringPattern*> new_patterns;
895   for (URLMatcherConditionSets::const_iterator condition_set_iter =
896       url_matcher_condition_sets_.begin();
897       condition_set_iter != url_matcher_condition_sets_.end();
898       ++condition_set_iter) {
899     const URLMatcherConditionSet::Conditions& conditions =
900         condition_set_iter->second->conditions();
901     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
902          conditions.begin(); condition_iter != conditions.end();
903          ++condition_iter) {
904       // If we are called to process Full URL searches, ignore others, and
905       // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
906       if (!condition_iter->IsRegexCondition() &&
907           !condition_iter->IsOriginAndPathRegexCondition() &&
908           full_url_conditions == condition_iter->IsFullURLCondition())
909         new_patterns.insert(condition_iter->string_pattern());
910     }
911
912     if (full_url_conditions)
913       continue;
914
915     const URLMatcherConditionSet::QueryConditions& query_conditions =
916         condition_set_iter->second->query_conditions();
917     for (URLMatcherConditionSet::QueryConditions::const_iterator
918              query_condition_iter = query_conditions.begin();
919          query_condition_iter != query_conditions.end();
920          ++query_condition_iter) {
921       new_patterns.insert(query_condition_iter->string_pattern());
922     }
923   }
924
925   // This is the set of patterns that were registered before this function
926   // is called.
927   std::set<const StringPattern*>& registered_patterns =
928       full_url_conditions ? registered_full_url_patterns_
929                           : registered_url_component_patterns_;
930
931   // Add all patterns that are in new_patterns but not in registered_patterns.
932   std::vector<const StringPattern*> patterns_to_register =
933       base::STLSetDifference<std::vector<const StringPattern*> >(
934           new_patterns, registered_patterns);
935
936   // Remove all patterns that are in registered_patterns but not in
937   // new_patterns.
938   std::vector<const StringPattern*> patterns_to_unregister =
939       base::STLSetDifference<std::vector<const StringPattern*> >(
940            registered_patterns, new_patterns);
941
942   // Update the SubstringSetMatcher.
943   SubstringSetMatcher& url_matcher =
944       full_url_conditions ? full_url_matcher_ : url_component_matcher_;
945   url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
946                                             patterns_to_unregister);
947
948   // Update the set of registered_patterns for the next time this function
949   // is being called.
950   registered_patterns.swap(new_patterns);
951 }
952
953 void URLMatcher::UpdateRegexSetMatcher() {
954   std::vector<const StringPattern*> new_patterns;
955   std::vector<const StringPattern*> new_origin_and_path_patterns;
956
957   for (URLMatcherConditionSets::const_iterator condition_set_iter =
958       url_matcher_condition_sets_.begin();
959       condition_set_iter != url_matcher_condition_sets_.end();
960       ++condition_set_iter) {
961     const URLMatcherConditionSet::Conditions& conditions =
962         condition_set_iter->second->conditions();
963     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
964          conditions.begin(); condition_iter != conditions.end();
965          ++condition_iter) {
966       if (condition_iter->IsRegexCondition()) {
967         new_patterns.push_back(condition_iter->string_pattern());
968       } else if (condition_iter->IsOriginAndPathRegexCondition()) {
969         new_origin_and_path_patterns.push_back(
970             condition_iter->string_pattern());
971       }
972     }
973   }
974
975   // Start over from scratch. We can't really do better than this, since the
976   // FilteredRE2 backend doesn't support incremental updates.
977   regex_set_matcher_.ClearPatterns();
978   regex_set_matcher_.AddPatterns(new_patterns);
979   origin_and_path_regex_set_matcher_.ClearPatterns();
980   origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns);
981 }
982
983 void URLMatcher::UpdateTriggers() {
984   // Count substring pattern frequencies.
985   std::map<StringPattern::ID, size_t> substring_pattern_frequencies;
986   for (URLMatcherConditionSets::const_iterator condition_set_iter =
987       url_matcher_condition_sets_.begin();
988       condition_set_iter != url_matcher_condition_sets_.end();
989       ++condition_set_iter) {
990     const URLMatcherConditionSet::Conditions& conditions =
991         condition_set_iter->second->conditions();
992     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
993          conditions.begin(); condition_iter != conditions.end();
994          ++condition_iter) {
995       const StringPattern* pattern = condition_iter->string_pattern();
996       substring_pattern_frequencies[pattern->id()]++;
997     }
998
999     const URLMatcherConditionSet::QueryConditions& query_conditions =
1000         condition_set_iter->second->query_conditions();
1001     for (URLMatcherConditionSet::QueryConditions::const_iterator
1002              query_condition_iter = query_conditions.begin();
1003          query_condition_iter != query_conditions.end();
1004          ++query_condition_iter) {
1005       const StringPattern* pattern = query_condition_iter->string_pattern();
1006       substring_pattern_frequencies[pattern->id()]++;
1007     }
1008   }
1009
1010   // Update trigger conditions: Determine for each URLMatcherConditionSet which
1011   // URLMatcherCondition contains a StringPattern that occurs least
1012   // frequently in this URLMatcher. We assume that this condition is very
1013   // specific and occurs rarely in URLs. If a match occurs for this
1014   // URLMatcherCondition, we want to test all other URLMatcherCondition in the
1015   // respective URLMatcherConditionSet as well to see whether the entire
1016   // URLMatcherConditionSet is considered matching.
1017   substring_match_triggers_.clear();
1018   for (URLMatcherConditionSets::const_iterator condition_set_iter =
1019       url_matcher_condition_sets_.begin();
1020       condition_set_iter != url_matcher_condition_sets_.end();
1021       ++condition_set_iter) {
1022     const URLMatcherConditionSet::Conditions& conditions =
1023         condition_set_iter->second->conditions();
1024     if (conditions.empty())
1025       continue;
1026     URLMatcherConditionSet::Conditions::const_iterator condition_iter =
1027         conditions.begin();
1028     StringPattern::ID trigger = condition_iter->string_pattern()->id();
1029     // We skip the first element in the following loop.
1030     ++condition_iter;
1031     for (; condition_iter != conditions.end(); ++condition_iter) {
1032       StringPattern::ID current_id =
1033           condition_iter->string_pattern()->id();
1034       if (substring_pattern_frequencies[trigger] >
1035           substring_pattern_frequencies[current_id]) {
1036         trigger = current_id;
1037       }
1038     }
1039
1040     const URLMatcherConditionSet::QueryConditions& query_conditions =
1041         condition_set_iter->second->query_conditions();
1042     for (URLMatcherConditionSet::QueryConditions::const_iterator
1043              query_condition_iter = query_conditions.begin();
1044          query_condition_iter != query_conditions.end();
1045          ++query_condition_iter) {
1046       StringPattern::ID current_id =
1047           query_condition_iter->string_pattern()->id();
1048       if (substring_pattern_frequencies[trigger] >
1049           substring_pattern_frequencies[current_id]) {
1050         trigger = current_id;
1051       }
1052     }
1053
1054     substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
1055   }
1056 }
1057
1058 void URLMatcher::UpdateConditionFactory() {
1059   std::set<StringPattern::ID> used_patterns;
1060   for (URLMatcherConditionSets::const_iterator condition_set_iter =
1061       url_matcher_condition_sets_.begin();
1062       condition_set_iter != url_matcher_condition_sets_.end();
1063       ++condition_set_iter) {
1064     const URLMatcherConditionSet::Conditions& conditions =
1065         condition_set_iter->second->conditions();
1066     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
1067          conditions.begin(); condition_iter != conditions.end();
1068          ++condition_iter) {
1069       used_patterns.insert(condition_iter->string_pattern()->id());
1070     }
1071     const URLMatcherConditionSet::QueryConditions& query_conditions =
1072         condition_set_iter->second->query_conditions();
1073     for (URLMatcherConditionSet::QueryConditions::const_iterator
1074              query_condition_iter = query_conditions.begin();
1075          query_condition_iter != query_conditions.end();
1076          ++query_condition_iter) {
1077       used_patterns.insert(query_condition_iter->string_pattern()->id());
1078     }
1079   }
1080   condition_factory_.ForgetUnusedPatterns(used_patterns);
1081 }
1082
1083 void URLMatcher::UpdateInternalDatastructures() {
1084   UpdateSubstringSetMatcher(false);
1085   UpdateSubstringSetMatcher(true);
1086   UpdateRegexSetMatcher();
1087   UpdateTriggers();
1088   UpdateConditionFactory();
1089 }
1090
1091 }  // namespace url_matcher