- add sources.
[platform/framework/web/crosswalk.git] / src / extensions / common / matcher / url_matcher.cc
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "extensions/common/matcher/url_matcher.h"
6
7 #include <algorithm>
8 #include <iterator>
9
10 #include "base/logging.h"
11 #include "content/public/common/url_constants.h"
12 #include "url/gurl.h"
13 #include "url/url_canon.h"
14
15 namespace extensions {
16
17 // This set of classes implement a mapping of URL Component Patterns, such as
18 // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
19 // for use in substring comparisons.
20 //
21 // The idea of this mapping is to reduce the problem of comparing many
22 // URL Component Patterns against one URL to the problem of searching many
23 // substrings in one string:
24 //
25 // ----------------------                    -----------------
26 // | URL Query operator | ----translate----> | StringPattern |
27 // ----------------------                    -----------------
28 //                                                   ^
29 //                                                   |
30 //                                                compare
31 //                                                   |
32 //                                                   v
33 // ----------------------                    -----------------
34 // | URL to compare     |                    |               |
35 // | to all URL Query   | ----translate----> | String        |
36 // | operators          |                    |               |
37 // ----------------------                    -----------------
38 //
39 // The reason for this problem reduction is that there are efficient algorithms
40 // for searching many substrings in one string (see Aho-Corasick algorithm).
41 //
42 // Additionally, some of the same pieces are reused to implement regular
43 // expression comparisons. The FilteredRE2 implementation for matching many
44 // regular expressions against one string uses prefiltering, in which a set
45 // of substrings (derived from the regexes) are first searched for, to reduce
46 // the number of regular expressions to test; the prefiltering step also
47 // uses Aho-Corasick.
48 //
49 // Case 1: {host,path,query}_{prefix,suffix,equals} searches.
50 // ==========================================================
51 //
52 // For searches in this class, we normalize URLs as follows:
53 //
54 // Step 1:
55 // Remove scheme, port and segment from URL:
56 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
57 //    www.example.com/index.html?search=foo
58 //
59 // We remove the scheme and port number because they can be checked later
60 // in a secondary filter step. We remove the segment (the #... part) because
61 // this is not guaranteed to be ASCII-7 encoded.
62 //
63 // Step 2:
64 // Translate URL to String and add the following position markers:
65 // - BU = Beginning of URL
66 // - ED = End of Domain
67 // - EP = End of Path
68 // - EU = End of URL
69 // Furthermore, the hostname is canonicalized to start with a ".".
70 //
71 // Position markers are represented as characters >127, which are therefore
72 // guaranteed not to be part of the ASCII-7 encoded URL character set.
73 //
74 // -> www.example.com/index.html?search=foo becomes
75 // BU .www.example.com ED /index.html EP ?search=foo EU
76 //
77 // -> www.example.com/index.html becomes
78 // BU .www.example.com ED /index.html EP EU
79 //
80 // Step 3:
81 // Translate URL Component Patterns as follows:
82 //
83 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
84 // -> host_prefix("www.example") = BU .www.example
85 //
86 // host_suffix(suffix) = suffix ED
87 // -> host_suffix("example.com") = example.com ED
88 // -> host_suffix(".example.com") = .example.com ED
89 //
90 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED
91 // -> host_equals("www.example.com") = BU .www.example.com ED
92 //
93 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
94 //
95 // With this, we can search the StringPatterns in the normalized URL.
96 //
97 //
98 // Case 2: url_{prefix,suffix,equals,contains} searches.
99 // =====================================================
100 //
101 // Step 1: as above, except that
102 // - the scheme is not removed
103 // - the port is not removed if it is specified and does not match the default
104 //   port for the given scheme.
105 //
106 // Step 2:
107 // Translate URL to String and add the following position markers:
108 // - BU = Beginning of URL
109 // - EU = End of URL
110 //
111 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
112 // BU http://www.example.com:8080/index.html?search=foo EU
113 // -> http://www.example.com:80/index.html?search=foo#first_match becomes
114 // BU http://www.example.com/index.html?search=foo EU
115 //
116 // url_prefix(prefix) = BU prefix
117 // -> url_prefix("http://www.example") = BU http://www.example
118 //
119 // url_contains(substring) = substring
120 // -> url_contains("index") = index
121 //
122 //
123 // Case 3: {host,path,query}_contains searches.
124 // ============================================
125 //
126 // These kinds of searches are not supported directly but can be derived
127 // by a combination of a url_contains() query followed by an explicit test:
128 //
129 // host_contains(str) = url_contains(str) followed by test whether str occurs
130 //   in host component of original URL.
131 // -> host_contains("example.co") = example.co
132 //    followed by gurl.host().find("example.co");
133 //
134 // [similarly for path_contains and query_contains].
135 //
136 //
137 // Regular expression matching (url_matches searches)
138 // ==================================================
139 //
140 // This class also supports matching regular expressions (RE2 syntax)
141 // against full URLs, which are transformed as in case 2.
142
143 namespace {
144
145 bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {
146   return criterion == URLMatcherCondition::URL_MATCHES;
147 }
148
149 bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {
150   return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES;
151 }
152
153 }  // namespace
154
155 //
156 // URLMatcherCondition
157 //
158
159 URLMatcherCondition::URLMatcherCondition()
160     : criterion_(HOST_PREFIX),
161       string_pattern_(NULL) {}
162
163 URLMatcherCondition::~URLMatcherCondition() {}
164
165 URLMatcherCondition::URLMatcherCondition(
166     Criterion criterion,
167     const StringPattern* string_pattern)
168     : criterion_(criterion),
169       string_pattern_(string_pattern) {}
170
171 URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
172     : criterion_(rhs.criterion_),
173       string_pattern_(rhs.string_pattern_) {}
174
175 URLMatcherCondition& URLMatcherCondition::operator=(
176     const URLMatcherCondition& rhs) {
177   criterion_ = rhs.criterion_;
178   string_pattern_ = rhs.string_pattern_;
179   return *this;
180 }
181
182 bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
183   if (criterion_ < rhs.criterion_) return true;
184   if (criterion_ > rhs.criterion_) return false;
185   if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
186     return *string_pattern_ < *rhs.string_pattern_;
187   if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true;
188   // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
189   // or both are NULL.
190   return false;
191 }
192
193 bool URLMatcherCondition::IsFullURLCondition() const {
194   // For these criteria the SubstringMatcher needs to be executed on the
195   // GURL that is canonicalized with
196   // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
197   switch (criterion_) {
198     case HOST_CONTAINS:
199     case PATH_CONTAINS:
200     case QUERY_CONTAINS:
201     case URL_PREFIX:
202     case URL_SUFFIX:
203     case URL_CONTAINS:
204     case URL_EQUALS:
205       return true;
206     default:
207       break;
208   }
209   return false;
210 }
211
212 bool URLMatcherCondition::IsRegexCondition() const {
213   return IsRegexCriterion(criterion_);
214 }
215
216 bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {
217   return IsOriginAndPathRegexCriterion(criterion_);
218 }
219
220 bool URLMatcherCondition::IsMatch(
221     const std::set<StringPattern::ID>& matching_patterns,
222     const GURL& url) const {
223   DCHECK(string_pattern_);
224   if (!ContainsKey(matching_patterns, string_pattern_->id()))
225     return false;
226   // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
227   // a substring match on the raw URL. In case of a match, we need to verify
228   // that the match was found in the correct component of the URL.
229   switch (criterion_) {
230     case HOST_CONTAINS:
231       return url.host().find(string_pattern_->pattern()) !=
232           std::string::npos;
233     case PATH_CONTAINS:
234       return url.path().find(string_pattern_->pattern()) !=
235           std::string::npos;
236     case QUERY_CONTAINS:
237       return url.query().find(string_pattern_->pattern()) !=
238           std::string::npos;
239     default:
240       break;
241   }
242   return true;
243 }
244
245 //
246 // URLMatcherConditionFactory
247 //
248
249 namespace {
250 // These are symbols that are not contained in 7-bit ASCII used in GURLs.
251 const char kBeginningOfURL[] = {static_cast<char>(-1), 0};
252 const char kEndOfDomain[] = {static_cast<char>(-2), 0};
253 const char kEndOfPath[] = {static_cast<char>(-3), 0};
254 const char kEndOfURL[] = {static_cast<char>(-4), 0};
255 }  // namespace
256
257 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
258
259 URLMatcherConditionFactory::~URLMatcherConditionFactory() {
260   STLDeleteElements(&substring_pattern_singletons_);
261   STLDeleteElements(&regex_pattern_singletons_);
262   STLDeleteElements(&origin_and_path_regex_pattern_singletons_);
263 }
264
265 std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
266     const GURL& url) const {
267   return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
268          url.path() + kEndOfPath +
269          (url.has_query() ? "?" + url.query() : std::string()) + kEndOfURL;
270 }
271
272 URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
273     const std::string& prefix) {
274   return CreateCondition(URLMatcherCondition::HOST_PREFIX,
275       kBeginningOfURL + CanonicalizeHostname(prefix));
276 }
277
278 URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
279     const std::string& suffix) {
280   return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
281       suffix + kEndOfDomain);
282 }
283
284 URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
285     const std::string& str) {
286   return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
287 }
288
289 URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
290     const std::string& str) {
291   return CreateCondition(URLMatcherCondition::HOST_EQUALS,
292       kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
293 }
294
295 URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
296     const std::string& prefix) {
297   return CreateCondition(URLMatcherCondition::PATH_PREFIX,
298       kEndOfDomain + prefix);
299 }
300
301 URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
302     const std::string& suffix) {
303   return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath);
304 }
305
306 URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
307     const std::string& str) {
308   return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
309 }
310
311 URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
312     const std::string& str) {
313   return CreateCondition(URLMatcherCondition::PATH_EQUALS,
314       kEndOfDomain + str + kEndOfPath);
315 }
316
317 URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
318     const std::string& prefix) {
319   std::string pattern;
320   if (!prefix.empty() && prefix[0] == '?')
321     pattern = kEndOfPath + prefix;
322   else
323     pattern = kEndOfPath + ('?' + prefix);
324
325   return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern);
326 }
327
328 URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
329     const std::string& suffix) {
330   if (!suffix.empty() && suffix[0] == '?') {
331     return CreateQueryEqualsCondition(suffix);
332   } else {
333     return CreateCondition(URLMatcherCondition::QUERY_SUFFIX,
334                            suffix + kEndOfURL);
335   }
336 }
337
338 URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
339     const std::string& str) {
340   if (!str.empty() && str[0] == '?')
341     return CreateQueryPrefixCondition(str);
342   else
343     return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
344 }
345
346 URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
347     const std::string& str) {
348   std::string pattern;
349   if (!str.empty() && str[0] == '?')
350     pattern = kEndOfPath + str + kEndOfURL;
351   else
352     pattern = kEndOfPath + ('?' + str) + kEndOfURL;
353
354   return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern);
355 }
356
357 URLMatcherCondition
358     URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
359     const std::string& host_suffix,
360     const std::string& path_prefix) {
361   return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
362       host_suffix + kEndOfDomain + path_prefix);
363 }
364
365 URLMatcherCondition
366 URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
367     const std::string& host,
368     const std::string& path_prefix) {
369   return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX,
370       kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain +
371       path_prefix);
372 }
373
374 std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
375     const GURL& url) const {
376   GURL::Replacements replacements;
377   replacements.ClearPassword();
378   replacements.ClearUsername();
379   replacements.ClearRef();
380   // Clear port if it is implicit from scheme.
381   if (url.has_port()) {
382     const std::string& port = url.scheme();
383     if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
384             url.EffectiveIntPort()) {
385       replacements.ClearPort();
386     }
387   }
388   return kBeginningOfURL + url.ReplaceComponents(replacements).spec() +
389       kEndOfURL;
390 }
391
392 static std::string CanonicalizeURLForRegexSearchesHelper(
393     const GURL& url,
394     bool clear_query) {
395   GURL::Replacements replacements;
396   replacements.ClearPassword();
397   replacements.ClearUsername();
398   replacements.ClearRef();
399   if (clear_query)
400     replacements.ClearQuery();
401   // Clear port if it is implicit from scheme.
402   if (url.has_port()) {
403     const std::string& port = url.scheme();
404     if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
405             url.EffectiveIntPort()) {
406       replacements.ClearPort();
407     }
408   }
409   return url.ReplaceComponents(replacements).spec();
410 }
411
412 std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
413     const GURL& url) const {
414   return CanonicalizeURLForRegexSearchesHelper(url, false);
415 }
416
417 std::string
418 URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
419     const GURL& url) const {
420   return CanonicalizeURLForRegexSearchesHelper(url, true);
421 }
422
423 URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
424     const std::string& prefix) {
425   return CreateCondition(URLMatcherCondition::URL_PREFIX,
426       kBeginningOfURL + prefix);
427 }
428
429 URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
430     const std::string& suffix) {
431   return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
432 }
433
434 URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
435     const std::string& str) {
436   return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
437 }
438
439 URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
440     const std::string& str) {
441   return CreateCondition(URLMatcherCondition::URL_EQUALS,
442       kBeginningOfURL + str + kEndOfURL);
443 }
444
445 URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
446     const std::string& regex) {
447   return CreateCondition(URLMatcherCondition::URL_MATCHES, regex);
448 }
449
450 URLMatcherCondition
451 URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
452     const std::string& regex) {
453   return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex);
454 }
455
456 void URLMatcherConditionFactory::ForgetUnusedPatterns(
457       const std::set<StringPattern::ID>& used_patterns) {
458   PatternSingletons::iterator i = substring_pattern_singletons_.begin();
459   while (i != substring_pattern_singletons_.end()) {
460     if (ContainsKey(used_patterns, (*i)->id())) {
461       ++i;
462     } else {
463       delete *i;
464       substring_pattern_singletons_.erase(i++);
465     }
466   }
467   i = regex_pattern_singletons_.begin();
468   while (i != regex_pattern_singletons_.end()) {
469     if (ContainsKey(used_patterns, (*i)->id())) {
470       ++i;
471     } else {
472       delete *i;
473       regex_pattern_singletons_.erase(i++);
474     }
475   }
476   i = origin_and_path_regex_pattern_singletons_.begin();
477   while (i != origin_and_path_regex_pattern_singletons_.end()) {
478     if (ContainsKey(used_patterns, (*i)->id())) {
479       ++i;
480     } else {
481       delete *i;
482       origin_and_path_regex_pattern_singletons_.erase(i++);
483     }
484   }
485 }
486
487 bool URLMatcherConditionFactory::IsEmpty() const {
488   return substring_pattern_singletons_.empty() &&
489       regex_pattern_singletons_.empty() &&
490       origin_and_path_regex_pattern_singletons_.empty();
491 }
492
493 URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
494     URLMatcherCondition::Criterion criterion,
495     const std::string& pattern) {
496   StringPattern search_pattern(pattern, 0);
497   PatternSingletons* pattern_singletons = NULL;
498   if (IsRegexCriterion(criterion))
499     pattern_singletons = &regex_pattern_singletons_;
500   else if (IsOriginAndPathRegexCriterion(criterion))
501     pattern_singletons = &origin_and_path_regex_pattern_singletons_;
502   else
503     pattern_singletons = &substring_pattern_singletons_;
504
505   PatternSingletons::const_iterator iter =
506       pattern_singletons->find(&search_pattern);
507
508   if (iter != pattern_singletons->end()) {
509     return URLMatcherCondition(criterion, *iter);
510   } else {
511     StringPattern* new_pattern =
512         new StringPattern(pattern, id_counter_++);
513     pattern_singletons->insert(new_pattern);
514     return URLMatcherCondition(criterion, new_pattern);
515   }
516 }
517
518 std::string URLMatcherConditionFactory::CanonicalizeHostname(
519     const std::string& hostname) const {
520   if (!hostname.empty() && hostname[0] == '.')
521     return hostname;
522   else
523     return "." + hostname;
524 }
525
526 bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
527     StringPattern* lhs,
528     StringPattern* rhs) const {
529   if (lhs == NULL && rhs != NULL) return true;
530   if (lhs != NULL && rhs != NULL)
531     return lhs->pattern() < rhs->pattern();
532   // Either both are NULL or only rhs is NULL.
533   return false;
534 }
535
536 //
537 // URLMatcherSchemeFilter
538 //
539
540 URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter)
541     : filters_(1) {
542   filters_.push_back(filter);
543 }
544
545 URLMatcherSchemeFilter::URLMatcherSchemeFilter(
546     const std::vector<std::string>& filters)
547     : filters_(filters) {}
548
549 URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}
550
551 bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const {
552   return std::find(filters_.begin(), filters_.end(), url.scheme()) !=
553       filters_.end();
554 }
555
556 //
557 // URLMatcherPortFilter
558 //
559
560 URLMatcherPortFilter::URLMatcherPortFilter(
561     const std::vector<URLMatcherPortFilter::Range>& ranges)
562     : ranges_(ranges) {}
563
564 URLMatcherPortFilter::~URLMatcherPortFilter() {}
565
566 bool URLMatcherPortFilter::IsMatch(const GURL& url) const {
567   int port = url.EffectiveIntPort();
568   for (std::vector<Range>::const_iterator i = ranges_.begin();
569        i != ranges_.end(); ++i) {
570     if (i->first <= port && port <= i->second)
571       return true;
572   }
573   return false;
574 }
575
576 // static
577 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from,
578                                                               int to) {
579   return Range(from, to);
580 }
581
582 // static
583 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) {
584   return Range(port, port);
585 }
586
587 //
588 // URLMatcherConditionSet
589 //
590
591 URLMatcherConditionSet::~URLMatcherConditionSet() {}
592
593 URLMatcherConditionSet::URLMatcherConditionSet(
594     ID id,
595     const Conditions& conditions)
596     : id_(id),
597       conditions_(conditions) {}
598
599 URLMatcherConditionSet::URLMatcherConditionSet(
600     ID id,
601     const Conditions& conditions,
602     scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
603     scoped_ptr<URLMatcherPortFilter> port_filter)
604     : id_(id),
605       conditions_(conditions),
606       scheme_filter_(scheme_filter.Pass()),
607       port_filter_(port_filter.Pass()) {}
608
609 bool URLMatcherConditionSet::IsMatch(
610     const std::set<StringPattern::ID>& matching_patterns,
611     const GURL& url) const {
612   for (Conditions::const_iterator i = conditions_.begin();
613        i != conditions_.end(); ++i) {
614     if (!i->IsMatch(matching_patterns, url))
615       return false;
616   }
617   if (scheme_filter_.get() && !scheme_filter_->IsMatch(url))
618     return false;
619   if (port_filter_.get() && !port_filter_->IsMatch(url))
620     return false;
621   return true;
622 }
623
624 //
625 // URLMatcher
626 //
627
628 URLMatcher::URLMatcher() {}
629
630 URLMatcher::~URLMatcher() {}
631
632 void URLMatcher::AddConditionSets(
633     const URLMatcherConditionSet::Vector& condition_sets) {
634   for (URLMatcherConditionSet::Vector::const_iterator i =
635        condition_sets.begin(); i != condition_sets.end(); ++i) {
636     DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
637         url_matcher_condition_sets_.end());
638     url_matcher_condition_sets_[(*i)->id()] = *i;
639   }
640   UpdateInternalDatastructures();
641 }
642
643 void URLMatcher::RemoveConditionSets(
644     const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) {
645   for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
646        condition_set_ids.begin(); i != condition_set_ids.end(); ++i) {
647     DCHECK(url_matcher_condition_sets_.find(*i) !=
648         url_matcher_condition_sets_.end());
649     url_matcher_condition_sets_.erase(*i);
650   }
651   UpdateInternalDatastructures();
652 }
653
654 void URLMatcher::ClearUnusedConditionSets() {
655   UpdateConditionFactory();
656 }
657
658 std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
659     const GURL& url) const {
660   // Find all IDs of StringPatterns that match |url|.
661   // See URLMatcherConditionFactory for the canonicalization of URLs and the
662   // distinction between full url searches and url component searches.
663   std::set<StringPattern::ID> matches;
664   if (!full_url_matcher_.IsEmpty()) {
665     full_url_matcher_.Match(
666         condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
667   }
668   if (!url_component_matcher_.IsEmpty()) {
669     url_component_matcher_.Match(
670         condition_factory_.CanonicalizeURLForComponentSearches(url), &matches);
671   }
672   if (!regex_set_matcher_.IsEmpty()) {
673     regex_set_matcher_.Match(
674         condition_factory_.CanonicalizeURLForRegexSearches(url), &matches);
675   }
676   if (!origin_and_path_regex_set_matcher_.IsEmpty()) {
677     origin_and_path_regex_set_matcher_.Match(
678         condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url),
679         &matches);
680   }
681
682   // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
683   // were fulfilled.
684   std::set<URLMatcherConditionSet::ID> result;
685   for (std::set<StringPattern::ID>::const_iterator i = matches.begin();
686        i != matches.end(); ++i) {
687     // For each URLMatcherConditionSet there is exactly one condition
688     // registered in substring_match_triggers_. This means that the following
689     // logic tests each URLMatcherConditionSet exactly once if it can be
690     // completely fulfilled.
691     StringPatternTriggers::const_iterator triggered_condition_sets_iter =
692         substring_match_triggers_.find(*i);
693     if (triggered_condition_sets_iter == substring_match_triggers_.end())
694       continue;  // Not all substring matches are triggers for a condition set.
695     const std::set<URLMatcherConditionSet::ID>& condition_sets =
696         triggered_condition_sets_iter->second;
697     for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
698          condition_sets.begin(); j != condition_sets.end(); ++j) {
699       URLMatcherConditionSets::const_iterator condition_set_iter =
700           url_matcher_condition_sets_.find(*j);
701       DCHECK(condition_set_iter != url_matcher_condition_sets_.end());
702       if (condition_set_iter->second->IsMatch(matches, url))
703         result.insert(*j);
704     }
705   }
706
707   return result;
708 }
709
710 bool URLMatcher::IsEmpty() const {
711   return condition_factory_.IsEmpty() &&
712       url_matcher_condition_sets_.empty() &&
713       substring_match_triggers_.empty() &&
714       full_url_matcher_.IsEmpty() &&
715       url_component_matcher_.IsEmpty() &&
716       regex_set_matcher_.IsEmpty() &&
717       origin_and_path_regex_set_matcher_.IsEmpty() &&
718       registered_full_url_patterns_.empty() &&
719       registered_url_component_patterns_.empty();
720 }
721
722 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
723   // The purpose of |full_url_conditions| is just that we need to execute
724   // the same logic once for Full URL searches and once for URL Component
725   // searches (see URLMatcherConditionFactory).
726
727   // Determine which patterns need to be registered when this function
728   // terminates.
729   std::set<const StringPattern*> new_patterns;
730   for (URLMatcherConditionSets::const_iterator condition_set_iter =
731       url_matcher_condition_sets_.begin();
732       condition_set_iter != url_matcher_condition_sets_.end();
733       ++condition_set_iter) {
734     const URLMatcherConditionSet::Conditions& conditions =
735         condition_set_iter->second->conditions();
736     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
737          conditions.begin(); condition_iter != conditions.end();
738          ++condition_iter) {
739       // If we are called to process Full URL searches, ignore others, and
740       // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
741       if (!condition_iter->IsRegexCondition() &&
742           !condition_iter->IsOriginAndPathRegexCondition() &&
743           full_url_conditions == condition_iter->IsFullURLCondition())
744         new_patterns.insert(condition_iter->string_pattern());
745     }
746   }
747
748   // This is the set of patterns that were registered before this function
749   // is called.
750   std::set<const StringPattern*>& registered_patterns =
751       full_url_conditions ? registered_full_url_patterns_
752                           : registered_url_component_patterns_;
753
754   // Add all patterns that are in new_patterns but not in registered_patterns.
755   std::vector<const StringPattern*> patterns_to_register =
756       base::STLSetDifference<std::vector<const StringPattern*> >(
757           new_patterns, registered_patterns);
758
759   // Remove all patterns that are in registered_patterns but not in
760   // new_patterns.
761   std::vector<const StringPattern*> patterns_to_unregister =
762       base::STLSetDifference<std::vector<const StringPattern*> >(
763            registered_patterns, new_patterns);
764
765   // Update the SubstringSetMatcher.
766   SubstringSetMatcher& url_matcher =
767       full_url_conditions ? full_url_matcher_ : url_component_matcher_;
768   url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
769                                             patterns_to_unregister);
770
771   // Update the set of registered_patterns for the next time this function
772   // is being called.
773   registered_patterns.swap(new_patterns);
774 }
775
776 void URLMatcher::UpdateRegexSetMatcher() {
777   std::vector<const StringPattern*> new_patterns;
778   std::vector<const StringPattern*> new_origin_and_path_patterns;
779
780   for (URLMatcherConditionSets::const_iterator condition_set_iter =
781       url_matcher_condition_sets_.begin();
782       condition_set_iter != url_matcher_condition_sets_.end();
783       ++condition_set_iter) {
784     const URLMatcherConditionSet::Conditions& conditions =
785         condition_set_iter->second->conditions();
786     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
787          conditions.begin(); condition_iter != conditions.end();
788          ++condition_iter) {
789       if (condition_iter->IsRegexCondition()) {
790         new_patterns.push_back(condition_iter->string_pattern());
791       } else if (condition_iter->IsOriginAndPathRegexCondition()) {
792         new_origin_and_path_patterns.push_back(
793             condition_iter->string_pattern());
794       }
795     }
796   }
797
798   // Start over from scratch. We can't really do better than this, since the
799   // FilteredRE2 backend doesn't support incremental updates.
800   regex_set_matcher_.ClearPatterns();
801   regex_set_matcher_.AddPatterns(new_patterns);
802   origin_and_path_regex_set_matcher_.ClearPatterns();
803   origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns);
804 }
805
806 void URLMatcher::UpdateTriggers() {
807   // Count substring pattern frequencies.
808   std::map<StringPattern::ID, size_t> substring_pattern_frequencies;
809   for (URLMatcherConditionSets::const_iterator condition_set_iter =
810       url_matcher_condition_sets_.begin();
811       condition_set_iter != url_matcher_condition_sets_.end();
812       ++condition_set_iter) {
813     const URLMatcherConditionSet::Conditions& conditions =
814         condition_set_iter->second->conditions();
815     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
816          conditions.begin(); condition_iter != conditions.end();
817          ++condition_iter) {
818       const StringPattern* pattern = condition_iter->string_pattern();
819       substring_pattern_frequencies[pattern->id()]++;
820     }
821   }
822
823   // Update trigger conditions: Determine for each URLMatcherConditionSet which
824   // URLMatcherCondition contains a StringPattern that occurs least
825   // frequently in this URLMatcher. We assume that this condition is very
826   // specific and occurs rarely in URLs. If a match occurs for this
827   // URLMatcherCondition, we want to test all other URLMatcherCondition in the
828   // respective URLMatcherConditionSet as well to see whether the entire
829   // URLMatcherConditionSet is considered matching.
830   substring_match_triggers_.clear();
831   for (URLMatcherConditionSets::const_iterator condition_set_iter =
832       url_matcher_condition_sets_.begin();
833       condition_set_iter != url_matcher_condition_sets_.end();
834       ++condition_set_iter) {
835     const URLMatcherConditionSet::Conditions& conditions =
836         condition_set_iter->second->conditions();
837     if (conditions.empty())
838       continue;
839     URLMatcherConditionSet::Conditions::const_iterator condition_iter =
840         conditions.begin();
841     StringPattern::ID trigger = condition_iter->string_pattern()->id();
842     // We skip the first element in the following loop.
843     ++condition_iter;
844     for (; condition_iter != conditions.end(); ++condition_iter) {
845       StringPattern::ID current_id =
846           condition_iter->string_pattern()->id();
847       if (substring_pattern_frequencies[trigger] >
848           substring_pattern_frequencies[current_id]) {
849         trigger = current_id;
850       }
851     }
852     substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
853   }
854 }
855
856 void URLMatcher::UpdateConditionFactory() {
857   std::set<StringPattern::ID> used_patterns;
858   for (URLMatcherConditionSets::const_iterator condition_set_iter =
859       url_matcher_condition_sets_.begin();
860       condition_set_iter != url_matcher_condition_sets_.end();
861       ++condition_set_iter) {
862     const URLMatcherConditionSet::Conditions& conditions =
863         condition_set_iter->second->conditions();
864     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
865          conditions.begin(); condition_iter != conditions.end();
866          ++condition_iter) {
867       used_patterns.insert(condition_iter->string_pattern()->id());
868     }
869   }
870   condition_factory_.ForgetUnusedPatterns(used_patterns);
871 }
872
873 void URLMatcher::UpdateInternalDatastructures() {
874   UpdateSubstringSetMatcher(false);
875   UpdateSubstringSetMatcher(true);
876   UpdateRegexSetMatcher();
877   UpdateTriggers();
878   UpdateConditionFactory();
879 }
880
881 }  // namespace extensions