1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/managed_mode/managed_mode_url_filter.h"
7 #include "base/containers/hash_tables.h"
8 #include "base/files/file_path.h"
9 #include "base/json/json_file_value_serializer.h"
10 #include "base/metrics/histogram.h"
11 #include "base/sha1.h"
12 #include "base/strings/string_number_conversions.h"
13 #include "base/strings/string_util.h"
14 #include "base/task_runner_util.h"
15 #include "base/threading/sequenced_worker_pool.h"
16 #include "chrome/common/net/url_fixer_upper.h"
17 #include "components/policy/core/browser/url_blacklist_manager.h"
18 #include "components/url_matcher/url_matcher.h"
19 #include "content/public/browser/browser_thread.h"
20 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
23 using content::BrowserThread;
24 using net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES;
25 using net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES;
26 using net::registry_controlled_domains::GetRegistryLength;
27 using policy::URLBlacklist;
28 using url_matcher::URLMatcher;
29 using url_matcher::URLMatcherConditionSet;
31 struct ManagedModeURLFilter::Contents {
32 URLMatcher url_matcher;
33 std::map<URLMatcherConditionSet::ID, int> matcher_site_map;
34 base::hash_multimap<std::string, int> hash_site_map;
35 std::vector<ManagedModeSiteList::Site> sites;
40 const char* kStandardSchemes[] = {
51 // This class encapsulates all the state that is required during construction of
52 // a new ManagedModeURLFilter::Contents.
58 // Adds a single URL pattern for the site identified by |site_id|.
59 bool AddPattern(const std::string& pattern, int site_id);
61 // Adds a single hostname SHA1 hash for the site identified by |site_id|.
62 void AddHostnameHash(const std::string& hash, int site_id);
64 // Adds all the sites in |site_list|, with URL patterns and hostname hashes.
65 void AddSiteList(ManagedModeSiteList* site_list);
67 // Finalizes construction of the ManagedModeURLFilter::Contents and returns
68 // them. This method should be called before this object is destroyed.
69 scoped_ptr<ManagedModeURLFilter::Contents> Build();
72 scoped_ptr<ManagedModeURLFilter::Contents> contents_;
73 URLMatcherConditionSet::Vector all_conditions_;
74 URLMatcherConditionSet::ID matcher_id_;
77 FilterBuilder::FilterBuilder()
78 : contents_(new ManagedModeURLFilter::Contents()),
81 FilterBuilder::~FilterBuilder() {
82 DCHECK(!contents_.get());
85 bool FilterBuilder::AddPattern(const std::string& pattern, int site_id) {
86 DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
91 bool match_subdomains = true;
92 URLBlacklist::SegmentURLCallback callback =
93 static_cast<URLBlacklist::SegmentURLCallback>(URLFixerUpper::SegmentURL);
94 if (!URLBlacklist::FilterToComponents(
95 callback, pattern, &scheme, &host, &match_subdomains, &port, &path)) {
96 LOG(ERROR) << "Invalid pattern " << pattern;
100 scoped_refptr<URLMatcherConditionSet> condition_set =
101 URLBlacklist::CreateConditionSet(
102 &contents_->url_matcher, ++matcher_id_,
103 scheme, host, match_subdomains, port, path);
104 all_conditions_.push_back(condition_set);
105 contents_->matcher_site_map[matcher_id_] = site_id;
109 void FilterBuilder::AddHostnameHash(const std::string& hash, int site_id) {
110 contents_->hash_site_map.insert(std::make_pair(StringToUpperASCII(hash),
114 void FilterBuilder::AddSiteList(ManagedModeSiteList* site_list) {
115 std::vector<ManagedModeSiteList::Site> sites;
116 site_list->GetSites(&sites);
117 int site_id = contents_->sites.size();
118 for (std::vector<ManagedModeSiteList::Site>::const_iterator it =
119 sites.begin(); it != sites.end(); ++it) {
120 const ManagedModeSiteList::Site& site = *it;
121 contents_->sites.push_back(site);
123 for (std::vector<std::string>::const_iterator pattern_it =
124 site.patterns.begin();
125 pattern_it != site.patterns.end(); ++pattern_it) {
126 AddPattern(*pattern_it, site_id);
129 for (std::vector<std::string>::const_iterator hash_it =
130 site.hostname_hashes.begin();
131 hash_it != site.hostname_hashes.end(); ++hash_it) {
132 AddHostnameHash(*hash_it, site_id);
139 scoped_ptr<ManagedModeURLFilter::Contents> FilterBuilder::Build() {
140 DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
141 contents_->url_matcher.AddConditionSets(all_conditions_);
142 return contents_.Pass();
145 scoped_ptr<ManagedModeURLFilter::Contents> CreateWhitelistFromPatterns(
146 const std::vector<std::string>& patterns) {
147 DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
149 FilterBuilder builder;
150 for (std::vector<std::string>::const_iterator it = patterns.begin();
151 it != patterns.end(); ++it) {
152 // TODO(bauerb): We should create a fake site for the whitelist.
153 builder.AddPattern(*it, -1);
156 return builder.Build();
159 scoped_ptr<ManagedModeURLFilter::Contents> LoadWhitelistsOnBlockingPoolThread(
160 ScopedVector<ManagedModeSiteList> site_lists) {
161 DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
163 FilterBuilder builder;
164 for (ScopedVector<ManagedModeSiteList>::iterator it = site_lists.begin();
165 it != site_lists.end(); ++it) {
166 builder.AddSiteList(*it);
169 return builder.Build();
174 ManagedModeURLFilter::ManagedModeURLFilter()
175 : default_behavior_(ALLOW),
176 contents_(new Contents()) {
177 // Detach from the current thread so we can be constructed on a different
178 // thread than the one where we're used.
182 ManagedModeURLFilter::~ManagedModeURLFilter() {
183 DCHECK(CalledOnValidThread());
187 ManagedModeURLFilter::FilteringBehavior
188 ManagedModeURLFilter::BehaviorFromInt(int behavior_value) {
189 DCHECK_GE(behavior_value, ALLOW);
190 DCHECK_LE(behavior_value, BLOCK);
191 return static_cast<FilteringBehavior>(behavior_value);
195 GURL ManagedModeURLFilter::Normalize(const GURL& url) {
196 GURL normalized_url = url;
197 GURL::Replacements replacements;
198 // Strip username, password, query, and ref.
199 replacements.ClearUsername();
200 replacements.ClearPassword();
201 replacements.ClearQuery();
202 replacements.ClearRef();
203 return url.ReplaceComponents(replacements);
207 bool ManagedModeURLFilter::HasStandardScheme(const GURL& url) {
208 for (size_t i = 0; i < arraysize(kStandardSchemes); ++i) {
209 if (url.scheme() == kStandardSchemes[i])
215 std::string GetHostnameHash(const GURL& url) {
216 std::string hash = base::SHA1HashString(url.host());
217 return base::HexEncode(hash.data(), hash.length());
221 bool ManagedModeURLFilter::HostMatchesPattern(const std::string& host,
222 const std::string& pattern) {
223 std::string trimmed_pattern = pattern;
224 std::string trimmed_host = host;
225 if (EndsWith(pattern, ".*", true)) {
226 size_t registry_length = GetRegistryLength(
227 trimmed_host, EXCLUDE_UNKNOWN_REGISTRIES, EXCLUDE_PRIVATE_REGISTRIES);
228 // A host without a known registry part does not match.
229 if (registry_length == 0)
232 trimmed_pattern.erase(trimmed_pattern.length() - 2);
233 trimmed_host.erase(trimmed_host.length() - (registry_length + 1));
236 if (StartsWithASCII(trimmed_pattern, "*.", true)) {
237 trimmed_pattern.erase(0, 2);
239 // The remaining pattern should be non-empty, and it should not contain
240 // further stars. Also the trimmed host needs to end with the trimmed
242 if (trimmed_pattern.empty() ||
243 trimmed_pattern.find('*') != std::string::npos ||
244 !EndsWith(trimmed_host, trimmed_pattern, true)) {
248 // The trimmed host needs to have a dot separating the subdomain from the
249 // matched pattern piece, unless there is no subdomain.
250 int pos = trimmed_host.length() - trimmed_pattern.length();
252 return (pos == 0) || (trimmed_host[pos - 1] == '.');
255 return trimmed_host == trimmed_pattern;
258 ManagedModeURLFilter::FilteringBehavior
259 ManagedModeURLFilter::GetFilteringBehaviorForURL(const GURL& url) const {
260 DCHECK(CalledOnValidThread());
262 // URLs with a non-standard scheme (e.g. chrome://) are always allowed.
263 if (!HasStandardScheme(url))
266 // Check manual overrides for the exact URL.
267 std::map<GURL, bool>::const_iterator url_it = url_map_.find(Normalize(url));
268 if (url_it != url_map_.end())
269 return url_it->second ? ALLOW : BLOCK;
271 // Check manual overrides for the hostname.
272 std::string host = url.host();
273 std::map<std::string, bool>::const_iterator host_it = host_map_.find(host);
274 if (host_it != host_map_.end())
275 return host_it->second ? ALLOW : BLOCK;
277 // Look for patterns matching the hostname, with a value that is different
278 // from the default (a value of true in the map meaning allowed).
279 for (std::map<std::string, bool>::const_iterator host_it =
280 host_map_.begin(); host_it != host_map_.end(); ++host_it) {
281 if ((host_it->second == (default_behavior_ == BLOCK)) &&
282 HostMatchesPattern(host, host_it->first)) {
283 return host_it->second ? ALLOW : BLOCK;
287 // If the default behavior is to allow, we don't need to check anything else.
288 if (default_behavior_ == ALLOW)
291 // Check the list of URL patterns.
292 std::set<URLMatcherConditionSet::ID> matching_ids =
293 contents_->url_matcher.MatchURL(url);
294 if (!matching_ids.empty())
297 // Check the list of hostname hashes.
298 if (contents_->hash_site_map.count(GetHostnameHash(url)))
301 // Fall back to the default behavior.
302 return default_behavior_;
305 void ManagedModeURLFilter::GetSites(
307 std::vector<ManagedModeSiteList::Site*>* sites) const {
308 std::set<URLMatcherConditionSet::ID> matching_ids =
309 contents_->url_matcher.MatchURL(url);
310 for (std::set<URLMatcherConditionSet::ID>::const_iterator it =
311 matching_ids.begin(); it != matching_ids.end(); ++it) {
312 std::map<URLMatcherConditionSet::ID, int>::const_iterator entry =
313 contents_->matcher_site_map.find(*it);
314 if (entry == contents_->matcher_site_map.end()) {
318 sites->push_back(&contents_->sites[entry->second]);
321 typedef base::hash_multimap<std::string, int>::const_iterator
322 hash_site_map_iterator;
323 std::pair<hash_site_map_iterator, hash_site_map_iterator> bounds =
324 contents_->hash_site_map.equal_range(GetHostnameHash(url));
325 for (hash_site_map_iterator hash_it = bounds.first;
326 hash_it != bounds.second; hash_it++) {
327 sites->push_back(&contents_->sites[hash_it->second]);
331 void ManagedModeURLFilter::SetDefaultFilteringBehavior(
332 FilteringBehavior behavior) {
333 DCHECK(CalledOnValidThread());
334 default_behavior_ = behavior;
337 void ManagedModeURLFilter::LoadWhitelists(
338 ScopedVector<ManagedModeSiteList> site_lists) {
339 DCHECK(CalledOnValidThread());
341 base::PostTaskAndReplyWithResult(
342 BrowserThread::GetBlockingPool(),
344 base::Bind(&LoadWhitelistsOnBlockingPoolThread,
345 base::Passed(&site_lists)),
346 base::Bind(&ManagedModeURLFilter::SetContents, this));
349 void ManagedModeURLFilter::SetFromPatterns(
350 const std::vector<std::string>& patterns) {
351 DCHECK(CalledOnValidThread());
353 base::PostTaskAndReplyWithResult(
354 BrowserThread::GetBlockingPool(),
356 base::Bind(&CreateWhitelistFromPatterns, patterns),
357 base::Bind(&ManagedModeURLFilter::SetContents, this));
360 void ManagedModeURLFilter::SetManualHosts(
361 const std::map<std::string, bool>* host_map) {
362 DCHECK(CalledOnValidThread());
363 host_map_ = *host_map;
364 UMA_HISTOGRAM_CUSTOM_COUNTS("ManagedMode.ManualHostsEntries",
365 host_map->size(), 1, 1000, 50);
368 void ManagedModeURLFilter::SetManualURLs(
369 const std::map<GURL, bool>* url_map) {
370 DCHECK(CalledOnValidThread());
372 UMA_HISTOGRAM_CUSTOM_COUNTS("ManagedMode.ManualURLsEntries",
373 url_map->size(), 1, 1000, 50);
376 void ManagedModeURLFilter::AddObserver(Observer* observer) {
377 observers_.AddObserver(observer);
380 void ManagedModeURLFilter::RemoveObserver(Observer* observer) {
381 observers_.RemoveObserver(observer);
384 void ManagedModeURLFilter::SetContents(scoped_ptr<Contents> contents) {
385 DCHECK(CalledOnValidThread());
386 contents_ = contents.Pass();
387 FOR_EACH_OBSERVER(Observer, observers_, OnSiteListUpdated());