1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/managed_mode/managed_mode_url_filter.h"
7 #include "base/containers/hash_tables.h"
8 #include "base/files/file_path.h"
9 #include "base/json/json_file_value_serializer.h"
10 #include "base/metrics/histogram.h"
11 #include "base/sha1.h"
12 #include "base/strings/string_number_conversions.h"
13 #include "base/strings/string_util.h"
14 #include "base/task_runner_util.h"
15 #include "base/threading/sequenced_worker_pool.h"
16 #include "chrome/common/net/url_fixer_upper.h"
17 #include "components/policy/core/browser/url_blacklist_manager.h"
18 #include "components/url_matcher/url_matcher.h"
19 #include "content/public/browser/browser_thread.h"
20 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
23 using content::BrowserThread;
24 using net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES;
25 using net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES;
26 using net::registry_controlled_domains::GetRegistryLength;
27 using policy::URLBlacklist;
28 using url_matcher::URLMatcher;
29 using url_matcher::URLMatcherConditionSet;
31 struct ManagedModeURLFilter::Contents {
32 URLMatcher url_matcher;
33 std::map<URLMatcherConditionSet::ID, int> matcher_site_map;
34 base::hash_multimap<std::string, int> hash_site_map;
35 std::vector<ManagedModeSiteList::Site> sites;
40 // URL schemes not in this list (e.g., file:// and chrome://) will always be
42 const char* kFilteredSchemes[] = {
52 // This class encapsulates all the state that is required during construction of
53 // a new ManagedModeURLFilter::Contents.
59 // Adds a single URL pattern for the site identified by |site_id|.
60 bool AddPattern(const std::string& pattern, int site_id);
62 // Adds a single hostname SHA1 hash for the site identified by |site_id|.
63 void AddHostnameHash(const std::string& hash, int site_id);
65 // Adds all the sites in |site_list|, with URL patterns and hostname hashes.
66 void AddSiteList(ManagedModeSiteList* site_list);
68 // Finalizes construction of the ManagedModeURLFilter::Contents and returns
69 // them. This method should be called before this object is destroyed.
70 scoped_ptr<ManagedModeURLFilter::Contents> Build();
73 scoped_ptr<ManagedModeURLFilter::Contents> contents_;
74 URLMatcherConditionSet::Vector all_conditions_;
75 URLMatcherConditionSet::ID matcher_id_;
78 FilterBuilder::FilterBuilder()
79 : contents_(new ManagedModeURLFilter::Contents()),
82 FilterBuilder::~FilterBuilder() {
83 DCHECK(!contents_.get());
86 bool FilterBuilder::AddPattern(const std::string& pattern, int site_id) {
87 DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
92 bool match_subdomains = true;
93 URLBlacklist::SegmentURLCallback callback =
94 static_cast<URLBlacklist::SegmentURLCallback>(URLFixerUpper::SegmentURL);
95 if (!URLBlacklist::FilterToComponents(
96 callback, pattern, &scheme, &host, &match_subdomains, &port, &path)) {
97 LOG(ERROR) << "Invalid pattern " << pattern;
101 scoped_refptr<URLMatcherConditionSet> condition_set =
102 URLBlacklist::CreateConditionSet(
103 &contents_->url_matcher, ++matcher_id_,
104 scheme, host, match_subdomains, port, path);
105 all_conditions_.push_back(condition_set);
106 contents_->matcher_site_map[matcher_id_] = site_id;
110 void FilterBuilder::AddHostnameHash(const std::string& hash, int site_id) {
111 contents_->hash_site_map.insert(std::make_pair(StringToUpperASCII(hash),
115 void FilterBuilder::AddSiteList(ManagedModeSiteList* site_list) {
116 std::vector<ManagedModeSiteList::Site> sites;
117 site_list->GetSites(&sites);
118 int site_id = contents_->sites.size();
119 for (std::vector<ManagedModeSiteList::Site>::const_iterator it =
120 sites.begin(); it != sites.end(); ++it) {
121 const ManagedModeSiteList::Site& site = *it;
122 contents_->sites.push_back(site);
124 for (std::vector<std::string>::const_iterator pattern_it =
125 site.patterns.begin();
126 pattern_it != site.patterns.end(); ++pattern_it) {
127 AddPattern(*pattern_it, site_id);
130 for (std::vector<std::string>::const_iterator hash_it =
131 site.hostname_hashes.begin();
132 hash_it != site.hostname_hashes.end(); ++hash_it) {
133 AddHostnameHash(*hash_it, site_id);
140 scoped_ptr<ManagedModeURLFilter::Contents> FilterBuilder::Build() {
141 DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
142 contents_->url_matcher.AddConditionSets(all_conditions_);
143 return contents_.Pass();
146 scoped_ptr<ManagedModeURLFilter::Contents> CreateWhitelistFromPatterns(
147 const std::vector<std::string>& patterns) {
148 DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
150 FilterBuilder builder;
151 for (std::vector<std::string>::const_iterator it = patterns.begin();
152 it != patterns.end(); ++it) {
153 // TODO(bauerb): We should create a fake site for the whitelist.
154 builder.AddPattern(*it, -1);
157 return builder.Build();
160 scoped_ptr<ManagedModeURLFilter::Contents> LoadWhitelistsOnBlockingPoolThread(
161 ScopedVector<ManagedModeSiteList> site_lists) {
162 DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
164 FilterBuilder builder;
165 for (ScopedVector<ManagedModeSiteList>::iterator it = site_lists.begin();
166 it != site_lists.end(); ++it) {
167 builder.AddSiteList(*it);
170 return builder.Build();
175 ManagedModeURLFilter::ManagedModeURLFilter()
176 : default_behavior_(ALLOW),
177 contents_(new Contents()) {
178 // Detach from the current thread so we can be constructed on a different
179 // thread than the one where we're used.
183 ManagedModeURLFilter::~ManagedModeURLFilter() {
184 DCHECK(CalledOnValidThread());
188 ManagedModeURLFilter::FilteringBehavior
189 ManagedModeURLFilter::BehaviorFromInt(int behavior_value) {
190 DCHECK_GE(behavior_value, ALLOW);
191 DCHECK_LE(behavior_value, BLOCK);
192 return static_cast<FilteringBehavior>(behavior_value);
196 GURL ManagedModeURLFilter::Normalize(const GURL& url) {
197 GURL normalized_url = url;
198 GURL::Replacements replacements;
199 // Strip username, password, query, and ref.
200 replacements.ClearUsername();
201 replacements.ClearPassword();
202 replacements.ClearQuery();
203 replacements.ClearRef();
204 return url.ReplaceComponents(replacements);
208 bool ManagedModeURLFilter::HasFilteredScheme(const GURL& url) {
209 for (size_t i = 0; i < arraysize(kFilteredSchemes); ++i) {
210 if (url.scheme() == kFilteredSchemes[i])
216 std::string GetHostnameHash(const GURL& url) {
217 std::string hash = base::SHA1HashString(url.host());
218 return base::HexEncode(hash.data(), hash.length());
222 bool ManagedModeURLFilter::HostMatchesPattern(const std::string& host,
223 const std::string& pattern) {
224 std::string trimmed_pattern = pattern;
225 std::string trimmed_host = host;
226 if (EndsWith(pattern, ".*", true)) {
227 size_t registry_length = GetRegistryLength(
228 trimmed_host, EXCLUDE_UNKNOWN_REGISTRIES, EXCLUDE_PRIVATE_REGISTRIES);
229 // A host without a known registry part does not match.
230 if (registry_length == 0)
233 trimmed_pattern.erase(trimmed_pattern.length() - 2);
234 trimmed_host.erase(trimmed_host.length() - (registry_length + 1));
237 if (StartsWithASCII(trimmed_pattern, "*.", true)) {
238 trimmed_pattern.erase(0, 2);
240 // The remaining pattern should be non-empty, and it should not contain
241 // further stars. Also the trimmed host needs to end with the trimmed
243 if (trimmed_pattern.empty() ||
244 trimmed_pattern.find('*') != std::string::npos ||
245 !EndsWith(trimmed_host, trimmed_pattern, true)) {
249 // The trimmed host needs to have a dot separating the subdomain from the
250 // matched pattern piece, unless there is no subdomain.
251 int pos = trimmed_host.length() - trimmed_pattern.length();
253 return (pos == 0) || (trimmed_host[pos - 1] == '.');
256 return trimmed_host == trimmed_pattern;
259 ManagedModeURLFilter::FilteringBehavior
260 ManagedModeURLFilter::GetFilteringBehaviorForURL(const GURL& url) const {
261 DCHECK(CalledOnValidThread());
263 // URLs with a non-standard scheme (e.g. chrome://) are always allowed.
264 if (!HasFilteredScheme(url))
267 // Check manual overrides for the exact URL.
268 std::map<GURL, bool>::const_iterator url_it = url_map_.find(Normalize(url));
269 if (url_it != url_map_.end())
270 return url_it->second ? ALLOW : BLOCK;
272 // Check manual overrides for the hostname.
273 std::string host = url.host();
274 std::map<std::string, bool>::const_iterator host_it = host_map_.find(host);
275 if (host_it != host_map_.end())
276 return host_it->second ? ALLOW : BLOCK;
278 // Look for patterns matching the hostname, with a value that is different
279 // from the default (a value of true in the map meaning allowed).
280 for (std::map<std::string, bool>::const_iterator host_it =
281 host_map_.begin(); host_it != host_map_.end(); ++host_it) {
282 if ((host_it->second == (default_behavior_ == BLOCK)) &&
283 HostMatchesPattern(host, host_it->first)) {
284 return host_it->second ? ALLOW : BLOCK;
288 // If the default behavior is to allow, we don't need to check anything else.
289 if (default_behavior_ == ALLOW)
292 // Check the list of URL patterns.
293 std::set<URLMatcherConditionSet::ID> matching_ids =
294 contents_->url_matcher.MatchURL(url);
295 if (!matching_ids.empty())
298 // Check the list of hostname hashes.
299 if (contents_->hash_site_map.count(GetHostnameHash(url)))
302 // Fall back to the default behavior.
303 return default_behavior_;
306 void ManagedModeURLFilter::GetSites(
308 std::vector<ManagedModeSiteList::Site*>* sites) const {
309 std::set<URLMatcherConditionSet::ID> matching_ids =
310 contents_->url_matcher.MatchURL(url);
311 for (std::set<URLMatcherConditionSet::ID>::const_iterator it =
312 matching_ids.begin(); it != matching_ids.end(); ++it) {
313 std::map<URLMatcherConditionSet::ID, int>::const_iterator entry =
314 contents_->matcher_site_map.find(*it);
315 if (entry == contents_->matcher_site_map.end()) {
319 sites->push_back(&contents_->sites[entry->second]);
322 typedef base::hash_multimap<std::string, int>::const_iterator
323 hash_site_map_iterator;
324 std::pair<hash_site_map_iterator, hash_site_map_iterator> bounds =
325 contents_->hash_site_map.equal_range(GetHostnameHash(url));
326 for (hash_site_map_iterator hash_it = bounds.first;
327 hash_it != bounds.second; hash_it++) {
328 sites->push_back(&contents_->sites[hash_it->second]);
332 void ManagedModeURLFilter::SetDefaultFilteringBehavior(
333 FilteringBehavior behavior) {
334 DCHECK(CalledOnValidThread());
335 default_behavior_ = behavior;
338 void ManagedModeURLFilter::LoadWhitelists(
339 ScopedVector<ManagedModeSiteList> site_lists) {
340 DCHECK(CalledOnValidThread());
342 base::PostTaskAndReplyWithResult(
343 BrowserThread::GetBlockingPool(),
345 base::Bind(&LoadWhitelistsOnBlockingPoolThread,
346 base::Passed(&site_lists)),
347 base::Bind(&ManagedModeURLFilter::SetContents, this));
350 void ManagedModeURLFilter::SetFromPatterns(
351 const std::vector<std::string>& patterns) {
352 DCHECK(CalledOnValidThread());
354 base::PostTaskAndReplyWithResult(
355 BrowserThread::GetBlockingPool(),
357 base::Bind(&CreateWhitelistFromPatterns, patterns),
358 base::Bind(&ManagedModeURLFilter::SetContents, this));
361 void ManagedModeURLFilter::SetManualHosts(
362 const std::map<std::string, bool>* host_map) {
363 DCHECK(CalledOnValidThread());
364 host_map_ = *host_map;
365 UMA_HISTOGRAM_CUSTOM_COUNTS("ManagedMode.ManualHostsEntries",
366 host_map->size(), 1, 1000, 50);
369 void ManagedModeURLFilter::SetManualURLs(
370 const std::map<GURL, bool>* url_map) {
371 DCHECK(CalledOnValidThread());
373 UMA_HISTOGRAM_CUSTOM_COUNTS("ManagedMode.ManualURLsEntries",
374 url_map->size(), 1, 1000, 50);
377 void ManagedModeURLFilter::AddObserver(Observer* observer) {
378 observers_.AddObserver(observer);
381 void ManagedModeURLFilter::RemoveObserver(Observer* observer) {
382 observers_.RemoveObserver(observer);
385 void ManagedModeURLFilter::SetContents(scoped_ptr<Contents> contents) {
386 DCHECK(CalledOnValidThread());
387 contents_ = contents.Pass();
388 FOR_EACH_OBSERVER(Observer, observers_, OnSiteListUpdated());