1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/managed_mode/managed_mode_url_filter.h"
7 #include "base/containers/hash_tables.h"
8 #include "base/files/file_path.h"
9 #include "base/json/json_file_value_serializer.h"
10 #include "base/metrics/histogram.h"
11 #include "base/sha1.h"
12 #include "base/strings/string_number_conversions.h"
13 #include "base/strings/string_util.h"
14 #include "base/task_runner_util.h"
15 #include "base/threading/sequenced_worker_pool.h"
16 #include "chrome/browser/policy/url_blacklist_manager.h"
17 #include "content/public/browser/browser_thread.h"
18 #include "extensions/common/matcher/url_matcher.h"
19 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
22 using content::BrowserThread;
23 using extensions::URLMatcher;
24 using extensions::URLMatcherConditionSet;
25 using net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES;
26 using net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES;
27 using net::registry_controlled_domains::GetRegistryLength;
29 struct ManagedModeURLFilter::Contents {
30 URLMatcher url_matcher;
31 std::map<URLMatcherConditionSet::ID, int> matcher_site_map;
32 base::hash_multimap<std::string, int> hash_site_map;
33 std::vector<ManagedModeSiteList::Site> sites;
38 const char* kStandardSchemes[] = {
49 // This class encapsulates all the state that is required during construction of
50 // a new ManagedModeURLFilter::Contents.
56 // Adds a single URL pattern for the site identified by |site_id|.
57 bool AddPattern(const std::string& pattern, int site_id);
59 // Adds a single hostname SHA1 hash for the site identified by |site_id|.
60 void AddHostnameHash(const std::string& hash, int site_id);
62 // Adds all the sites in |site_list|, with URL patterns and hostname hashes.
63 void AddSiteList(ManagedModeSiteList* site_list);
65 // Finalizes construction of the ManagedModeURLFilter::Contents and returns
66 // them. This method should be called before this object is destroyed.
67 scoped_ptr<ManagedModeURLFilter::Contents> Build();
70 scoped_ptr<ManagedModeURLFilter::Contents> contents_;
71 URLMatcherConditionSet::Vector all_conditions_;
72 URLMatcherConditionSet::ID matcher_id_;
75 FilterBuilder::FilterBuilder()
76 : contents_(new ManagedModeURLFilter::Contents()),
79 FilterBuilder::~FilterBuilder() {
80 DCHECK(!contents_.get());
83 bool FilterBuilder::AddPattern(const std::string& pattern, int site_id) {
84 DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
89 bool match_subdomains = true;
90 if (!policy::URLBlacklist::FilterToComponents(
91 pattern, &scheme, &host, &match_subdomains, &port, &path)) {
92 LOG(ERROR) << "Invalid pattern " << pattern;
96 scoped_refptr<extensions::URLMatcherConditionSet> condition_set =
97 policy::URLBlacklist::CreateConditionSet(
98 &contents_->url_matcher, ++matcher_id_,
99 scheme, host, match_subdomains, port, path);
100 all_conditions_.push_back(condition_set);
101 contents_->matcher_site_map[matcher_id_] = site_id;
105 void FilterBuilder::AddHostnameHash(const std::string& hash, int site_id) {
106 contents_->hash_site_map.insert(std::make_pair(StringToUpperASCII(hash),
110 void FilterBuilder::AddSiteList(ManagedModeSiteList* site_list) {
111 std::vector<ManagedModeSiteList::Site> sites;
112 site_list->GetSites(&sites);
113 int site_id = contents_->sites.size();
114 for (std::vector<ManagedModeSiteList::Site>::const_iterator it =
115 sites.begin(); it != sites.end(); ++it) {
116 const ManagedModeSiteList::Site& site = *it;
117 contents_->sites.push_back(site);
119 for (std::vector<std::string>::const_iterator pattern_it =
120 site.patterns.begin();
121 pattern_it != site.patterns.end(); ++pattern_it) {
122 AddPattern(*pattern_it, site_id);
125 for (std::vector<std::string>::const_iterator hash_it =
126 site.hostname_hashes.begin();
127 hash_it != site.hostname_hashes.end(); ++hash_it) {
128 AddHostnameHash(*hash_it, site_id);
135 scoped_ptr<ManagedModeURLFilter::Contents> FilterBuilder::Build() {
136 DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
137 contents_->url_matcher.AddConditionSets(all_conditions_);
138 return contents_.Pass();
141 scoped_ptr<ManagedModeURLFilter::Contents> CreateWhitelistFromPatterns(
142 const std::vector<std::string>& patterns) {
143 DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
145 FilterBuilder builder;
146 for (std::vector<std::string>::const_iterator it = patterns.begin();
147 it != patterns.end(); ++it) {
148 // TODO(bauerb): We should create a fake site for the whitelist.
149 builder.AddPattern(*it, -1);
152 return builder.Build();
155 scoped_ptr<ManagedModeURLFilter::Contents> LoadWhitelistsOnBlockingPoolThread(
156 ScopedVector<ManagedModeSiteList> site_lists) {
157 DCHECK(BrowserThread::GetBlockingPool()->RunsTasksOnCurrentThread());
159 FilterBuilder builder;
160 for (ScopedVector<ManagedModeSiteList>::iterator it = site_lists.begin();
161 it != site_lists.end(); ++it) {
162 builder.AddSiteList(*it);
165 return builder.Build();
170 ManagedModeURLFilter::ManagedModeURLFilter()
171 : default_behavior_(ALLOW),
172 contents_(new Contents()) {
173 // Detach from the current thread so we can be constructed on a different
174 // thread than the one where we're used.
178 ManagedModeURLFilter::~ManagedModeURLFilter() {
179 DCHECK(CalledOnValidThread());
183 ManagedModeURLFilter::FilteringBehavior
184 ManagedModeURLFilter::BehaviorFromInt(int behavior_value) {
185 DCHECK_GE(behavior_value, ALLOW);
186 DCHECK_LE(behavior_value, BLOCK);
187 return static_cast<FilteringBehavior>(behavior_value);
191 GURL ManagedModeURLFilter::Normalize(const GURL& url) {
192 GURL normalized_url = url;
193 GURL::Replacements replacements;
194 // Strip username, password, query, and ref.
195 replacements.ClearUsername();
196 replacements.ClearPassword();
197 replacements.ClearQuery();
198 replacements.ClearRef();
199 return url.ReplaceComponents(replacements);
203 bool ManagedModeURLFilter::HasStandardScheme(const GURL& url) {
204 for (size_t i = 0; i < arraysize(kStandardSchemes); ++i) {
205 if (url.scheme() == kStandardSchemes[i])
212 bool ManagedModeURLFilter::HostMatchesPattern(const std::string& host,
213 const std::string& pattern) {
214 std::string trimmed_pattern = pattern;
215 std::string trimmed_host = host;
216 if (EndsWith(pattern, ".*", true)) {
217 size_t registry_length = GetRegistryLength(
218 trimmed_host, EXCLUDE_UNKNOWN_REGISTRIES, EXCLUDE_PRIVATE_REGISTRIES);
219 // A host without a known registry part does not match.
220 if (registry_length == 0)
223 trimmed_pattern.erase(trimmed_pattern.length() - 2);
224 trimmed_host.erase(trimmed_host.length() - (registry_length + 1));
227 if (StartsWithASCII(trimmed_pattern, "*.", true)) {
228 trimmed_pattern.erase(0, 2);
230 // The remaining pattern should be non-empty, and it should not contain
231 // further stars. Also the trimmed host needs to end with the trimmed
233 if (trimmed_pattern.empty() ||
234 trimmed_pattern.find('*') != std::string::npos ||
235 !EndsWith(trimmed_host, trimmed_pattern, true)) {
239 // The trimmed host needs to have a dot separating the subdomain from the
240 // matched pattern piece, unless there is no subdomain.
241 int pos = trimmed_host.length() - trimmed_pattern.length();
243 return (pos == 0) || (trimmed_host[pos - 1] == '.');
246 return trimmed_host == trimmed_pattern;
249 ManagedModeURLFilter::FilteringBehavior
250 ManagedModeURLFilter::GetFilteringBehaviorForURL(const GURL& url) const {
251 DCHECK(CalledOnValidThread());
253 // URLs with a non-standard scheme (e.g. chrome://) are always allowed.
254 if (!HasStandardScheme(url))
257 // Check manual overrides for the exact URL.
258 std::map<GURL, bool>::const_iterator url_it = url_map_.find(Normalize(url));
259 if (url_it != url_map_.end())
260 return url_it->second ? ALLOW : BLOCK;
262 // Check manual overrides for the hostname.
263 std::string host = url.host();
264 std::map<std::string, bool>::const_iterator host_it = host_map_.find(host);
265 if (host_it != host_map_.end())
266 return host_it->second ? ALLOW : BLOCK;
268 // Look for patterns matching the hostname, with a value that is different
269 // from the default (a value of true in the map meaning allowed).
270 for (std::map<std::string, bool>::const_iterator host_it =
271 host_map_.begin(); host_it != host_map_.end(); ++host_it) {
272 if ((host_it->second == (default_behavior_ == BLOCK)) &&
273 HostMatchesPattern(host, host_it->first)) {
274 return host_it->second ? ALLOW : BLOCK;
278 // If the default behavior is to allow, we don't need to check anything else.
279 if (default_behavior_ == ALLOW)
282 // Check the list of URL patterns.
283 std::set<URLMatcherConditionSet::ID> matching_ids =
284 contents_->url_matcher.MatchURL(url);
285 if (!matching_ids.empty())
288 // Check the list of hostname hashes.
289 std::string hash = base::SHA1HashString(url.host());
290 std::string hash_hex = base::HexEncode(hash.data(), hash.length());
291 if (contents_->hash_site_map.count(hash_hex))
294 // Fall back to the default behavior.
295 return default_behavior_;
298 void ManagedModeURLFilter::GetSites(
300 std::vector<ManagedModeSiteList::Site*>* sites) const {
301 std::set<URLMatcherConditionSet::ID> matching_ids =
302 contents_->url_matcher.MatchURL(url);
303 for (std::set<URLMatcherConditionSet::ID>::const_iterator it =
304 matching_ids.begin(); it != matching_ids.end(); ++it) {
305 std::map<URLMatcherConditionSet::ID, int>::const_iterator entry =
306 contents_->matcher_site_map.find(*it);
307 if (entry == contents_->matcher_site_map.end()) {
311 sites->push_back(&contents_->sites[entry->second]);
314 typedef base::hash_map<std::string, int>::const_iterator
315 hash_site_map_iterator;
316 std::pair<hash_site_map_iterator, hash_site_map_iterator> bounds =
317 contents_->hash_site_map.equal_range(url.host());
318 for (hash_site_map_iterator hash_it = bounds.first;
319 hash_it != bounds.second; hash_it++) {
320 sites->push_back(&contents_->sites[hash_it->second]);
324 void ManagedModeURLFilter::SetDefaultFilteringBehavior(
325 FilteringBehavior behavior) {
326 DCHECK(CalledOnValidThread());
327 default_behavior_ = behavior;
330 void ManagedModeURLFilter::LoadWhitelists(
331 ScopedVector<ManagedModeSiteList> site_lists) {
332 DCHECK(CalledOnValidThread());
334 base::PostTaskAndReplyWithResult(
335 BrowserThread::GetBlockingPool(),
337 base::Bind(&LoadWhitelistsOnBlockingPoolThread,
338 base::Passed(&site_lists)),
339 base::Bind(&ManagedModeURLFilter::SetContents, this));
342 void ManagedModeURLFilter::SetFromPatterns(
343 const std::vector<std::string>& patterns) {
344 DCHECK(CalledOnValidThread());
346 base::PostTaskAndReplyWithResult(
347 BrowserThread::GetBlockingPool(),
349 base::Bind(&CreateWhitelistFromPatterns, patterns),
350 base::Bind(&ManagedModeURLFilter::SetContents, this));
353 void ManagedModeURLFilter::SetManualHosts(
354 const std::map<std::string, bool>* host_map) {
355 DCHECK(CalledOnValidThread());
356 host_map_ = *host_map;
357 UMA_HISTOGRAM_CUSTOM_COUNTS("ManagedMode.ManualHostsEntries",
358 host_map->size(), 1, 1000, 50);
361 void ManagedModeURLFilter::SetManualURLs(
362 const std::map<GURL, bool>* url_map) {
363 DCHECK(CalledOnValidThread());
365 UMA_HISTOGRAM_CUSTOM_COUNTS("ManagedMode.ManualURLsEntries",
366 url_map->size(), 1, 1000, 50);
369 void ManagedModeURLFilter::AddObserver(Observer* observer) {
370 observers_.AddObserver(observer);
373 void ManagedModeURLFilter::RemoveObserver(Observer* observer) {
374 observers_.RemoveObserver(observer);
377 void ManagedModeURLFilter::SetContents(scoped_ptr<Contents> contents) {
378 DCHECK(CalledOnValidThread());
379 contents_ = contents.Pass();
380 FOR_EACH_OBSERVER(Observer, observers_, OnSiteListUpdated());