1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/safe_browsing_util.h"
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "base/strings/stringprintf.h"
10 #include "chrome/browser/google/google_util.h"
11 #include "crypto/sha2.h"
12 #include "net/base/escape.h"
14 #include "url/url_util.h"
17 #include "chrome/installer/util/browser_distribution.h"
20 static const char kReportParams[] = "?tpl=%s&url=%s";
22 // SBChunk ---------------------------------------------------------------------
30 SBChunk::~SBChunk() {}
32 // SBChunkList -----------------------------------------------------------------
34 SBChunkList::SBChunkList() {}
36 SBChunkList::~SBChunkList() {
40 void SBChunkList::clear() {
41 for (std::vector<SBChunk>::iterator citer = chunks_.begin();
42 citer != chunks_.end(); ++citer) {
43 for (std::deque<SBChunkHost>::iterator hiter = citer->hosts.begin();
44 hiter != citer->hosts.end(); ++hiter) {
46 hiter->entry->Destroy();
54 // SBListChunkRanges -----------------------------------------------------------
56 SBListChunkRanges::SBListChunkRanges(const std::string& n) : name(n) {}
58 // SBChunkDelete ---------------------------------------------------------------
60 SBChunkDelete::SBChunkDelete() : is_sub_del(false) {}
62 SBChunkDelete::~SBChunkDelete() {}
64 // SBEntry ---------------------------------------------------------------------
67 SBEntry* SBEntry::Create(Type type, int prefix_count) {
68 int size = Size(type, prefix_count);
69 SBEntry *rv = static_cast<SBEntry*>(malloc(size));
72 rv->set_prefix_count(prefix_count);
76 void SBEntry::Destroy() {
81 int SBEntry::PrefixSize(Type type) {
84 return sizeof(SBPrefix);
86 return sizeof(SBFullHash);
88 return sizeof(SBSubPrefix);
90 return sizeof(SBSubFullHash);
97 int SBEntry::Size() const {
98 return Size(type(), prefix_count());
102 int SBEntry::Size(Type type, int prefix_count) {
103 return sizeof(Data) + prefix_count * PrefixSize(type);
106 int SBEntry::ChunkIdAtPrefix(int index) const {
107 if (type() == SUB_PREFIX)
108 return sub_prefixes_[index].add_chunk;
109 return (type() == SUB_FULL_HASH) ?
110 sub_full_hashes_[index].add_chunk : chunk_id();
113 void SBEntry::SetChunkIdAtPrefix(int index, int chunk_id) {
116 if (type() == SUB_PREFIX)
117 sub_prefixes_[index].add_chunk = chunk_id;
119 sub_full_hashes_[index].add_chunk = chunk_id;
122 const SBPrefix& SBEntry::PrefixAt(int index) const {
125 return IsAdd() ? add_prefixes_[index] : sub_prefixes_[index].prefix;
128 const SBFullHash& SBEntry::FullHashAt(int index) const {
131 return IsAdd() ? add_full_hashes_[index] : sub_full_hashes_[index].prefix;
134 void SBEntry::SetPrefixAt(int index, const SBPrefix& prefix) {
138 add_prefixes_[index] = prefix;
140 sub_prefixes_[index].prefix = prefix;
143 void SBEntry::SetFullHashAt(int index, const SBFullHash& full_hash) {
147 add_full_hashes_[index] = full_hash;
149 sub_full_hashes_[index].prefix = full_hash;
153 // Utility functions -----------------------------------------------------------
155 namespace safe_browsing_util {
157 // Listnames that browser can process.
158 const char kMalwareList[] = "goog-malware-shavar";
159 const char kPhishingList[] = "goog-phish-shavar";
160 const char kBinUrlList[] = "goog-badbinurl-shavar";
161 // We don't use the bad binary digest list anymore. Use a fake listname to be
162 // sure we don't request it accidentally.
163 const char kBinHashList[] = "goog-badbin-digestvar-disabled";
164 const char kCsdWhiteList[] = "goog-csdwhite-sha256";
165 const char kDownloadWhiteList[] = "goog-downloadwhite-digest256";
166 const char kExtensionBlacklist[] = "goog-badcrxids-digestvar";
167 const char kSideEffectFreeWhitelist[] = "goog-sideeffectfree-shavar";
168 const char kIPBlacklist[] = "goog-badip-digest256";
170 ListType GetListId(const std::string& name) {
172 if (name == safe_browsing_util::kMalwareList) {
174 } else if (name == safe_browsing_util::kPhishingList) {
176 } else if (name == safe_browsing_util::kBinUrlList) {
178 } else if (name == safe_browsing_util::kBinHashList) {
180 } else if (name == safe_browsing_util::kCsdWhiteList) {
182 } else if (name == safe_browsing_util::kDownloadWhiteList) {
183 id = DOWNLOADWHITELIST;
184 } else if (name == safe_browsing_util::kExtensionBlacklist) {
185 id = EXTENSIONBLACKLIST;
186 } else if (name == safe_browsing_util::kSideEffectFreeWhitelist) {
187 id = SIDEEFFECTFREEWHITELIST;
188 } else if (name == safe_browsing_util::kIPBlacklist) {
196 bool GetListName(ListType list_id, std::string* list) {
199 *list = safe_browsing_util::kMalwareList;
202 *list = safe_browsing_util::kPhishingList;
205 *list = safe_browsing_util::kBinUrlList;
208 *list = safe_browsing_util::kBinHashList;
211 *list = safe_browsing_util::kCsdWhiteList;
213 case DOWNLOADWHITELIST:
214 *list = safe_browsing_util::kDownloadWhiteList;
216 case EXTENSIONBLACKLIST:
217 *list = safe_browsing_util::kExtensionBlacklist;
219 case SIDEEFFECTFREEWHITELIST:
220 *list = safe_browsing_util::kSideEffectFreeWhitelist;
223 *list = safe_browsing_util::kIPBlacklist;
231 std::string Unescape(const std::string& url) {
232 std::string unescaped_str(url);
233 std::string old_unescaped_str;
234 const int kMaxLoopIterations = 1024;
237 old_unescaped_str = unescaped_str;
238 unescaped_str = net::UnescapeURLComponent(old_unescaped_str,
239 net::UnescapeRule::CONTROL_CHARS | net::UnescapeRule::SPACES |
240 net::UnescapeRule::URL_SPECIAL_CHARS);
241 } while (unescaped_str != old_unescaped_str && ++loop_var <=
244 return unescaped_str;
247 std::string Escape(const std::string& url) {
248 std::string escaped_str;
249 const char* kHexString = "0123456789ABCDEF";
250 for (size_t i = 0; i < url.length(); i++) {
251 unsigned char c = static_cast<unsigned char>(url[i]);
252 if (c <= ' ' || c > '~' || c == '#' || c == '%') {
253 escaped_str.push_back('%');
254 escaped_str.push_back(kHexString[c >> 4]);
255 escaped_str.push_back(kHexString[c & 0xf]);
257 escaped_str.push_back(c);
264 std::string RemoveConsecutiveChars(const std::string& str, const char c) {
265 std::string output(str);
266 std::string string_to_find;
267 std::string::size_type loc = 0;
268 string_to_find.append(2, c);
269 while ((loc = output.find(string_to_find, loc)) != std::string::npos) {
270 output.erase(loc, 1);
276 // Canonicalizes url as per Google Safe Browsing Specification.
277 // See section 6.1 in
278 // http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec.
279 void CanonicalizeUrl(const GURL& url,
280 std::string* canonicalized_hostname,
281 std::string* canonicalized_path,
282 std::string* canonicalized_query) {
283 DCHECK(url.is_valid());
285 // We only canonicalize "normal" URLs.
286 if (!url.IsStandard())
289 // Following canonicalization steps are excluded since url parsing takes care
291 // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url.
292 // (Exclude escaped version of these chars).
293 // 2. Normalize hostname to 4 dot-seperated decimal values.
294 // 3. Lowercase hostname.
295 // 4. Resolve path sequences "/../" and "/./".
297 // That leaves us with the following :-
298 // 1. Remove fragment in URL.
299 GURL url_without_fragment;
300 GURL::Replacements f_replacements;
301 f_replacements.ClearRef();
302 f_replacements.ClearUsername();
303 f_replacements.ClearPassword();
304 url_without_fragment = url.ReplaceComponents(f_replacements);
306 // 2. Do URL unescaping until no more hex encoded characters exist.
307 std::string url_unescaped_str(Unescape(url_without_fragment.spec()));
308 url_parse::Parsed parsed;
309 url_parse::ParseStandardURL(url_unescaped_str.data(),
310 url_unescaped_str.length(), &parsed);
312 // 3. In hostname, remove all leading and trailing dots.
313 const std::string host =
314 (parsed.host.len > 0)
315 ? url_unescaped_str.substr(parsed.host.begin, parsed.host.len)
317 const char kCharsToTrim[] = ".";
318 std::string host_without_end_dots;
319 TrimString(host, kCharsToTrim, &host_without_end_dots);
321 // 4. In hostname, replace consecutive dots with a single dot.
322 std::string host_without_consecutive_dots(RemoveConsecutiveChars(
323 host_without_end_dots, '.'));
325 // 5. In path, replace runs of consecutive slashes with a single slash.
327 (parsed.path.len > 0)
328 ? url_unescaped_str.substr(parsed.path.begin, parsed.path.len)
330 std::string path_without_consecutive_slash(RemoveConsecutiveChars(path, '/'));
332 url_canon::Replacements<char> hp_replacements;
333 hp_replacements.SetHost(host_without_consecutive_dots.data(),
334 url_parse::Component(0, host_without_consecutive_dots.length()));
335 hp_replacements.SetPath(path_without_consecutive_slash.data(),
336 url_parse::Component(0, path_without_consecutive_slash.length()));
338 std::string url_unescaped_with_can_hostpath;
339 url_canon::StdStringCanonOutput output(&url_unescaped_with_can_hostpath);
340 url_parse::Parsed temp_parsed;
341 url_util::ReplaceComponents(url_unescaped_str.data(),
342 url_unescaped_str.length(), parsed,
343 hp_replacements, NULL, &output, &temp_parsed);
346 // 6. Step needed to revert escaping done in url_util::ReplaceComponents.
347 url_unescaped_with_can_hostpath = Unescape(url_unescaped_with_can_hostpath);
349 // 7. After performing all above steps, percent-escape all chars in url which
350 // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters.
351 std::string escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath));
352 url_parse::Parsed final_parsed;
353 url_parse::ParseStandardURL(escaped_canon_url_str.data(),
354 escaped_canon_url_str.length(), &final_parsed);
356 if (canonicalized_hostname && final_parsed.host.len > 0) {
357 *canonicalized_hostname =
358 escaped_canon_url_str.substr(final_parsed.host.begin,
359 final_parsed.host.len);
361 if (canonicalized_path && final_parsed.path.len > 0) {
362 *canonicalized_path = escaped_canon_url_str.substr(final_parsed.path.begin,
363 final_parsed.path.len);
365 if (canonicalized_query && final_parsed.query.len > 0) {
366 *canonicalized_query = escaped_canon_url_str.substr(
367 final_parsed.query.begin, final_parsed.query.len);
371 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) {
374 std::string canon_host;
375 CanonicalizeUrl(url, &canon_host, NULL, NULL);
377 const std::string host = canon_host; // const sidesteps GCC bugs below!
381 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4
382 // hostnames formed by starting with the last 5 components and successively
383 // removing the leading component. The last component isn't examined alone,
384 // since it's the TLD or a subcomponent thereof.
386 // Note that we don't need to be clever about stopping at the "real" eTLD --
387 // the data on the server side has been filtered to ensure it will not
388 // blacklist a whole TLD, and it's not significantly slower on our side to
389 // just check too much.
391 // Also note that because we have a simple blacklist, not some sort of complex
392 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check
394 const size_t kMaxHostsToCheck = 4;
395 bool skipped_last_component = false;
396 for (std::string::const_reverse_iterator i(host.rbegin());
397 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) {
399 if (skipped_last_component)
400 hosts->push_back(std::string(i.base(), host.end()));
402 skipped_last_component = true;
405 hosts->push_back(host);
408 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) {
411 std::string canon_path;
412 std::string canon_query;
413 CanonicalizeUrl(url, NULL, &canon_path, &canon_query);
415 const std::string path = canon_path; // const sidesteps GCC bugs below!
416 const std::string query = canon_query;
420 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without
421 // the query parameters, and also up to 4 paths formed by starting at the root
422 // and adding more path components.
424 // As with the hosts above, it doesn't matter what order we check these in.
425 const size_t kMaxPathsToCheck = 4;
426 for (std::string::const_iterator i(path.begin());
427 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) {
429 paths->push_back(std::string(path.begin(), i + 1));
432 if (!paths->empty() && paths->back() != path)
433 paths->push_back(path);
436 paths->push_back(path + "?" + query);
439 void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) {
440 std::vector<std::string> hosts, paths;
441 GenerateHostsToCheck(url, &hosts);
442 GeneratePathsToCheck(url, &paths);
443 for (size_t h = 0; h < hosts.size(); ++h) {
444 for (size_t p = 0; p < paths.size(); ++p) {
445 urls->push_back(hosts[h] + paths[p]);
450 int GetHashIndex(const SBFullHash& hash,
451 const std::vector<SBFullHashResult>& full_hashes) {
452 for (size_t i = 0; i < full_hashes.size(); ++i) {
453 if (hash == full_hashes[i].hash)
454 return static_cast<int>(i);
459 int GetUrlHashIndex(const GURL& url,
460 const std::vector<SBFullHashResult>& full_hashes) {
461 if (full_hashes.empty())
464 std::vector<std::string> patterns;
465 GeneratePatternsToCheck(url, &patterns);
467 for (size_t i = 0; i < patterns.size(); ++i) {
469 crypto::SHA256HashString(patterns[i], key.full_hash, sizeof(SBFullHash));
470 int index = GetHashIndex(key, full_hashes);
477 bool IsPhishingList(const std::string& list_name) {
478 return list_name.compare(kPhishingList) == 0;
481 bool IsMalwareList(const std::string& list_name) {
482 return list_name.compare(kMalwareList) == 0;
485 bool IsBadbinurlList(const std::string& list_name) {
486 return list_name.compare(kBinUrlList) == 0;
489 bool IsBadbinhashList(const std::string& list_name) {
490 return list_name.compare(kBinHashList) == 0;
493 bool IsExtensionList(const std::string& list_name) {
494 return list_name.compare(kExtensionBlacklist) == 0;
497 GURL GeneratePhishingReportUrl(const std::string& report_page,
498 const std::string& url_to_report,
499 bool is_client_side_detection) {
500 const std::string current_esc = net::EscapeQueryParamValue(url_to_report,
504 BrowserDistribution* dist = BrowserDistribution::GetDistribution();
505 std::string client_name(dist->GetSafeBrowsingName());
507 std::string client_name("googlechrome");
509 if (is_client_side_detection)
510 client_name.append("_csd");
512 GURL report_url(report_page + base::StringPrintf(kReportParams,
514 current_esc.c_str()));
515 return google_util::AppendGoogleLocaleParam(report_url);
518 SBFullHash StringToSBFullHash(const std::string& hash_in) {
519 DCHECK_EQ(crypto::kSHA256Length, hash_in.size());
521 memcpy(hash_out.full_hash, hash_in.data(), crypto::kSHA256Length);
525 std::string SBFullHashToString(const SBFullHash& hash) {
526 DCHECK_EQ(crypto::kSHA256Length, sizeof(hash.full_hash));
527 return std::string(hash.full_hash, sizeof(hash.full_hash));
530 } // namespace safe_browsing_util