src/chrome/browser/safe_browsing/safe_browsing_store_file.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "chrome/browser/safe_browsing/safe_browsing_store_file.h"
   6
   7 #include "base/file_util.h"
   8 #include "base/files/scoped_file.h"
   9 #include "base/md5.h"
  10 #include "base/metrics/histogram.h"
  11 #include "base/metrics/sparse_histogram.h"
  12
  13 namespace {
  14
  15 // NOTE(shess): kFileMagic should not be a byte-wise palindrome, so
  16 // that byte-order changes force corruption.
  17 const int32 kFileMagic = 0x600D71FE;
  18
  19 // Version history:
  20 // Version 6: aad08754/r2814 by erikkay@google.com on 2008-10-02 (sqlite)
  21 // Version 7: 6afe28a5/r37435 by shess@chromium.org on 2010-01-28
  22 // Version 8: d3dd0715/r259791 by shess@chromium.org on 2014-03-27
  23 const int32 kFileVersion = 8;
  24
  25 // ReadAndVerifyHeader() returns this in case of error.
  26 const int32 kInvalidVersion = -1;
  27
  28 // Starting with version 8, the storage is sorted and can be sharded to allow
  29 // updates to be done with lower memory requirements.  Newly written files will
  30 // be sharded to need less than this amount of memory during update.  Larger
  31 // values are preferred to minimize looping overhead during processing.
  32 const int64 kUpdateStorageBytes = 100 * 1024;
  33
  34 // Prevent excessive sharding by setting a lower limit on the shard stride.
  35 // Smaller values should work fine, but very small values will probably lead to
  36 // poor performance.  Shard stride is indirectly related to
  37 // |kUpdateStorageBytes|, setting that very small will bump against this.
  38 const uint32 kMinShardStride = 1 << 24;
  39
  40 // Strides over the entire SBPrefix space.
  41 const uint64 kMaxShardStride = 1ULL << 32;
  42
  43 // Maximum SBPrefix value.
  44 const SBPrefix kMaxSBPrefix = 0xFFFFFFFF;
  45
  46 // Header at the front of the main database file.
  47 struct FileHeader {
  48   int32 magic, version;
  49   uint32 add_chunk_count, sub_chunk_count;
  50   uint32 shard_stride;
  51   // TODO(shess): Is this where 64-bit will bite me?  Perhaps write a
  52   // specialized read/write?
  53 };
  54
  55 // Header for each chunk in the chunk-accumulation file.
  56 struct ChunkHeader {
  57   uint32 add_prefix_count, sub_prefix_count;
  58   uint32 add_hash_count, sub_hash_count;
  59 };
  60
  61 // Header for each shard of data in the main database file.
  62 struct ShardHeader {
  63   uint32 add_prefix_count, sub_prefix_count;
  64   uint32 add_hash_count, sub_hash_count;
  65 };
  66
  67 // Enumerate different format-change events for histogramming
  68 // purposes.  DO NOT CHANGE THE ORDERING OF THESE VALUES.
  69 enum FormatEventType {
  70   // Corruption detected, broken down by file format.
  71   FORMAT_EVENT_FILE_CORRUPT,
  72   FORMAT_EVENT_SQLITE_CORRUPT,  // Obsolete
  73
  74   // The type of format found in the file.  The expected case (new
  75   // file format) is intentionally not covered.
  76   FORMAT_EVENT_FOUND_SQLITE,  // Obsolete
  77   FORMAT_EVENT_FOUND_UNKNOWN,  // magic does not match.
  78
  79   // The number of SQLite-format files deleted should be the same as
  80   // FORMAT_EVENT_FOUND_SQLITE.  It can differ if the delete fails,
  81   // or if a failure prevents the update from succeeding.
  82   FORMAT_EVENT_SQLITE_DELETED,  // Obsolete
  83   FORMAT_EVENT_SQLITE_DELETE_FAILED,  // Obsolete
  84
  85   // Found and deleted (or failed to delete) the ancient "Safe
  86   // Browsing" file.
  87   FORMAT_EVENT_DELETED_ORIGINAL,  // Obsolete
  88   FORMAT_EVENT_DELETED_ORIGINAL_FAILED,  // Obsolete
  89
  90   // The checksum did not check out in CheckValidity() or in
  91   // FinishUpdate().  This most likely indicates that the machine
  92   // crashed before the file was fully sync'ed to disk.
  93   FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE,
  94   FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE,
  95
  96   // The header checksum was incorrect in ReadAndVerifyHeader().  Likely
  97   // indicates that the system crashed while writing an update.
  98   FORMAT_EVENT_HEADER_CHECKSUM_FAILURE,
  99
 100   FORMAT_EVENT_FOUND_DEPRECATED,  // version too old.
 101
 102   // Memory space for histograms is determined by the max.  ALWAYS
 103   // ADD NEW VALUES BEFORE THIS ONE.
 104   FORMAT_EVENT_MAX
 105 };
 106
 107 void RecordFormatEvent(FormatEventType event_type) {
 108   UMA_HISTOGRAM_ENUMERATION("SB2.FormatEvent", event_type, FORMAT_EVENT_MAX);
 109 }
 110
 111 // Rewind the file.  Using fseek(2) because rewind(3) errors are
 112 // weird.
 113 bool FileRewind(FILE* fp) {
 114   int rv = fseek(fp, 0, SEEK_SET);
 115   DCHECK_EQ(rv, 0);
 116   return rv == 0;
 117 }
 118
 119 // Read from |fp| into |item|, and fold the input data into the
 120 // checksum in |context|, if non-NULL.  Return true on success.
 121 template <class T>
 122 bool ReadItem(T* item, FILE* fp, base::MD5Context* context) {
 123   const size_t ret = fread(item, sizeof(T), 1, fp);
 124   if (ret != 1)
 125     return false;
 126
 127   if (context) {
 128     base::MD5Update(context,
 129                     base::StringPiece(reinterpret_cast<char*>(item),
 130                                       sizeof(T)));
 131   }
 132   return true;
 133 }
 134
 135 // Write |item| to |fp|, and fold the output data into the checksum in
 136 // |context|, if non-NULL.  Return true on success.
 137 template <class T>
 138 bool WriteItem(const T& item, FILE* fp, base::MD5Context* context) {
 139   const size_t ret = fwrite(&item, sizeof(T), 1, fp);
 140   if (ret != 1)
 141     return false;
 142
 143   if (context) {
 144     base::MD5Update(context,
 145                     base::StringPiece(reinterpret_cast<const char*>(&item),
 146                                       sizeof(T)));
 147   }
 148
 149   return true;
 150 }
 151
 152 // Read |count| items into |values| from |fp|, and fold them into the
 153 // checksum in |context|.  Returns true on success.
 154 template <typename CT>
 155 bool ReadToContainer(CT* values, size_t count, FILE* fp,
 156                      base::MD5Context* context) {
 157   if (!count)
 158     return true;
 159
 160   for (size_t i = 0; i < count; ++i) {
 161     typename CT::value_type value;
 162     if (!ReadItem(&value, fp, context))
 163       return false;
 164
 165     // push_back() is more obvious, but coded this way std::set can
 166     // also be read.
 167     values->insert(values->end(), value);
 168   }
 169
 170   return true;
 171 }
 172
 173 // Write values between |beg| and |end| to |fp|, and fold the data into the
 174 // checksum in |context|, if non-NULL.  Returns true if all items successful.
 175 template <typename CTI>
 176 bool WriteRange(const CTI& beg, const CTI& end,
 177                 FILE* fp, base::MD5Context* context) {
 178   for (CTI iter = beg; iter != end; ++iter) {
 179     if (!WriteItem(*iter, fp, context))
 180       return false;
 181   }
 182   return true;
 183 }
 184
 185 // Write all of |values| to |fp|, and fold the data into the checksum
 186 // in |context|, if non-NULL.  Returns true if all items successful.
 187 template <typename CT>
 188 bool WriteContainer(const CT& values, FILE* fp,
 189                     base::MD5Context* context) {
 190   return WriteRange(values.begin(), values.end(), fp, context);
 191 }
 192
 193 // Delete the chunks in |deleted| from |chunks|.
 194 void DeleteChunksFromSet(const base::hash_set<int32>& deleted,
 195                          std::set<int32>* chunks) {
 196   for (std::set<int32>::iterator iter = chunks->begin();
 197        iter != chunks->end();) {
 198     std::set<int32>::iterator prev = iter++;
 199     if (deleted.count(*prev) > 0)
 200       chunks->erase(prev);
 201   }
 202 }
 203
 204 bool ReadAndVerifyChecksum(FILE* fp, base::MD5Context* context) {
 205   base::MD5Digest calculated_digest;
 206   base::MD5IntermediateFinal(&calculated_digest, context);
 207
 208   base::MD5Digest file_digest;
 209   if (!ReadItem(&file_digest, fp, context))
 210     return false;
 211
 212   return memcmp(&file_digest, &calculated_digest, sizeof(file_digest)) == 0;
 213 }
 214
 215 // Helper function to read the file header and chunk TOC.  Rewinds |fp| and
 216 // initializes |context|.  The header is left in |header|, with the version
 217 // returned.  kInvalidVersion is returned for sanity check or checksum failure.
 218 int ReadAndVerifyHeader(const base::FilePath& filename,
 219                         FileHeader* header,
 220                         std::set<int32>* add_chunks,
 221                         std::set<int32>* sub_chunks,
 222                         FILE* fp,
 223                         base::MD5Context* context) {
 224   DCHECK(header);
 225   DCHECK(add_chunks);
 226   DCHECK(sub_chunks);
 227   DCHECK(fp);
 228   DCHECK(context);
 229
 230   base::MD5Init(context);
 231   if (!FileRewind(fp))
 232     return kInvalidVersion;
 233   if (!ReadItem(header, fp, context))
 234     return kInvalidVersion;
 235   if (header->magic != kFileMagic)
 236     return kInvalidVersion;
 237
 238   // Track version read to inform removal of support for older versions.
 239   UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.StoreVersionRead", header->version);
 240
 241   if (header->version != kFileVersion)
 242     return kInvalidVersion;
 243
 244   if (!ReadToContainer(add_chunks, header->add_chunk_count, fp, context) ||
 245       !ReadToContainer(sub_chunks, header->sub_chunk_count, fp, context)) {
 246     return kInvalidVersion;
 247   }
 248
 249   // Verify that the data read thus far is valid.
 250   if (!ReadAndVerifyChecksum(fp, context)) {
 251     RecordFormatEvent(FORMAT_EVENT_HEADER_CHECKSUM_FAILURE);
 252     return kInvalidVersion;
 253   }
 254
 255   return kFileVersion;
 256 }
 257
 258 // Helper function to write out the initial header and chunks-contained data.
 259 // Rewinds |fp|, initializes |context|, then writes a file header and
 260 // |add_chunks| and |sub_chunks|.
 261 bool WriteHeader(uint32 out_stride,
 262                  const std::set<int32>& add_chunks,
 263                  const std::set<int32>& sub_chunks,
 264                  FILE* fp,
 265                  base::MD5Context* context) {
 266   if (!FileRewind(fp))
 267     return false;
 268
 269   base::MD5Init(context);
 270   FileHeader header;
 271   header.magic = kFileMagic;
 272   header.version = kFileVersion;
 273   header.add_chunk_count = add_chunks.size();
 274   header.sub_chunk_count = sub_chunks.size();
 275   header.shard_stride = out_stride;
 276   if (!WriteItem(header, fp, context))
 277     return false;
 278
 279   if (!WriteContainer(add_chunks, fp, context) ||
 280       !WriteContainer(sub_chunks, fp, context))
 281     return false;
 282
 283   // Write out the header digest.
 284   base::MD5Digest header_digest;
 285   base::MD5IntermediateFinal(&header_digest, context);
 286   if (!WriteItem(header_digest, fp, context))
 287     return false;
 288
 289   return true;
 290 }
 291
 292 // Return |true| if the range is sorted by the given comparator.
 293 template <typename CTI, typename LESS>
 294 bool sorted(CTI beg, CTI end, LESS less) {
 295   while ((end - beg) > 2) {
 296     CTI n = beg++;
 297     DCHECK(!less(*beg, *n));
 298     if (less(*beg, *n))
 299       return false;
 300   }
 301   return true;
 302 }
 303
 304 // Merge |beg|..|end| into |container|.  Both should be sorted by the given
 305 // comparator, and the range iterators should not be derived from |container|.
 306 // Differs from std::inplace_merge() in that additional memory is not required
 307 // for linear performance.
 308 template <typename CT, typename CTI, typename COMP>
 309 void container_merge(CT* container, CTI beg, CTI end, const COMP& less) {
 310   DCHECK(sorted(container->begin(), container->end(), less));
 311   DCHECK(sorted(beg, end, less));
 312
 313   // Size the container to fit the results.
 314   const size_t c_size = container->size();
 315   container->resize(c_size + (end - beg));
 316
 317   // |c_end| points to the original endpoint, while |c_out| points to the
 318   // endpoint that will scan from end to beginning while merging.
 319   typename CT::iterator c_end = container->begin() + c_size;
 320   typename CT::iterator c_out = container->end();
 321
 322   // While both inputs have data, move the greater to |c_out|.
 323   while (c_end != container->begin() && end != beg) {
 324     if (less(*(c_end - 1), *(end - 1))) {
 325       *(--c_out) = *(--end);
 326     } else {
 327       *(--c_out) = *(--c_end);
 328     }
 329   }
 330
 331   // Copy any data remaining in the new range.
 332   if (end != beg) {
 333     // The original container data has been fully shifted.
 334     DCHECK(c_end == container->begin());
 335
 336     // There is exactly the correct amount of space left.
 337     DCHECK_EQ(c_out - c_end, end - beg);
 338
 339     std::copy(beg, end, container->begin());
 340   }
 341
 342   DCHECK(sorted(container->begin(), container->end(), less));
 343 }
 344
 345 // Collection of iterators used while stepping through StateInternal (see
 346 // below).
 347 class StateInternalPos {
 348  public:
 349   StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter,
 350                    SBSubPrefixes::iterator sub_prefixes_iter,
 351                    std::vector<SBAddFullHash>::iterator add_hashes_iter,
 352                    std::vector<SBSubFullHash>::iterator sub_hashes_iter)
 353       : add_prefixes_iter_(add_prefixes_iter),
 354         sub_prefixes_iter_(sub_prefixes_iter),
 355         add_hashes_iter_(add_hashes_iter),
 356         sub_hashes_iter_(sub_hashes_iter) {
 357   }
 358
 359   SBAddPrefixes::iterator add_prefixes_iter_;
 360   SBSubPrefixes::iterator sub_prefixes_iter_;
 361   std::vector<SBAddFullHash>::iterator add_hashes_iter_;
 362   std::vector<SBSubFullHash>::iterator sub_hashes_iter_;
 363 };
 364
 365 // Helper to find the next shard boundary.
 366 template <class T>
 367 bool prefix_bounder(SBPrefix val, const T& elt) {
 368   return val < elt.GetAddPrefix();
 369 }
 370
 371 // Container for partial database state.  Includes add/sub prefixes/hashes, plus
 372 // aggregate operations on same.
 373 class StateInternal {
 374  public:
 375   // Append indicated amount of data from |fp|.
 376   bool AppendData(size_t add_prefix_count, size_t sub_prefix_count,
 377                   size_t add_hash_count, size_t sub_hash_count,
 378                   FILE* fp, base::MD5Context* context) {
 379     return
 380         ReadToContainer(&add_prefixes_, add_prefix_count, fp, context) &&
 381         ReadToContainer(&sub_prefixes_, sub_prefix_count, fp, context) &&
 382         ReadToContainer(&add_full_hashes_, add_hash_count, fp, context) &&
 383         ReadToContainer(&sub_full_hashes_, sub_hash_count, fp, context);
 384   }
 385
 386   void ClearData() {
 387     add_prefixes_.clear();
 388     sub_prefixes_.clear();
 389     add_full_hashes_.clear();
 390     sub_full_hashes_.clear();
 391   }
 392
 393   // Merge data from |beg|..|end| into receiver's state, then process the state.
 394   // The current state and the range given should corrospond to the same sorted
 395   // shard of data from different sources.  |add_del_cache| and |sub_del_cache|
 396   // indicate the chunk ids which should be deleted during processing (see
 397   // SBProcessSubs).
 398   void MergeDataAndProcess(const StateInternalPos& beg,
 399                            const StateInternalPos& end,
 400                            const base::hash_set<int32>& add_del_cache,
 401                            const base::hash_set<int32>& sub_del_cache) {
 402     container_merge(&add_prefixes_,
 403                     beg.add_prefixes_iter_,
 404                     end.add_prefixes_iter_,
 405                     SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
 406
 407     container_merge(&sub_prefixes_,
 408                     beg.sub_prefixes_iter_,
 409                     end.sub_prefixes_iter_,
 410                     SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
 411
 412     container_merge(&add_full_hashes_,
 413                     beg.add_hashes_iter_,
 414                     end.add_hashes_iter_,
 415                     SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
 416
 417     container_merge(&sub_full_hashes_,
 418                     beg.sub_hashes_iter_,
 419                     end.sub_hashes_iter_,
 420                     SBAddPrefixHashLess<SBSubFullHash, SBSubFullHash>);
 421
 422     SBProcessSubs(&add_prefixes_, &sub_prefixes_,
 423                   &add_full_hashes_, &sub_full_hashes_,
 424                   add_del_cache, sub_del_cache);
 425   }
 426
 427   // Sort the data appropriately for the sharding, merging, and processing
 428   // operations.
 429   void SortData() {
 430     std::sort(add_prefixes_.begin(), add_prefixes_.end(),
 431               SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
 432     std::sort(sub_prefixes_.begin(), sub_prefixes_.end(),
 433               SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
 434     std::sort(add_full_hashes_.begin(), add_full_hashes_.end(),
 435               SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
 436     std::sort(sub_full_hashes_.begin(), sub_full_hashes_.end(),
 437               SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>);
 438   }
 439
 440   // Iterator from the beginning of the state's data.
 441   StateInternalPos StateBegin() {
 442     return StateInternalPos(add_prefixes_.begin(),
 443                             sub_prefixes_.begin(),
 444                             add_full_hashes_.begin(),
 445                             sub_full_hashes_.begin());
 446   }
 447
 448   // An iterator pointing just after the last possible element of the shard
 449   // indicated by |shard_max|.  Used to step through the state by shard.
 450   // TODO(shess): Verify whether binary search really improves over linear.
 451   // Merging or writing will immediately touch all of these elements.
 452   StateInternalPos ShardEnd(const StateInternalPos& beg, SBPrefix shard_max) {
 453     return StateInternalPos(
 454         std::upper_bound(beg.add_prefixes_iter_, add_prefixes_.end(),
 455                          shard_max, prefix_bounder<SBAddPrefix>),
 456         std::upper_bound(beg.sub_prefixes_iter_, sub_prefixes_.end(),
 457                          shard_max, prefix_bounder<SBSubPrefix>),
 458         std::upper_bound(beg.add_hashes_iter_, add_full_hashes_.end(),
 459                          shard_max, prefix_bounder<SBAddFullHash>),
 460         std::upper_bound(beg.sub_hashes_iter_, sub_full_hashes_.end(),
 461                          shard_max, prefix_bounder<SBSubFullHash>));
 462   }
 463
 464   // Write a shard header and data for the shard starting at |beg| and ending at
 465   // the element before |end|.
 466   bool WriteShard(const StateInternalPos& beg, const StateInternalPos& end,
 467                   FILE* fp, base::MD5Context* context) {
 468     ShardHeader shard_header;
 469     shard_header.add_prefix_count =
 470         end.add_prefixes_iter_ - beg.add_prefixes_iter_;
 471     shard_header.sub_prefix_count =
 472         end.sub_prefixes_iter_ - beg.sub_prefixes_iter_;
 473     shard_header.add_hash_count =
 474         end.add_hashes_iter_ - beg.add_hashes_iter_;
 475     shard_header.sub_hash_count =
 476         end.sub_hashes_iter_ - beg.sub_hashes_iter_;
 477
 478     return
 479         WriteItem(shard_header, fp, context) &&
 480         WriteRange(beg.add_prefixes_iter_, end.add_prefixes_iter_,
 481                    fp, context) &&
 482         WriteRange(beg.sub_prefixes_iter_, end.sub_prefixes_iter_,
 483                    fp, context) &&
 484         WriteRange(beg.add_hashes_iter_, end.add_hashes_iter_,
 485                    fp, context) &&
 486         WriteRange(beg.sub_hashes_iter_, end.sub_hashes_iter_,
 487                    fp, context);
 488   }
 489
 490   SBAddPrefixes add_prefixes_;
 491   SBSubPrefixes sub_prefixes_;
 492   std::vector<SBAddFullHash> add_full_hashes_;
 493   std::vector<SBSubFullHash> sub_full_hashes_;
 494 };
 495
 496 // True if |val| is an even power of two.
 497 template <typename T>
 498 bool IsPowerOfTwo(const T& val) {
 499   return val && (val & (val - 1)) == 0;
 500 }
 501
 502 // Helper to read the entire database state, used by GetAddPrefixes() and
 503 // GetAddFullHashes().  Those functions are generally used only for smaller
 504 // files.  Returns false in case of errors reading the data.
 505 bool ReadDbStateHelper(const base::FilePath& filename,
 506                        StateInternal* db_state) {
 507   base::ScopedFILE file(base::OpenFile(filename, "rb"));
 508   if (file.get() == NULL)
 509     return false;
 510
 511   std::set<int32> add_chunks;
 512   std::set<int32> sub_chunks;
 513
 514   base::MD5Context context;
 515   FileHeader header;
 516   const int version =
 517       ReadAndVerifyHeader(filename, &header, &add_chunks, &sub_chunks,
 518                           file.get(), &context);
 519   if (version == kInvalidVersion)
 520     return false;
 521
 522   uint64 in_min = 0;
 523   uint64 in_stride = header.shard_stride;
 524   if (!in_stride)
 525     in_stride = kMaxShardStride;
 526   if (!IsPowerOfTwo(in_stride))
 527     return false;
 528
 529   do {
 530     ShardHeader shard_header;
 531     if (!ReadItem(&shard_header, file.get(), &context))
 532       return false;
 533
 534     if (!db_state->AppendData(shard_header.add_prefix_count,
 535                               shard_header.sub_prefix_count,
 536                               shard_header.add_hash_count,
 537                               shard_header.sub_hash_count,
 538                               file.get(), &context)) {
 539       return false;
 540     }
 541
 542     in_min += in_stride;
 543   } while (in_min <= kMaxSBPrefix);
 544
 545   if (!ReadAndVerifyChecksum(file.get(), &context))
 546     return false;
 547
 548   int64 size = 0;
 549   if (!base::GetFileSize(filename, &size))
 550     return false;
 551
 552   return static_cast<int64>(ftell(file.get())) == size;
 553 }
 554
 555 }  // namespace
 556
 557 SafeBrowsingStoreFile::SafeBrowsingStoreFile()
 558     : chunks_written_(0), empty_(false), corruption_seen_(false) {}
 559
 560 SafeBrowsingStoreFile::~SafeBrowsingStoreFile() {
 561   Close();
 562 }
 563
 564 bool SafeBrowsingStoreFile::Delete() {
 565   // The database should not be open at this point.  But, just in
 566   // case, close everything before deleting.
 567   if (!Close()) {
 568     NOTREACHED();
 569     return false;
 570   }
 571
 572   return DeleteStore(filename_);
 573 }
 574
 575 bool SafeBrowsingStoreFile::CheckValidity() {
 576   // The file was either empty or never opened.  The empty case is
 577   // presumed not to be invalid.  The never-opened case can happen if
 578   // BeginUpdate() fails for any databases, and should already have
 579   // caused the corruption callback to fire.
 580   if (!file_.get())
 581     return true;
 582
 583   if (!FileRewind(file_.get()))
 584     return OnCorruptDatabase();
 585
 586   int64 size = 0;
 587   if (!base::GetFileSize(filename_, &size))
 588     return OnCorruptDatabase();
 589
 590   base::MD5Context context;
 591   base::MD5Init(&context);
 592
 593   // Read everything except the final digest.
 594   size_t bytes_left = static_cast<size_t>(size);
 595   CHECK(size == static_cast<int64>(bytes_left));
 596   if (bytes_left < sizeof(base::MD5Digest))
 597     return OnCorruptDatabase();
 598   bytes_left -= sizeof(base::MD5Digest);
 599
 600   // Fold the contents of the file into the checksum.
 601   while (bytes_left > 0) {
 602     char buf[4096];
 603     const size_t c = std::min(sizeof(buf), bytes_left);
 604     const size_t ret = fread(buf, 1, c, file_.get());
 605
 606     // The file's size changed while reading, give up.
 607     if (ret != c)
 608       return OnCorruptDatabase();
 609     base::MD5Update(&context, base::StringPiece(buf, c));
 610     bytes_left -= c;
 611   }
 612
 613   if (!ReadAndVerifyChecksum(file_.get(), &context)) {
 614     RecordFormatEvent(FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE);
 615     return OnCorruptDatabase();
 616   }
 617
 618   return true;
 619 }
 620
 621 void SafeBrowsingStoreFile::Init(
 622     const base::FilePath& filename,
 623     const base::Closure& corruption_callback
 624 ) {
 625   filename_ = filename;
 626   corruption_callback_ = corruption_callback;
 627 }
 628
 629 bool SafeBrowsingStoreFile::BeginChunk() {
 630   return ClearChunkBuffers();
 631 }
 632
 633 bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id, SBPrefix prefix) {
 634   add_prefixes_.push_back(SBAddPrefix(chunk_id, prefix));
 635   return true;
 636 }
 637
 638 bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes* add_prefixes) {
 639   add_prefixes->clear();
 640   if (!base::PathExists(filename_))
 641     return true;
 642
 643   StateInternal db_state;
 644   if (!ReadDbStateHelper(filename_, &db_state))
 645     return OnCorruptDatabase();
 646
 647   add_prefixes->swap(db_state.add_prefixes_);
 648   return true;
 649 }
 650
 651 bool SafeBrowsingStoreFile::GetAddFullHashes(
 652     std::vector<SBAddFullHash>* add_full_hashes) {
 653   add_full_hashes->clear();
 654   if (!base::PathExists(filename_))
 655     return true;
 656
 657   StateInternal db_state;
 658   if (!ReadDbStateHelper(filename_, &db_state))
 659     return OnCorruptDatabase();
 660
 661   add_full_hashes->swap(db_state.add_full_hashes_);
 662   return true;
 663 }
 664
 665 bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id,
 666                                          const SBFullHash& full_hash) {
 667   add_hashes_.push_back(SBAddFullHash(chunk_id, full_hash));
 668   return true;
 669 }
 670
 671 bool SafeBrowsingStoreFile::WriteSubPrefix(int32 chunk_id,
 672                                            int32 add_chunk_id,
 673                                            SBPrefix prefix) {
 674   sub_prefixes_.push_back(SBSubPrefix(chunk_id, add_chunk_id, prefix));
 675   return true;
 676 }
 677
 678 bool SafeBrowsingStoreFile::WriteSubHash(int32 chunk_id, int32 add_chunk_id,
 679                                          const SBFullHash& full_hash) {
 680   sub_hashes_.push_back(SBSubFullHash(chunk_id, add_chunk_id, full_hash));
 681   return true;
 682 }
 683
 684 bool SafeBrowsingStoreFile::OnCorruptDatabase() {
 685   if (!corruption_seen_)
 686     RecordFormatEvent(FORMAT_EVENT_FILE_CORRUPT);
 687   corruption_seen_ = true;
 688
 689   corruption_callback_.Run();
 690
 691   // Return false as a convenience to callers.
 692   return false;
 693 }
 694
 695 bool SafeBrowsingStoreFile::Close() {
 696   ClearUpdateBuffers();
 697
 698   // Make sure the files are closed.
 699   file_.reset();
 700   new_file_.reset();
 701   return true;
 702 }
 703
 704 bool SafeBrowsingStoreFile::BeginUpdate() {
 705   DCHECK(!file_.get() && !new_file_.get());
 706
 707   // Structures should all be clear unless something bad happened.
 708   DCHECK(add_chunks_cache_.empty());
 709   DCHECK(sub_chunks_cache_.empty());
 710   DCHECK(add_del_cache_.empty());
 711   DCHECK(sub_del_cache_.empty());
 712   DCHECK(add_prefixes_.empty());
 713   DCHECK(sub_prefixes_.empty());
 714   DCHECK(add_hashes_.empty());
 715   DCHECK(sub_hashes_.empty());
 716   DCHECK_EQ(chunks_written_, 0);
 717
 718   corruption_seen_ = false;
 719
 720   const base::FilePath new_filename = TemporaryFileForFilename(filename_);
 721   base::ScopedFILE new_file(base::OpenFile(new_filename, "wb+"));
 722   if (new_file.get() == NULL)
 723     return false;
 724
 725   base::ScopedFILE file(base::OpenFile(filename_, "rb"));
 726   empty_ = (file.get() == NULL);
 727   if (empty_) {
 728     // If the file exists but cannot be opened, try to delete it (not
 729     // deleting directly, the bloom filter needs to be deleted, too).
 730     if (base::PathExists(filename_))
 731       return OnCorruptDatabase();
 732
 733     new_file_.swap(new_file);
 734     return true;
 735   }
 736
 737   base::MD5Context context;
 738   FileHeader header;
 739   const int version =
 740       ReadAndVerifyHeader(filename_, &header,
 741                           &add_chunks_cache_, &sub_chunks_cache_,
 742                           file.get(), &context);
 743   if (version == kInvalidVersion) {
 744     FileHeader retry_header;
 745     if (FileRewind(file.get()) && ReadItem(&retry_header, file.get(), NULL)) {
 746       if (retry_header.magic == kFileMagic &&
 747           retry_header.version < kFileVersion) {
 748         RecordFormatEvent(FORMAT_EVENT_FOUND_DEPRECATED);
 749       } else {
 750         RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN);
 751       }
 752     }
 753
 754     // Close the file so that it can be deleted.
 755     file.reset();
 756
 757     return OnCorruptDatabase();
 758   }
 759
 760   file_.swap(file);
 761   new_file_.swap(new_file);
 762   return true;
 763 }
 764
 765 bool SafeBrowsingStoreFile::FinishChunk() {
 766   if (!add_prefixes_.size() && !sub_prefixes_.size() &&
 767       !add_hashes_.size() && !sub_hashes_.size())
 768     return true;
 769
 770   ChunkHeader header;
 771   header.add_prefix_count = add_prefixes_.size();
 772   header.sub_prefix_count = sub_prefixes_.size();
 773   header.add_hash_count = add_hashes_.size();
 774   header.sub_hash_count = sub_hashes_.size();
 775   if (!WriteItem(header, new_file_.get(), NULL))
 776     return false;
 777
 778   if (!WriteContainer(add_prefixes_, new_file_.get(), NULL) ||
 779       !WriteContainer(sub_prefixes_, new_file_.get(), NULL) ||
 780       !WriteContainer(add_hashes_, new_file_.get(), NULL) ||
 781       !WriteContainer(sub_hashes_, new_file_.get(), NULL))
 782     return false;
 783
 784   ++chunks_written_;
 785
 786   // Clear everything to save memory.
 787   return ClearChunkBuffers();
 788 }
 789
 790 bool SafeBrowsingStoreFile::DoUpdate(
 791     safe_browsing::PrefixSetBuilder* builder,
 792     std::vector<SBAddFullHash>* add_full_hashes_result) {
 793   DCHECK(file_.get() || empty_);
 794   DCHECK(new_file_.get());
 795   CHECK(builder);
 796   CHECK(add_full_hashes_result);
 797
 798   // Rewind the temporary storage.
 799   if (!FileRewind(new_file_.get()))
 800     return false;
 801
 802   // Get chunk file's size for validating counts.
 803   int64 update_size = 0;
 804   if (!base::GetFileSize(TemporaryFileForFilename(filename_), &update_size))
 805     return OnCorruptDatabase();
 806
 807   // Track update size to answer questions at http://crbug.com/72216 .
 808   // Log small updates as 1k so that the 0 (underflow) bucket can be
 809   // used for "empty" in SafeBrowsingDatabase.
 810   UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes",
 811                        std::max(static_cast<int>(update_size / 1024), 1));
 812
 813   // Chunk updates to integrate.
 814   StateInternal new_state;
 815
 816   // Read update chunks.
 817   for (int i = 0; i < chunks_written_; ++i) {
 818     ChunkHeader header;
 819
 820     int64 ofs = ftell(new_file_.get());
 821     if (ofs == -1)
 822       return false;
 823
 824     if (!ReadItem(&header, new_file_.get(), NULL))
 825       return false;
 826
 827     // As a safety measure, make sure that the header describes a sane
 828     // chunk, given the remaining file size.
 829     int64 expected_size = ofs + sizeof(ChunkHeader);
 830     expected_size += header.add_prefix_count * sizeof(SBAddPrefix);
 831     expected_size += header.sub_prefix_count * sizeof(SBSubPrefix);
 832     expected_size += header.add_hash_count * sizeof(SBAddFullHash);
 833     expected_size += header.sub_hash_count * sizeof(SBSubFullHash);
 834     if (expected_size > update_size)
 835       return false;
 836
 837     if (!new_state.AppendData(header.add_prefix_count, header.sub_prefix_count,
 838                               header.add_hash_count, header.sub_hash_count,
 839                               new_file_.get(), NULL)) {
 840       return false;
 841     }
 842   }
 843
 844   // The state was accumulated by chunk, sort by prefix.
 845   new_state.SortData();
 846
 847   // These strides control how much data is loaded into memory per pass.
 848   // Strides must be an even power of two.  |in_stride| will be derived from the
 849   // input file.  |out_stride| will be derived from an estimate of the resulting
 850   // file's size.  |process_stride| will be the max of both.
 851   uint64 in_stride = kMaxShardStride;
 852   uint64 out_stride = kMaxShardStride;
 853   uint64 process_stride = 0;
 854
 855   // Used to verify the input's checksum if |!empty_|.
 856   base::MD5Context in_context;
 857
 858   if (!empty_) {
 859     DCHECK(file_.get());
 860
 861     FileHeader header = {0};
 862     int version = ReadAndVerifyHeader(filename_, &header,
 863                                       &add_chunks_cache_, &sub_chunks_cache_,
 864                                       file_.get(), &in_context);
 865     if (version == kInvalidVersion)
 866       return OnCorruptDatabase();
 867
 868     if (header.shard_stride)
 869       in_stride = header.shard_stride;
 870
 871     // The header checksum should have prevented this case, but the code will be
 872     // broken if this is not correct.
 873     if (!IsPowerOfTwo(in_stride))
 874       return OnCorruptDatabase();
 875   }
 876
 877   // We no longer need to track deleted chunks.
 878   DeleteChunksFromSet(add_del_cache_, &add_chunks_cache_);
 879   DeleteChunksFromSet(sub_del_cache_, &sub_chunks_cache_);
 880
 881   // Calculate |out_stride| to break the file down into reasonable shards.
 882   {
 883     int64 original_size = 0;
 884     if (!empty_ && !base::GetFileSize(filename_, &original_size))
 885       return OnCorruptDatabase();
 886
 887     // Approximate the final size as everything.  Subs and deletes will reduce
 888     // the size, but modest over-sharding won't hurt much.
 889     int64 shard_size = original_size + update_size;
 890
 891     // Keep splitting until a single stride of data fits the target.
 892     size_t shifts = 0;
 893     while (out_stride > kMinShardStride && shard_size > kUpdateStorageBytes) {
 894       out_stride >>= 1;
 895       shard_size >>= 1;
 896       ++shifts;
 897     }
 898     UMA_HISTOGRAM_COUNTS("SB2.OutShardShifts", shifts);
 899
 900     DCHECK(IsPowerOfTwo(out_stride));
 901   }
 902
 903   // Outer loop strides by the max of the input stride (to read integral shards)
 904   // and the output stride (to write integral shards).
 905   process_stride = std::max(in_stride, out_stride);
 906   DCHECK(IsPowerOfTwo(process_stride));
 907   DCHECK_EQ(0u, process_stride % in_stride);
 908   DCHECK_EQ(0u, process_stride % out_stride);
 909
 910   // Start writing the new data to |new_file_|.
 911   base::MD5Context out_context;
 912   if (!WriteHeader(out_stride, add_chunks_cache_, sub_chunks_cache_,
 913                    new_file_.get(), &out_context)) {
 914     return false;
 915   }
 916
 917   // Start at the beginning of the SBPrefix space.
 918   uint64 in_min = 0;
 919   uint64 out_min = 0;
 920   uint64 process_min = 0;
 921
 922   // Start at the beginning of the updates.
 923   StateInternalPos new_pos = new_state.StateBegin();
 924
 925   // Re-usable container for shard processing.
 926   StateInternal db_state;
 927
 928   // Track aggregate counts for histograms.
 929   size_t add_prefix_count = 0;
 930   size_t sub_prefix_count = 0;
 931
 932   do {
 933     // Maximum element in the current shard.
 934     SBPrefix process_max =
 935         static_cast<SBPrefix>(process_min + process_stride - 1);
 936     DCHECK_GT(process_max, process_min);
 937
 938     // Drop the data from previous pass.
 939     db_state.ClearData();
 940
 941     // Fill the processing shard with one or more input shards.
 942     if (!empty_) {
 943       do {
 944         ShardHeader shard_header;
 945         if (!ReadItem(&shard_header, file_.get(), &in_context))
 946           return OnCorruptDatabase();
 947
 948         if (!db_state.AppendData(shard_header.add_prefix_count,
 949                                  shard_header.sub_prefix_count,
 950                                  shard_header.add_hash_count,
 951                                  shard_header.sub_hash_count,
 952                                  file_.get(), &in_context))
 953           return OnCorruptDatabase();
 954
 955         in_min += in_stride;
 956       } while (in_min <= kMaxSBPrefix && in_min < process_max);
 957     }
 958
 959     // Shard the update data to match the database data, then merge the update
 960     // data and process the results.
 961     {
 962       StateInternalPos new_end = new_state.ShardEnd(new_pos, process_max);
 963       db_state.MergeDataAndProcess(new_pos, new_end,
 964                                    add_del_cache_, sub_del_cache_);
 965       new_pos = new_end;
 966     }
 967
 968     // Collect the processed data for return to caller.
 969     for (size_t i = 0; i < db_state.add_prefixes_.size(); ++i) {
 970       builder->AddPrefix(db_state.add_prefixes_[i].prefix);
 971     }
 972     add_full_hashes_result->insert(add_full_hashes_result->end(),
 973                                    db_state.add_full_hashes_.begin(),
 974                                    db_state.add_full_hashes_.end());
 975     add_prefix_count += db_state.add_prefixes_.size();
 976     sub_prefix_count += db_state.sub_prefixes_.size();
 977
 978     // Write one or more shards of processed output.
 979     StateInternalPos out_pos = db_state.StateBegin();
 980     do {
 981       SBPrefix out_max = static_cast<SBPrefix>(out_min + out_stride - 1);
 982       DCHECK_GT(out_max, out_min);
 983
 984       StateInternalPos out_end = db_state.ShardEnd(out_pos, out_max);
 985       if (!db_state.WriteShard(out_pos, out_end, new_file_.get(), &out_context))
 986         return false;
 987       out_pos = out_end;
 988
 989       out_min += out_stride;
 990     } while (out_min == static_cast<SBPrefix>(out_min) &&
 991              out_min < process_max);
 992
 993     process_min += process_stride;
 994   } while (process_min <= kMaxSBPrefix);
 995
 996   // Verify the overall checksum.
 997   if (!empty_) {
 998     if (!ReadAndVerifyChecksum(file_.get(), &in_context)) {
 999       RecordFormatEvent(FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE);
1000       return OnCorruptDatabase();
1001     }
1002
1003     // TODO(shess): Verify EOF?
1004
1005     // Close the input file so the new file can be renamed over it.
1006     file_.reset();
1007   }
1008   DCHECK(!file_.get());
1009
1010   // Write the overall checksum.
1011   base::MD5Digest out_digest;
1012   base::MD5Final(&out_digest, &out_context);
1013   if (!WriteItem(out_digest, new_file_.get(), NULL))
1014     return false;
1015
1016   // Trim any excess left over from the temporary chunk data.
1017   if (!base::TruncateFile(new_file_.get()))
1018     return false;
1019
1020   // Close the file handle and swizzle the file into place.
1021   new_file_.reset();
1022   if (!base::DeleteFile(filename_, false) &&
1023       base::PathExists(filename_))
1024     return false;
1025
1026   const base::FilePath new_filename = TemporaryFileForFilename(filename_);
1027   if (!base::Move(new_filename, filename_))
1028     return false;
1029
1030   // Record counts before swapping to caller.
1031   UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefix_count);
1032   UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefix_count);
1033
1034   return true;
1035 }
1036
1037 bool SafeBrowsingStoreFile::FinishUpdate(
1038     safe_browsing::PrefixSetBuilder* builder,
1039     std::vector<SBAddFullHash>* add_full_hashes_result) {
1040   DCHECK(builder);
1041   DCHECK(add_full_hashes_result);
1042
1043   if (!DoUpdate(builder, add_full_hashes_result)) {
1044     CancelUpdate();
1045     return false;
1046   }
1047
1048   DCHECK(!new_file_.get());
1049   DCHECK(!file_.get());
1050
1051   return Close();
1052 }
1053
1054 bool SafeBrowsingStoreFile::CancelUpdate() {
1055   bool ret = Close();
1056
1057   // Delete stale staging file.
1058   const base::FilePath new_filename = TemporaryFileForFilename(filename_);
1059   base::DeleteFile(new_filename, false);
1060
1061   return ret;
1062 }
1063
1064 void SafeBrowsingStoreFile::SetAddChunk(int32 chunk_id) {
1065   add_chunks_cache_.insert(chunk_id);
1066 }
1067
1068 bool SafeBrowsingStoreFile::CheckAddChunk(int32 chunk_id) {
1069   return add_chunks_cache_.count(chunk_id) > 0;
1070 }
1071
1072 void SafeBrowsingStoreFile::GetAddChunks(std::vector<int32>* out) {
1073   out->clear();
1074   out->insert(out->end(), add_chunks_cache_.begin(), add_chunks_cache_.end());
1075 }
1076
1077 void SafeBrowsingStoreFile::SetSubChunk(int32 chunk_id) {
1078   sub_chunks_cache_.insert(chunk_id);
1079 }
1080
1081 bool SafeBrowsingStoreFile::CheckSubChunk(int32 chunk_id) {
1082   return sub_chunks_cache_.count(chunk_id) > 0;
1083 }
1084
1085 void SafeBrowsingStoreFile::GetSubChunks(std::vector<int32>* out) {
1086   out->clear();
1087   out->insert(out->end(), sub_chunks_cache_.begin(), sub_chunks_cache_.end());
1088 }
1089
1090 void SafeBrowsingStoreFile::DeleteAddChunk(int32 chunk_id) {
1091   add_del_cache_.insert(chunk_id);
1092 }
1093
1094 void SafeBrowsingStoreFile::DeleteSubChunk(int32 chunk_id) {
1095   sub_del_cache_.insert(chunk_id);
1096 }
1097
1098 // static
1099 bool SafeBrowsingStoreFile::DeleteStore(const base::FilePath& basename) {
1100   if (!base::DeleteFile(basename, false) &&
1101       base::PathExists(basename)) {
1102     NOTREACHED();
1103     return false;
1104   }
1105
1106   const base::FilePath new_filename = TemporaryFileForFilename(basename);
1107   if (!base::DeleteFile(new_filename, false) &&
1108       base::PathExists(new_filename)) {
1109     NOTREACHED();
1110     return false;
1111   }
1112
1113   // With SQLite support gone, one way to get to this code is if the
1114   // existing file is a SQLite file.  Make sure the journal file is
1115   // also removed.
1116   const base::FilePath journal_filename(
1117       basename.value() + FILE_PATH_LITERAL("-journal"));
1118   if (base::PathExists(journal_filename))
1119     base::DeleteFile(journal_filename, false);
1120
1121   return true;
1122 }