#include <algorithm>
#include "base/logging.h"
+#include "base/metrics/histogram.h"
namespace {
return true;
}
-// Find items matching between |subs| and |adds|, and remove them,
-// recording the item from |adds| in |adds_removed|. To minimize
-// copies, the inputs are processing in parallel, so |subs| and |adds|
-// should be compatibly ordered (either by SBAddPrefixLess or
-// SBAddPrefixHashLess).
+// Find items matching between |subs| and |adds|, and remove them. To minimize
+// copies, the inputs are processing in parallel, so |subs| and |adds| should be
+// compatibly ordered (either by SBAddPrefixLess or SBAddPrefixHashLess).
//
-// |predAddSub| provides add < sub, |predSubAdd| provides sub < add,
-// for the tightest compare appropriate (see calls in SBProcessSubs).
+// |predAddSub| provides add < sub, |predSubAdd| provides sub < add, for the
+// tightest compare appropriate (see calls in SBProcessSubs).
template <typename SubsT, typename AddsT,
typename PredAddSubT, typename PredSubAddT>
void KnockoutSubs(SubsT* subs, AddsT* adds,
- PredAddSubT predAddSub, PredSubAddT predSubAdd,
- AddsT* adds_removed) {
+ PredAddSubT predAddSub, PredSubAddT predSubAdd) {
// Keep a pair of output iterators for writing kept items. Due to
// deletions, these may lag the main iterators. Using erase() on
// individual items would result in O(N^2) copies. Using std::list
++add_out;
++add_iter;
- // Record equal items and drop them.
+ // Drop equal items.
} else {
- adds_removed->push_back(*add_iter);
++add_iter;
++sub_iter;
}
subs->erase(sub_out, sub_iter);
}
-// Remove items in |removes| from |full_hashes|. |full_hashes| and
-// |removes| should be ordered by SBAddPrefix component.
-template <typename HashesT, typename AddsT>
-void RemoveMatchingPrefixes(const AddsT& removes, HashesT* full_hashes) {
- // This is basically an inline of std::set_difference().
- // Unfortunately, that algorithm requires that the two iterator
- // pairs use the same value types.
-
- // Where to store kept items.
- typename HashesT::iterator out = full_hashes->begin();
-
- typename HashesT::iterator hash_iter = full_hashes->begin();
- typename AddsT::const_iterator remove_iter = removes.begin();
-
- while (hash_iter != full_hashes->end() && remove_iter != removes.end()) {
- // Keep items less than |*remove_iter|.
- if (SBAddPrefixLess(*hash_iter, *remove_iter)) {
- *out = *hash_iter;
- ++out;
- ++hash_iter;
-
- // No hit for |*remove_iter|, bump it forward.
- } else if (SBAddPrefixLess(*remove_iter, *hash_iter)) {
- ++remove_iter;
-
- // Drop equal items, there may be multiple hits.
- } else {
- do {
- ++hash_iter;
- } while (hash_iter != full_hashes->end() &&
- !SBAddPrefixLess(*remove_iter, *hash_iter));
- ++remove_iter;
- }
- }
-
- // Erase any leftover gap.
- full_hashes->erase(out, hash_iter);
-}
-
// Remove deleted items (|chunk_id| in |del_set|) from the container.
template <typename ItemsT>
void RemoveDeleted(ItemsT* items, const base::hash_set<int32>& del_set) {
items->erase(end_iter, items->end());
}
+// Remove prefixes which are in the same chunk as their fullhash. This was a
+// mistake in earlier implementations.
+template <typename HashesT, typename PrefixesT>
+size_t KnockoutPrefixVolunteers(const HashesT& full_hashes,
+ PrefixesT* prefixes) {
+ typename PrefixesT::iterator prefixes_process = prefixes->begin();
+ typename PrefixesT::iterator prefixes_out = prefixes->begin();
+ typename HashesT::const_iterator hashes_process = full_hashes.begin();
+
+ size_t skipped_count = 0;
+
+ while (hashes_process != full_hashes.end()) {
+ // Scan prefixes forward until an item is not less than the current hash.
+ while (prefixes_process != prefixes->end() &&
+ SBAddPrefixLess(*prefixes_process, *hashes_process)) {
+ if (prefixes_process != prefixes_out) {
+ *prefixes_out = *prefixes_process;
+ }
+ prefixes_out++;
+ prefixes_process++;
+ }
+
+ // If the current hash is also not less than the prefix, that implies they
+ // are equal. Skip the prefix.
+ if (prefixes_process != prefixes->end() &&
+ !SBAddPrefixLess(*hashes_process, *prefixes_process)) {
+ skipped_count++;
+ prefixes_process++;
+ }
+
+ hashes_process++;
+ }
+
+ // If any prefixes were skipped, copy over the tail and erase the excess.
+ if (prefixes_process != prefixes_out) {
+ prefixes_out = std::copy(prefixes_process, prefixes->end(), prefixes_out);
+ prefixes->erase(prefixes_out, prefixes->end());
+ }
+
+ return skipped_count;
+}
+
} // namespace
void SBProcessSubs(SBAddPrefixes* add_prefixes,
DCHECK(sorted(sub_full_hashes->begin(), sub_full_hashes->end(),
SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>));
+ // Earlier database code added prefixes when it saw fullhashes. The protocol
+ // should never send a chunk of mixed prefixes and fullhashes, the following
+ // removes any such cases which are seen.
+ // TODO(shess): Remove this code once most databases have been processed.
+ // Chunk churn should clean up anyone left over. This only takes a few ms to
+ // run through my current database, so it doesn't seem worthwhile to do much
+ // more than that.
+ size_t skipped = KnockoutPrefixVolunteers(*add_full_hashes, add_prefixes);
+ skipped += KnockoutPrefixVolunteers(*sub_full_hashes, sub_prefixes);
+ UMA_HISTOGRAM_COUNTS("SB2.VolunteerPrefixesRemoved", skipped);
+
// Factor out the prefix subs.
- SBAddPrefixes removed_adds;
KnockoutSubs(sub_prefixes, add_prefixes,
SBAddPrefixLess<SBAddPrefix,SBSubPrefix>,
- SBAddPrefixLess<SBSubPrefix,SBAddPrefix>,
- &removed_adds);
-
- // Remove the full-hashes corrosponding to the adds which
- // KnockoutSubs() removed. Processing these w/in KnockoutSubs()
- // would make the code more complicated, and they are very small
- // relative to the prefix lists so the gain would be modest.
- RemoveMatchingPrefixes(removed_adds, add_full_hashes);
- RemoveMatchingPrefixes(removed_adds, sub_full_hashes);
+ SBAddPrefixLess<SBSubPrefix,SBAddPrefix>);
// Factor out the full-hash subs.
- std::vector<SBAddFullHash> removed_full_adds;
KnockoutSubs(sub_full_hashes, add_full_hashes,
SBAddPrefixHashLess<SBAddFullHash,SBSubFullHash>,
- SBAddPrefixHashLess<SBSubFullHash,SBAddFullHash>,
- &removed_full_adds);
+ SBAddPrefixHashLess<SBSubFullHash,SBAddFullHash>);
// Remove items from the deleted chunks. This is done after other
// processing to allow subs to knock out adds (and be removed) even