1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // We want to measure the similarity of two sequences of bytes as a surrogate
6 // for measuring how well a second sequence will compress differentially to the
9 #include "courgette/difference_estimator.h"
15 #include <unordered_set>
17 #include "base/macros.h"
21 // Our difference measure is the number of k-tuples that occur in Subject that
22 // don't occur in Base.
23 const int kTupleSize = 4;
27 static_assert(kTupleSize >= 4 && kTupleSize <= 8,
28 "kTupleSize should be between 4 and 8");
30 size_t HashTuple(const uint8_t* source) {
31 size_t hash1 = *reinterpret_cast<const uint32_t*>(source);
32 size_t hash2 = *reinterpret_cast<const uint32_t*>(source + kTupleSize - 4);
33 size_t hash = ((hash1 * 17 + hash2 * 37) + (hash1 >> 17)) ^ (hash2 >> 23);
37 bool RegionsEqual(const Region& a, const Region& b) {
38 if (a.length() != b.length())
40 return memcmp(a.start(), b.start(), a.length()) == 0;
43 } // anonymous namepace
45 class DifferenceEstimator::Base {
47 explicit Base(const Region& region) : region_(region) { }
50 if (region_.length() < kTupleSize)
52 const uint8_t* start = region_.start();
53 const uint8_t* end = region_.end() - (kTupleSize - 1);
54 for (const uint8_t* p = start; p < end; ++p) {
55 size_t hash = HashTuple(p);
60 const Region& region() const { return region_; }
64 std::unordered_set<size_t> hashes_;
66 friend class DifferenceEstimator;
67 DISALLOW_COPY_AND_ASSIGN(Base);
70 class DifferenceEstimator::Subject {
72 explicit Subject(const Region& region) : region_(region) {}
74 const Region& region() const { return region_; }
79 DISALLOW_COPY_AND_ASSIGN(Subject);
82 DifferenceEstimator::DifferenceEstimator() = default;
84 DifferenceEstimator::~DifferenceEstimator() {
85 for (size_t i = 0; i < owned_bases_.size(); ++i)
86 delete owned_bases_[i];
87 for (size_t i = 0; i < owned_subjects_.size(); ++i)
88 delete owned_subjects_[i];
91 DifferenceEstimator::Base* DifferenceEstimator::MakeBase(const Region& region) {
92 Base* base = new Base(region);
94 owned_bases_.push_back(base);
98 DifferenceEstimator::Subject* DifferenceEstimator::MakeSubject(
99 const Region& region) {
100 Subject* subject = new Subject(region);
101 owned_subjects_.push_back(subject);
105 size_t DifferenceEstimator::Measure(Base* base, Subject* subject) {
106 size_t mismatches = 0;
107 if (subject->region().length() >= kTupleSize) {
108 const uint8_t* start = subject->region().start();
109 const uint8_t* end = subject->region().end() - (kTupleSize - 1);
111 const uint8_t* p = start;
113 size_t hash = HashTuple(p);
114 if (base->hashes_.find(hash) == base->hashes_.end()) {
121 if (mismatches == 0) {
122 if (RegionsEqual(base->region(), subject->region()))
125 ++mismatches; // Guarantee not zero.