1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "courgette/adjustment_method.h"
18 #include "base/format_macros.h"
19 #include "base/logging.h"
20 #include "base/macros.h"
21 #include "base/strings/stringprintf.h"
22 #include "base/time/time.h"
23 #include "courgette/assembly_program.h"
24 #include "courgette/courgette.h"
25 #include "courgette/encoded_program.h"
29 Shingle weighting matching.
31 We have a sequence S1 of symbols from alphabet A1={A,B,C,...} called the 'model'
32 and a second sequence of S2 of symbols from alphabet A2={U,V,W,....} called the
33 'program'. Each symbol in A1 has a unique numerical name or index. We can
34 transcribe the sequence S1 to a sequence T1 of indexes of the symbols. We wish
35 to assign indexes to the symbols in A2 so that when we transcribe S2 into T2, T2
36 has long subsequences that occur in T1. This will ensure that the sequence
37 T1;T2 compresses to be only slightly larger than the compressed T1.
39 The algorithm for matching members of S2 with members of S1 is eager - it makes
40 matches without backtracking, until no more matches can be made. Each variable
41 (symbol) U,V,... in A2 has a set of candidates from A1, each candidate with a
42 weight summarizing the evidence for the match. We keep a VariableQueue of
43 U,V,... sorted by how much the evidence for the best choice outweighs the
44 evidence for the second choice, i.e. prioritized by how 'clear cut' the best
45 assignment is. We pick the variable with the most clear-cut candidate, make the
46 assignment, adjust the evidence and repeat.
48 What has not been described so far is how the evidence is gathered and
49 maintained. We are working under the assumption that S1 and S2 are largely
50 similar. (A different assumption might be that S1 and S2 are dissimilar except
51 for many long subsequences.)
53 A naive algorithm would consider all pairs (A,U) and for each pair assess the
54 benefit, or score, the assignment U:=A. The score might count the number of
55 occurrences of U in S2 which appear in similar contexts to A in S1.
57 To distinguish contexts we view S1 and S2 as a sequence of overlapping k-length
58 substrings or 'shingles'. Two shingles are compatible if the symbols in one
59 shingle could be matched with the symbols in the other symbol. For example, ABC
60 is *not* compatible with UVU because it would require conflicting matches A=U
61 and C=U. ABC is compatible with UVW, UWV, WUV, VUW etc. We can't tell which
62 until we make an assignment - the compatible shingles form an equivalence class.
63 After assigning U:=A then only UVW and UWV (equivalently AVW, AWV) are
64 compatible. As we make assignments the number of equivalence classes of
65 shingles increases and the number of members of each equivalence class
66 decreases. The compatibility test becomes more restrictive.
68 We gather evidence for the potential assignment U:=A by counting how many
69 shingles containing U are compatible with shingles containing A. Thus symbols
70 occurring a large number of times in compatible contexts will be assigned first.
72 Finding the 'most clear-cut' assignment by considering all pairs symbols and for
73 each pair comparing the contexts of each pair of occurrences of the symbols is
74 computationally infeasible. We get the job done in a reasonable time by
75 approaching it 'backwards' and making incremental changes as we make
78 First the shingles are partitioned according to compatibility. In S1=ABCDD and
79 S2=UVWXX we have a total of 6 shingles, each occuring once. (ABC:1 BCD:1 CDD:1;
80 UVW:1 VWX: WXX:1) all fit the pattern <V0 V1 V2> or the pattern <V0 V1 V1>. The
81 first pattern indicates that each position matches a different symbol, the
82 second pattern indicates that the second symbol is repeated.
84 pattern S1 members S2 members
85 <V0 V1 V2>: {ABC:1, BCD:1}; {UVW:1, VWX:1}
86 <V0 V1 V1>: {CDD:1} {WXX:1}
88 The second pattern appears to have a unique assignment but we don't make the
89 assignment on such scant evidence. If S1 and S2 do not match exactly, there
90 will be numerous spurious low-score matches like this. Instead we must see what
91 assignments are indicated by considering all of the evidence.
93 First pattern has 2 x 2 = 4 shingle pairs. For each pair we count the number
94 of symbol assignments. For ABC:a * UVW:b accumulate min(a,b) to each of
96 After accumulating over all 2 x 2 pairs:
101 The second pattern contributes:
110 From this we decide to assign X:=D (because this assignment has both the largest
111 difference above the next candidate (X:=C) and this is also the largest
112 proportionately over the sum of alternatives).
114 Lets assume D has numerical 'name' 77. The assignment X:=D sets X to 77 too.
115 Next we repartition all the shingles containing X or D:
117 pattern S1 members S2 members
118 <V0 V1 V2>: {ABC:1}; {UVW:1}
119 <V0 V1 77>: {BCD:1}; {VWX:1}
120 <V0 77 77>: {CDD:1} {WXX:1}
121 As we repartition, we recalculate the contributions to the scores:
125 All the remaining assignments are now fixed.
127 There is one step in the incremental algorithm that is still infeasibly
128 expensive: the contributions due to the cross product of large equivalence
129 classes. We settle for making an approximation by computing the contribution of
130 the cross product of only the most common shingles. The hope is that the noise
131 from the long tail of uncounted shingles is well below the scores being used to
132 pick assignments. The second hope is that as assignment are made, the large
133 equivalence class will be partitioned into smaller equivalence classes, reducing
136 In the code below the shingles are bigger (Shingle::kWidth = 5).
137 Class ShinglePattern holds the data for one pattern.
139 There is an optimization for this case:
140 <V0 V1 V1>: {CDD:1} {WXX:1}
142 Above we said that we don't make an assignment on this "scant evidence". There
143 is an exception: if there is only one variable unassigned (more like the <V0 77
144 77> pattern) AND there are no occurrences of C and W other than those counted in
145 this pattern, then there is no competing evidence and we go ahead with the
146 assignment immediately. This produces slightly better results because these
147 cases tend to be low-scoring and susceptible to small mistakes made in
148 low-scoring assignments in the approximation for large equivalence classes.
152 namespace courgette {
153 namespace adjustment_method_2 {
155 ////////////////////////////////////////////////////////////////////////////////
157 class AssignmentCandidates;
158 class LabelInfoMaker;
160 class ShinglePattern;
162 // The purpose of adjustment is to assign indexes to Labels of a program 'p' to
163 // make the sequence of indexes similar to a 'model' program 'm'. Labels
164 // themselves don't have enough information to do this job, so we work with a
165 // LabelInfo surrogate for each label.
169 // Just a no-argument constructor and copy constructor. Actual LabelInfo
170 // objects are allocated in std::pair structs in a std::map.
176 assignment_(nullptr),
177 candidates_(nullptr) {}
181 AssignmentCandidates* candidates();
183 Label* label_; // The label that this info a surrogate for.
185 uint32_t is_model_ : 1; // Is the label in the model?
186 uint32_t debug_index_ : 31; // A small number for naming the label in debug
187 // output. The pair (is_model_, debug_index_) is
190 int refs_; // Number of times this Label is referenced.
192 LabelInfo* assignment_; // Label from other program corresponding to this.
194 std::vector<uint32_t> positions_; // Offsets into the trace of references.
197 AssignmentCandidates* candidates_;
199 void operator=(const LabelInfo*); // Disallow assignment only.
200 // Public compiler generated copy constructor is needed to constuct
201 // std::pair<Label*, LabelInfo> so that fresh LabelInfos can be allocated
202 // inside a std::map.
205 typedef std::vector<LabelInfo*> Trace;
207 std::string ToString(const LabelInfo* info) {
209 base::StringAppendF(&s, "%c%d", "pm"[info->is_model_], info->debug_index_);
210 if (info->label_->index_ != Label::kNoIndex)
211 base::StringAppendF(&s, " (%d)", info->label_->index_);
213 base::StringAppendF(&s, " #%u", info->refs_);
217 // LabelInfoMaker maps labels to their surrogate LabelInfo objects.
218 class LabelInfoMaker {
220 LabelInfoMaker() : debug_label_index_gen_(0) {}
222 LabelInfo* MakeLabelInfo(Label* label, bool is_model, uint32_t position) {
223 LabelInfo& slot = label_infos_[label];
224 if (slot.label_ == nullptr) {
226 slot.is_model_ = is_model;
227 slot.debug_index_ = ++debug_label_index_gen_;
229 slot.positions_.push_back(position);
234 void ResetDebugLabel() { debug_label_index_gen_ = 0; }
237 int debug_label_index_gen_;
239 // Note LabelInfo is allocated 'flat' inside map::value_type, so the LabelInfo
240 // lifetimes are managed by the map.
241 std::map<Label*, LabelInfo> label_infos_;
243 DISALLOW_COPY_AND_ASSIGN(LabelInfoMaker);
246 struct OrderLabelInfo {
247 bool operator()(const LabelInfo* a, const LabelInfo* b) const {
248 if (a->label_->rva_ < b->label_->rva_) return true;
249 if (a->label_->rva_ > b->label_->rva_) return false;
250 if (a == b) return false;
251 return a->positions_ < b->positions_; // Lexicographic ordering of vector.
255 // AssignmentCandidates is a priority queue of candidate assignments to
256 // a single program LabelInfo, |program_info_|.
257 class AssignmentCandidates {
259 explicit AssignmentCandidates(LabelInfo* program_info)
260 : program_info_(program_info) {}
262 LabelInfo* program_info() const { return program_info_; }
264 bool empty() const { return label_to_score_.empty(); }
266 LabelInfo* top_candidate() const { return queue_.begin()->second; }
268 void Update(LabelInfo* model_info, int delta_score) {
269 LOG_ASSERT(delta_score != 0);
272 LabelToScore::iterator p = label_to_score_.find(model_info);
273 if (p != label_to_score_.end()) {
274 old_score = p->second;
275 new_score = old_score + delta_score;
276 queue_.erase(ScoreAndLabel(old_score, p->first));
277 if (new_score == 0) {
278 label_to_score_.erase(p);
280 p->second = new_score;
281 queue_.insert(ScoreAndLabel(new_score, model_info));
284 new_score = delta_score;
285 label_to_score_.insert(std::make_pair(model_info, new_score));
286 queue_.insert(ScoreAndLabel(new_score, model_info));
288 LOG_ASSERT(queue_.size() == label_to_score_.size());
291 int TopScore() const {
293 int second_value = 0;
294 Queue::const_iterator p = queue_.begin();
295 if (p != queue_.end()) {
296 first_value = p->first;
298 if (p != queue_.end()) {
299 second_value = p->first;
302 return first_value - second_value;
305 bool HasPendingUpdates() { return !pending_updates_.empty(); }
307 void AddPendingUpdate(LabelInfo* model_info, int delta_score) {
308 LOG_ASSERT(delta_score != 0);
309 pending_updates_[model_info] += delta_score;
312 void ApplyPendingUpdates() {
313 // TODO(sra): try to walk |pending_updates_| and |label_to_score_| in
314 // lockstep. Try to batch updates to |queue_|.
316 for (LabelToScore::iterator p = pending_updates_.begin();
317 p != pending_updates_.end();
320 Update(p->first, p->second);
324 pending_updates_.clear();
327 void Print(int max) {
328 VLOG(2) << "score " << TopScore() << " " << ToString(program_info_)
330 if (!pending_updates_.empty())
331 VLOG(2) << pending_updates_.size() << " pending";
333 for (Queue::iterator q = queue_.begin(); q != queue_.end(); ++q) {
334 if (++count > max) break;
335 VLOG(2) << " " << q->first << " " << ToString(q->second);
340 typedef std::map<LabelInfo*, int, OrderLabelInfo> LabelToScore;
341 typedef std::pair<int, LabelInfo*> ScoreAndLabel;
342 struct OrderScoreAndLabelByScoreDecreasing {
343 OrderLabelInfo tie_breaker;
344 bool operator()(const ScoreAndLabel& a, const ScoreAndLabel& b) const {
345 if (a.first > b.first) return true;
346 if (a.first < b.first) return false;
347 return tie_breaker(a.second, b.second);
350 typedef std::set<ScoreAndLabel, OrderScoreAndLabelByScoreDecreasing> Queue;
352 LabelInfo* program_info_;
353 LabelToScore label_to_score_;
354 LabelToScore pending_updates_;
358 AssignmentCandidates* LabelInfo::candidates() {
359 if (candidates_ == nullptr)
360 candidates_ = new AssignmentCandidates(this);
364 LabelInfo::~LabelInfo() {
368 // A Shingle is a short fixed-length string of LabelInfos that actually occurs
369 // in a Trace. A Shingle may occur many times. We repesent the Shingle by the
370 // position of one of the occurrences in the Trace.
373 static const uint8_t kWidth = 5;
375 struct InterningLess {
376 bool operator()(const Shingle& a, const Shingle& b) const;
379 typedef std::set<Shingle, InterningLess> OwningSet;
381 static Shingle* Find(const Trace& trace, size_t position,
382 OwningSet* owning_set) {
383 std::pair<OwningSet::iterator, bool> pair =
384 owning_set->insert(Shingle(trace, position));
385 // pair.first iterator 'points' to the newly inserted Shingle or the
386 // previouly inserted one that looks the same according to the comparator.
388 // const_cast required because key is const. We modify the Shingle
389 // extensively but not in a way that affects InterningLess.
390 Shingle* shingle = const_cast<Shingle*>(&*pair.first);
391 shingle->add_position(position);
395 LabelInfo* at(size_t i) const { return trace_[exemplar_position_ + i]; }
396 void add_position(size_t position) {
397 positions_.push_back(static_cast<uint32_t>(position));
399 int position_count() const { return static_cast<int>(positions_.size()); }
401 bool InModel() const { return at(0)->is_model_; }
403 ShinglePattern* pattern() const { return pattern_; }
404 void set_pattern(ShinglePattern* pattern) { pattern_ = pattern; }
407 bool operator()(const Shingle* a, const Shingle* b) const {
408 // Arbitrary but repeatable (memory-address) independent ordering:
409 return a->exemplar_position_ < b->exemplar_position_;
410 // return InterningLess()(*a, *b);
415 Shingle(const Trace& trace, size_t exemplar_position)
417 exemplar_position_(exemplar_position),
420 const Trace& trace_; // The shingle lives inside trace_.
421 size_t exemplar_position_; // At this position (and other positions).
422 std::vector<uint32_t> positions_; // Includes exemplar_position_.
424 ShinglePattern* pattern_; // Pattern changes as LabelInfos are assigned.
426 friend std::string ToString(const Shingle* instance);
428 // We can't disallow the copy constructor because we use std::set<Shingle> and
429 // VS2005's implementation of std::set<T>::set() requires T to have a copy
431 // DISALLOW_COPY_AND_ASSIGN(Shingle);
432 void operator=(const Shingle&) = delete; // Disallow assignment only.
435 std::string ToString(const Shingle* instance) {
437 const char* sep = "<";
438 for (uint8_t i = 0; i < Shingle::kWidth; ++i) {
439 // base::StringAppendF(&s, "%s%x ", sep, instance.at(i)->label_->rva_);
441 s += ToString(instance->at(i));
444 base::StringAppendF(&s, ">(%" PRIuS ")@{%d}",
445 instance->exemplar_position_,
446 instance->position_count());
451 bool Shingle::InterningLess::operator()(
453 const Shingle& b) const {
454 for (uint8_t i = 0; i < kWidth; ++i) {
455 LabelInfo* info_a = a.at(i);
456 LabelInfo* info_b = b.at(i);
457 if (info_a->label_->rva_ < info_b->label_->rva_)
459 if (info_a->label_->rva_ > info_b->label_->rva_)
461 if (info_a->is_model_ < info_b->is_model_)
463 if (info_a->is_model_ > info_b->is_model_)
465 if (info_a != info_b) {
472 class ShinglePattern {
474 enum { kOffsetMask = 7, // Offset lives in low bits.
475 kFixed = 0, // kind & kVariable == 0 => fixed.
476 kVariable = 8 // kind & kVariable == 1 => variable.
478 // sequence[position + (kinds_[i] & kOffsetMask)] gives LabelInfo for position
479 // i of shingle. Below, second 'A' is duplicate of position 1, second '102'
480 // is duplicate of position 0.
482 // <102, A, 103, A , 102>
483 // --> <kFixed+0, kVariable+1, kFixed+2, kVariable+1, kFixed+0>
485 explicit Index(const Shingle* instance);
486 uint8_t kinds_[Shingle::kWidth];
488 uint8_t unique_variables_;
489 uint8_t first_variable_index_;
491 int assigned_indexes_[Shingle::kWidth];
494 // ShinglePattern keeps histograms of member Shingle instances, ordered by
495 // decreasing number of occurrences. We don't have a pair (occurrence count,
496 // Shingle instance), so we use a FreqView adapter to make the instance
497 // pointer look like the pair.
500 explicit FreqView(const Shingle* instance) : instance_(instance) {}
501 int count() const { return instance_->position_count(); }
502 const Shingle* instance() const { return instance_; }
504 bool operator()(const FreqView& a, const FreqView& b) const {
505 if (a.count() > b.count()) return true;
506 if (a.count() < b.count()) return false;
507 return resolve_ties(a.instance(), b.instance());
510 Shingle::PointerLess resolve_ties;
513 const Shingle* instance_;
516 typedef std::set<FreqView, FreqView::Greater> Histogram;
519 : index_(nullptr), model_coverage_(0), program_coverage_(0) {}
521 const Index* index_; // Points to the key in the owning map value_type.
522 Histogram model_histogram_;
523 Histogram program_histogram_;
525 int program_coverage_;
528 std::string ToString(const ShinglePattern::Index* index) {
530 if (index == nullptr) {
533 base::StringAppendF(&s, "<%d: ", index->variables_);
534 const char* sep = "";
535 for (uint8_t i = 0; i < Shingle::kWidth; ++i) {
538 uint32_t kind = index->kinds_[i];
539 int offset = kind & ShinglePattern::kOffsetMask;
540 if (kind & ShinglePattern::kVariable)
541 base::StringAppendF(&s, "V%d", offset);
543 base::StringAppendF(&s, "%d", index->assigned_indexes_[offset]);
545 base::StringAppendF(&s, " %x", index->hash_);
551 std::string HistogramToString(const ShinglePattern::Histogram& histogram,
552 size_t snippet_max) {
554 size_t histogram_size = histogram.size();
555 size_t snippet_size = 0;
556 for (ShinglePattern::Histogram::const_iterator p = histogram.begin();
557 p != histogram.end();
559 if (++snippet_size > snippet_max && snippet_size != histogram_size) {
563 base::StringAppendF(&s, " %d", p->count());
568 std::string HistogramToStringFull(const ShinglePattern::Histogram& histogram,
570 size_t snippet_max) {
573 size_t histogram_size = histogram.size();
574 size_t snippet_size = 0;
575 for (ShinglePattern::Histogram::const_iterator p = histogram.begin();
576 p != histogram.end();
579 if (++snippet_size > snippet_max && snippet_size != histogram_size) {
583 base::StringAppendF(&s, "(%d) ", p->count());
584 s += ToString(&(*p->instance()));
590 std::string ToString(const ShinglePattern* pattern, size_t snippet_max = 3) {
592 if (pattern == nullptr) {
596 s += ToString(pattern->index_);
597 base::StringAppendF(&s, "; %d(%d):",
598 static_cast<int>(pattern->model_histogram_.size()),
599 pattern->model_coverage_);
601 s += HistogramToString(pattern->model_histogram_, snippet_max);
602 base::StringAppendF(&s, "; %d(%d):",
603 static_cast<int>(pattern->program_histogram_.size()),
604 pattern->program_coverage_);
605 s += HistogramToString(pattern->program_histogram_, snippet_max);
611 std::string ShinglePatternToStringFull(const ShinglePattern* pattern,
614 s += ToString(pattern->index_);
616 size_t model_size = pattern->model_histogram_.size();
617 size_t program_size = pattern->program_histogram_.size();
618 base::StringAppendF(&s, " model shingles %" PRIuS "\n", model_size);
619 s += HistogramToStringFull(pattern->model_histogram_, " ", max);
620 base::StringAppendF(&s, " program shingles %" PRIuS "\n", program_size);
621 s += HistogramToStringFull(pattern->program_histogram_, " ", max);
625 struct ShinglePatternIndexLess {
626 bool operator()(const ShinglePattern::Index& a,
627 const ShinglePattern::Index& b) const {
628 if (a.hash_ < b.hash_) return true;
629 if (a.hash_ > b.hash_) return false;
631 for (uint8_t i = 0; i < Shingle::kWidth; ++i) {
632 if (a.kinds_[i] < b.kinds_[i]) return true;
633 if (a.kinds_[i] > b.kinds_[i]) return false;
634 if ((a.kinds_[i] & ShinglePattern::kVariable) == 0) {
635 if (a.assigned_indexes_[i] < b.assigned_indexes_[i])
637 if (a.assigned_indexes_[i] > b.assigned_indexes_[i])
645 static uint32_t hash_combine(uint32_t h, uint32_t v) {
647 return (h * (37 + 0x0000d100)) ^ (h >> 13);
650 ShinglePattern::Index::Index(const Shingle* instance) {
653 unique_variables_ = 0;
654 first_variable_index_ = 255;
656 for (uint8_t i = 0; i < Shingle::kWidth; ++i) {
657 LabelInfo* info = instance->at(i);
661 for ( ; j < i; ++j) {
662 if (info == instance->at(j)) { // Duplicate LabelInfo
667 if (j == i) { // Not found above.
668 if (info->assignment_) {
669 code = info->label_->index_;
670 assigned_indexes_[i] = code;
673 kind = kVariable + i;
675 if (i < first_variable_index_)
676 first_variable_index_ = i;
679 if (kind & kVariable) ++variables_;
680 hash = hash_combine(hash, code);
681 hash = hash_combine(hash, kind);
683 assigned_indexes_[i] = code;
688 struct ShinglePatternLess {
689 bool operator()(const ShinglePattern& a, const ShinglePattern& b) const {
690 return index_less(*a.index_, *b.index_);
692 ShinglePatternIndexLess index_less;
695 struct ShinglePatternPointerLess {
696 bool operator()(const ShinglePattern* a, const ShinglePattern* b) const {
697 return pattern_less(*a, *b);
699 ShinglePatternLess pattern_less;
702 template<int (*Scorer)(const ShinglePattern*)>
703 struct OrderShinglePatternByScoreDescending {
704 bool operator()(const ShinglePattern* a, const ShinglePattern* b) const {
705 int score_a = Scorer(a);
706 int score_b = Scorer(b);
707 if (score_a > score_b) return true;
708 if (score_a < score_b) return false;
709 return break_ties(a, b);
711 ShinglePatternPointerLess break_ties;
714 // Returns a score for a 'Single Use' rule. Returns -1 if the rule is not
716 int SingleUseScore(const ShinglePattern* pattern) {
717 if (pattern->index_->variables_ != 1)
720 if (pattern->model_histogram_.size() != 1 ||
721 pattern->program_histogram_.size() != 1)
724 // Does this pattern account for all uses of the variable?
725 const ShinglePattern::FreqView& program_freq =
726 *pattern->program_histogram_.begin();
727 const ShinglePattern::FreqView& model_freq =
728 *pattern->model_histogram_.begin();
729 int p1 = program_freq.count();
730 int m1 = model_freq.count();
732 const Shingle* program_instance = program_freq.instance();
733 const Shingle* model_instance = model_freq.instance();
734 size_t variable_index = pattern->index_->first_variable_index_;
735 LabelInfo* program_info = program_instance->at(variable_index);
736 LabelInfo* model_info = model_instance->at(variable_index);
737 if (!program_info->assignment_) {
738 if (program_info->refs_ == p1 && model_info->refs_ == m1) {
746 // The VariableQueue is a priority queue of unassigned LabelInfos from
747 // the 'program' (the 'variables') and their AssignmentCandidates.
748 class VariableQueue {
750 typedef std::pair<int, LabelInfo*> ScoreAndLabel;
752 VariableQueue() = default;
754 bool empty() const { return queue_.empty(); }
756 const ScoreAndLabel& first() const { return *queue_.begin(); }
758 // For debugging only.
760 for (Queue::const_iterator p = queue_.begin(); p != queue_.end(); ++p) {
761 AssignmentCandidates* candidates = p->second->candidates();
762 candidates->Print(std::numeric_limits<int>::max());
766 void AddPendingUpdate(LabelInfo* program_info, LabelInfo* model_info,
768 AssignmentCandidates* candidates = program_info->candidates();
769 if (!candidates->HasPendingUpdates()) {
770 pending_update_candidates_.push_back(candidates);
772 candidates->AddPendingUpdate(model_info, delta_score);
775 void ApplyPendingUpdates() {
776 for (size_t i = 0; i < pending_update_candidates_.size(); ++i) {
777 AssignmentCandidates* candidates = pending_update_candidates_[i];
778 int old_score = candidates->TopScore();
779 queue_.erase(ScoreAndLabel(old_score, candidates->program_info()));
780 candidates->ApplyPendingUpdates();
781 if (!candidates->empty()) {
782 int new_score = candidates->TopScore();
783 queue_.insert(ScoreAndLabel(new_score, candidates->program_info()));
786 pending_update_candidates_.clear();
790 struct OrderScoreAndLabelByScoreDecreasing {
791 bool operator()(const ScoreAndLabel& a, const ScoreAndLabel& b) const {
792 if (a.first > b.first) return true;
793 if (a.first < b.first) return false;
794 return OrderLabelInfo()(a.second, b.second);
797 typedef std::set<ScoreAndLabel, OrderScoreAndLabelByScoreDecreasing> Queue;
800 std::vector<AssignmentCandidates*> pending_update_candidates_;
802 DISALLOW_COPY_AND_ASSIGN(VariableQueue);
806 class AssignmentProblem {
808 AssignmentProblem(const Trace& trace, size_t model_end)
810 model_end_(model_end) {
811 VLOG(2) << "AssignmentProblem::AssignmentProblem " << model_end << ", "
816 if (model_end_ < Shingle::kWidth ||
817 trace_.size() - model_end_ < Shingle::kWidth) {
818 // Nothing much we can do with such a short problem.
821 instances_.resize(trace_.size() - Shingle::kWidth + 1, nullptr);
822 AddShingles(0, model_end_);
823 AddShingles(model_end_, trace_.size());
825 AddPatternsNeedingUpdatesToQueues();
827 patterns_needing_updates_.clear();
828 while (FindAndAssignBestLeader())
829 patterns_needing_updates_.clear();
830 PrintActivePatterns();
836 typedef std::set<Shingle*, Shingle::PointerLess> ShingleSet;
838 typedef std::set<const ShinglePattern*, ShinglePatternPointerLess>
841 // Patterns are partitioned into the following sets:
843 // * Retired patterns (not stored). No shingles exist for this pattern (they
844 // all now match more specialized patterns).
845 // * Useless patterns (not stored). There are no 'program' shingles for this
846 // pattern (they all now match more specialized patterns).
847 // * Single-use patterns - single_use_pattern_queue_.
848 // * Other patterns - active_non_single_use_patterns_ / variable_queue_.
850 typedef std::set<const ShinglePattern*,
851 OrderShinglePatternByScoreDescending<&SingleUseScore> >
852 SingleUsePatternQueue;
854 void PrintPatternsHeader() const {
855 VLOG(2) << shingle_instances_.size() << " instances "
856 << trace_.size() << " trace length "
857 << patterns_.size() << " shingle indexes "
858 << single_use_pattern_queue_.size() << " single use patterns "
859 << active_non_single_use_patterns_.size() << " active patterns";
862 void PrintActivePatterns() const {
863 for (ShinglePatternSet::const_iterator p =
864 active_non_single_use_patterns_.begin();
865 p != active_non_single_use_patterns_.end();
867 const ShinglePattern* pattern = *p;
868 VLOG(2) << ToString(pattern, 10);
872 void PrintPatterns() const {
874 PrintActivePatterns();
878 void PrintAllPatterns() const {
879 for (IndexToPattern::const_iterator p = patterns_.begin();
880 p != patterns_.end();
882 const ShinglePattern& pattern = p->second;
883 VLOG(2) << ToString(&pattern, 10);
887 void PrintAllShingles() const {
888 for (Shingle::OwningSet::const_iterator p = shingle_instances_.begin();
889 p != shingle_instances_.end();
891 const Shingle& instance = *p;
892 VLOG(2) << ToString(&instance) << " " << ToString(instance.pattern());
897 void AddShingles(size_t begin, size_t end) {
898 for (size_t i = begin; i + Shingle::kWidth - 1 < end; ++i) {
899 instances_[i] = Shingle::Find(trace_, i, &shingle_instances_);
903 void Declassify(Shingle* shingle) {
904 ShinglePattern* pattern = shingle->pattern();
905 if (shingle->InModel()) {
906 pattern->model_histogram_.erase(ShinglePattern::FreqView(shingle));
907 pattern->model_coverage_ -= shingle->position_count();
909 pattern->program_histogram_.erase(ShinglePattern::FreqView(shingle));
910 pattern->program_coverage_ -= shingle->position_count();
912 shingle->set_pattern(nullptr);
915 void Reclassify(Shingle* shingle) {
916 ShinglePattern* pattern = shingle->pattern();
917 LOG_ASSERT(pattern == nullptr);
919 ShinglePattern::Index index(shingle);
920 if (index.variables_ == 0)
923 std::pair<IndexToPattern::iterator, bool> inserted =
924 patterns_.insert(std::make_pair(index, ShinglePattern()));
926 pattern = &inserted.first->second;
927 pattern->index_ = &inserted.first->first;
928 shingle->set_pattern(pattern);
929 patterns_needing_updates_.insert(pattern);
931 if (shingle->InModel()) {
932 pattern->model_histogram_.insert(ShinglePattern::FreqView(shingle));
933 pattern->model_coverage_ += shingle->position_count();
935 pattern->program_histogram_.insert(ShinglePattern::FreqView(shingle));
936 pattern->program_coverage_ += shingle->position_count();
940 void InitialClassify() {
941 for (Shingle::OwningSet::iterator p = shingle_instances_.begin();
942 p != shingle_instances_.end();
944 // GCC's set<T>::iterator::operator *() returns a const object.
945 Reclassify(const_cast<Shingle*>(&*p));
949 // For the positions in |info|, find the shingles that overlap that position.
950 void AddAffectedPositions(LabelInfo* info, ShingleSet* affected_shingles) {
951 const uint8_t kWidth = Shingle::kWidth;
952 for (size_t i = 0; i < info->positions_.size(); ++i) {
953 size_t position = info->positions_[i];
954 // Find bounds to the subrange of |trace_| we are in.
955 size_t start = position < model_end_ ? 0 : model_end_;
956 size_t end = position < model_end_ ? model_end_ : trace_.size();
958 // Clip [position-kWidth+1, position+1)
960 position > start + kWidth - 1 ? position - kWidth + 1 : start;
961 size_t high = position + kWidth < end ? position + 1 : end - kWidth + 1;
963 for (size_t shingle_position = low;
964 shingle_position < high;
965 ++shingle_position) {
966 Shingle* overlapping_shingle = instances_.at(shingle_position);
967 affected_shingles->insert(overlapping_shingle);
972 void RemovePatternsNeedingUpdatesFromQueues() {
973 for (ShinglePatternSet::iterator p = patterns_needing_updates_.begin();
974 p != patterns_needing_updates_.end();
976 RemovePatternFromQueues(*p);
980 void AddPatternsNeedingUpdatesToQueues() {
981 for (ShinglePatternSet::iterator p = patterns_needing_updates_.begin();
982 p != patterns_needing_updates_.end();
984 AddPatternToQueues(*p);
986 variable_queue_.ApplyPendingUpdates();
989 void RemovePatternFromQueues(const ShinglePattern* pattern) {
990 int single_use_score = SingleUseScore(pattern);
991 if (single_use_score > 0) {
992 size_t n = single_use_pattern_queue_.erase(pattern);
994 } else if (pattern->program_histogram_.empty() &&
995 pattern->model_histogram_.empty()) {
996 NOTREACHED(); // Should not come back to life.
997 } else if (pattern->program_histogram_.empty()) {
1000 active_non_single_use_patterns_.erase(pattern);
1001 AddPatternToLabelQueue(pattern, -1);
1005 void AddPatternToQueues(const ShinglePattern* pattern) {
1006 int single_use_score = SingleUseScore(pattern);
1007 if (single_use_score > 0) {
1008 single_use_pattern_queue_.insert(pattern);
1009 } else if (pattern->program_histogram_.empty() &&
1010 pattern->model_histogram_.empty()) {
1011 } else if (pattern->program_histogram_.empty()) {
1014 active_non_single_use_patterns_.insert(pattern);
1015 AddPatternToLabelQueue(pattern, +1);
1019 void AddPatternToLabelQueue(const ShinglePattern* pattern, int sign) {
1020 // For each possible assignment in this pattern, update the potential
1021 // contributions to the LabelInfo queues.
1023 // We want to find for each symbol (LabelInfo) the maximum contribution that
1024 // could be achieved by making shingle-wise assignments between shingles in
1025 // the model and shingles in the program.
1027 // If the shingles in the histograms are independent (no two shingles have a
1028 // symbol in common) then any permutation of the assignments is possible,
1029 // and the maximum contribution can be found by taking the maximum over all
1032 // If the shingles are dependent two things happen. The maximum
1033 // contribution to any given symbol is a sum because the symbol has
1034 // contributions from all the shingles containing it. Second, some
1035 // assignments are blocked by previous incompatible assignments. We want to
1036 // avoid a combinatorial search, so we ignore the blocking.
1038 const size_t kUnwieldy = 5;
1040 typedef std::map<LabelInfo*, int> LabelToScore;
1041 typedef std::map<LabelInfo*, LabelToScore > ScoreSet;
1044 size_t n_model_samples = 0;
1045 for (ShinglePattern::Histogram::const_iterator model_iter =
1046 pattern->model_histogram_.begin();
1047 model_iter != pattern->model_histogram_.end();
1049 if (++n_model_samples > kUnwieldy) break;
1050 const ShinglePattern::FreqView& model_freq = *model_iter;
1051 int m1 = model_freq.count();
1052 const Shingle* model_instance = model_freq.instance();
1055 size_t n_program_samples = 0;
1056 for (ShinglePattern::Histogram::const_iterator program_iter =
1057 pattern->program_histogram_.begin();
1058 program_iter != pattern->program_histogram_.end();
1060 if (++n_program_samples > kUnwieldy) break;
1061 const ShinglePattern::FreqView& program_freq = *program_iter;
1062 int p1 = program_freq.count();
1063 const Shingle* program_instance = program_freq.instance();
1065 // int score = p1; // ? weigh all equally??
1066 int score = std::min(p1, m1);
1068 for (uint8_t i = 0; i < Shingle::kWidth; ++i) {
1069 LabelInfo* program_info = program_instance->at(i);
1070 LabelInfo* model_info = model_instance->at(i);
1071 if ((model_info->assignment_ == nullptr) !=
1072 (program_info->assignment_ == nullptr)) {
1073 VLOG(2) << "ERROR " << i
1074 << "\n\t" << ToString(pattern, 10)
1075 << "\n\t" << ToString(program_instance)
1076 << "\n\t" << ToString(model_instance);
1078 if (!program_info->assignment_ && !model_info->assignment_) {
1079 sums[program_info][model_info] += score;
1084 for (ScoreSet::iterator assignee_iterator = sums.begin();
1085 assignee_iterator != sums.end();
1086 ++assignee_iterator) {
1087 LabelInfo* program_info = assignee_iterator->first;
1088 for (LabelToScore::iterator p = assignee_iterator->second.begin();
1089 p != assignee_iterator->second.end();
1091 LabelInfo* model_info = p->first;
1092 int score = p->second;
1093 int* slot = &maxima[program_info][model_info];
1094 *slot = std::max(*slot, score);
1099 for (ScoreSet::iterator assignee_iterator = maxima.begin();
1100 assignee_iterator != maxima.end();
1101 ++assignee_iterator) {
1102 LabelInfo* program_info = assignee_iterator->first;
1103 for (LabelToScore::iterator p = assignee_iterator->second.begin();
1104 p != assignee_iterator->second.end();
1106 LabelInfo* model_info = p->first;
1107 int score = sign * p->second;
1108 variable_queue_.AddPendingUpdate(program_info, model_info, score);
1113 void AssignOne(LabelInfo* model_info, LabelInfo* program_info) {
1114 LOG_ASSERT(!model_info->assignment_);
1115 LOG_ASSERT(!program_info->assignment_);
1116 LOG_ASSERT(model_info->is_model_);
1117 LOG_ASSERT(!program_info->is_model_);
1119 VLOG(3) << "Assign " << ToString(program_info)
1120 << " := " << ToString(model_info);
1122 ShingleSet affected_shingles;
1123 AddAffectedPositions(model_info, &affected_shingles);
1124 AddAffectedPositions(program_info, &affected_shingles);
1126 for (ShingleSet::iterator p = affected_shingles.begin();
1127 p != affected_shingles.end();
1129 patterns_needing_updates_.insert((*p)->pattern());
1132 RemovePatternsNeedingUpdatesFromQueues();
1134 for (ShingleSet::iterator p = affected_shingles.begin();
1135 p != affected_shingles.end();
1140 program_info->label_->index_ = model_info->label_->index_;
1142 model_info->assignment_ = program_info;
1143 program_info->assignment_ = model_info;
1145 for (ShingleSet::iterator p = affected_shingles.begin();
1146 p != affected_shingles.end();
1151 AddPatternsNeedingUpdatesToQueues();
1154 bool AssignFirstVariableOfHistogramHead(const ShinglePattern& pattern) {
1155 const ShinglePattern::FreqView& program_1 =
1156 *pattern.program_histogram_.begin();
1157 const ShinglePattern::FreqView& model_1 = *pattern.model_histogram_.begin();
1158 const Shingle* program_instance = program_1.instance();
1159 const Shingle* model_instance = model_1.instance();
1160 size_t variable_index = pattern.index_->first_variable_index_;
1161 LabelInfo* program_info = program_instance->at(variable_index);
1162 LabelInfo* model_info = model_instance->at(variable_index);
1163 AssignOne(model_info, program_info);
1167 bool FindAndAssignBestLeader() {
1168 LOG_ASSERT(patterns_needing_updates_.empty());
1170 if (!single_use_pattern_queue_.empty()) {
1171 const ShinglePattern& pattern = **single_use_pattern_queue_.begin();
1172 return AssignFirstVariableOfHistogramHead(pattern);
1175 if (variable_queue_.empty())
1178 const VariableQueue::ScoreAndLabel best = variable_queue_.first();
1179 int score = best.first;
1180 LabelInfo* assignee = best.second;
1182 // TODO(sra): score (best.first) can be zero. A zero score means we are
1183 // blindly picking between two (or more) alternatives which look the same.
1184 // If we exit on the first zero-score we sometimes get 3-4% better total
1185 // compression. This indicates that 'infill' is doing a better job than
1186 // picking blindly. Perhaps we can use an extended region around the
1187 // undistinguished competing alternatives to break the tie.
1189 variable_queue_.Print();
1193 AssignmentCandidates* candidates = assignee->candidates();
1194 if (candidates->empty())
1195 return false; // Should not happen.
1197 AssignOne(candidates->top_candidate(), assignee);
1202 // The trace vector contains the model sequence [0, model_end_) followed by
1203 // the program sequence [model_end_, trace.end())
1204 const Trace& trace_;
1207 // |shingle_instances_| is the set of 'interned' shingles.
1208 Shingle::OwningSet shingle_instances_;
1210 // |instances_| maps from position in |trace_| to Shingle at that position.
1211 std::vector<Shingle*> instances_;
1213 SingleUsePatternQueue single_use_pattern_queue_;
1214 ShinglePatternSet active_non_single_use_patterns_;
1215 VariableQueue variable_queue_;
1217 // Transient information: when we make an assignment, we need to recompute
1218 // priority queue information derived from these ShinglePatterns.
1219 ShinglePatternSet patterns_needing_updates_;
1221 typedef std::map<ShinglePattern::Index,
1222 ShinglePattern, ShinglePatternIndexLess> IndexToPattern;
1223 IndexToPattern patterns_;
1225 DISALLOW_COPY_AND_ASSIGN(AssignmentProblem);
1228 class Adjuster : public AdjustmentMethod {
1230 Adjuster() : prog_(nullptr), model_(nullptr) {}
1231 ~Adjuster() = default;
1233 bool Adjust(const AssemblyProgram& model, AssemblyProgram* program) {
1234 VLOG(1) << "Adjuster::Adjust";
1241 prog_->UnassignIndexes();
1244 CollectTraces(model_, &abs32_trace_, &rel32_trace_, true);
1245 size_t abs32_model_end = abs32_trace_.size();
1246 size_t rel32_model_end = rel32_trace_.size();
1247 CollectTraces(prog_, &abs32_trace_, &rel32_trace_, false);
1248 Solve(abs32_trace_, abs32_model_end);
1249 Solve(rel32_trace_, rel32_model_end);
1250 prog_->AssignRemainingIndexes();
1255 void CollectTraces(const AssemblyProgram* program, Trace* abs32, Trace* rel32,
1257 label_info_maker_.ResetDebugLabel();
1259 for (Label* label : program->abs32_label_annotations())
1260 ReferenceLabel(abs32, is_model, label);
1261 for (Label* label : program->rel32_label_annotations())
1262 ReferenceLabel(rel32, is_model, label);
1264 // TODO(sra): we could simply append all the labels in index order to
1265 // incorporate some costing for entropy (bigger deltas) that will be
1266 // introduced into the label address table by non-monotonic ordering. This
1267 // would have some knock-on effects to parts of the algorithm that work on
1268 // single-occurrence labels.
1271 void Solve(const Trace& model, size_t model_end) {
1272 base::Time start_time = base::Time::Now();
1273 AssignmentProblem a(model, model_end);
1275 VLOG(1) << " Adjuster::Solve "
1276 << (base::Time::Now() - start_time).InSecondsF();
1279 void ReferenceLabel(Trace* trace, bool is_model, Label* label) {
1280 trace->push_back(label_info_maker_.MakeLabelInfo(
1281 label, is_model, static_cast<uint32_t>(trace->size())));
1284 AssemblyProgram* prog_; // Program to be adjusted, owned by caller.
1285 const AssemblyProgram* model_; // Program to be mimicked, owned by caller.
1287 LabelInfoMaker label_info_maker_;
1290 DISALLOW_COPY_AND_ASSIGN(Adjuster);
1293 ////////////////////////////////////////////////////////////////////////////////
1295 } // namespace adjustment_method_2
1297 AdjustmentMethod* AdjustmentMethod::MakeShingleAdjustmentMethod() {
1298 return new adjustment_method_2::Adjuster();
1301 } // namespace courgette