1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "sync/internal_api/public/base/unique_position.h"
7 #include "base/basictypes.h"
8 #include "base/logging.h"
9 #include "base/stl_util.h"
10 #include "base/strings/string_number_conversions.h"
11 #include "sync/protocol/unique_position.pb.h"
12 #include "third_party/zlib/zlib.h"
16 const size_t UniquePosition::kSuffixLength = 28;
17 const size_t UniquePosition::kCompressBytesThreshold = 128;
20 bool UniquePosition::IsValidSuffix(const std::string& suffix) {
21 // The suffix must be exactly the specified length, otherwise unique suffixes
22 // are not sufficient to guarantee unique positions (because prefix + suffix
23 // == p + refixsuffix).
24 return suffix.length() == kSuffixLength;
28 bool UniquePosition::IsValidBytes(const std::string& bytes) {
29 // The first condition ensures that our suffix uniqueness is sufficient to
30 // guarantee position uniqueness. Otherwise, it's possible the end of some
31 // prefix + some short suffix == some long suffix.
32 // The second condition ensures that FindSmallerWithSuffix can always return a
34 return bytes.length() >= kSuffixLength
35 && bytes[bytes.length()-1] != 0;
39 UniquePosition UniquePosition::CreateInvalid() {
41 DCHECK(!pos.IsValid());
46 UniquePosition UniquePosition::FromProto(const sync_pb::UniquePosition& proto) {
47 if (proto.has_custom_compressed_v1()) {
48 return UniquePosition(proto.custom_compressed_v1());
49 } else if (proto.has_value() && !proto.value().empty()) {
50 return UniquePosition(Compress(proto.value()));
51 } else if (proto.has_compressed_value() && proto.has_uncompressed_length()) {
52 uLongf uncompressed_len = proto.uncompressed_length();
53 std::string un_gzipped;
55 un_gzipped.resize(uncompressed_len);
56 int result = uncompress(
57 reinterpret_cast<Bytef*>(string_as_array(&un_gzipped)),
59 reinterpret_cast<const Bytef*>(proto.compressed_value().data()),
60 proto.compressed_value().size());
62 DLOG(ERROR) << "Unzip failed " << result;
63 return UniquePosition::CreateInvalid();
65 if (uncompressed_len != proto.uncompressed_length()) {
67 << "Uncompressed length " << uncompressed_len
68 << " did not match specified length " << proto.uncompressed_length();
69 return UniquePosition::CreateInvalid();
71 return UniquePosition(Compress(un_gzipped));
73 return UniquePosition::CreateInvalid();
78 UniquePosition UniquePosition::FromInt64(
79 int64 x, const std::string& suffix) {
80 uint64 y = static_cast<uint64>(x);
81 y ^= 0x8000000000000000ULL; // Make it non-negative.
82 std::string bytes(8, 0);
83 for (int i = 7; i >= 0; --i) {
84 bytes[i] = static_cast<uint8>(y);
87 return UniquePosition(bytes + suffix, suffix);
91 UniquePosition UniquePosition::InitialPosition(
92 const std::string& suffix) {
93 DCHECK(IsValidSuffix(suffix));
94 return UniquePosition(suffix, suffix);
98 UniquePosition UniquePosition::Before(
99 const UniquePosition& x,
100 const std::string& suffix) {
101 DCHECK(IsValidSuffix(suffix));
103 const std::string& before = FindSmallerWithSuffix(
104 Uncompress(x.compressed_), suffix);
105 return UniquePosition(before + suffix, suffix);
109 UniquePosition UniquePosition::After(
110 const UniquePosition& x,
111 const std::string& suffix) {
112 DCHECK(IsValidSuffix(suffix));
114 const std::string& after = FindGreaterWithSuffix(
115 Uncompress(x.compressed_), suffix);
116 return UniquePosition(after + suffix, suffix);
120 UniquePosition UniquePosition::Between(
121 const UniquePosition& before,
122 const UniquePosition& after,
123 const std::string& suffix) {
124 DCHECK(before.IsValid());
125 DCHECK(after.IsValid());
126 DCHECK(before.LessThan(after));
127 DCHECK(IsValidSuffix(suffix));
128 const std::string& mid = FindBetweenWithSuffix(
129 Uncompress(before.compressed_),
130 Uncompress(after.compressed_),
132 return UniquePosition(mid + suffix, suffix);
135 UniquePosition::UniquePosition() : is_valid_(false) {}
137 bool UniquePosition::LessThan(const UniquePosition& other) const {
138 DCHECK(this->IsValid());
139 DCHECK(other.IsValid());
141 return compressed_ < other.compressed_;
144 bool UniquePosition::Equals(const UniquePosition& other) const {
145 if (!this->IsValid() && !other.IsValid())
148 return compressed_ == other.compressed_;
151 void UniquePosition::ToProto(sync_pb::UniquePosition* proto) const {
154 // This is the current preferred foramt.
155 proto->set_custom_compressed_v1(compressed_);
157 // Some older clients (M28) don't know how to read that format. We don't want
158 // to break them until they're obsolete. We'll serialize to the old-style in
159 // addition to the new so they won't be confused.
160 std::string bytes = Uncompress(compressed_);
161 if (bytes.size() < kCompressBytesThreshold) {
162 // If it's small, then just write it. This is the common case.
163 proto->set_value(bytes);
165 // We've got a large one. Compress it.
166 proto->set_uncompressed_length(bytes.size());
167 std::string* compressed = proto->mutable_compressed_value();
169 uLongf compressed_len = compressBound(bytes.size());
170 compressed->resize(compressed_len);
171 int result = compress(reinterpret_cast<Bytef*>(string_as_array(compressed)),
173 reinterpret_cast<const Bytef*>(bytes.data()),
175 if (result != Z_OK) {
176 NOTREACHED() << "Failed to compress position: " << result;
177 // Maybe we can write an uncompressed version?
179 proto->set_value(bytes);
180 } else if (compressed_len >= bytes.size()) {
181 // Oops, we made it bigger. Just write the uncompressed version instead.
183 proto->set_value(bytes);
185 // Success! Don't forget to adjust the string's length.
186 compressed->resize(compressed_len);
191 void UniquePosition::SerializeToString(std::string* blob) const {
193 sync_pb::UniquePosition proto;
195 proto.SerializeToString(blob);
198 int64 UniquePosition::ToInt64() const {
200 const std::string& s = Uncompress(compressed_);
201 size_t l = sizeof(int64);
202 if (s.length() < l) {
206 for (size_t i = 0; i < l; ++i) {
207 const uint8 byte = s[l - i - 1];
208 y |= static_cast<uint64>(byte) << (i * 8);
210 y ^= 0x8000000000000000ULL;
211 // This is technically implementation-defined if y > INT64_MAX, so
212 // we're assuming that we're on a twos-complement machine.
213 return static_cast<int64>(y);
216 bool UniquePosition::IsValid() const {
220 std::string UniquePosition::ToDebugString() const {
221 const std::string bytes = Uncompress(compressed_);
223 return std::string("INVALID[]");
225 std::string debug_string = base::HexEncode(bytes.data(), bytes.length());
227 debug_string = "INVALID[" + debug_string + "]";
230 std::string compressed_string =
231 base::HexEncode(compressed_.data(), compressed_.length());
232 debug_string.append(", compressed: " + compressed_string);
236 std::string UniquePosition::GetSuffixForTest() const {
237 const std::string bytes = Uncompress(compressed_);
238 const size_t prefix_len = bytes.length() - kSuffixLength;
239 return bytes.substr(prefix_len, std::string::npos);
242 std::string UniquePosition::FindSmallerWithSuffix(
243 const std::string& reference,
244 const std::string& suffix) {
245 size_t ref_zeroes = reference.find_first_not_of('\0');
246 size_t suffix_zeroes = suffix.find_first_not_of('\0');
248 // Neither of our inputs are allowed to have trailing zeroes, so the following
250 DCHECK_NE(ref_zeroes, std::string::npos);
251 DCHECK_NE(suffix_zeroes, std::string::npos);
253 if (suffix_zeroes > ref_zeroes) {
254 // Implies suffix < ref.
255 return std::string();
258 if (suffix.substr(suffix_zeroes) < reference.substr(ref_zeroes)) {
259 // Prepend zeroes so the result has as many zero digits as |reference|.
260 return std::string(ref_zeroes - suffix_zeroes, '\0');
261 } else if (suffix_zeroes > 1) {
262 // Prepend zeroes so the result has one more zero digit than |reference|.
263 // We could also take the "else" branch below, but taking this branch will
264 // give us a smaller result.
265 return std::string(ref_zeroes - suffix_zeroes + 1, '\0');
267 // Prepend zeroes to match those in the |reference|, then something smaller
268 // than the first non-zero digit in |reference|.
269 char lt_digit = static_cast<uint8>(reference[ref_zeroes])/2;
270 return std::string(ref_zeroes, '\0') + lt_digit;
275 std::string UniquePosition::FindGreaterWithSuffix(
276 const std::string& reference,
277 const std::string& suffix) {
278 size_t ref_FFs = reference.find_first_not_of(kuint8max);
279 size_t suffix_FFs = suffix.find_first_not_of(kuint8max);
281 if (ref_FFs == std::string::npos) {
282 ref_FFs = reference.length();
284 if (suffix_FFs == std::string::npos) {
285 suffix_FFs = suffix.length();
288 if (suffix_FFs > ref_FFs) {
289 // Implies suffix > reference.
290 return std::string();
293 if (suffix.substr(suffix_FFs) > reference.substr(ref_FFs)) {
294 // Prepend FF digits to match those in |reference|.
295 return std::string(ref_FFs - suffix_FFs, kuint8max);
296 } else if (suffix_FFs > 1) {
297 // Prepend enough leading FF digits so result has one more of them than
298 // |reference| does. We could also take the "else" branch below, but this
299 // gives us a smaller result.
300 return std::string(ref_FFs - suffix_FFs + 1, kuint8max);
302 // Prepend FF digits to match those in |reference|, then something larger
303 // than the first non-FF digit in |reference|.
304 char gt_digit = static_cast<uint8>(reference[ref_FFs]) +
305 (kuint8max - static_cast<uint8>(reference[ref_FFs]) + 1) / 2;
306 return std::string(ref_FFs, kuint8max) + gt_digit;
311 std::string UniquePosition::FindBetweenWithSuffix(
312 const std::string& before,
313 const std::string& after,
314 const std::string& suffix) {
315 DCHECK(IsValidSuffix(suffix));
316 DCHECK_NE(before, after);
317 DCHECK_LT(before, after);
321 // Sometimes our suffix puts us where we want to be.
322 if (before < suffix && suffix < after) {
323 return std::string();
327 for ( ; i < std::min(before.length(), after.length()); ++i) {
328 uint8 a_digit = before[i];
329 uint8 b_digit = after[i];
331 if (b_digit - a_digit >= 2) {
332 mid.push_back(a_digit + (b_digit - a_digit)/2);
334 } else if (a_digit == b_digit) {
335 mid.push_back(a_digit);
337 // Both strings are equal so far. Will appending the suffix at this point
338 // give us the comparison we're looking for?
339 if (before.substr(i+1) < suffix && suffix < after.substr(i+1)) {
343 DCHECK_EQ(b_digit - a_digit, 1); // Implied by above if branches.
345 // The two options are off by one digit. The choice of whether to round
346 // up or down here will have consequences on what we do with the remaining
347 // digits. Exploring both options is an optimization and is not required
348 // for the correctness of this algorithm.
350 // Option A: Round down the current digit. This makes our |mid| <
351 // |after|, no matter what we append afterwards. We then focus on
352 // appending digits until |mid| > |before|.
353 std::string mid_a = mid;
354 mid_a.push_back(a_digit);
355 mid_a.append(FindGreaterWithSuffix(before.substr(i+1), suffix));
357 // Option B: Round up the current digit. This makes our |mid| > |before|,
358 // no matter what we append afterwards. We then focus on appending digits
359 // until |mid| < |after|. Note that this option may not be viable if the
360 // current digit is the last one in |after|, so we skip the option in that
362 if (after.length() > i+1) {
363 std::string mid_b = mid;
364 mid_b.push_back(b_digit);
365 mid_b.append(FindSmallerWithSuffix(after.substr(i+1), suffix));
367 // Does this give us a shorter position value? If so, use it.
368 if (mid_b.length() < mid_a.length()) {
376 // If we haven't found a midpoint yet, the following must be true:
377 DCHECK_EQ(before.substr(0, i), after.substr(0, i));
378 DCHECK_EQ(before, mid);
379 DCHECK_LT(before.length(), after.length());
381 // We know that we'll need to append at least one more byte to |mid| in the
382 // process of making it < |after|. Appending any digit, regardless of the
383 // value, will make |before| < |mid|. Therefore, the following will get us a
386 mid.append(FindSmallerWithSuffix(after.substr(i), suffix));
390 UniquePosition::UniquePosition(const std::string& internal_rep)
391 : compressed_(internal_rep),
392 is_valid_(IsValidBytes(Uncompress(internal_rep))) {
395 UniquePosition::UniquePosition(
396 const std::string& uncompressed,
397 const std::string& suffix)
398 : compressed_(Compress(uncompressed)),
399 is_valid_(IsValidBytes(uncompressed)) {
400 DCHECK(uncompressed.rfind(suffix) + kSuffixLength == uncompressed.length());
401 DCHECK(IsValidSuffix(suffix));
405 // On custom compression:
407 // Let C(x) be the compression function and U(x) be the uncompression function.
409 // This compression scheme has a few special properties. For one, it is
410 // order-preserving. For any two valid position strings x and y:
411 // x < y <=> C(x) < C(y)
412 // This allows us keep the position strings compressed as we sort them.
414 // The compressed format and the decode algorithm:
416 // The compressed string is a series of blocks, almost all of which are 8 bytes
417 // in length. The only exception is the last block in the compressed string,
418 // which may be a remainder block, which has length no greater than 7. The
419 // full-length blocks are either repeated character blocks or plain data blocks.
420 // All blocks are entirely self-contained. Their decoded values are independent
421 // from that of their neighbours.
423 // A repeated character block is encoded into eight bytes and represents between
424 // 4 and 2^31 repeated instances of a given character in the unencoded stream.
425 // The encoding consists of a single character repeated four times, followed by
426 // an encoded count. The encoded count is stored as a big-endian 32 bit
427 // integer. There are 2^31 possible count values, and two encodings for each.
428 // The high encoding is 'enc = kuint32max - count'; the low encoding is 'enc =
429 // count'. At compression time, the algorithm will choose between the two
430 // encodings based on which of the two will maintain the appropriate sort
431 // ordering (by a process which will be described below). The decompression
432 // algorithm need not concern itself with which encoding was used; it needs only
433 // to decode it. The decoded value of this block is "count" instances of the
434 // character that was repeated four times in the first half of this block.
436 // A plain data block is encoded into eight bytes and represents exactly eight
437 // bytes of data in the unencoded stream. The plain data block must not begin
438 // with the same character repeated four times. It is allowed to contain such a
439 // four-character sequence, just not at the start of the block. The decoded
440 // value of a plain data block is identical to its encoded value.
442 // A remainder block has length of at most seven. It is a shorter version of
443 // the plain data block. It occurs only at the end of the encoded stream and
444 // represents exactly as many bytes of unencoded data as its own length. Like a
445 // plain data block, the remainder block never begins with the same character
446 // repeated four times. The decoded value of this block is identical to its
449 // The encode algorithm:
451 // From the above description, it can be seen that there may be more than one
452 // way to encode a given input string. The encoder must be careful to choose
453 // the encoding that guarantees sort ordering.
455 // The rules for the encoder are as follows:
456 // 1. Iterate through the input string and produce output blocks one at a time.
457 // 2. Where possible (ie. where the next four bytes of input consist of the
458 // same character repeated four times), produce a repeated data block of
459 // maximum possible length.
460 // 3. If there is at least 8 bytes of data remaining and it is not possible
461 // to produce a repeated character block, produce a plain data block.
462 // 4. If there are less than 8 bytes of data remaining and it is not possible
463 // to produce a repeated character block, produce a remainder block.
464 // 5. When producing a repeated character block, the count encoding must be
465 // chosen in such a way that the sort ordering is maintained. The choice is
466 // best illustrated by way of example:
468 // When comparing two strings, the first of which begins with of 8
469 // instances of the letter 'B' and the second with 10 instances of the
470 // letter 'B', which of the two should compare lower? The result depends
471 // on the 9th character of the first string, since it will be compared
472 // against the 9th 'B' in the second string. If that character is an 'A',
473 // then the first string will compare lower. If it is a 'C', then the
474 // first string will compare higher.
476 // The key insight is that the comparison value of a repeated character block
477 // depends on the value of the character that follows it. If the character
478 // follows the repeated character has a value greater than the repeated
479 // character itself, then a shorter run length should translate to a higher
480 // comparison value. Therefore, we encode its count using the low encoding.
481 // Similarly, if the following character is lower, we use the high encoding.
485 // Appends an encoded run length to |output_str|.
486 static void WriteEncodedRunLength(uint32 length,
488 std::string* output_str) {
489 CHECK_GE(length, 4U);
490 CHECK_LT(length, 0x80000000);
492 // Step 1: Invert the count, if necessary, to account for the following digit.
493 uint32 encoded_length;
495 encoded_length = 0xffffffff - length;
497 encoded_length = length;
500 // Step 2: Write it as big-endian so it compares correctly with memcmp(3).
501 output_str->append(1, 0xff & (encoded_length >> 24U));
502 output_str->append(1, 0xff & (encoded_length >> 16U));
503 output_str->append(1, 0xff & (encoded_length >> 8U));
504 output_str->append(1, 0xff & (encoded_length >> 0U));
507 // Reads an encoded run length for |str| at position |i|.
508 static uint32 ReadEncodedRunLength(const std::string& str, size_t i) {
509 DCHECK_LE(i + 4, str.length());
511 // Step 1: Extract the big-endian count.
512 uint32 encoded_length =
513 ((uint8)(str[i+3]) << 0) |
514 ((uint8)(str[i+2]) << 8) |
515 ((uint8)(str[i+1]) << 16) |
516 ((uint8)(str[i+0]) << 24);
518 // Step 2: If this was an inverted count, un-invert it.
520 if (encoded_length & 0x80000000) {
521 length = 0xffffffff - encoded_length;
523 length = encoded_length;
529 // A series of four identical chars at the beginning of a block indicates
530 // the beginning of a repeated character block.
531 static bool IsRepeatedCharPrefix(const std::string& chars, size_t start_index) {
532 return chars[start_index] == chars[start_index+1]
533 && chars[start_index] == chars[start_index+2]
534 && chars[start_index] == chars[start_index+3];
540 // Wraps the CompressImpl function with a bunch of DCHECKs.
541 std::string UniquePosition::Compress(const std::string& str) {
542 DCHECK(IsValidBytes(str));
543 std::string compressed = CompressImpl(str);
544 DCHECK(IsValidCompressed(compressed));
545 DCHECK_EQ(str, Uncompress(compressed));
550 // Performs the order preserving run length compression of a given input string.
551 std::string UniquePosition::CompressImpl(const std::string& str) {
554 // The compressed length will usually be at least as long as the suffix (28),
555 // since the suffix bytes are mostly random. Most are a few bytes longer; a
556 // small few are tens of bytes longer. Some early tests indicated that
557 // roughly 99% had length 40 or smaller. We guess that pre-sizing for 48 is a
558 // good trade-off, but that has not been confirmed through profiling.
561 // Each loop iteration will consume 8, or N bytes, where N >= 4 and is the
562 // length of a string of identical digits starting at i.
563 for (size_t i = 0; i < str.length(); ) {
564 if (i + 4 <= str.length() && IsRepeatedCharPrefix(str, i)) {
565 // Four identical bytes in a row at this position means that we must start
566 // a repeated character block. Begin by outputting those four bytes.
567 output.append(str, i, 4);
569 // Determine the size of the run.
570 const char rep_digit = str[i];
571 const size_t runs_until = str.find_first_not_of(rep_digit, i+4);
573 // Handle the 'runs until end' special case specially.
575 bool encode_high; // True if the next byte is greater than |rep_digit|.
576 if (runs_until == std::string::npos) {
577 run_length = str.length() - i;
580 run_length = runs_until - i;
581 encode_high = static_cast<uint8>(str[runs_until]) >
582 static_cast<uint8>(rep_digit);
584 DCHECK_LT(run_length, static_cast<size_t>(kint32max))
585 << "This implementation can't encode run-lengths greater than 2^31.";
587 WriteEncodedRunLength(run_length, encode_high, &output);
588 i += run_length; // Jump forward by the size of the run length.
590 // Output up to eight bytes without any encoding.
591 const size_t len = std::min(static_cast<size_t>(8), str.length() - i);
592 output.append(str, i, len);
593 i += len; // Jump forward by the amount of input consumed (usually 8).
601 // Uncompresses strings that were compresed with UniquePosition::Compress.
602 std::string UniquePosition::Uncompress(const std::string& str) {
605 // Iterate through the compressed string one block at a time.
606 for (i = 0; i + 8 <= str.length(); i += 8) {
607 if (IsRepeatedCharPrefix(str, i)) {
608 // Found a repeated character block. Expand it.
609 const char rep_digit = str[i];
610 uint32 length = ReadEncodedRunLength(str, i+4);
611 output.append(length, rep_digit);
613 // Found a regular block. Copy it.
614 output.append(str, i, 8);
617 // Copy the remaining bytes that were too small to form a block.
618 output.append(str, i, std::string::npos);
622 bool UniquePosition::IsValidCompressed(const std::string& str) {
623 for (size_t i = 0; i + 8 <= str.length(); i += 8) {
624 if (IsRepeatedCharPrefix(str, i)) {
625 uint32 count = ReadEncodedRunLength(str, i+4);
627 // A repeated character block should at least represent the four
628 // characters that started it.
631 if (str[i] == str[i+4]) {
632 // Does the next digit after a count match the repeated character? Then
633 // this is not the highest possible count.
638 // We don't bother looking for the existence or checking the validity of
639 // any partial blocks. There's no way they could be invalid anyway.
643 } // namespace syncer