src/sync/internal_api/public/base/unique_position.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "sync/internal_api/public/base/unique_position.h"
   6
   7 #include "base/basictypes.h"
   8 #include "base/logging.h"
   9 #include "base/stl_util.h"
  10 #include "base/strings/string_number_conversions.h"
  11 #include "sync/protocol/unique_position.pb.h"
  12 #include "third_party/zlib/zlib.h"
  13
  14 namespace syncer {
  15
  16 const size_t UniquePosition::kSuffixLength = 28;
  17 const size_t UniquePosition::kCompressBytesThreshold = 128;
  18
  19 // static.
  20 bool UniquePosition::IsValidSuffix(const std::string& suffix) {
  21   // The suffix must be exactly the specified length, otherwise unique suffixes
  22   // are not sufficient to guarantee unique positions (because prefix + suffix
  23   // == p + refixsuffix).
  24   return suffix.length() == kSuffixLength;
  25 }
  26
  27 // static.
  28 bool UniquePosition::IsValidBytes(const std::string& bytes) {
  29   // The first condition ensures that our suffix uniqueness is sufficient to
  30   // guarantee position uniqueness.  Otherwise, it's possible the end of some
  31   // prefix + some short suffix == some long suffix.
  32   // The second condition ensures that FindSmallerWithSuffix can always return a
  33   // result.
  34   return bytes.length() >= kSuffixLength
  35       && bytes[bytes.length()-1] != 0;
  36 }
  37
  38 // static.
  39 UniquePosition UniquePosition::CreateInvalid() {
  40   UniquePosition pos;
  41   DCHECK(!pos.IsValid());
  42   return pos;
  43 }
  44
  45 // static.
  46 UniquePosition UniquePosition::FromProto(const sync_pb::UniquePosition& proto) {
  47   if (proto.has_custom_compressed_v1()) {
  48     return UniquePosition(proto.custom_compressed_v1());
  49   } else if (proto.has_value() && !proto.value().empty()) {
  50     return UniquePosition(Compress(proto.value()));
  51   } else if (proto.has_compressed_value() && proto.has_uncompressed_length()) {
  52     uLongf uncompressed_len = proto.uncompressed_length();
  53     std::string un_gzipped;
  54
  55     un_gzipped.resize(uncompressed_len);
  56     int result = uncompress(
  57         reinterpret_cast<Bytef*>(string_as_array(&un_gzipped)),
  58         &uncompressed_len,
  59         reinterpret_cast<const Bytef*>(proto.compressed_value().data()),
  60         proto.compressed_value().size());
  61     if (result != Z_OK) {
  62       DLOG(ERROR) << "Unzip failed " << result;
  63       return UniquePosition::CreateInvalid();
  64     }
  65     if (uncompressed_len != proto.uncompressed_length()) {
  66       DLOG(ERROR)
  67           << "Uncompressed length " << uncompressed_len
  68           << " did not match specified length " << proto.uncompressed_length();
  69       return UniquePosition::CreateInvalid();
  70     }
  71     return UniquePosition(Compress(un_gzipped));
  72   } else {
  73     return UniquePosition::CreateInvalid();
  74   }
  75 }
  76
  77 // static.
  78 UniquePosition UniquePosition::FromInt64(
  79     int64 x, const std::string& suffix) {
  80   uint64 y = static_cast<uint64>(x);
  81   y ^= 0x8000000000000000ULL; // Make it non-negative.
  82   std::string bytes(8, 0);
  83   for (int i = 7; i >= 0; --i) {
  84     bytes[i] = static_cast<uint8>(y);
  85     y >>= 8;
  86   }
  87   return UniquePosition(bytes + suffix, suffix);
  88 }
  89
  90 // static.
  91 UniquePosition UniquePosition::InitialPosition(
  92     const std::string& suffix) {
  93   DCHECK(IsValidSuffix(suffix));
  94   return UniquePosition(suffix, suffix);
  95 }
  96
  97 // static.
  98 UniquePosition UniquePosition::Before(
  99     const UniquePosition& x,
 100     const std::string& suffix) {
 101   DCHECK(IsValidSuffix(suffix));
 102   DCHECK(x.IsValid());
 103   const std::string& before = FindSmallerWithSuffix(
 104       Uncompress(x.compressed_), suffix);
 105   return UniquePosition(before + suffix, suffix);
 106 }
 107
 108 // static.
 109 UniquePosition UniquePosition::After(
 110     const UniquePosition& x,
 111     const std::string& suffix) {
 112   DCHECK(IsValidSuffix(suffix));
 113   DCHECK(x.IsValid());
 114   const std::string& after = FindGreaterWithSuffix(
 115       Uncompress(x.compressed_), suffix);
 116   return UniquePosition(after + suffix, suffix);
 117 }
 118
 119 // static.
 120 UniquePosition UniquePosition::Between(
 121     const UniquePosition& before,
 122     const UniquePosition& after,
 123     const std::string& suffix) {
 124   DCHECK(before.IsValid());
 125   DCHECK(after.IsValid());
 126   DCHECK(before.LessThan(after));
 127   DCHECK(IsValidSuffix(suffix));
 128   const std::string& mid = FindBetweenWithSuffix(
 129       Uncompress(before.compressed_),
 130       Uncompress(after.compressed_),
 131       suffix);
 132   return UniquePosition(mid + suffix, suffix);
 133 }
 134
 135 UniquePosition::UniquePosition() : is_valid_(false) {}
 136
 137 bool UniquePosition::LessThan(const UniquePosition& other) const {
 138   DCHECK(this->IsValid());
 139   DCHECK(other.IsValid());
 140
 141   return compressed_ < other.compressed_;
 142 }
 143
 144 bool UniquePosition::Equals(const UniquePosition& other) const {
 145   if (!this->IsValid() && !other.IsValid())
 146     return true;
 147
 148   return compressed_ == other.compressed_;
 149 }
 150
 151 void UniquePosition::ToProto(sync_pb::UniquePosition* proto) const {
 152   proto->Clear();
 153
 154   // This is the current preferred foramt.
 155   proto->set_custom_compressed_v1(compressed_);
 156
 157   // Some older clients (M28) don't know how to read that format.  We don't want
 158   // to break them until they're obsolete.  We'll serialize to the old-style in
 159   // addition to the new so they won't be confused.
 160   std::string bytes = Uncompress(compressed_);
 161   if (bytes.size() < kCompressBytesThreshold) {
 162     // If it's small, then just write it.  This is the common case.
 163     proto->set_value(bytes);
 164   } else {
 165     // We've got a large one.  Compress it.
 166     proto->set_uncompressed_length(bytes.size());
 167     std::string* compressed = proto->mutable_compressed_value();
 168
 169     uLongf compressed_len = compressBound(bytes.size());
 170     compressed->resize(compressed_len);
 171     int result = compress(reinterpret_cast<Bytef*>(string_as_array(compressed)),
 172              &compressed_len,
 173              reinterpret_cast<const Bytef*>(bytes.data()),
 174              bytes.size());
 175     if (result != Z_OK) {
 176       NOTREACHED() << "Failed to compress position: " << result;
 177       // Maybe we can write an uncompressed version?
 178       proto->Clear();
 179       proto->set_value(bytes);
 180     } else if (compressed_len >= bytes.size()) {
 181       // Oops, we made it bigger.  Just write the uncompressed version instead.
 182       proto->Clear();
 183       proto->set_value(bytes);
 184     } else {
 185       // Success!  Don't forget to adjust the string's length.
 186       compressed->resize(compressed_len);
 187     }
 188   }
 189 }
 190
 191 void UniquePosition::SerializeToString(std::string* blob) const {
 192   DCHECK(blob);
 193   sync_pb::UniquePosition proto;
 194   ToProto(&proto);
 195   proto.SerializeToString(blob);
 196 }
 197
 198 int64 UniquePosition::ToInt64() const {
 199   uint64 y = 0;
 200   const std::string& s = Uncompress(compressed_);
 201   size_t l = sizeof(int64);
 202   if (s.length() < l) {
 203     NOTREACHED();
 204     l = s.length();
 205   }
 206   for (size_t i = 0; i < l; ++i) {
 207     const uint8 byte = s[l - i - 1];
 208     y |= static_cast<uint64>(byte) << (i * 8);
 209   }
 210   y ^= 0x8000000000000000ULL;
 211   // This is technically implementation-defined if y > INT64_MAX, so
 212   // we're assuming that we're on a twos-complement machine.
 213   return static_cast<int64>(y);
 214 }
 215
 216 bool UniquePosition::IsValid() const {
 217   return is_valid_;
 218 }
 219
 220 std::string UniquePosition::ToDebugString() const {
 221   const std::string bytes = Uncompress(compressed_);
 222   if (bytes.empty())
 223     return std::string("INVALID[]");
 224
 225   std::string debug_string = base::HexEncode(bytes.data(), bytes.length());
 226   if (!IsValid()) {
 227     debug_string = "INVALID[" + debug_string + "]";
 228   }
 229
 230   std::string compressed_string =
 231       base::HexEncode(compressed_.data(), compressed_.length());
 232   debug_string.append(", compressed: " + compressed_string);
 233   return debug_string;
 234 }
 235
 236 std::string UniquePosition::GetSuffixForTest() const {
 237   const std::string bytes = Uncompress(compressed_);
 238   const size_t prefix_len = bytes.length() - kSuffixLength;
 239   return bytes.substr(prefix_len, std::string::npos);
 240 }
 241
 242 std::string UniquePosition::FindSmallerWithSuffix(
 243     const std::string& reference,
 244     const std::string& suffix) {
 245   size_t ref_zeroes = reference.find_first_not_of('\0');
 246   size_t suffix_zeroes = suffix.find_first_not_of('\0');
 247
 248   // Neither of our inputs are allowed to have trailing zeroes, so the following
 249   // must be true.
 250   DCHECK_NE(ref_zeroes, std::string::npos);
 251   DCHECK_NE(suffix_zeroes, std::string::npos);
 252
 253   if (suffix_zeroes > ref_zeroes) {
 254     // Implies suffix < ref.
 255     return std::string();
 256   }
 257
 258   if (suffix.substr(suffix_zeroes) < reference.substr(ref_zeroes)) {
 259     // Prepend zeroes so the result has as many zero digits as |reference|.
 260     return std::string(ref_zeroes - suffix_zeroes, '\0');
 261   } else if (suffix_zeroes > 1) {
 262     // Prepend zeroes so the result has one more zero digit than |reference|.
 263     // We could also take the "else" branch below, but taking this branch will
 264     // give us a smaller result.
 265     return std::string(ref_zeroes - suffix_zeroes + 1, '\0');
 266   } else {
 267     // Prepend zeroes to match those in the |reference|, then something smaller
 268     // than the first non-zero digit in |reference|.
 269     char lt_digit = static_cast<uint8>(reference[ref_zeroes])/2;
 270     return std::string(ref_zeroes, '\0') + lt_digit;
 271   }
 272 }
 273
 274 // static
 275 std::string UniquePosition::FindGreaterWithSuffix(
 276     const std::string& reference,
 277     const std::string& suffix) {
 278   size_t ref_FFs = reference.find_first_not_of(kuint8max);
 279   size_t suffix_FFs = suffix.find_first_not_of(kuint8max);
 280
 281   if (ref_FFs == std::string::npos) {
 282     ref_FFs = reference.length();
 283   }
 284   if (suffix_FFs == std::string::npos) {
 285     suffix_FFs = suffix.length();
 286   }
 287
 288   if (suffix_FFs > ref_FFs) {
 289     // Implies suffix > reference.
 290     return std::string();
 291   }
 292
 293   if (suffix.substr(suffix_FFs) > reference.substr(ref_FFs)) {
 294     // Prepend FF digits to match those in |reference|.
 295     return std::string(ref_FFs - suffix_FFs, kuint8max);
 296   } else if (suffix_FFs > 1) {
 297     // Prepend enough leading FF digits so result has one more of them than
 298     // |reference| does.  We could also take the "else" branch below, but this
 299     // gives us a smaller result.
 300     return std::string(ref_FFs - suffix_FFs + 1, kuint8max);
 301   } else {
 302     // Prepend FF digits to match those in |reference|, then something larger
 303     // than the first non-FF digit in |reference|.
 304     char gt_digit = static_cast<uint8>(reference[ref_FFs]) +
 305         (kuint8max - static_cast<uint8>(reference[ref_FFs]) + 1) / 2;
 306     return std::string(ref_FFs, kuint8max) + gt_digit;
 307   }
 308 }
 309
 310 // static
 311 std::string UniquePosition::FindBetweenWithSuffix(
 312     const std::string& before,
 313     const std::string& after,
 314     const std::string& suffix) {
 315   DCHECK(IsValidSuffix(suffix));
 316   DCHECK_NE(before, after);
 317   DCHECK_LT(before, after);
 318
 319   std::string mid;
 320
 321   // Sometimes our suffix puts us where we want to be.
 322   if (before < suffix && suffix < after) {
 323     return std::string();
 324   }
 325
 326   size_t i = 0;
 327   for ( ; i < std::min(before.length(), after.length()); ++i) {
 328     uint8 a_digit = before[i];
 329     uint8 b_digit = after[i];
 330
 331     if (b_digit - a_digit >= 2) {
 332       mid.push_back(a_digit + (b_digit - a_digit)/2);
 333       return mid;
 334     } else if (a_digit == b_digit) {
 335       mid.push_back(a_digit);
 336
 337       // Both strings are equal so far.  Will appending the suffix at this point
 338       // give us the comparison we're looking for?
 339       if (before.substr(i+1) < suffix && suffix < after.substr(i+1)) {
 340         return mid;
 341       }
 342     } else {
 343       DCHECK_EQ(b_digit - a_digit, 1);  // Implied by above if branches.
 344
 345       // The two options are off by one digit.  The choice of whether to round
 346       // up or down here will have consequences on what we do with the remaining
 347       // digits.  Exploring both options is an optimization and is not required
 348       // for the correctness of this algorithm.
 349
 350       // Option A: Round down the current digit.  This makes our |mid| <
 351       // |after|, no matter what we append afterwards.  We then focus on
 352       // appending digits until |mid| > |before|.
 353       std::string mid_a = mid;
 354       mid_a.push_back(a_digit);
 355       mid_a.append(FindGreaterWithSuffix(before.substr(i+1), suffix));
 356
 357       // Option B: Round up the current digit.  This makes our |mid| > |before|,
 358       // no matter what we append afterwards.  We then focus on appending digits
 359       // until |mid| < |after|.  Note that this option may not be viable if the
 360       // current digit is the last one in |after|, so we skip the option in that
 361       // case.
 362       if (after.length() > i+1) {
 363         std::string mid_b = mid;
 364         mid_b.push_back(b_digit);
 365         mid_b.append(FindSmallerWithSuffix(after.substr(i+1), suffix));
 366
 367         // Does this give us a shorter position value?  If so, use it.
 368         if (mid_b.length() < mid_a.length()) {
 369           return mid_b;
 370         }
 371       }
 372       return mid_a;
 373     }
 374   }
 375
 376   // If we haven't found a midpoint yet, the following must be true:
 377   DCHECK_EQ(before.substr(0, i), after.substr(0, i));
 378   DCHECK_EQ(before, mid);
 379   DCHECK_LT(before.length(), after.length());
 380
 381   // We know that we'll need to append at least one more byte to |mid| in the
 382   // process of making it < |after|.  Appending any digit, regardless of the
 383   // value, will make |before| < |mid|.  Therefore, the following will get us a
 384   // valid position.
 385
 386   mid.append(FindSmallerWithSuffix(after.substr(i), suffix));
 387   return mid;
 388 }
 389
 390 UniquePosition::UniquePosition(const std::string& internal_rep)
 391     : compressed_(internal_rep),
 392       is_valid_(IsValidBytes(Uncompress(internal_rep))) {
 393 }
 394
 395 UniquePosition::UniquePosition(
 396     const std::string& uncompressed,
 397     const std::string& suffix)
 398   : compressed_(Compress(uncompressed)),
 399     is_valid_(IsValidBytes(uncompressed)) {
 400   DCHECK(uncompressed.rfind(suffix) + kSuffixLength == uncompressed.length());
 401   DCHECK(IsValidSuffix(suffix));
 402   DCHECK(IsValid());
 403 }
 404
 405 // On custom compression:
 406 //
 407 // Let C(x) be the compression function and U(x) be the uncompression function.
 408 //
 409 // This compression scheme has a few special properties.  For one, it is
 410 // order-preserving.  For any two valid position strings x and y:
 411 //   x < y <=> C(x) < C(y)
 412 // This allows us keep the position strings compressed as we sort them.
 413 //
 414 // The compressed format and the decode algorithm:
 415 //
 416 // The compressed string is a series of blocks, almost all of which are 8 bytes
 417 // in length.  The only exception is the last block in the compressed string,
 418 // which may be a remainder block, which has length no greater than 7.  The
 419 // full-length blocks are either repeated character blocks or plain data blocks.
 420 // All blocks are entirely self-contained.  Their decoded values are independent
 421 // from that of their neighbours.
 422 //
 423 // A repeated character block is encoded into eight bytes and represents between
 424 // 4 and 2^31 repeated instances of a given character in the unencoded stream.
 425 // The encoding consists of a single character repeated four times, followed by
 426 // an encoded count.  The encoded count is stored as a big-endian 32 bit
 427 // integer.  There are 2^31 possible count values, and two encodings for each.
 428 // The high encoding is 'enc = kuint32max - count'; the low encoding is 'enc =
 429 // count'.  At compression time, the algorithm will choose between the two
 430 // encodings based on which of the two will maintain the appropriate sort
 431 // ordering (by a process which will be described below).  The decompression
 432 // algorithm need not concern itself with which encoding was used; it needs only
 433 // to decode it.  The decoded value of this block is "count" instances of the
 434 // character that was repeated four times in the first half of this block.
 435 //
 436 // A plain data block is encoded into eight bytes and represents exactly eight
 437 // bytes of data in the unencoded stream.  The plain data block must not begin
 438 // with the same character repeated four times.  It is allowed to contain such a
 439 // four-character sequence, just not at the start of the block.  The decoded
 440 // value of a plain data block is identical to its encoded value.
 441 //
 442 // A remainder block has length of at most seven.  It is a shorter version of
 443 // the plain data block.  It occurs only at the end of the encoded stream and
 444 // represents exactly as many bytes of unencoded data as its own length.  Like a
 445 // plain data block, the remainder block never begins with the same character
 446 // repeated four times.  The decoded value of this block is identical to its
 447 // encoded value.
 448 //
 449 // The encode algorithm:
 450 //
 451 // From the above description, it can be seen that there may be more than one
 452 // way to encode a given input string.  The encoder must be careful to choose
 453 // the encoding that guarantees sort ordering.
 454 //
 455 // The rules for the encoder are as follows:
 456 // 1. Iterate through the input string and produce output blocks one at a time.
 457 // 2. Where possible (ie. where the next four bytes of input consist of the
 458 //    same character repeated four times), produce a repeated data block of
 459 //    maximum possible length.
 460 // 3. If there is at least 8 bytes of data remaining and it is not possible
 461 //    to produce a repeated character block, produce a plain data block.
 462 // 4. If there are less than 8 bytes of data remaining and it is not possible
 463 //    to produce a repeated character block, produce a remainder block.
 464 // 5. When producing a repeated character block, the count encoding must be
 465 //    chosen in such a way that the sort ordering is maintained.  The choice is
 466 //    best illustrated by way of example:
 467 //
 468 //      When comparing two strings, the first of which begins with of 8
 469 //      instances of the letter 'B' and the second with 10 instances of the
 470 //      letter 'B', which of the two should compare lower?  The result depends
 471 //      on the 9th character of the first string, since it will be compared
 472 //      against the 9th 'B' in the second string.  If that character is an 'A',
 473 //      then the first string will compare lower.  If it is a 'C', then the
 474 //      first string will compare higher.
 475 //
 476 //    The key insight is that the comparison value of a repeated character block
 477 //    depends on the value of the character that follows it.  If the character
 478 //    follows the repeated character has a value greater than the repeated
 479 //    character itself, then a shorter run length should translate to a higher
 480 //    comparison value.  Therefore, we encode its count using the low encoding.
 481 //    Similarly, if the following character is lower, we use the high encoding.
 482
 483 namespace {
 484
 485 // Appends an encoded run length to |output_str|.
 486 static void WriteEncodedRunLength(uint32 length,
 487                                   bool high_encoding,
 488                                   std::string* output_str) {
 489   CHECK_GE(length, 4U);
 490   CHECK_LT(length, 0x80000000);
 491
 492   // Step 1: Invert the count, if necessary, to account for the following digit.
 493   uint32 encoded_length;
 494   if (high_encoding) {
 495     encoded_length = 0xffffffff - length;
 496   } else {
 497     encoded_length = length;
 498   }
 499
 500   // Step 2: Write it as big-endian so it compares correctly with memcmp(3).
 501   output_str->append(1, 0xff & (encoded_length >> 24U));
 502   output_str->append(1, 0xff & (encoded_length >> 16U));
 503   output_str->append(1, 0xff & (encoded_length >> 8U));
 504   output_str->append(1, 0xff & (encoded_length >> 0U));
 505 }
 506
 507 // Reads an encoded run length for |str| at position |i|.
 508 static uint32 ReadEncodedRunLength(const std::string& str, size_t i) {
 509   DCHECK_LE(i + 4, str.length());
 510
 511   // Step 1: Extract the big-endian count.
 512   uint32 encoded_length =
 513       ((uint8)(str[i+3]) << 0)  |
 514       ((uint8)(str[i+2]) << 8)  |
 515       ((uint8)(str[i+1]) << 16) |
 516       ((uint8)(str[i+0]) << 24);
 517
 518   // Step 2: If this was an inverted count, un-invert it.
 519   uint32 length;
 520   if (encoded_length & 0x80000000) {
 521     length = 0xffffffff - encoded_length;
 522   } else {
 523     length = encoded_length;
 524   }
 525
 526   return length;
 527 }
 528
 529 // A series of four identical chars at the beginning of a block indicates
 530 // the beginning of a repeated character block.
 531 static bool IsRepeatedCharPrefix(const std::string& chars, size_t start_index) {
 532   return chars[start_index] == chars[start_index+1]
 533       && chars[start_index] == chars[start_index+2]
 534       && chars[start_index] == chars[start_index+3];
 535 }
 536
 537 }  // namespace
 538
 539 // static
 540 // Wraps the CompressImpl function with a bunch of DCHECKs.
 541 std::string UniquePosition::Compress(const std::string& str) {
 542   DCHECK(IsValidBytes(str));
 543   std::string compressed = CompressImpl(str);
 544   DCHECK(IsValidCompressed(compressed));
 545   DCHECK_EQ(str, Uncompress(compressed));
 546   return compressed;
 547 }
 548
 549 // static
 550 // Performs the order preserving run length compression of a given input string.
 551 std::string UniquePosition::CompressImpl(const std::string& str) {
 552   std::string output;
 553
 554   // The compressed length will usually be at least as long as the suffix (28),
 555   // since the suffix bytes are mostly random.  Most are a few bytes longer; a
 556   // small few are tens of bytes longer.  Some early tests indicated that
 557   // roughly 99% had length 40 or smaller.  We guess that pre-sizing for 48 is a
 558   // good trade-off, but that has not been confirmed through profiling.
 559   output.reserve(48);
 560
 561   // Each loop iteration will consume 8, or N bytes, where N >= 4 and is the
 562   // length of a string of identical digits starting at i.
 563   for (size_t i = 0; i < str.length(); ) {
 564     if (i + 4 <= str.length() && IsRepeatedCharPrefix(str, i)) {
 565       // Four identical bytes in a row at this position means that we must start
 566       // a repeated character block.  Begin by outputting those four bytes.
 567       output.append(str, i, 4);
 568
 569       // Determine the size of the run.
 570       const char rep_digit = str[i];
 571       const size_t runs_until = str.find_first_not_of(rep_digit, i+4);
 572
 573       // Handle the 'runs until end' special case specially.
 574       size_t run_length;
 575       bool encode_high;  // True if the next byte is greater than |rep_digit|.
 576       if (runs_until == std::string::npos) {
 577         run_length = str.length() - i;
 578         encode_high = false;
 579       } else {
 580         run_length = runs_until - i;
 581         encode_high = static_cast<uint8>(str[runs_until]) >
 582             static_cast<uint8>(rep_digit);
 583       }
 584       DCHECK_LT(run_length, static_cast<size_t>(kint32max))
 585           << "This implementation can't encode run-lengths greater than 2^31.";
 586
 587       WriteEncodedRunLength(run_length, encode_high, &output);
 588       i += run_length;  // Jump forward by the size of the run length.
 589     } else {
 590       // Output up to eight bytes without any encoding.
 591       const size_t len = std::min(static_cast<size_t>(8), str.length() - i);
 592       output.append(str, i, len);
 593       i += len;  // Jump forward by the amount of input consumed (usually 8).
 594     }
 595   }
 596
 597   return output;
 598 }
 599
 600 // static
 601 // Uncompresses strings that were compresed with UniquePosition::Compress.
 602 std::string UniquePosition::Uncompress(const std::string& str) {
 603   std::string output;
 604   size_t i = 0;
 605   // Iterate through the compressed string one block at a time.
 606   for (i = 0; i + 8 <= str.length(); i += 8) {
 607     if (IsRepeatedCharPrefix(str, i)) {
 608       // Found a repeated character block.  Expand it.
 609       const char rep_digit = str[i];
 610       uint32 length = ReadEncodedRunLength(str, i+4);
 611       output.append(length, rep_digit);
 612     } else {
 613       // Found a regular block.  Copy it.
 614       output.append(str, i, 8);
 615     }
 616   }
 617   // Copy the remaining bytes that were too small to form a block.
 618   output.append(str, i, std::string::npos);
 619   return output;
 620 }
 621
 622 bool UniquePosition::IsValidCompressed(const std::string& str) {
 623   for (size_t i = 0; i + 8 <= str.length(); i += 8) {
 624     if (IsRepeatedCharPrefix(str, i)) {
 625       uint32 count = ReadEncodedRunLength(str, i+4);
 626       if (count < 4) {
 627         // A repeated character block should at least represent the four
 628         // characters that started it.
 629         return false;
 630       }
 631       if (str[i] == str[i+4]) {
 632         // Does the next digit after a count match the repeated character?  Then
 633         // this is not the highest possible count.
 634         return false;
 635       }
 636     }
 637   }
 638   // We don't bother looking for the existence or checking the validity of
 639   // any partial blocks.  There's no way they could be invalid anyway.
 640   return true;
 641 }
 642
 643 }  // namespace syncer