src/base/strings/utf_offset_string_conversions.cc

   1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "base/strings/utf_offset_string_conversions.h"
   6
   7 #include <algorithm>
   8
   9 #include "base/logging.h"
  10 #include "base/memory/scoped_ptr.h"
  11 #include "base/strings/string_piece.h"
  12 #include "base/strings/utf_string_conversion_utils.h"
  13
  14 namespace base {
  15
  16 OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
  17                                        size_t original_length,
  18                                        size_t output_length)
  19     : original_offset(original_offset),
  20       original_length(original_length),
  21       output_length(output_length) {
  22 }
  23
  24 // static
  25 void OffsetAdjuster::AdjustOffsets(
  26     const Adjustments& adjustments,
  27     std::vector<size_t>* offsets_for_adjustment) {
  28   if (!offsets_for_adjustment || adjustments.empty())
  29     return;
  30   for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin());
  31        i != offsets_for_adjustment->end(); ++i)
  32     AdjustOffset(adjustments, &(*i));
  33 }
  34
  35 // static
  36 void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments,
  37                                   size_t* offset) {
  38   if (*offset == string16::npos)
  39     return;
  40   int adjustment = 0;
  41   for (Adjustments::const_iterator i = adjustments.begin();
  42        i != adjustments.end(); ++i) {
  43     if (*offset <= i->original_offset)
  44       break;
  45     if (*offset < (i->original_offset + i->original_length)) {
  46       *offset = string16::npos;
  47       return;
  48     }
  49     adjustment += static_cast<int>(i->original_length - i->output_length);
  50   }
  51   *offset -= adjustment;
  52 }
  53
  54 // static
  55 void OffsetAdjuster::UnadjustOffsets(
  56     const Adjustments& adjustments,
  57     std::vector<size_t>* offsets_for_unadjustment) {
  58   if (!offsets_for_unadjustment || adjustments.empty())
  59     return;
  60   for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin());
  61        i != offsets_for_unadjustment->end(); ++i)
  62     UnadjustOffset(adjustments, &(*i));
  63 }
  64
  65 // static
  66 void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments,
  67                                     size_t* offset) {
  68   if (*offset == string16::npos)
  69     return;
  70   int adjustment = 0;
  71   for (Adjustments::const_iterator i = adjustments.begin();
  72        i != adjustments.end(); ++i) {
  73     if (*offset + adjustment <= i->original_offset)
  74       break;
  75     adjustment += static_cast<int>(i->original_length - i->output_length);
  76     if ((*offset + adjustment) <
  77         (i->original_offset + i->original_length)) {
  78       *offset = string16::npos;
  79       return;
  80     }
  81   }
  82   *offset += adjustment;
  83 }
  84
  85 // static
  86 void OffsetAdjuster::MergeSequentialAdjustments(
  87     const Adjustments& first_adjustments,
  88     Adjustments* adjustments_on_adjusted_string) {
  89   Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin();
  90   Adjustments::const_iterator first_iter = first_adjustments.begin();
  91   // Simultaneously iterate over all |adjustments_on_adjusted_string| and
  92   // |first_adjustments|, adding adjustments to or correcting the adjustments
  93   // in |adjustments_on_adjusted_string| as we go.  |shift| keeps track of the
  94   // current number of characters collapsed by |first_adjustments| up to this
  95   // point.  |currently_collapsing| keeps track of the number of characters
  96   // collapsed by |first_adjustments| into the current |adjusted_iter|'s
  97   // length.  These are characters that will change |shift| as soon as we're
  98   // done processing the current |adjusted_iter|; they are not yet reflected in
  99   // |shift|.
 100   size_t shift = 0;
 101   size_t currently_collapsing = 0;
 102   while (adjusted_iter != adjustments_on_adjusted_string->end()) {
 103     if ((first_iter == first_adjustments.end()) ||
 104         ((adjusted_iter->original_offset + shift +
 105           adjusted_iter->original_length) <= first_iter->original_offset)) {
 106       // Entire |adjusted_iter| (accounting for its shift and including its
 107       // whole original length) comes before |first_iter|.
 108       //
 109       // Correct the offset at |adjusted_iter| and move onto the next
 110       // adjustment that needs revising.
 111       adjusted_iter->original_offset += shift;
 112       shift += currently_collapsing;
 113       currently_collapsing = 0;
 114       ++adjusted_iter;
 115     } else if ((adjusted_iter->original_offset + shift) >
 116                first_iter->original_offset) {
 117       // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|).
 118
 119       // It's not possible for the adjustments to overlap.  (It shouldn't
 120       // be possible that we have an |adjusted_iter->original_offset| that,
 121       // when adjusted by the computed |shift|, is in the middle of
 122       // |first_iter|'s output's length.  After all, that would mean the
 123       // current adjustment_on_adjusted_string somehow points to an offset
 124       // that was supposed to have been eliminated by the first set of
 125       // adjustments.)
 126       DCHECK_LE(first_iter->original_offset + first_iter->output_length,
 127                 adjusted_iter->original_offset + shift);
 128
 129       // Add the |first_adjustment_iter| to the full set of adjustments while
 130       // making sure |adjusted_iter| continues pointing to the same element.
 131       // We do this by inserting the |first_adjustment_iter| right before
 132       // |adjusted_iter|, then incrementing |adjusted_iter| so it points to
 133       // the following element.
 134       shift += first_iter->original_length - first_iter->output_length;
 135       adjusted_iter = adjustments_on_adjusted_string->insert(
 136           adjusted_iter, *first_iter);
 137       ++adjusted_iter;
 138       ++first_iter;
 139     } else {
 140       // The first adjustment adjusted something that then got further adjusted
 141       // by the second set of adjustments.  In other words, |first_iter| points
 142       // to something in the range covered by |adjusted_iter|'s length (after
 143       // accounting for |shift|).  Precisely,
 144       //   adjusted_iter->original_offset + shift
 145       //   <=
 146       //   first_iter->original_offset
 147       //   <=
 148       //   adjusted_iter->original_offset + shift +
 149       //       adjusted_iter->original_length
 150
 151       // Modify the current |adjusted_iter| to include whatever collapsing
 152       // happened in |first_iter|, then advance to the next |first_adjustments|
 153       // because we dealt with the current one.
 154       const int collapse = static_cast<int>(first_iter->original_length) -
 155           static_cast<int>(first_iter->output_length);
 156       // This function does not know how to deal with a string that expands and
 157       // then gets modified, only strings that collapse and then get modified.
 158       DCHECK_GT(collapse, 0);
 159       adjusted_iter->original_length += collapse;
 160       currently_collapsing += collapse;
 161       ++first_iter;
 162     }
 163   }
 164   DCHECK_EQ(0u, currently_collapsing);
 165   if (first_iter != first_adjustments.end()) {
 166     // Only first adjustments are left.  These do not need to be modified.
 167     // (Their offsets are already correct with respect to the original string.)
 168     // Append them all.
 169     DCHECK(adjusted_iter == adjustments_on_adjusted_string->end());
 170     adjustments_on_adjusted_string->insert(
 171         adjustments_on_adjusted_string->end(), first_iter,
 172         first_adjustments.end());
 173   }
 174 }
 175
 176 // Converts the given source Unicode character type to the given destination
 177 // Unicode character type as a STL string. The given input buffer and size
 178 // determine the source, and the given output STL string will be replaced by
 179 // the result.  If non-NULL, |adjustments| is set to reflect the all the
 180 // alterations to the string that are not one-character-to-one-character.
 181 // It will always be sorted by increasing offset.
 182 template<typename SrcChar, typename DestStdString>
 183 bool ConvertUnicode(const SrcChar* src,
 184                     size_t src_len,
 185                     DestStdString* output,
 186                     OffsetAdjuster::Adjustments* adjustments) {
 187   if (adjustments)
 188     adjustments->clear();
 189   // ICU requires 32-bit numbers.
 190   bool success = true;
 191   int32 src_len32 = static_cast<int32>(src_len);
 192   for (int32 i = 0; i < src_len32; i++) {
 193     uint32 code_point;
 194     size_t original_i = i;
 195     size_t chars_written = 0;
 196     if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
 197       chars_written = WriteUnicodeCharacter(code_point, output);
 198     } else {
 199       chars_written = WriteUnicodeCharacter(0xFFFD, output);
 200       success = false;
 201     }
 202
 203     // Only bother writing an adjustment if this modification changed the
 204     // length of this character.
 205     // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
 206     // character read, not after it (so that incrementing it in the loop
 207     // increment will place it at the right location), so we need to account
 208     // for that in determining the amount that was read.
 209     if (adjustments && ((i - original_i + 1) != chars_written)) {
 210       adjustments->push_back(OffsetAdjuster::Adjustment(
 211           original_i, i - original_i + 1, chars_written));
 212     }
 213   }
 214   return success;
 215 }
 216
 217 bool UTF8ToUTF16WithAdjustments(
 218     const char* src,
 219     size_t src_len,
 220     string16* output,
 221     base::OffsetAdjuster::Adjustments* adjustments) {
 222   PrepareForUTF16Or32Output(src, src_len, output);
 223   return ConvertUnicode(src, src_len, output, adjustments);
 224 }
 225
 226 string16 UTF8ToUTF16WithAdjustments(
 227     const base::StringPiece& utf8,
 228     base::OffsetAdjuster::Adjustments* adjustments) {
 229   string16 result;
 230   UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments);
 231   return result;
 232 }
 233
 234 string16 UTF8ToUTF16AndAdjustOffsets(
 235     const base::StringPiece& utf8,
 236     std::vector<size_t>* offsets_for_adjustment) {
 237   std::for_each(offsets_for_adjustment->begin(),
 238                 offsets_for_adjustment->end(),
 239                 LimitOffset<base::StringPiece>(utf8.length()));
 240   OffsetAdjuster::Adjustments adjustments;
 241   string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments);
 242   OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
 243   return result;
 244 }
 245
 246 std::string UTF16ToUTF8AndAdjustOffsets(
 247     const base::StringPiece16& utf16,
 248     std::vector<size_t>* offsets_for_adjustment) {
 249   std::for_each(offsets_for_adjustment->begin(),
 250                 offsets_for_adjustment->end(),
 251                 LimitOffset<base::StringPiece16>(utf16.length()));
 252   std::string result;
 253   PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
 254   OffsetAdjuster::Adjustments adjustments;
 255   ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments);
 256   OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
 257   return result;
 258 }
 259
 260 }  // namespace base