Upstream version 7.36.149.0
[platform/framework/web/crosswalk.git] / src / base / strings / utf_offset_string_conversions.cc
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/strings/utf_offset_string_conversions.h"
6
7 #include <algorithm>
8
9 #include "base/logging.h"
10 #include "base/memory/scoped_ptr.h"
11 #include "base/strings/string_piece.h"
12 #include "base/strings/utf_string_conversion_utils.h"
13
14 namespace base {
15
16 OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
17                                        size_t original_length,
18                                        size_t output_length)
19     : original_offset(original_offset),
20       original_length(original_length),
21       output_length(output_length) {
22 }
23
24 // static
25 void OffsetAdjuster::AdjustOffsets(
26     const Adjustments& adjustments,
27     std::vector<size_t>* offsets_for_adjustment) {
28   if (!offsets_for_adjustment || adjustments.empty())
29     return;
30   for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin());
31        i != offsets_for_adjustment->end(); ++i)
32     AdjustOffset(adjustments, &(*i));
33 }
34
35 // static
36 void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments,
37                                   size_t* offset) {
38   if (*offset == string16::npos)
39     return;
40   int adjustment = 0;
41   for (Adjustments::const_iterator i = adjustments.begin();
42        i != adjustments.end(); ++i) {
43     if (*offset <= i->original_offset)
44       break;
45     if (*offset < (i->original_offset + i->original_length)) {
46       *offset = string16::npos;
47       return;
48     }
49     adjustment += static_cast<int>(i->original_length - i->output_length);
50   }
51   *offset -= adjustment;
52 }
53
54 // static
55 void OffsetAdjuster::UnadjustOffsets(
56     const Adjustments& adjustments,
57     std::vector<size_t>* offsets_for_unadjustment) {
58   if (!offsets_for_unadjustment || adjustments.empty())
59     return;
60   for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin());
61        i != offsets_for_unadjustment->end(); ++i)
62     UnadjustOffset(adjustments, &(*i));
63 }
64
65 // static
66 void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments,
67                                     size_t* offset) {
68   if (*offset == string16::npos)
69     return;
70   int adjustment = 0;
71   for (Adjustments::const_iterator i = adjustments.begin();
72        i != adjustments.end(); ++i) {
73     if (*offset + adjustment <= i->original_offset)
74       break;
75     adjustment += static_cast<int>(i->original_length - i->output_length);
76     if ((*offset + adjustment) <
77         (i->original_offset + i->original_length)) {
78       *offset = string16::npos;
79       return;
80     }
81   }
82   *offset += adjustment;
83 }
84
85 // static
86 void OffsetAdjuster::MergeSequentialAdjustments(
87     const Adjustments& first_adjustments,
88     Adjustments* adjustments_on_adjusted_string) {
89   Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin();
90   Adjustments::const_iterator first_iter = first_adjustments.begin();
91   // Simultaneously iterate over all |adjustments_on_adjusted_string| and
92   // |first_adjustments|, adding adjustments to or correcting the adjustments
93   // in |adjustments_on_adjusted_string| as we go.  |shift| keeps track of the
94   // current number of characters collapsed by |first_adjustments| up to this
95   // point.  |currently_collapsing| keeps track of the number of characters
96   // collapsed by |first_adjustments| into the current |adjusted_iter|'s
97   // length.  These are characters that will change |shift| as soon as we're
98   // done processing the current |adjusted_iter|; they are not yet reflected in
99   // |shift|.
100   size_t shift = 0;
101   size_t currently_collapsing = 0;
102   while (adjusted_iter != adjustments_on_adjusted_string->end()) {
103     if ((first_iter == first_adjustments.end()) ||
104         ((adjusted_iter->original_offset + shift +
105           adjusted_iter->original_length) <= first_iter->original_offset)) {
106       // Entire |adjusted_iter| (accounting for its shift and including its
107       // whole original length) comes before |first_iter|.
108       //
109       // Correct the offset at |adjusted_iter| and move onto the next
110       // adjustment that needs revising.
111       adjusted_iter->original_offset += shift;
112       shift += currently_collapsing;
113       currently_collapsing = 0;
114       ++adjusted_iter;
115     } else if ((adjusted_iter->original_offset + shift) >
116                first_iter->original_offset) {
117       // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|).
118
119       // It's not possible for the adjustments to overlap.  (It shouldn't
120       // be possible that we have an |adjusted_iter->original_offset| that,
121       // when adjusted by the computed |shift|, is in the middle of
122       // |first_iter|'s output's length.  After all, that would mean the
123       // current adjustment_on_adjusted_string somehow points to an offset
124       // that was supposed to have been eliminated by the first set of
125       // adjustments.)
126       DCHECK_LE(first_iter->original_offset + first_iter->output_length,
127                 adjusted_iter->original_offset + shift);
128
129       // Add the |first_adjustment_iter| to the full set of adjustments while
130       // making sure |adjusted_iter| continues pointing to the same element.
131       // We do this by inserting the |first_adjustment_iter| right before
132       // |adjusted_iter|, then incrementing |adjusted_iter| so it points to
133       // the following element.
134       shift += first_iter->original_length - first_iter->output_length;
135       adjusted_iter = adjustments_on_adjusted_string->insert(
136           adjusted_iter, *first_iter);
137       ++adjusted_iter;
138       ++first_iter;
139     } else {
140       // The first adjustment adjusted something that then got further adjusted
141       // by the second set of adjustments.  In other words, |first_iter| points
142       // to something in the range covered by |adjusted_iter|'s length (after
143       // accounting for |shift|).  Precisely,
144       //   adjusted_iter->original_offset + shift
145       //   <=
146       //   first_iter->original_offset
147       //   <=
148       //   adjusted_iter->original_offset + shift +
149       //       adjusted_iter->original_length
150
151       // Modify the current |adjusted_iter| to include whatever collapsing
152       // happened in |first_iter|, then advance to the next |first_adjustments|
153       // because we dealt with the current one.
154       const int collapse = static_cast<int>(first_iter->original_length) -
155           static_cast<int>(first_iter->output_length);
156       // This function does not know how to deal with a string that expands and
157       // then gets modified, only strings that collapse and then get modified.
158       DCHECK_GT(collapse, 0);
159       adjusted_iter->original_length += collapse;
160       currently_collapsing += collapse;
161       ++first_iter;
162     }
163   }
164   DCHECK_EQ(0u, currently_collapsing);
165   if (first_iter != first_adjustments.end()) {
166     // Only first adjustments are left.  These do not need to be modified.
167     // (Their offsets are already correct with respect to the original string.)
168     // Append them all.
169     DCHECK(adjusted_iter == adjustments_on_adjusted_string->end());
170     adjustments_on_adjusted_string->insert(
171         adjustments_on_adjusted_string->end(), first_iter,
172         first_adjustments.end());
173   }
174 }
175
176 // Converts the given source Unicode character type to the given destination
177 // Unicode character type as a STL string. The given input buffer and size
178 // determine the source, and the given output STL string will be replaced by
179 // the result.  If non-NULL, |adjustments| is set to reflect the all the
180 // alterations to the string that are not one-character-to-one-character.
181 // It will always be sorted by increasing offset.
182 template<typename SrcChar, typename DestStdString>
183 bool ConvertUnicode(const SrcChar* src,
184                     size_t src_len,
185                     DestStdString* output,
186                     OffsetAdjuster::Adjustments* adjustments) {
187   if (adjustments)
188     adjustments->clear();
189   // ICU requires 32-bit numbers.
190   bool success = true;
191   int32 src_len32 = static_cast<int32>(src_len);
192   for (int32 i = 0; i < src_len32; i++) {
193     uint32 code_point;
194     size_t original_i = i;
195     size_t chars_written = 0;
196     if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
197       chars_written = WriteUnicodeCharacter(code_point, output);
198     } else {
199       chars_written = WriteUnicodeCharacter(0xFFFD, output);
200       success = false;
201     }
202
203     // Only bother writing an adjustment if this modification changed the
204     // length of this character.
205     // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
206     // character read, not after it (so that incrementing it in the loop
207     // increment will place it at the right location), so we need to account
208     // for that in determining the amount that was read.
209     if (adjustments && ((i - original_i + 1) != chars_written)) {
210       adjustments->push_back(OffsetAdjuster::Adjustment(
211           original_i, i - original_i + 1, chars_written));
212     }
213   }
214   return success;
215 }
216
217 bool UTF8ToUTF16WithAdjustments(
218     const char* src,
219     size_t src_len,
220     string16* output,
221     base::OffsetAdjuster::Adjustments* adjustments) {
222   PrepareForUTF16Or32Output(src, src_len, output);
223   return ConvertUnicode(src, src_len, output, adjustments);
224 }
225
226 string16 UTF8ToUTF16WithAdjustments(
227     const base::StringPiece& utf8,
228     base::OffsetAdjuster::Adjustments* adjustments) {
229   string16 result;
230   UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments);
231   return result;
232 }
233
234 string16 UTF8ToUTF16AndAdjustOffsets(
235     const base::StringPiece& utf8,
236     std::vector<size_t>* offsets_for_adjustment) {
237   std::for_each(offsets_for_adjustment->begin(),
238                 offsets_for_adjustment->end(),
239                 LimitOffset<base::StringPiece>(utf8.length()));
240   OffsetAdjuster::Adjustments adjustments;
241   string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments);
242   OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
243   return result;
244 }
245
246 std::string UTF16ToUTF8AndAdjustOffsets(
247     const base::StringPiece16& utf16,
248     std::vector<size_t>* offsets_for_adjustment) {
249   std::for_each(offsets_for_adjustment->begin(),
250                 offsets_for_adjustment->end(),
251                 LimitOffset<base::StringPiece16>(utf16.length()));
252   std::string result;
253   PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
254   OffsetAdjuster::Adjustments adjustments;
255   ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments);
256   OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
257   return result;
258 }
259
260 }  // namespace base