1 // Copyright (C) 2006 Google Inc.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
21 #include "phonenumbers/utf/unicodetext.h"
22 #include "phonenumbers/utf/stringpiece.h"
23 //#include "utf/stringprintf.h"
24 #include "phonenumbers/utf/utf.h"
25 #include "phonenumbers/utf/unilib.h"
28 namespace phonenumbers {
30 using std::stringstream;
37 static int CodepointDistance(const char* start, const char* end) {
39 // Increment n on every non-trail-byte.
40 for (const char* p = start; p < end; ++p) {
41 n += (*reinterpret_cast<const signed char*>(p) >= -0x40);
46 static int CodepointCount(const char* utf8, int len) {
47 return CodepointDistance(utf8, utf8 + len);
50 UnicodeText::const_iterator::difference_type
51 distance(const UnicodeText::const_iterator& first,
52 const UnicodeText::const_iterator& last) {
53 return CodepointDistance(first.it_, last.it_);
56 // ---------- Utility ----------
58 static int ConvertToInterchangeValid(char* start, int len) {
59 // This routine is called only when we've discovered that a UTF-8 buffer
60 // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
61 // was not interchange valid. This indicates a bug in the caller, and
62 // a LOG(WARNING) is done in that case.
63 // This is similar to CoerceToInterchangeValid, but it replaces each
64 // structurally valid byte with a space, and each non-interchange
65 // character with a space, even when that character requires more
66 // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
67 // structurally valid UTF8, but U+FDD0 is not an interchange-valid
68 // code point. The result should contain one space, not three.
70 // Since the conversion never needs to write more data than it
71 // reads, it is safe to change the buffer in place. It returns the
72 // number of bytes written.
73 char* const in = start;
75 char* const end = start + len;
77 int good = UniLib::SpanInterchangeValid(start, end - start);
80 memmove(out, start, good);
88 // Is the current string invalid UTF8 or just non-interchange UTF8?
91 if (isvalidcharntorune(start, end - start, &rune, &n)) {
92 // structurally valid UTF8, but not interchange valid
93 start += n; // Skip over the whole character.
95 start += 1; // Skip over just one byte
103 // *************** Data representation **********
105 // Note: the copy constructor is undefined.
107 // After reserve(), resize(), or clear(), we're an owner, not an alias.
109 void UnicodeText::Repr::reserve(int new_capacity) {
110 // If there's already enough capacity, and we're an owner, do nothing.
111 if (capacity_ >= new_capacity && ours_) return;
113 // Otherwise, allocate a new buffer.
114 capacity_ = max(new_capacity, (3 * capacity_) / 2 + 20);
115 char* new_data = new char[capacity_];
117 // If there is an old buffer, copy it into the new buffer.
119 memcpy(new_data, data_, size_);
120 if (ours_) delete[] data_; // If we owned the old buffer, free it.
123 ours_ = true; // We own the new buffer.
124 // size_ is unchanged.
127 void UnicodeText::Repr::resize(int new_size) {
131 if (!ours_ || new_size > capacity_) reserve(new_size);
132 // Clear the memory in the expanded part.
133 if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
139 // This implementation of clear() deallocates the buffer if we're an owner.
140 // That's not strictly necessary; we could just set size_ to 0.
141 void UnicodeText::Repr::clear() {
142 if (ours_) delete[] data_;
144 size_ = capacity_ = 0;
148 void UnicodeText::Repr::Copy(const char* data, int size) {
150 memcpy(data_, data, size);
153 void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {
154 if (data == data_) return; // We already own this memory. (Weird case.)
155 if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
158 capacity_ = capacity;
162 void UnicodeText::Repr::PointTo(const char* data, int size) {
163 if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
164 data_ = const_cast<char*>(data);
170 void UnicodeText::Repr::append(const char* bytes, int byte_length) {
171 reserve(size_ + byte_length);
172 memcpy(data_ + size_, bytes, byte_length);
173 size_ += byte_length;
176 string UnicodeText::Repr::DebugString() const {
179 ss << "{Repr " << hex << this << " data=" << data_ << " size=" << dec
180 << size_ << " capacity=" << capacity_ << " "
181 << (ours_ ? "Owned" : "Alias") << "}";
191 // *************** UnicodeText ******************
193 // ----- Constructors -----
195 // Default constructor
196 UnicodeText::UnicodeText() {
200 UnicodeText::UnicodeText(const UnicodeText& src) {
204 // Substring constructor
205 UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,
206 const UnicodeText::const_iterator& last) {
207 assert(first <= last && "Incompatible iterators");
208 repr_.append(first.it_, last.it_ - first.it_);
211 string UnicodeText::UTF8Substring(const const_iterator& first,
212 const const_iterator& last) {
213 assert(first <= last && "Incompatible iterators");
214 return string(first.it_, last.it_ - first.it_);
220 UnicodeText& UnicodeText::operator=(const UnicodeText& src) {
227 UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
228 repr_.Copy(src.repr_.data_, src.repr_.size_);
232 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
233 repr_.Copy(buffer, byte_length);
234 if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
235 cerr << "UTF-8 buffer is not interchange-valid." << endl;
236 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
241 UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,
243 repr_.Copy(buffer, byte_length);
247 // ----- TakeOwnershipOf -----
249 UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
252 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
253 if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
254 cerr << "UTF-8 buffer is not interchange-valid." << endl;
255 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
260 UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
263 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
267 // ----- PointTo -----
269 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
270 if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
271 repr_.PointTo(buffer, byte_length);
273 cerr << "UTF-8 buffer is not interchange-valid." << endl;
274 repr_.Copy(buffer, byte_length);
275 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
280 UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,
282 repr_.PointTo(buffer, byte_length);
286 UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {
287 repr_.PointTo(src.repr_.data_, src.repr_.size_);
291 UnicodeText& UnicodeText::PointTo(const const_iterator &first,
292 const const_iterator &last) {
293 assert(first <= last && " Incompatible iterators");
294 repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
298 // ----- Append -----
300 UnicodeText& UnicodeText::append(const UnicodeText& u) {
301 repr_.append(u.repr_.data_, u.repr_.size_);
305 UnicodeText& UnicodeText::append(const const_iterator& first,
306 const const_iterator& last) {
307 assert(first <= last && "Incompatible iterators");
308 repr_.append(first.it_, last.it_ - first.it_);
312 UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {
313 repr_.append(utf8, len);
317 // ----- substring searching -----
319 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,
320 const_iterator start_pos) const {
321 assert(start_pos.utf8_data() >= utf8_data());
322 assert(start_pos.utf8_data() <= utf8_data() + utf8_length());
323 return UnsafeFind(look, start_pos);
326 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {
327 return UnsafeFind(look, begin());
330 UnicodeText::const_iterator UnicodeText::UnsafeFind(
331 const UnicodeText& look, const_iterator start_pos) const {
332 // Due to the magic of the UTF8 encoding, searching for a sequence of
333 // letters is equivalent to substring search.
334 StringPiece searching(utf8_data(), utf8_length());
335 StringPiece look_piece(look.utf8_data(), look.utf8_length());
336 StringPiece::size_type found =
337 searching.find(look_piece, start_pos.utf8_data() - utf8_data());
338 if (found == StringPiece::npos) return end();
339 return const_iterator(utf8_data() + found);
342 bool UnicodeText::HasReplacementChar() const {
344 // UnicodeText replacement_char;
345 // replacement_char.push_back(0xFFFD);
346 // return find(replacement_char) != end();
347 StringPiece searching(utf8_data(), utf8_length());
348 StringPiece looking_for("\xEF\xBF\xBD", 3);
349 return searching.find(looking_for) != StringPiece::npos;
352 // ----- other methods -----
355 void UnicodeText::clear() {
360 UnicodeText::~UnicodeText() {}
363 void UnicodeText::push_back(char32 c) {
364 if (UniLib::IsValidCodepoint(c)) {
366 int len = runetochar(buf, &c);
367 if (UniLib::IsInterchangeValid(buf, len)) {
368 repr_.append(buf, len);
370 cerr << "Unicode value 0x" << hex << c
371 << " is not valid for interchange" << endl;
372 repr_.append(" ", 1);
375 cerr << "Illegal Unicode value: 0x" << hex << c << endl;
376 repr_.append(" ", 1);
380 int UnicodeText::size() const {
381 return CodepointCount(repr_.data_, repr_.size_);
384 bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {
385 if (&lhs == &rhs) return true;
386 if (lhs.repr_.size_ != rhs.repr_.size_) return false;
387 return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
390 string UnicodeText::DebugString() const {
393 ss << "{UnicodeText " << hex << this << dec << " chars="
394 << size() << " repr=" << repr_.DebugString() << "}";
396 return StringPrintf("{UnicodeText %p chars=%d repr=%s}",
399 repr_.DebugString().c_str());
408 // ******************* UnicodeText::const_iterator *********************
410 // The implementation of const_iterator would be nicer if it
411 // inherited from boost::iterator_facade
412 // (http://boost.org/libs/iterator/doc/iterator_facade.html).
414 UnicodeText::const_iterator::const_iterator() : it_(0) {}
416 UnicodeText::const_iterator::const_iterator(const const_iterator& other)
420 UnicodeText::const_iterator&
421 UnicodeText::const_iterator::operator=(const const_iterator& other) {
427 UnicodeText::const_iterator UnicodeText::begin() const {
428 return const_iterator(repr_.data_);
431 UnicodeText::const_iterator UnicodeText::end() const {
432 return const_iterator(repr_.data_ + repr_.size_);
435 bool operator<(const UnicodeText::const_iterator& lhs,
436 const UnicodeText::const_iterator& rhs) {
437 return lhs.it_ < rhs.it_;
440 char32 UnicodeText::const_iterator::operator*() const {
441 // (We could call chartorune here, but that does some
442 // error-checking, and we're guaranteed that our data is valid
443 // UTF-8. Also, we expect this routine to be called very often. So
444 // for speed, we do the calculation ourselves.)
446 // Convert from UTF-8
447 uint8 byte1 = static_cast<uint8>(it_[0]);
451 uint8 byte2 = static_cast<uint8>(it_[1]);
453 return ((byte1 & 0x1F) << 6)
456 uint8 byte3 = static_cast<uint8>(it_[2]);
458 return ((byte1 & 0x0F) << 12)
459 | ((byte2 & 0x3F) << 6)
462 uint8 byte4 = static_cast<uint8>(it_[3]);
463 return ((byte1 & 0x07) << 18)
464 | ((byte2 & 0x3F) << 12)
465 | ((byte3 & 0x3F) << 6)
469 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
470 it_ += UniLib::OneCharLen(it_);
474 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
475 while (UniLib::IsTrailByte(*--it_)) { }
479 int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {
480 utf8_output[0] = it_[0];
481 if (static_cast<unsigned char>(it_[0]) < 0x80)
484 utf8_output[1] = it_[1];
485 if (static_cast<unsigned char>(it_[0]) < 0xE0)
488 utf8_output[2] = it_[2];
489 if (static_cast<unsigned char>(it_[0]) < 0xF0)
492 utf8_output[3] = it_[3];
497 UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {
499 const char* start = utf8_data();
500 int len = utf8_length();
501 const char* end = start + len;
504 assert(p == end || !UniLib::IsTrailByte(*p));
505 return const_iterator(p);
508 string UnicodeText::const_iterator::DebugString() const {
511 ss << "{iter " << hex << it_ << "}";
518 } // namespace phonenumbers