cpp/src/phonenumbers/utf/unicodetext.cc

   1 // Copyright (C) 2006 Google Inc.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 // http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 // Author: Jim Meehan
  16
  17 #include <iostream>
  18 #include <sstream>
  19 #include <cassert>
  20
  21 #include "phonenumbers/utf/unicodetext.h"
  22 #include "phonenumbers/utf/stringpiece.h"
  23 //#include "utf/stringprintf.h"
  24 #include "phonenumbers/utf/utf.h"
  25 #include "phonenumbers/utf/unilib.h"
  26
  27 namespace i18n {
  28 namespace phonenumbers {
  29
  30 using std::stringstream;
  31 using std::max;
  32 using std::hex;
  33 using std::dec;
  34 using std::cerr;
  35 using std::endl;
  36
  37 static int CodepointDistance(const char* start, const char* end) {
  38   int n = 0;
  39   // Increment n on every non-trail-byte.
  40   for (const char* p = start; p < end; ++p) {
  41     n += (*reinterpret_cast<const signed char*>(p) >= -0x40);
  42   }
  43   return n;
  44 }
  45
  46 static int CodepointCount(const char* utf8, int len) {
  47   return CodepointDistance(utf8, utf8 + len);
  48 }
  49
  50 UnicodeText::const_iterator::difference_type
  51 distance(const UnicodeText::const_iterator& first,
  52          const UnicodeText::const_iterator& last) {
  53   return CodepointDistance(first.it_, last.it_);
  54 }
  55
  56 // ---------- Utility ----------
  57
  58 static int ConvertToInterchangeValid(char* start, int len) {
  59   // This routine is called only when we've discovered that a UTF-8 buffer
  60   // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
  61   // was not interchange valid. This indicates a bug in the caller, and
  62   // a LOG(WARNING) is done in that case.
  63   // This is similar to CoerceToInterchangeValid, but it replaces each
  64   // structurally valid byte with a space, and each non-interchange
  65   // character with a space, even when that character requires more
  66   // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
  67   // structurally valid UTF8, but U+FDD0 is not an interchange-valid
  68   // code point. The result should contain one space, not three.
  69   //
  70   // Since the conversion never needs to write more data than it
  71   // reads, it is safe to change the buffer in place. It returns the
  72   // number of bytes written.
  73   char* const in = start;
  74   char* out = start;
  75   char* const end = start + len;
  76   while (start < end) {
  77     int good = UniLib::SpanInterchangeValid(start, end - start);
  78     if (good > 0) {
  79       if (out != start) {
  80         memmove(out, start, good);
  81       }
  82       out += good;
  83       start += good;
  84       if (start == end) {
  85         break;
  86       }
  87     }
  88     // Is the current string invalid UTF8 or just non-interchange UTF8?
  89     char32 rune;
  90     int n;
  91     if (isvalidcharntorune(start, end - start, &rune, &n)) {
  92       // structurally valid UTF8, but not interchange valid
  93       start += n;  // Skip over the whole character.
  94     } else {  // bad UTF8
  95       start += 1;  // Skip over just one byte
  96     }
  97     *out++ = ' ';
  98   }
  99   return out - in;
 100 }
 101
 102
 103 // *************** Data representation **********
 104
 105 // Note: the copy constructor is undefined.
 106
 107 // After reserve(), resize(), or clear(), we're an owner, not an alias.
 108
 109 void UnicodeText::Repr::reserve(int new_capacity) {
 110   // If there's already enough capacity, and we're an owner, do nothing.
 111   if (capacity_ >= new_capacity && ours_) return;
 112
 113   // Otherwise, allocate a new buffer.
 114   capacity_ = max(new_capacity, (3 * capacity_) / 2 + 20);
 115   char* new_data = new char[capacity_];
 116
 117   // If there is an old buffer, copy it into the new buffer.
 118   if (data_) {
 119     memcpy(new_data, data_, size_);
 120     if (ours_) delete[] data_;  // If we owned the old buffer, free it.
 121   }
 122   data_ = new_data;
 123   ours_ = true;  // We own the new buffer.
 124   // size_ is unchanged.
 125 }
 126
 127 void UnicodeText::Repr::resize(int new_size) {
 128   if (new_size == 0) {
 129     clear();
 130   } else {
 131     if (!ours_ || new_size > capacity_) reserve(new_size);
 132     // Clear the memory in the expanded part.
 133     if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
 134     size_ = new_size;
 135     ours_ = true;
 136   }
 137 }
 138
 139 // This implementation of clear() deallocates the buffer if we're an owner.
 140 // That's not strictly necessary; we could just set size_ to 0.
 141 void UnicodeText::Repr::clear() {
 142   if (ours_) delete[] data_;
 143   data_ = NULL;
 144   size_ = capacity_ = 0;
 145   ours_ = true;
 146 }
 147
 148 void UnicodeText::Repr::Copy(const char* data, int size) {
 149   resize(size);
 150   memcpy(data_, data, size);
 151 }
 152
 153 void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {
 154   if (data == data_) return;  // We already own this memory. (Weird case.)
 155   if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
 156   data_ = data;
 157   size_ = size;
 158   capacity_ = capacity;
 159   ours_ = true;
 160 }
 161
 162 void UnicodeText::Repr::PointTo(const char* data, int size) {
 163   if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
 164   data_ = const_cast<char*>(data);
 165   size_ = size;
 166   capacity_ = size;
 167   ours_ = false;
 168 }
 169
 170 void UnicodeText::Repr::append(const char* bytes, int byte_length) {
 171   reserve(size_ + byte_length);
 172   memcpy(data_ + size_, bytes, byte_length);
 173   size_ += byte_length;
 174 }
 175
 176 string UnicodeText::Repr::DebugString() const {
 177   stringstream ss;
 178
 179   ss << "{Repr " << hex << this << " data=" << data_ << " size=" << dec
 180      << size_ << " capacity=" << capacity_ << " "
 181      << (ours_ ? "Owned" : "Alias") << "}";
 182
 183   string result;
 184   ss >> result;
 185
 186   return result;
 187 }
 188
 189
 190
 191 // *************** UnicodeText ******************
 192
 193 // ----- Constructors -----
 194
 195 // Default constructor
 196 UnicodeText::UnicodeText() {
 197 }
 198
 199 // Copy constructor
 200 UnicodeText::UnicodeText(const UnicodeText& src) {
 201   Copy(src);
 202 }
 203
 204 // Substring constructor
 205 UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,
 206                          const UnicodeText::const_iterator& last) {
 207   assert(first <= last && "Incompatible iterators");
 208   repr_.append(first.it_, last.it_ - first.it_);
 209 }
 210
 211 string UnicodeText::UTF8Substring(const const_iterator& first,
 212                                   const const_iterator& last) {
 213   assert(first <= last && "Incompatible iterators");
 214   return string(first.it_, last.it_ - first.it_);
 215 }
 216
 217
 218 // ----- Copy -----
 219
 220 UnicodeText& UnicodeText::operator=(const UnicodeText& src) {
 221   if (this != &src) {
 222     Copy(src);
 223   }
 224   return *this;
 225 }
 226
 227 UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
 228   repr_.Copy(src.repr_.data_, src.repr_.size_);
 229   return *this;
 230 }
 231
 232 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
 233   repr_.Copy(buffer, byte_length);
 234   if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
 235     cerr << "UTF-8 buffer is not interchange-valid." << endl;
 236     repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
 237   }
 238   return *this;
 239 }
 240
 241 UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,
 242                                            int byte_length) {
 243   repr_.Copy(buffer, byte_length);
 244   return *this;
 245 }
 246
 247 // ----- TakeOwnershipOf  -----
 248
 249 UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
 250                                               int byte_length,
 251                                               int byte_capacity) {
 252   repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
 253   if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
 254     cerr << "UTF-8 buffer is not interchange-valid." << endl;
 255     repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
 256   }
 257   return *this;
 258 }
 259
 260 UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
 261                                                     int byte_length,
 262                                                     int byte_capacity) {
 263   repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
 264   return *this;
 265 }
 266
 267 // ----- PointTo -----
 268
 269 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
 270   if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
 271     repr_.PointTo(buffer, byte_length);
 272   } else {
 273     cerr << "UTF-8 buffer is not interchange-valid." << endl;
 274     repr_.Copy(buffer, byte_length);
 275     repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
 276   }
 277   return *this;
 278 }
 279
 280 UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,
 281                                           int byte_length) {
 282   repr_.PointTo(buffer, byte_length);
 283   return *this;
 284 }
 285
 286 UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {
 287   repr_.PointTo(src.repr_.data_, src.repr_.size_);
 288   return *this;
 289 }
 290
 291 UnicodeText& UnicodeText::PointTo(const const_iterator &first,
 292                                   const const_iterator &last) {
 293   assert(first <= last && " Incompatible iterators");
 294   repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
 295   return *this;
 296 }
 297
 298 // ----- Append -----
 299
 300 UnicodeText& UnicodeText::append(const UnicodeText& u) {
 301   repr_.append(u.repr_.data_, u.repr_.size_);
 302   return *this;
 303 }
 304
 305 UnicodeText& UnicodeText::append(const const_iterator& first,
 306                                  const const_iterator& last) {
 307   assert(first <= last && "Incompatible iterators");
 308   repr_.append(first.it_, last.it_ - first.it_);
 309   return *this;
 310 }
 311
 312 UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {
 313   repr_.append(utf8, len);
 314   return *this;
 315 }
 316
 317 // ----- substring searching -----
 318
 319 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,
 320                                               const_iterator start_pos) const {
 321   assert(start_pos.utf8_data() >= utf8_data());
 322   assert(start_pos.utf8_data() <= utf8_data() + utf8_length());
 323   return UnsafeFind(look, start_pos);
 324 }
 325
 326 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {
 327   return UnsafeFind(look, begin());
 328 }
 329
 330 UnicodeText::const_iterator UnicodeText::UnsafeFind(
 331     const UnicodeText& look, const_iterator start_pos) const {
 332   // Due to the magic of the UTF8 encoding, searching for a sequence of
 333   // letters is equivalent to substring search.
 334   StringPiece searching(utf8_data(), utf8_length());
 335   StringPiece look_piece(look.utf8_data(), look.utf8_length());
 336   StringPiece::size_type found =
 337       searching.find(look_piece, start_pos.utf8_data() - utf8_data());
 338   if (found == StringPiece::npos) return end();
 339   return const_iterator(utf8_data() + found);
 340 }
 341
 342 bool UnicodeText::HasReplacementChar() const {
 343   // Equivalent to:
 344   //   UnicodeText replacement_char;
 345   //   replacement_char.push_back(0xFFFD);
 346   //   return find(replacement_char) != end();
 347   StringPiece searching(utf8_data(), utf8_length());
 348   StringPiece looking_for("\xEF\xBF\xBD", 3);
 349   return searching.find(looking_for) != StringPiece::npos;
 350 }
 351
 352 // ----- other methods -----
 353
 354 // Clear operator
 355 void UnicodeText::clear() {
 356   repr_.clear();
 357 }
 358
 359 // Destructor
 360 UnicodeText::~UnicodeText() {}
 361
 362
 363 void UnicodeText::push_back(char32 c) {
 364   if (UniLib::IsValidCodepoint(c)) {
 365     char buf[UTFmax];
 366     int len = runetochar(buf, &c);
 367     if (UniLib::IsInterchangeValid(buf, len)) {
 368       repr_.append(buf, len);
 369     } else {
 370       cerr << "Unicode value 0x" << hex << c
 371            << " is not valid for interchange" << endl;
 372       repr_.append(" ", 1);
 373     }
 374   } else {
 375     cerr << "Illegal Unicode value: 0x" << hex << c << endl;
 376     repr_.append(" ", 1);
 377   }
 378 }
 379
 380 int UnicodeText::size() const {
 381   return CodepointCount(repr_.data_, repr_.size_);
 382 }
 383
 384 bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {
 385   if (&lhs == &rhs) return true;
 386   if (lhs.repr_.size_ != rhs.repr_.size_) return false;
 387   return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
 388 }
 389
 390 string UnicodeText::DebugString() const {
 391   stringstream ss;
 392
 393   ss << "{UnicodeText " << hex << this << dec << " chars="
 394      << size() << " repr=" << repr_.DebugString() << "}";
 395 #if 0
 396   return StringPrintf("{UnicodeText %p chars=%d repr=%s}",
 397                       this,
 398                       size(),
 399                       repr_.DebugString().c_str());
 400 #endif
 401   string result;
 402   ss >> result;
 403
 404   return result;
 405 }
 406
 407
 408 // ******************* UnicodeText::const_iterator *********************
 409
 410 // The implementation of const_iterator would be nicer if it
 411 // inherited from boost::iterator_facade
 412 // (http://boost.org/libs/iterator/doc/iterator_facade.html).
 413
 414 UnicodeText::const_iterator::const_iterator() : it_(0) {}
 415
 416 UnicodeText::const_iterator::const_iterator(const const_iterator& other)
 417     : it_(other.it_) {
 418 }
 419
 420 UnicodeText::const_iterator&
 421 UnicodeText::const_iterator::operator=(const const_iterator& other) {
 422   if (&other != this)
 423     it_ = other.it_;
 424   return *this;
 425 }
 426
 427 UnicodeText::const_iterator UnicodeText::begin() const {
 428   return const_iterator(repr_.data_);
 429 }
 430
 431 UnicodeText::const_iterator UnicodeText::end() const {
 432   return const_iterator(repr_.data_ + repr_.size_);
 433 }
 434
 435 bool operator<(const UnicodeText::const_iterator& lhs,
 436                const UnicodeText::const_iterator& rhs) {
 437   return lhs.it_ < rhs.it_;
 438 }
 439
 440 char32 UnicodeText::const_iterator::operator*() const {
 441   // (We could call chartorune here, but that does some
 442   // error-checking, and we're guaranteed that our data is valid
 443   // UTF-8. Also, we expect this routine to be called very often. So
 444   // for speed, we do the calculation ourselves.)
 445
 446   // Convert from UTF-8
 447   uint8 byte1 = static_cast<uint8>(it_[0]);
 448   if (byte1 < 0x80)
 449     return byte1;
 450
 451   uint8 byte2 = static_cast<uint8>(it_[1]);
 452   if (byte1 < 0xE0)
 453     return ((byte1 & 0x1F) << 6)
 454           | (byte2 & 0x3F);
 455
 456   uint8 byte3 = static_cast<uint8>(it_[2]);
 457   if (byte1 < 0xF0)
 458     return ((byte1 & 0x0F) << 12)
 459          | ((byte2 & 0x3F) << 6)
 460          |  (byte3 & 0x3F);
 461
 462   uint8 byte4 = static_cast<uint8>(it_[3]);
 463   return ((byte1 & 0x07) << 18)
 464        | ((byte2 & 0x3F) << 12)
 465        | ((byte3 & 0x3F) << 6)
 466        |  (byte4 & 0x3F);
 467 }
 468
 469 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
 470   it_ += UniLib::OneCharLen(it_);
 471   return *this;
 472 }
 473
 474 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
 475   while (UniLib::IsTrailByte(*--it_)) { }
 476   return *this;
 477 }
 478
 479 int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {
 480   utf8_output[0] = it_[0];
 481   if (static_cast<unsigned char>(it_[0]) < 0x80)
 482     return 1;
 483
 484   utf8_output[1] = it_[1];
 485   if (static_cast<unsigned char>(it_[0]) < 0xE0)
 486     return 2;
 487
 488   utf8_output[2] = it_[2];
 489   if (static_cast<unsigned char>(it_[0]) < 0xF0)
 490     return 3;
 491
 492   utf8_output[3] = it_[3];
 493   return 4;
 494 }
 495
 496
 497 UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {
 498   assert(p != NULL);
 499   const char* start = utf8_data();
 500   int len = utf8_length();
 501   const char* end = start + len;
 502   assert(p >= start);
 503   assert(p <= end);
 504   assert(p == end || !UniLib::IsTrailByte(*p));
 505   return const_iterator(p);
 506 }
 507
 508 string UnicodeText::const_iterator::DebugString() const {
 509   stringstream ss;
 510
 511   ss << "{iter " << hex << it_ << "}";
 512   string result;
 513   ss >> result;
 514
 515   return result;
 516 }
 517
 518 }  // namespace phonenumbers
 519 }  // namespace i18n