src/3rdparty/v8/src/scanner-character-streams.cc

   1 // Copyright 2011 the V8 project authors. All rights reserved.
   2 // Redistribution and use in source and binary forms, with or without
   3 // modification, are permitted provided that the following conditions are
   4 // met:
   5 //
   6 //     * Redistributions of source code must retain the above copyright
   7 //       notice, this list of conditions and the following disclaimer.
   8 //     * Redistributions in binary form must reproduce the above
   9 //       copyright notice, this list of conditions and the following
  10 //       disclaimer in the documentation and/or other materials provided
  11 //       with the distribution.
  12 //     * Neither the name of Google Inc. nor the names of its
  13 //       contributors may be used to endorse or promote products derived
  14 //       from this software without specific prior written permission.
  15 //
  16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27
  28 #include "v8.h"
  29
  30 #include "scanner-character-streams.h"
  31
  32 #include "handles.h"
  33 #include "unicode-inl.h"
  34
  35 namespace v8 {
  36 namespace internal {
  37
  38 // ----------------------------------------------------------------------------
  39 // BufferedUtf16CharacterStreams
  40
  41 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
  42     : Utf16CharacterStream(),
  43       pushback_limit_(NULL) {
  44   // Initialize buffer as being empty. First read will fill the buffer.
  45   buffer_cursor_ = buffer_;
  46   buffer_end_ = buffer_;
  47 }
  48
  49 BufferedUtf16CharacterStream::~BufferedUtf16CharacterStream() { }
  50
  51 void BufferedUtf16CharacterStream::PushBack(uc32 character) {
  52   if (character == kEndOfInput) {
  53     pos_--;
  54     return;
  55   }
  56   if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) {
  57     // buffer_ is writable, buffer_cursor_ is const pointer.
  58     buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character);
  59     pos_--;
  60     return;
  61   }
  62   SlowPushBack(static_cast<uc16>(character));
  63 }
  64
  65
  66 void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) {
  67   // In pushback mode, the end of the buffer contains pushback,
  68   // and the start of the buffer (from buffer start to pushback_limit_)
  69   // contains valid data that comes just after the pushback.
  70   // We NULL the pushback_limit_ if pushing all the way back to the
  71   // start of the buffer.
  72
  73   if (pushback_limit_ == NULL) {
  74     // Enter pushback mode.
  75     pushback_limit_ = buffer_end_;
  76     buffer_end_ = buffer_ + kBufferSize;
  77     buffer_cursor_ = buffer_end_;
  78   }
  79   // Ensure that there is room for at least one pushback.
  80   ASSERT(buffer_cursor_ > buffer_);
  81   ASSERT(pos_ > 0);
  82   buffer_[--buffer_cursor_ - buffer_] = character;
  83   if (buffer_cursor_ == buffer_) {
  84     pushback_limit_ = NULL;
  85   } else if (buffer_cursor_ < pushback_limit_) {
  86     pushback_limit_ = buffer_cursor_;
  87   }
  88   pos_--;
  89 }
  90
  91
  92 bool BufferedUtf16CharacterStream::ReadBlock() {
  93   buffer_cursor_ = buffer_;
  94   if (pushback_limit_ != NULL) {
  95     // Leave pushback mode.
  96     buffer_end_ = pushback_limit_;
  97     pushback_limit_ = NULL;
  98     // If there were any valid characters left at the
  99     // start of the buffer, use those.
 100     if (buffer_cursor_ < buffer_end_) return true;
 101     // Otherwise read a new block.
 102   }
 103   unsigned length = FillBuffer(pos_, kBufferSize);
 104   buffer_end_ = buffer_ + length;
 105   return length > 0;
 106 }
 107
 108
 109 unsigned BufferedUtf16CharacterStream::SlowSeekForward(unsigned delta) {
 110   // Leave pushback mode (i.e., ignore that there might be valid data
 111   // in the buffer before the pushback_limit_ point).
 112   pushback_limit_ = NULL;
 113   return BufferSeekForward(delta);
 114 }
 115
 116 // ----------------------------------------------------------------------------
 117 // GenericStringUtf16CharacterStream
 118
 119
 120 GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream(
 121     Handle<String> data,
 122     unsigned start_position,
 123     unsigned end_position)
 124     : string_(data),
 125       length_(end_position) {
 126   ASSERT(end_position >= start_position);
 127   buffer_cursor_ = buffer_;
 128   buffer_end_ = buffer_;
 129   pos_ = start_position;
 130 }
 131
 132
 133 GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { }
 134
 135
 136 unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) {
 137   unsigned old_pos = pos_;
 138   pos_ = Min(pos_ + delta, length_);
 139   ReadBlock();
 140   return pos_ - old_pos;
 141 }
 142
 143
 144 unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos,
 145                                                       unsigned length) {
 146   if (from_pos >= length_) return 0;
 147   if (from_pos + length > length_) {
 148     length = length_ - from_pos;
 149   }
 150   String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length);
 151   return length;
 152 }
 153
 154
 155 // ----------------------------------------------------------------------------
 156 // Utf8ToUtf16CharacterStream
 157 Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data,
 158                                                        unsigned length)
 159     : BufferedUtf16CharacterStream(),
 160       raw_data_(data),
 161       raw_data_length_(length),
 162       raw_data_pos_(0),
 163       raw_character_position_(0) {
 164   ReadBlock();
 165 }
 166
 167
 168 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { }
 169
 170
 171 unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) {
 172   unsigned old_pos = pos_;
 173   unsigned target_pos = pos_ + delta;
 174   SetRawPosition(target_pos);
 175   pos_ = raw_character_position_;
 176   ReadBlock();
 177   return pos_ - old_pos;
 178 }
 179
 180
 181 unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position,
 182                                                 unsigned length) {
 183   static const unibrow::uchar kMaxUtf16Character = 0xffff;
 184   SetRawPosition(char_position);
 185   if (raw_character_position_ != char_position) {
 186     // char_position was not a valid position in the stream (hit the end
 187     // while spooling to it).
 188     return 0u;
 189   }
 190   unsigned i = 0;
 191   while (i < length - 1) {
 192     if (raw_data_pos_ == raw_data_length_) break;
 193     unibrow::uchar c = raw_data_[raw_data_pos_];
 194     if (c <= unibrow::Utf8::kMaxOneByteChar) {
 195       raw_data_pos_++;
 196     } else {
 197       c =  unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_,
 198                                          raw_data_length_ - raw_data_pos_,
 199                                          &raw_data_pos_);
 200     }
 201     if (c > kMaxUtf16Character) {
 202       buffer_[i++] = unibrow::Utf16::LeadSurrogate(c);
 203       buffer_[i++] = unibrow::Utf16::TrailSurrogate(c);
 204     } else {
 205       buffer_[i++] = static_cast<uc16>(c);
 206     }
 207   }
 208   raw_character_position_ = char_position + i;
 209   return i;
 210 }
 211
 212
 213 static const byte kUtf8MultiByteMask = 0xC0;
 214 static const byte kUtf8MultiByteCharStart = 0xC0;
 215 static const byte kUtf8MultiByteCharFollower = 0x80;
 216
 217
 218 #ifdef DEBUG
 219 static bool IsUtf8MultiCharacterStart(byte first_byte) {
 220   return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;
 221 }
 222 #endif
 223
 224
 225 static bool IsUtf8MultiCharacterFollower(byte later_byte) {
 226   return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;
 227 }
 228
 229
 230 // Move the cursor back to point at the preceding UTF-8 character start
 231 // in the buffer.
 232 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {
 233   byte character = buffer[--*cursor];
 234   if (character > unibrow::Utf8::kMaxOneByteChar) {
 235     ASSERT(IsUtf8MultiCharacterFollower(character));
 236     // Last byte of a multi-byte character encoding. Step backwards until
 237     // pointing to the first byte of the encoding, recognized by having the
 238     // top two bits set.
 239     while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { }
 240     ASSERT(IsUtf8MultiCharacterStart(buffer[*cursor]));
 241   }
 242 }
 243
 244
 245 // Move the cursor forward to point at the next following UTF-8 character start
 246 // in the buffer.
 247 static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {
 248   byte character = buffer[(*cursor)++];
 249   if (character > unibrow::Utf8::kMaxOneByteChar) {
 250     // First character of a multi-byte character encoding.
 251     // The number of most-significant one-bits determines the length of the
 252     // encoding:
 253     //  110..... - (0xCx, 0xDx) one additional byte (minimum).
 254     //  1110.... - (0xEx) two additional bytes.
 255     //  11110... - (0xFx) three additional bytes (maximum).
 256     ASSERT(IsUtf8MultiCharacterStart(character));
 257     // Additional bytes is:
 258     // 1 if value in range 0xC0 .. 0xDF.
 259     // 2 if value in range 0xE0 .. 0xEF.
 260     // 3 if value in range 0xF0 .. 0xF7.
 261     // Encode that in a single value.
 262     unsigned additional_bytes =
 263         ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;
 264     *cursor += additional_bytes;
 265     ASSERT(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));
 266   }
 267 }
 268
 269
 270 // This can't set a raw position between two surrogate pairs, since there
 271 // is no position in the UTF8 stream that corresponds to that.  This assumes
 272 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence.  If
 273 // it is illegally coded as two 3 byte sequences then there is no problem here.
 274 void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) {
 275   if (raw_character_position_ > target_position) {
 276     // Spool backwards in utf8 buffer.
 277     do {
 278       int old_pos = raw_data_pos_;
 279       Utf8CharacterBack(raw_data_, &raw_data_pos_);
 280       raw_character_position_--;
 281       ASSERT(old_pos - raw_data_pos_ <= 4);
 282       // Step back over both code units for surrogate pairs.
 283       if (old_pos - raw_data_pos_ == 4) raw_character_position_--;
 284     } while (raw_character_position_ > target_position);
 285     // No surrogate pair splitting.
 286     ASSERT(raw_character_position_ == target_position);
 287     return;
 288   }
 289   // Spool forwards in the utf8 buffer.
 290   while (raw_character_position_ < target_position) {
 291     if (raw_data_pos_ == raw_data_length_) return;
 292     int old_pos = raw_data_pos_;
 293     Utf8CharacterForward(raw_data_, &raw_data_pos_);
 294     raw_character_position_++;
 295     ASSERT(raw_data_pos_ - old_pos <= 4);
 296     if (raw_data_pos_ - old_pos == 4) raw_character_position_++;
 297   }
 298   // No surrogate pair splitting.
 299   ASSERT(raw_character_position_ == target_position);
 300 }
 301
 302
 303 // ----------------------------------------------------------------------------
 304 // ExternalTwoByteStringUtf16CharacterStream
 305
 306 ExternalTwoByteStringUtf16CharacterStream::
 307     ~ExternalTwoByteStringUtf16CharacterStream() { }
 308
 309
 310 ExternalTwoByteStringUtf16CharacterStream
 311     ::ExternalTwoByteStringUtf16CharacterStream(
 312         Handle<ExternalTwoByteString> data,
 313         int start_position,
 314         int end_position)
 315     : Utf16CharacterStream(),
 316       source_(data),
 317       raw_data_(data->GetTwoByteData(start_position)) {
 318   buffer_cursor_ = raw_data_,
 319   buffer_end_ = raw_data_ + (end_position - start_position);
 320   pos_ = start_position;
 321 }
 322
 323 } }  // namespace v8::internal