src/v8/src/scanner-character-streams.cc

   1 // Copyright 2011 the V8 project authors. All rights reserved.
   2 // Redistribution and use in source and binary forms, with or without
   3 // modification, are permitted provided that the following conditions are
   4 // met:
   5 //
   6 //     * Redistributions of source code must retain the above copyright
   7 //       notice, this list of conditions and the following disclaimer.
   8 //     * Redistributions in binary form must reproduce the above
   9 //       copyright notice, this list of conditions and the following
  10 //       disclaimer in the documentation and/or other materials provided
  11 //       with the distribution.
  12 //     * Neither the name of Google Inc. nor the names of its
  13 //       contributors may be used to endorse or promote products derived
  14 //       from this software without specific prior written permission.
  15 //
  16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27
  28 #include "v8.h"
  29
  30 #include "scanner-character-streams.h"
  31
  32 #include "handles.h"
  33 #include "unicode-inl.h"
  34
  35 namespace v8 {
  36 namespace internal {
  37
  38 // ----------------------------------------------------------------------------
  39 // BufferedUtf16CharacterStreams
  40
  41 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
  42     : Utf16CharacterStream(),
  43       pushback_limit_(NULL) {
  44   // Initialize buffer as being empty. First read will fill the buffer.
  45   buffer_cursor_ = buffer_;
  46   buffer_end_ = buffer_;
  47 }
  48
  49
  50 BufferedUtf16CharacterStream::~BufferedUtf16CharacterStream() { }
  51
  52 void BufferedUtf16CharacterStream::PushBack(uc32 character) {
  53   if (character == kEndOfInput) {
  54     pos_--;
  55     return;
  56   }
  57   if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) {
  58     // buffer_ is writable, buffer_cursor_ is const pointer.
  59     buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character);
  60     pos_--;
  61     return;
  62   }
  63   SlowPushBack(static_cast<uc16>(character));
  64 }
  65
  66
  67 void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) {
  68   // In pushback mode, the end of the buffer contains pushback,
  69   // and the start of the buffer (from buffer start to pushback_limit_)
  70   // contains valid data that comes just after the pushback.
  71   // We NULL the pushback_limit_ if pushing all the way back to the
  72   // start of the buffer.
  73
  74   if (pushback_limit_ == NULL) {
  75     // Enter pushback mode.
  76     pushback_limit_ = buffer_end_;
  77     buffer_end_ = buffer_ + kBufferSize;
  78     buffer_cursor_ = buffer_end_;
  79   }
  80   // Ensure that there is room for at least one pushback.
  81   ASSERT(buffer_cursor_ > buffer_);
  82   ASSERT(pos_ > 0);
  83   buffer_[--buffer_cursor_ - buffer_] = character;
  84   if (buffer_cursor_ == buffer_) {
  85     pushback_limit_ = NULL;
  86   } else if (buffer_cursor_ < pushback_limit_) {
  87     pushback_limit_ = buffer_cursor_;
  88   }
  89   pos_--;
  90 }
  91
  92
  93 bool BufferedUtf16CharacterStream::ReadBlock() {
  94   buffer_cursor_ = buffer_;
  95   if (pushback_limit_ != NULL) {
  96     // Leave pushback mode.
  97     buffer_end_ = pushback_limit_;
  98     pushback_limit_ = NULL;
  99     // If there were any valid characters left at the
 100     // start of the buffer, use those.
 101     if (buffer_cursor_ < buffer_end_) return true;
 102     // Otherwise read a new block.
 103   }
 104   unsigned length = FillBuffer(pos_, kBufferSize);
 105   buffer_end_ = buffer_ + length;
 106   return length > 0;
 107 }
 108
 109
 110 unsigned BufferedUtf16CharacterStream::SlowSeekForward(unsigned delta) {
 111   // Leave pushback mode (i.e., ignore that there might be valid data
 112   // in the buffer before the pushback_limit_ point).
 113   pushback_limit_ = NULL;
 114   return BufferSeekForward(delta);
 115 }
 116
 117
 118 // ----------------------------------------------------------------------------
 119 // GenericStringUtf16CharacterStream
 120
 121
 122 GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream(
 123     Handle<String> data,
 124     unsigned start_position,
 125     unsigned end_position)
 126     : string_(data),
 127       length_(end_position) {
 128   ASSERT(end_position >= start_position);
 129   buffer_cursor_ = buffer_;
 130   buffer_end_ = buffer_;
 131   pos_ = start_position;
 132 }
 133
 134
 135 GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { }
 136
 137
 138 unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) {
 139   unsigned old_pos = pos_;
 140   pos_ = Min(pos_ + delta, length_);
 141   ReadBlock();
 142   return pos_ - old_pos;
 143 }
 144
 145
 146 unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos,
 147                                                       unsigned length) {
 148   if (from_pos >= length_) return 0;
 149   if (from_pos + length > length_) {
 150     length = length_ - from_pos;
 151   }
 152   String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length);
 153   return length;
 154 }
 155
 156
 157 // ----------------------------------------------------------------------------
 158 // Utf8ToUtf16CharacterStream
 159 Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data,
 160                                                        unsigned length)
 161     : BufferedUtf16CharacterStream(),
 162       raw_data_(data),
 163       raw_data_length_(length),
 164       raw_data_pos_(0),
 165       raw_character_position_(0) {
 166   ReadBlock();
 167 }
 168
 169
 170 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { }
 171
 172
 173 unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) {
 174   unsigned old_pos = pos_;
 175   unsigned target_pos = pos_ + delta;
 176   SetRawPosition(target_pos);
 177   pos_ = raw_character_position_;
 178   ReadBlock();
 179   return pos_ - old_pos;
 180 }
 181
 182
 183 unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position,
 184                                                 unsigned length) {
 185   static const unibrow::uchar kMaxUtf16Character = 0xffff;
 186   SetRawPosition(char_position);
 187   if (raw_character_position_ != char_position) {
 188     // char_position was not a valid position in the stream (hit the end
 189     // while spooling to it).
 190     return 0u;
 191   }
 192   unsigned i = 0;
 193   while (i < length - 1) {
 194     if (raw_data_pos_ == raw_data_length_) break;
 195     unibrow::uchar c = raw_data_[raw_data_pos_];
 196     if (c <= unibrow::Utf8::kMaxOneByteChar) {
 197       raw_data_pos_++;
 198     } else {
 199       c =  unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_,
 200                                          raw_data_length_ - raw_data_pos_,
 201                                          &raw_data_pos_);
 202     }
 203     if (c > kMaxUtf16Character) {
 204       buffer_[i++] = unibrow::Utf16::LeadSurrogate(c);
 205       buffer_[i++] = unibrow::Utf16::TrailSurrogate(c);
 206     } else {
 207       buffer_[i++] = static_cast<uc16>(c);
 208     }
 209   }
 210   raw_character_position_ = char_position + i;
 211   return i;
 212 }
 213
 214
 215 static const byte kUtf8MultiByteMask = 0xC0;
 216 static const byte kUtf8MultiByteCharStart = 0xC0;
 217 static const byte kUtf8MultiByteCharFollower = 0x80;
 218
 219
 220 #ifdef DEBUG
 221 static bool IsUtf8MultiCharacterStart(byte first_byte) {
 222   return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;
 223 }
 224 #endif
 225
 226
 227 static bool IsUtf8MultiCharacterFollower(byte later_byte) {
 228   return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;
 229 }
 230
 231
 232 // Move the cursor back to point at the preceding UTF-8 character start
 233 // in the buffer.
 234 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {
 235   byte character = buffer[--*cursor];
 236   if (character > unibrow::Utf8::kMaxOneByteChar) {
 237     ASSERT(IsUtf8MultiCharacterFollower(character));
 238     // Last byte of a multi-byte character encoding. Step backwards until
 239     // pointing to the first byte of the encoding, recognized by having the
 240     // top two bits set.
 241     while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { }
 242     ASSERT(IsUtf8MultiCharacterStart(buffer[*cursor]));
 243   }
 244 }
 245
 246
 247 // Move the cursor forward to point at the next following UTF-8 character start
 248 // in the buffer.
 249 static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {
 250   byte character = buffer[(*cursor)++];
 251   if (character > unibrow::Utf8::kMaxOneByteChar) {
 252     // First character of a multi-byte character encoding.
 253     // The number of most-significant one-bits determines the length of the
 254     // encoding:
 255     //  110..... - (0xCx, 0xDx) one additional byte (minimum).
 256     //  1110.... - (0xEx) two additional bytes.
 257     //  11110... - (0xFx) three additional bytes (maximum).
 258     ASSERT(IsUtf8MultiCharacterStart(character));
 259     // Additional bytes is:
 260     // 1 if value in range 0xC0 .. 0xDF.
 261     // 2 if value in range 0xE0 .. 0xEF.
 262     // 3 if value in range 0xF0 .. 0xF7.
 263     // Encode that in a single value.
 264     unsigned additional_bytes =
 265         ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;
 266     *cursor += additional_bytes;
 267     ASSERT(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));
 268   }
 269 }
 270
 271
 272 // This can't set a raw position between two surrogate pairs, since there
 273 // is no position in the UTF8 stream that corresponds to that.  This assumes
 274 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence.  If
 275 // it is illegally coded as two 3 byte sequences then there is no problem here.
 276 void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) {
 277   if (raw_character_position_ > target_position) {
 278     // Spool backwards in utf8 buffer.
 279     do {
 280       int old_pos = raw_data_pos_;
 281       Utf8CharacterBack(raw_data_, &raw_data_pos_);
 282       raw_character_position_--;
 283       ASSERT(old_pos - raw_data_pos_ <= 4);
 284       // Step back over both code units for surrogate pairs.
 285       if (old_pos - raw_data_pos_ == 4) raw_character_position_--;
 286     } while (raw_character_position_ > target_position);
 287     // No surrogate pair splitting.
 288     ASSERT(raw_character_position_ == target_position);
 289     return;
 290   }
 291   // Spool forwards in the utf8 buffer.
 292   while (raw_character_position_ < target_position) {
 293     if (raw_data_pos_ == raw_data_length_) return;
 294     int old_pos = raw_data_pos_;
 295     Utf8CharacterForward(raw_data_, &raw_data_pos_);
 296     raw_character_position_++;
 297     ASSERT(raw_data_pos_ - old_pos <= 4);
 298     if (raw_data_pos_ - old_pos == 4) raw_character_position_++;
 299   }
 300   // No surrogate pair splitting.
 301   ASSERT(raw_character_position_ == target_position);
 302 }
 303
 304
 305 // ----------------------------------------------------------------------------
 306 // ExternalTwoByteStringUtf16CharacterStream
 307
 308 ExternalTwoByteStringUtf16CharacterStream::
 309     ~ExternalTwoByteStringUtf16CharacterStream() { }
 310
 311
 312 ExternalTwoByteStringUtf16CharacterStream
 313     ::ExternalTwoByteStringUtf16CharacterStream(
 314         Handle<ExternalTwoByteString> data,
 315         int start_position,
 316         int end_position)
 317     : Utf16CharacterStream(),
 318       source_(data),
 319       raw_data_(data->GetTwoByteData(start_position)) {
 320   buffer_cursor_ = raw_data_,
 321   buffer_end_ = raw_data_ + (end_position - start_position);
 322   pos_ = start_position;
 323 }
 324
 325 } }  // namespace v8::internal