src/v8/src/scanner-character-streams.cc

   1 // Copyright 2011 the V8 project authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "src/v8.h"
   6
   7 #include "src/scanner-character-streams.h"
   8
   9 #include "include/v8.h"
  10 #include "src/handles.h"
  11 #include "src/unicode-inl.h"
  12
  13 namespace v8 {
  14 namespace internal {
  15
  16 namespace {
  17
  18 unsigned CopyCharsHelper(uint16_t* dest, unsigned length, const uint8_t* src,
  19                          unsigned* src_pos, unsigned src_length,
  20                          ScriptCompiler::StreamedSource::Encoding encoding) {
  21   if (encoding == ScriptCompiler::StreamedSource::UTF8) {
  22     return v8::internal::Utf8ToUtf16CharacterStream::CopyChars(
  23         dest, length, src, src_pos, src_length);
  24   }
  25
  26   unsigned to_fill = length;
  27   if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos;
  28
  29   if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) {
  30     v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill);
  31   } else {
  32     DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE);
  33     v8::internal::CopyChars<uint16_t, uint16_t>(
  34         dest, reinterpret_cast<const uint16_t*>(src + *src_pos), to_fill);
  35   }
  36   *src_pos += to_fill;
  37   return to_fill;
  38 }
  39
  40 }  // namespace
  41
  42
  43 // ----------------------------------------------------------------------------
  44 // BufferedUtf16CharacterStreams
  45
  46 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
  47     : Utf16CharacterStream(),
  48       pushback_limit_(NULL) {
  49   // Initialize buffer as being empty. First read will fill the buffer.
  50   buffer_cursor_ = buffer_;
  51   buffer_end_ = buffer_;
  52 }
  53
  54
  55 BufferedUtf16CharacterStream::~BufferedUtf16CharacterStream() { }
  56
  57 void BufferedUtf16CharacterStream::PushBack(uc32 character) {
  58   if (character == kEndOfInput) {
  59     pos_--;
  60     return;
  61   }
  62   if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) {
  63     // buffer_ is writable, buffer_cursor_ is const pointer.
  64     buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character);
  65     pos_--;
  66     return;
  67   }
  68   SlowPushBack(static_cast<uc16>(character));
  69 }
  70
  71
  72 void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) {
  73   // In pushback mode, the end of the buffer contains pushback,
  74   // and the start of the buffer (from buffer start to pushback_limit_)
  75   // contains valid data that comes just after the pushback.
  76   // We NULL the pushback_limit_ if pushing all the way back to the
  77   // start of the buffer.
  78
  79   if (pushback_limit_ == NULL) {
  80     // Enter pushback mode.
  81     pushback_limit_ = buffer_end_;
  82     buffer_end_ = buffer_ + kBufferSize;
  83     buffer_cursor_ = buffer_end_;
  84   }
  85   // Ensure that there is room for at least one pushback.
  86   DCHECK(buffer_cursor_ > buffer_);
  87   DCHECK(pos_ > 0);
  88   buffer_[--buffer_cursor_ - buffer_] = character;
  89   if (buffer_cursor_ == buffer_) {
  90     pushback_limit_ = NULL;
  91   } else if (buffer_cursor_ < pushback_limit_) {
  92     pushback_limit_ = buffer_cursor_;
  93   }
  94   pos_--;
  95 }
  96
  97
  98 bool BufferedUtf16CharacterStream::ReadBlock() {
  99   buffer_cursor_ = buffer_;
 100   if (pushback_limit_ != NULL) {
 101     // Leave pushback mode.
 102     buffer_end_ = pushback_limit_;
 103     pushback_limit_ = NULL;
 104     // If there were any valid characters left at the
 105     // start of the buffer, use those.
 106     if (buffer_cursor_ < buffer_end_) return true;
 107     // Otherwise read a new block.
 108   }
 109   unsigned length = FillBuffer(pos_);
 110   buffer_end_ = buffer_ + length;
 111   return length > 0;
 112 }
 113
 114
 115 unsigned BufferedUtf16CharacterStream::SlowSeekForward(unsigned delta) {
 116   // Leave pushback mode (i.e., ignore that there might be valid data
 117   // in the buffer before the pushback_limit_ point).
 118   pushback_limit_ = NULL;
 119   return BufferSeekForward(delta);
 120 }
 121
 122
 123 // ----------------------------------------------------------------------------
 124 // GenericStringUtf16CharacterStream
 125
 126
 127 GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream(
 128     Handle<String> data,
 129     unsigned start_position,
 130     unsigned end_position)
 131     : string_(data),
 132       length_(end_position) {
 133   DCHECK(end_position >= start_position);
 134   pos_ = start_position;
 135 }
 136
 137
 138 GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { }
 139
 140
 141 unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) {
 142   unsigned old_pos = pos_;
 143   pos_ = Min(pos_ + delta, length_);
 144   ReadBlock();
 145   return pos_ - old_pos;
 146 }
 147
 148
 149 unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos) {
 150   if (from_pos >= length_) return 0;
 151   unsigned length = kBufferSize;
 152   if (from_pos + length > length_) {
 153     length = length_ - from_pos;
 154   }
 155   String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length);
 156   return length;
 157 }
 158
 159
 160 // ----------------------------------------------------------------------------
 161 // Utf8ToUtf16CharacterStream
 162 Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data,
 163                                                        unsigned length)
 164     : BufferedUtf16CharacterStream(),
 165       raw_data_(data),
 166       raw_data_length_(length),
 167       raw_data_pos_(0),
 168       raw_character_position_(0) {
 169   ReadBlock();
 170 }
 171
 172
 173 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { }
 174
 175
 176 unsigned Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, unsigned length,
 177                                                const byte* src,
 178                                                unsigned* src_pos,
 179                                                unsigned src_length) {
 180   static const unibrow::uchar kMaxUtf16Character = 0xffff;
 181   unsigned i = 0;
 182   // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer
 183   // one character early (in the normal case), because we need to have at least
 184   // two free spaces in the buffer to be sure that the next character will fit.
 185   while (i < length - 1) {
 186     if (*src_pos == src_length) break;
 187     unibrow::uchar c = src[*src_pos];
 188     if (c <= unibrow::Utf8::kMaxOneByteChar) {
 189       *src_pos = *src_pos + 1;
 190     } else {
 191       c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos,
 192                                         src_pos);
 193     }
 194     if (c > kMaxUtf16Character) {
 195       dest[i++] = unibrow::Utf16::LeadSurrogate(c);
 196       dest[i++] = unibrow::Utf16::TrailSurrogate(c);
 197     } else {
 198       dest[i++] = static_cast<uc16>(c);
 199     }
 200   }
 201   return i;
 202 }
 203
 204
 205 unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) {
 206   unsigned old_pos = pos_;
 207   unsigned target_pos = pos_ + delta;
 208   SetRawPosition(target_pos);
 209   pos_ = raw_character_position_;
 210   ReadBlock();
 211   return pos_ - old_pos;
 212 }
 213
 214
 215 unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position) {
 216   SetRawPosition(char_position);
 217   if (raw_character_position_ != char_position) {
 218     // char_position was not a valid position in the stream (hit the end
 219     // while spooling to it).
 220     return 0u;
 221   }
 222   unsigned i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_,
 223                          raw_data_length_);
 224   raw_character_position_ = char_position + i;
 225   return i;
 226 }
 227
 228
 229 static const byte kUtf8MultiByteMask = 0xC0;
 230 static const byte kUtf8MultiByteCharFollower = 0x80;
 231
 232
 233 #ifdef DEBUG
 234 static const byte kUtf8MultiByteCharStart = 0xC0;
 235 static bool IsUtf8MultiCharacterStart(byte first_byte) {
 236   return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;
 237 }
 238 #endif
 239
 240
 241 static bool IsUtf8MultiCharacterFollower(byte later_byte) {
 242   return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;
 243 }
 244
 245
 246 // Move the cursor back to point at the preceding UTF-8 character start
 247 // in the buffer.
 248 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {
 249   byte character = buffer[--*cursor];
 250   if (character > unibrow::Utf8::kMaxOneByteChar) {
 251     DCHECK(IsUtf8MultiCharacterFollower(character));
 252     // Last byte of a multi-byte character encoding. Step backwards until
 253     // pointing to the first byte of the encoding, recognized by having the
 254     // top two bits set.
 255     while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { }
 256     DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor]));
 257   }
 258 }
 259
 260
 261 // Move the cursor forward to point at the next following UTF-8 character start
 262 // in the buffer.
 263 static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {
 264   byte character = buffer[(*cursor)++];
 265   if (character > unibrow::Utf8::kMaxOneByteChar) {
 266     // First character of a multi-byte character encoding.
 267     // The number of most-significant one-bits determines the length of the
 268     // encoding:
 269     //  110..... - (0xCx, 0xDx) one additional byte (minimum).
 270     //  1110.... - (0xEx) two additional bytes.
 271     //  11110... - (0xFx) three additional bytes (maximum).
 272     DCHECK(IsUtf8MultiCharacterStart(character));
 273     // Additional bytes is:
 274     // 1 if value in range 0xC0 .. 0xDF.
 275     // 2 if value in range 0xE0 .. 0xEF.
 276     // 3 if value in range 0xF0 .. 0xF7.
 277     // Encode that in a single value.
 278     unsigned additional_bytes =
 279         ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;
 280     *cursor += additional_bytes;
 281     DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));
 282   }
 283 }
 284
 285
 286 // This can't set a raw position between two surrogate pairs, since there
 287 // is no position in the UTF8 stream that corresponds to that.  This assumes
 288 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence.  If
 289 // it is illegally coded as two 3 byte sequences then there is no problem here.
 290 void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) {
 291   if (raw_character_position_ > target_position) {
 292     // Spool backwards in utf8 buffer.
 293     do {
 294       int old_pos = raw_data_pos_;
 295       Utf8CharacterBack(raw_data_, &raw_data_pos_);
 296       raw_character_position_--;
 297       DCHECK(old_pos - raw_data_pos_ <= 4);
 298       // Step back over both code units for surrogate pairs.
 299       if (old_pos - raw_data_pos_ == 4) raw_character_position_--;
 300     } while (raw_character_position_ > target_position);
 301     // No surrogate pair splitting.
 302     DCHECK(raw_character_position_ == target_position);
 303     return;
 304   }
 305   // Spool forwards in the utf8 buffer.
 306   while (raw_character_position_ < target_position) {
 307     if (raw_data_pos_ == raw_data_length_) return;
 308     int old_pos = raw_data_pos_;
 309     Utf8CharacterForward(raw_data_, &raw_data_pos_);
 310     raw_character_position_++;
 311     DCHECK(raw_data_pos_ - old_pos <= 4);
 312     if (raw_data_pos_ - old_pos == 4) raw_character_position_++;
 313   }
 314   // No surrogate pair splitting.
 315   DCHECK(raw_character_position_ == target_position);
 316 }
 317
 318
 319 unsigned ExternalStreamingStream::FillBuffer(unsigned position) {
 320   // Ignore "position" which is the position in the decoded data. Instead,
 321   // ExternalStreamingStream keeps track of the position in the raw data.
 322   unsigned data_in_buffer = 0;
 323   // Note that the UTF-8 decoder might not be able to fill the buffer
 324   // completely; it will typically leave the last character empty (see
 325   // Utf8ToUtf16CharacterStream::CopyChars).
 326   while (data_in_buffer < kBufferSize - 1) {
 327     if (current_data_ == NULL) {
 328       // GetSomeData will wait until the embedder has enough data. Here's an
 329       // interface between the API which uses size_t (which is the correct type
 330       // here) and the internal parts which use unsigned. TODO(marja): make the
 331       // internal parts use size_t too.
 332       current_data_length_ =
 333           static_cast<unsigned>(source_stream_->GetMoreData(&current_data_));
 334       current_data_offset_ = 0;
 335       bool data_ends = current_data_length_ == 0;
 336
 337       // A caveat: a data chunk might end with bytes from an incomplete UTF-8
 338       // character (the rest of the bytes will be in the next chunk).
 339       if (encoding_ == ScriptCompiler::StreamedSource::UTF8) {
 340         HandleUtf8SplitCharacters(&data_in_buffer);
 341         if (!data_ends && current_data_offset_ == current_data_length_) {
 342           // The data stream didn't end, but we used all the data in the
 343           // chunk. This will only happen when the chunk was really small. We
 344           // don't handle the case where a UTF-8 character is split over several
 345           // chunks; in that case V8 won't crash, but it will be a parse error.
 346           delete[] current_data_;
 347           current_data_ = NULL;
 348           current_data_length_ = 0;
 349           current_data_offset_ = 0;
 350           continue;  // Request a new chunk.
 351         }
 352       }
 353
 354       // Did the data stream end?
 355       if (data_ends) {
 356         DCHECK(utf8_split_char_buffer_length_ == 0);
 357         return data_in_buffer;
 358       }
 359     }
 360
 361     // Fill the buffer from current_data_.
 362     unsigned new_offset = 0;
 363     unsigned new_chars_in_buffer =
 364         CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer,
 365                         current_data_ + current_data_offset_, &new_offset,
 366                         current_data_length_ - current_data_offset_, encoding_);
 367     data_in_buffer += new_chars_in_buffer;
 368     current_data_offset_ += new_offset;
 369     DCHECK(data_in_buffer <= kBufferSize);
 370
 371     // Did we use all the data in the data chunk?
 372     if (current_data_offset_ == current_data_length_) {
 373       delete[] current_data_;
 374       current_data_ = NULL;
 375       current_data_length_ = 0;
 376       current_data_offset_ = 0;
 377     }
 378   }
 379   return data_in_buffer;
 380 }
 381
 382 void ExternalStreamingStream::HandleUtf8SplitCharacters(
 383     unsigned* data_in_buffer) {
 384   // First check if we have leftover data from the last chunk.
 385   unibrow::uchar c;
 386   if (utf8_split_char_buffer_length_ > 0) {
 387     // Move the bytes which are part of the split character (which started in
 388     // the previous chunk) into utf8_split_char_buffer_.
 389     while (current_data_offset_ < current_data_length_ &&
 390            utf8_split_char_buffer_length_ < 4 &&
 391            (c = current_data_[current_data_offset_]) >
 392                unibrow::Utf8::kMaxOneByteChar) {
 393       utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c;
 394       ++utf8_split_char_buffer_length_;
 395       ++current_data_offset_;
 396     }
 397
 398     // Convert the data in utf8_split_char_buffer_.
 399     unsigned new_offset = 0;
 400     unsigned new_chars_in_buffer =
 401         CopyCharsHelper(buffer_ + *data_in_buffer,
 402                         kBufferSize - *data_in_buffer, utf8_split_char_buffer_,
 403                         &new_offset, utf8_split_char_buffer_length_, encoding_);
 404     *data_in_buffer += new_chars_in_buffer;
 405     // Make sure we used all the data.
 406     DCHECK(new_offset == utf8_split_char_buffer_length_);
 407     DCHECK(*data_in_buffer <= kBufferSize);
 408
 409     utf8_split_char_buffer_length_ = 0;
 410   }
 411
 412   // Move bytes which are part of an incomplete character from the end of the
 413   // current chunk to utf8_split_char_buffer_. They will be converted when the
 414   // next data chunk arrives. Note that all valid UTF-8 characters are at most 4
 415   // bytes long, but if the data is invalid, we can have character values bigger
 416   // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes.
 417   while (current_data_length_ > current_data_offset_ &&
 418          (c = current_data_[current_data_length_ - 1]) >
 419              unibrow::Utf8::kMaxOneByteChar &&
 420          utf8_split_char_buffer_length_ < 4) {
 421     --current_data_length_;
 422     ++utf8_split_char_buffer_length_;
 423   }
 424   CHECK(utf8_split_char_buffer_length_ <= 4);
 425   for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) {
 426     utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i];
 427   }
 428 }
 429
 430
 431 // ----------------------------------------------------------------------------
 432 // ExternalTwoByteStringUtf16CharacterStream
 433
 434 ExternalTwoByteStringUtf16CharacterStream::
 435     ~ExternalTwoByteStringUtf16CharacterStream() { }
 436
 437
 438 ExternalTwoByteStringUtf16CharacterStream
 439     ::ExternalTwoByteStringUtf16CharacterStream(
 440         Handle<ExternalTwoByteString> data,
 441         int start_position,
 442         int end_position)
 443     : Utf16CharacterStream(),
 444       source_(data),
 445       raw_data_(data->GetTwoByteData(start_position)) {
 446   buffer_cursor_ = raw_data_,
 447   buffer_end_ = raw_data_ + (end_position - start_position);
 448   pos_ = start_position;
 449 }
 450
 451 } }  // namespace v8::internal