src/v8/src/scanner-character-streams.cc

   1 // Copyright 2011 the V8 project authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "src/v8.h"
   6
   7 #include "src/scanner-character-streams.h"
   8
   9 #include "include/v8.h"
  10 #include "src/handles.h"
  11 #include "src/unicode-inl.h"
  12
  13 namespace v8 {
  14 namespace internal {
  15
  16 namespace {
  17
  18 unsigned CopyCharsHelper(uint16_t* dest, unsigned length, const uint8_t* src,
  19                          unsigned* src_pos, unsigned src_length,
  20                          ScriptCompiler::StreamedSource::Encoding encoding) {
  21   // It's possible that this will be called with length 0, but don't assume that
  22   // the functions this calls handle it gracefully.
  23   if (length == 0) return 0;
  24
  25   if (encoding == ScriptCompiler::StreamedSource::UTF8) {
  26     return v8::internal::Utf8ToUtf16CharacterStream::CopyChars(
  27         dest, length, src, src_pos, src_length);
  28   }
  29
  30   unsigned to_fill = length;
  31   if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos;
  32
  33   if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) {
  34     v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill);
  35   } else {
  36     DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE);
  37     v8::internal::CopyChars<uint16_t, uint16_t>(
  38         dest, reinterpret_cast<const uint16_t*>(src + *src_pos), to_fill);
  39   }
  40   *src_pos += to_fill;
  41   return to_fill;
  42 }
  43
  44 }  // namespace
  45
  46
  47 // ----------------------------------------------------------------------------
  48 // BufferedUtf16CharacterStreams
  49
  50 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
  51     : Utf16CharacterStream(),
  52       pushback_limit_(NULL) {
  53   // Initialize buffer as being empty. First read will fill the buffer.
  54   buffer_cursor_ = buffer_;
  55   buffer_end_ = buffer_;
  56 }
  57
  58
  59 BufferedUtf16CharacterStream::~BufferedUtf16CharacterStream() { }
  60
  61 void BufferedUtf16CharacterStream::PushBack(uc32 character) {
  62   if (character == kEndOfInput) {
  63     pos_--;
  64     return;
  65   }
  66   if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) {
  67     // buffer_ is writable, buffer_cursor_ is const pointer.
  68     buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character);
  69     pos_--;
  70     return;
  71   }
  72   SlowPushBack(static_cast<uc16>(character));
  73 }
  74
  75
  76 void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) {
  77   // In pushback mode, the end of the buffer contains pushback,
  78   // and the start of the buffer (from buffer start to pushback_limit_)
  79   // contains valid data that comes just after the pushback.
  80   // We NULL the pushback_limit_ if pushing all the way back to the
  81   // start of the buffer.
  82
  83   if (pushback_limit_ == NULL) {
  84     // Enter pushback mode.
  85     pushback_limit_ = buffer_end_;
  86     buffer_end_ = buffer_ + kBufferSize;
  87     buffer_cursor_ = buffer_end_;
  88   }
  89   // Ensure that there is room for at least one pushback.
  90   DCHECK(buffer_cursor_ > buffer_);
  91   DCHECK(pos_ > 0);
  92   buffer_[--buffer_cursor_ - buffer_] = character;
  93   if (buffer_cursor_ == buffer_) {
  94     pushback_limit_ = NULL;
  95   } else if (buffer_cursor_ < pushback_limit_) {
  96     pushback_limit_ = buffer_cursor_;
  97   }
  98   pos_--;
  99 }
 100
 101
 102 bool BufferedUtf16CharacterStream::ReadBlock() {
 103   buffer_cursor_ = buffer_;
 104   if (pushback_limit_ != NULL) {
 105     // Leave pushback mode.
 106     buffer_end_ = pushback_limit_;
 107     pushback_limit_ = NULL;
 108     // If there were any valid characters left at the
 109     // start of the buffer, use those.
 110     if (buffer_cursor_ < buffer_end_) return true;
 111     // Otherwise read a new block.
 112   }
 113   unsigned length = FillBuffer(pos_);
 114   buffer_end_ = buffer_ + length;
 115   return length > 0;
 116 }
 117
 118
 119 unsigned BufferedUtf16CharacterStream::SlowSeekForward(unsigned delta) {
 120   // Leave pushback mode (i.e., ignore that there might be valid data
 121   // in the buffer before the pushback_limit_ point).
 122   pushback_limit_ = NULL;
 123   return BufferSeekForward(delta);
 124 }
 125
 126
 127 // ----------------------------------------------------------------------------
 128 // GenericStringUtf16CharacterStream
 129
 130
 131 GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream(
 132     Handle<String> data,
 133     unsigned start_position,
 134     unsigned end_position)
 135     : string_(data),
 136       length_(end_position) {
 137   DCHECK(end_position >= start_position);
 138   pos_ = start_position;
 139 }
 140
 141
 142 GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { }
 143
 144
 145 unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) {
 146   unsigned old_pos = pos_;
 147   pos_ = Min(pos_ + delta, length_);
 148   ReadBlock();
 149   return pos_ - old_pos;
 150 }
 151
 152
 153 unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos) {
 154   if (from_pos >= length_) return 0;
 155   unsigned length = kBufferSize;
 156   if (from_pos + length > length_) {
 157     length = length_ - from_pos;
 158   }
 159   String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length);
 160   return length;
 161 }
 162
 163
 164 // ----------------------------------------------------------------------------
 165 // Utf8ToUtf16CharacterStream
 166 Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data,
 167                                                        unsigned length)
 168     : BufferedUtf16CharacterStream(),
 169       raw_data_(data),
 170       raw_data_length_(length),
 171       raw_data_pos_(0),
 172       raw_character_position_(0) {
 173   ReadBlock();
 174 }
 175
 176
 177 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { }
 178
 179
 180 unsigned Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, unsigned length,
 181                                                const byte* src,
 182                                                unsigned* src_pos,
 183                                                unsigned src_length) {
 184   static const unibrow::uchar kMaxUtf16Character = 0xffff;
 185   unsigned i = 0;
 186   // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer
 187   // one character early (in the normal case), because we need to have at least
 188   // two free spaces in the buffer to be sure that the next character will fit.
 189   while (i < length - 1) {
 190     if (*src_pos == src_length) break;
 191     unibrow::uchar c = src[*src_pos];
 192     if (c <= unibrow::Utf8::kMaxOneByteChar) {
 193       *src_pos = *src_pos + 1;
 194     } else {
 195       c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos,
 196                                         src_pos);
 197     }
 198     if (c > kMaxUtf16Character) {
 199       dest[i++] = unibrow::Utf16::LeadSurrogate(c);
 200       dest[i++] = unibrow::Utf16::TrailSurrogate(c);
 201     } else {
 202       dest[i++] = static_cast<uc16>(c);
 203     }
 204   }
 205   return i;
 206 }
 207
 208
 209 unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) {
 210   unsigned old_pos = pos_;
 211   unsigned target_pos = pos_ + delta;
 212   SetRawPosition(target_pos);
 213   pos_ = raw_character_position_;
 214   ReadBlock();
 215   return pos_ - old_pos;
 216 }
 217
 218
 219 unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position) {
 220   SetRawPosition(char_position);
 221   if (raw_character_position_ != char_position) {
 222     // char_position was not a valid position in the stream (hit the end
 223     // while spooling to it).
 224     return 0u;
 225   }
 226   unsigned i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_,
 227                          raw_data_length_);
 228   raw_character_position_ = char_position + i;
 229   return i;
 230 }
 231
 232
 233 static const byte kUtf8MultiByteMask = 0xC0;
 234 static const byte kUtf8MultiByteCharFollower = 0x80;
 235
 236
 237 #ifdef DEBUG
 238 static const byte kUtf8MultiByteCharStart = 0xC0;
 239 static bool IsUtf8MultiCharacterStart(byte first_byte) {
 240   return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;
 241 }
 242 #endif
 243
 244
 245 static bool IsUtf8MultiCharacterFollower(byte later_byte) {
 246   return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;
 247 }
 248
 249
 250 // Move the cursor back to point at the preceding UTF-8 character start
 251 // in the buffer.
 252 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {
 253   byte character = buffer[--*cursor];
 254   if (character > unibrow::Utf8::kMaxOneByteChar) {
 255     DCHECK(IsUtf8MultiCharacterFollower(character));
 256     // Last byte of a multi-byte character encoding. Step backwards until
 257     // pointing to the first byte of the encoding, recognized by having the
 258     // top two bits set.
 259     while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { }
 260     DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor]));
 261   }
 262 }
 263
 264
 265 // Move the cursor forward to point at the next following UTF-8 character start
 266 // in the buffer.
 267 static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {
 268   byte character = buffer[(*cursor)++];
 269   if (character > unibrow::Utf8::kMaxOneByteChar) {
 270     // First character of a multi-byte character encoding.
 271     // The number of most-significant one-bits determines the length of the
 272     // encoding:
 273     //  110..... - (0xCx, 0xDx) one additional byte (minimum).
 274     //  1110.... - (0xEx) two additional bytes.
 275     //  11110... - (0xFx) three additional bytes (maximum).
 276     DCHECK(IsUtf8MultiCharacterStart(character));
 277     // Additional bytes is:
 278     // 1 if value in range 0xC0 .. 0xDF.
 279     // 2 if value in range 0xE0 .. 0xEF.
 280     // 3 if value in range 0xF0 .. 0xF7.
 281     // Encode that in a single value.
 282     unsigned additional_bytes =
 283         ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;
 284     *cursor += additional_bytes;
 285     DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));
 286   }
 287 }
 288
 289
 290 // This can't set a raw position between two surrogate pairs, since there
 291 // is no position in the UTF8 stream that corresponds to that.  This assumes
 292 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence.  If
 293 // it is illegally coded as two 3 byte sequences then there is no problem here.
 294 void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) {
 295   if (raw_character_position_ > target_position) {
 296     // Spool backwards in utf8 buffer.
 297     do {
 298       int old_pos = raw_data_pos_;
 299       Utf8CharacterBack(raw_data_, &raw_data_pos_);
 300       raw_character_position_--;
 301       DCHECK(old_pos - raw_data_pos_ <= 4);
 302       // Step back over both code units for surrogate pairs.
 303       if (old_pos - raw_data_pos_ == 4) raw_character_position_--;
 304     } while (raw_character_position_ > target_position);
 305     // No surrogate pair splitting.
 306     DCHECK(raw_character_position_ == target_position);
 307     return;
 308   }
 309   // Spool forwards in the utf8 buffer.
 310   while (raw_character_position_ < target_position) {
 311     if (raw_data_pos_ == raw_data_length_) return;
 312     int old_pos = raw_data_pos_;
 313     Utf8CharacterForward(raw_data_, &raw_data_pos_);
 314     raw_character_position_++;
 315     DCHECK(raw_data_pos_ - old_pos <= 4);
 316     if (raw_data_pos_ - old_pos == 4) raw_character_position_++;
 317   }
 318   // No surrogate pair splitting.
 319   DCHECK(raw_character_position_ == target_position);
 320 }
 321
 322
 323 unsigned ExternalStreamingStream::FillBuffer(unsigned position) {
 324   // Ignore "position" which is the position in the decoded data. Instead,
 325   // ExternalStreamingStream keeps track of the position in the raw data.
 326   unsigned data_in_buffer = 0;
 327   // Note that the UTF-8 decoder might not be able to fill the buffer
 328   // completely; it will typically leave the last character empty (see
 329   // Utf8ToUtf16CharacterStream::CopyChars).
 330   while (data_in_buffer < kBufferSize - 1) {
 331     if (current_data_ == NULL) {
 332       // GetSomeData will wait until the embedder has enough data. Here's an
 333       // interface between the API which uses size_t (which is the correct type
 334       // here) and the internal parts which use unsigned. TODO(marja): make the
 335       // internal parts use size_t too.
 336       current_data_length_ =
 337           static_cast<unsigned>(source_stream_->GetMoreData(&current_data_));
 338       current_data_offset_ = 0;
 339       bool data_ends = current_data_length_ == 0;
 340
 341       // A caveat: a data chunk might end with bytes from an incomplete UTF-8
 342       // character (the rest of the bytes will be in the next chunk).
 343       if (encoding_ == ScriptCompiler::StreamedSource::UTF8) {
 344         HandleUtf8SplitCharacters(&data_in_buffer);
 345         if (!data_ends && current_data_offset_ == current_data_length_) {
 346           // The data stream didn't end, but we used all the data in the
 347           // chunk. This will only happen when the chunk was really small. We
 348           // don't handle the case where a UTF-8 character is split over several
 349           // chunks; in that case V8 won't crash, but it will be a parse error.
 350           delete[] current_data_;
 351           current_data_ = NULL;
 352           current_data_length_ = 0;
 353           current_data_offset_ = 0;
 354           continue;  // Request a new chunk.
 355         }
 356       }
 357
 358       // Did the data stream end?
 359       if (data_ends) {
 360         DCHECK(utf8_split_char_buffer_length_ == 0);
 361         return data_in_buffer;
 362       }
 363     }
 364
 365     // Fill the buffer from current_data_.
 366     unsigned new_offset = 0;
 367     unsigned new_chars_in_buffer =
 368         CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer,
 369                         current_data_ + current_data_offset_, &new_offset,
 370                         current_data_length_ - current_data_offset_, encoding_);
 371     data_in_buffer += new_chars_in_buffer;
 372     current_data_offset_ += new_offset;
 373     DCHECK(data_in_buffer <= kBufferSize);
 374
 375     // Did we use all the data in the data chunk?
 376     if (current_data_offset_ == current_data_length_) {
 377       delete[] current_data_;
 378       current_data_ = NULL;
 379       current_data_length_ = 0;
 380       current_data_offset_ = 0;
 381     }
 382   }
 383   return data_in_buffer;
 384 }
 385
 386 void ExternalStreamingStream::HandleUtf8SplitCharacters(
 387     unsigned* data_in_buffer) {
 388   // Note the following property of UTF-8 which makes this function possible:
 389   // Given any byte, we can always read its local environment (in both
 390   // directions) to find out the (possibly multi-byte) character it belongs
 391   // to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a
 392   // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or
 393   // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX.
 394
 395   // First check if we have leftover data from the last chunk.
 396   unibrow::uchar c;
 397   if (utf8_split_char_buffer_length_ > 0) {
 398     // Move the bytes which are part of the split character (which started in
 399     // the previous chunk) into utf8_split_char_buffer_. Note that the
 400     // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2.
 401     while (current_data_offset_ < current_data_length_ &&
 402            utf8_split_char_buffer_length_ < 4 &&
 403            (c = current_data_[current_data_offset_]) >> 6 == 2) {
 404       utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c;
 405       ++utf8_split_char_buffer_length_;
 406       ++current_data_offset_;
 407     }
 408
 409     // Convert the data in utf8_split_char_buffer_.
 410     unsigned new_offset = 0;
 411     unsigned new_chars_in_buffer =
 412         CopyCharsHelper(buffer_ + *data_in_buffer,
 413                         kBufferSize - *data_in_buffer, utf8_split_char_buffer_,
 414                         &new_offset, utf8_split_char_buffer_length_, encoding_);
 415     *data_in_buffer += new_chars_in_buffer;
 416     // Make sure we used all the data.
 417     DCHECK(new_offset == utf8_split_char_buffer_length_);
 418     DCHECK(*data_in_buffer <= kBufferSize);
 419
 420     utf8_split_char_buffer_length_ = 0;
 421   }
 422
 423   // Move bytes which are part of an incomplete character from the end of the
 424   // current chunk to utf8_split_char_buffer_. They will be converted when the
 425   // next data chunk arrives. Note that all valid UTF-8 characters are at most 4
 426   // bytes long, but if the data is invalid, we can have character values bigger
 427   // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes.
 428   while (current_data_length_ > current_data_offset_ &&
 429          (c = current_data_[current_data_length_ - 1]) >
 430              unibrow::Utf8::kMaxOneByteChar &&
 431          utf8_split_char_buffer_length_ < 4) {
 432     --current_data_length_;
 433     ++utf8_split_char_buffer_length_;
 434     if (c >= (3 << 6)) {
 435       // 3 << 6 = 0b11000000; this is the first byte of the multi-byte
 436       // character. No need to copy the previous characters into the conversion
 437       // buffer (even if they're multi-byte).
 438       break;
 439     }
 440   }
 441   CHECK(utf8_split_char_buffer_length_ <= 4);
 442   for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) {
 443     utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i];
 444   }
 445 }
 446
 447
 448 // ----------------------------------------------------------------------------
 449 // ExternalTwoByteStringUtf16CharacterStream
 450
 451 ExternalTwoByteStringUtf16CharacterStream::
 452     ~ExternalTwoByteStringUtf16CharacterStream() { }
 453
 454
 455 ExternalTwoByteStringUtf16CharacterStream
 456     ::ExternalTwoByteStringUtf16CharacterStream(
 457         Handle<ExternalTwoByteString> data,
 458         int start_position,
 459         int end_position)
 460     : Utf16CharacterStream(),
 461       source_(data),
 462       raw_data_(data->GetTwoByteData(start_position)) {
 463   buffer_cursor_ = raw_data_,
 464   buffer_end_ = raw_data_ + (end_position - start_position);
 465   pos_ = start_position;
 466 }
 467
 468 } }  // namespace v8::internal