Update To 11.40.268.0
[platform/framework/web/crosswalk.git] / src / v8 / src / scanner-character-streams.cc
index cbef3f9..50c3955 100644 (file)
@@ -1,40 +1,49 @@
 // Copyright 2011 the V8 project authors. All rights reserved.
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-//       notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-//       copyright notice, this list of conditions and the following
-//       disclaimer in the documentation and/or other materials provided
-//       with the distribution.
-//     * Neither the name of Google Inc. nor the names of its
-//       contributors may be used to endorse or promote products derived
-//       from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include "v8.h"
-
-#include "scanner-character-streams.h"
-
-#include "handles.h"
-#include "unicode-inl.h"
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "src/v8.h"
+
+#include "src/scanner-character-streams.h"
+
+#include "include/v8.h"
+#include "src/handles.h"
+#include "src/unicode-inl.h"
 
 namespace v8 {
 namespace internal {
 
+namespace {
+
+unsigned CopyCharsHelper(uint16_t* dest, unsigned length, const uint8_t* src,
+                         unsigned* src_pos, unsigned src_length,
+                         ScriptCompiler::StreamedSource::Encoding encoding) {
+  // It's possible that this will be called with length 0, but don't assume that
+  // the functions this calls handle it gracefully.
+  if (length == 0) return 0;
+
+  if (encoding == ScriptCompiler::StreamedSource::UTF8) {
+    return v8::internal::Utf8ToUtf16CharacterStream::CopyChars(
+        dest, length, src, src_pos, src_length);
+  }
+
+  unsigned to_fill = length;
+  if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos;
+
+  if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) {
+    v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill);
+  } else {
+    DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE);
+    v8::internal::CopyChars<uint16_t, uint16_t>(
+        dest, reinterpret_cast<const uint16_t*>(src + *src_pos), to_fill);
+  }
+  *src_pos += to_fill;
+  return to_fill;
+}
+
+}  // namespace
+
+
 // ----------------------------------------------------------------------------
 // BufferedUtf16CharacterStreams
 
@@ -78,8 +87,8 @@ void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) {
     buffer_cursor_ = buffer_end_;
   }
   // Ensure that there is room for at least one pushback.
-  ASSERT(buffer_cursor_ > buffer_);
-  ASSERT(pos_ > 0);
+  DCHECK(buffer_cursor_ > buffer_);
+  DCHECK(pos_ > 0);
   buffer_[--buffer_cursor_ - buffer_] = character;
   if (buffer_cursor_ == buffer_) {
     pushback_limit_ = NULL;
@@ -101,7 +110,7 @@ bool BufferedUtf16CharacterStream::ReadBlock() {
     if (buffer_cursor_ < buffer_end_) return true;
     // Otherwise read a new block.
   }
-  unsigned length = FillBuffer(pos_, kBufferSize);
+  unsigned length = FillBuffer(pos_);
   buffer_end_ = buffer_ + length;
   return length > 0;
 }
@@ -125,9 +134,7 @@ GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream(
     unsigned end_position)
     : string_(data),
       length_(end_position) {
-  ASSERT(end_position >= start_position);
-  buffer_cursor_ = buffer_;
-  buffer_end_ = buffer_;
+  DCHECK(end_position >= start_position);
   pos_ = start_position;
 }
 
@@ -143,9 +150,9 @@ unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) {
 }
 
 
-unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos,
-                                                      unsigned length) {
+unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos) {
   if (from_pos >= length_) return 0;
+  unsigned length = kBufferSize;
   if (from_pos + length > length_) {
     length = length_ - from_pos;
   }
@@ -170,6 +177,35 @@ Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data,
 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { }
 
 
+unsigned Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, unsigned length,
+                                               const byte* src,
+                                               unsigned* src_pos,
+                                               unsigned src_length) {
+  static const unibrow::uchar kMaxUtf16Character = 0xffff;
+  unsigned i = 0;
+  // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer
+  // one character early (in the normal case), because we need to have at least
+  // two free spaces in the buffer to be sure that the next character will fit.
+  while (i < length - 1) {
+    if (*src_pos == src_length) break;
+    unibrow::uchar c = src[*src_pos];
+    if (c <= unibrow::Utf8::kMaxOneByteChar) {
+      *src_pos = *src_pos + 1;
+    } else {
+      c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos,
+                                        src_pos);
+    }
+    if (c > kMaxUtf16Character) {
+      dest[i++] = unibrow::Utf16::LeadSurrogate(c);
+      dest[i++] = unibrow::Utf16::TrailSurrogate(c);
+    } else {
+      dest[i++] = static_cast<uc16>(c);
+    }
+  }
+  return i;
+}
+
+
 unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) {
   unsigned old_pos = pos_;
   unsigned target_pos = pos_ + delta;
@@ -180,33 +216,15 @@ unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) {
 }
 
 
-unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position,
-                                                unsigned length) {
-  static const unibrow::uchar kMaxUtf16Character = 0xffff;
+unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position) {
   SetRawPosition(char_position);
   if (raw_character_position_ != char_position) {
     // char_position was not a valid position in the stream (hit the end
     // while spooling to it).
     return 0u;
   }
-  unsigned i = 0;
-  while (i < length - 1) {
-    if (raw_data_pos_ == raw_data_length_) break;
-    unibrow::uchar c = raw_data_[raw_data_pos_];
-    if (c <= unibrow::Utf8::kMaxOneByteChar) {
-      raw_data_pos_++;
-    } else {
-      c =  unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_,
-                                         raw_data_length_ - raw_data_pos_,
-                                         &raw_data_pos_);
-    }
-    if (c > kMaxUtf16Character) {
-      buffer_[i++] = unibrow::Utf16::LeadSurrogate(c);
-      buffer_[i++] = unibrow::Utf16::TrailSurrogate(c);
-    } else {
-      buffer_[i++] = static_cast<uc16>(c);
-    }
-  }
+  unsigned i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_,
+                         raw_data_length_);
   raw_character_position_ = char_position + i;
   return i;
 }
@@ -234,12 +252,12 @@ static bool IsUtf8MultiCharacterFollower(byte later_byte) {
 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {
   byte character = buffer[--*cursor];
   if (character > unibrow::Utf8::kMaxOneByteChar) {
-    ASSERT(IsUtf8MultiCharacterFollower(character));
+    DCHECK(IsUtf8MultiCharacterFollower(character));
     // Last byte of a multi-byte character encoding. Step backwards until
     // pointing to the first byte of the encoding, recognized by having the
     // top two bits set.
     while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { }
-    ASSERT(IsUtf8MultiCharacterStart(buffer[*cursor]));
+    DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor]));
   }
 }
 
@@ -255,7 +273,7 @@ static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {
     //  110..... - (0xCx, 0xDx) one additional byte (minimum).
     //  1110.... - (0xEx) two additional bytes.
     //  11110... - (0xFx) three additional bytes (maximum).
-    ASSERT(IsUtf8MultiCharacterStart(character));
+    DCHECK(IsUtf8MultiCharacterStart(character));
     // Additional bytes is:
     // 1 if value in range 0xC0 .. 0xDF.
     // 2 if value in range 0xE0 .. 0xEF.
@@ -264,7 +282,7 @@ static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {
     unsigned additional_bytes =
         ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;
     *cursor += additional_bytes;
-    ASSERT(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));
+    DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));
   }
 }
 
@@ -280,12 +298,12 @@ void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) {
       int old_pos = raw_data_pos_;
       Utf8CharacterBack(raw_data_, &raw_data_pos_);
       raw_character_position_--;
-      ASSERT(old_pos - raw_data_pos_ <= 4);
+      DCHECK(old_pos - raw_data_pos_ <= 4);
       // Step back over both code units for surrogate pairs.
       if (old_pos - raw_data_pos_ == 4) raw_character_position_--;
     } while (raw_character_position_ > target_position);
     // No surrogate pair splitting.
-    ASSERT(raw_character_position_ == target_position);
+    DCHECK(raw_character_position_ == target_position);
     return;
   }
   // Spool forwards in the utf8 buffer.
@@ -294,11 +312,136 @@ void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) {
     int old_pos = raw_data_pos_;
     Utf8CharacterForward(raw_data_, &raw_data_pos_);
     raw_character_position_++;
-    ASSERT(raw_data_pos_ - old_pos <= 4);
+    DCHECK(raw_data_pos_ - old_pos <= 4);
     if (raw_data_pos_ - old_pos == 4) raw_character_position_++;
   }
   // No surrogate pair splitting.
-  ASSERT(raw_character_position_ == target_position);
+  DCHECK(raw_character_position_ == target_position);
+}
+
+
+unsigned ExternalStreamingStream::FillBuffer(unsigned position) {
+  // Ignore "position" which is the position in the decoded data. Instead,
+  // ExternalStreamingStream keeps track of the position in the raw data.
+  unsigned data_in_buffer = 0;
+  // Note that the UTF-8 decoder might not be able to fill the buffer
+  // completely; it will typically leave the last character empty (see
+  // Utf8ToUtf16CharacterStream::CopyChars).
+  while (data_in_buffer < kBufferSize - 1) {
+    if (current_data_ == NULL) {
+      // GetSomeData will wait until the embedder has enough data. Here's an
+      // interface between the API which uses size_t (which is the correct type
+      // here) and the internal parts which use unsigned. TODO(marja): make the
+      // internal parts use size_t too.
+      current_data_length_ =
+          static_cast<unsigned>(source_stream_->GetMoreData(&current_data_));
+      current_data_offset_ = 0;
+      bool data_ends = current_data_length_ == 0;
+
+      // A caveat: a data chunk might end with bytes from an incomplete UTF-8
+      // character (the rest of the bytes will be in the next chunk).
+      if (encoding_ == ScriptCompiler::StreamedSource::UTF8) {
+        HandleUtf8SplitCharacters(&data_in_buffer);
+        if (!data_ends && current_data_offset_ == current_data_length_) {
+          // The data stream didn't end, but we used all the data in the
+          // chunk. This will only happen when the chunk was really small. We
+          // don't handle the case where a UTF-8 character is split over several
+          // chunks; in that case V8 won't crash, but it will be a parse error.
+          delete[] current_data_;
+          current_data_ = NULL;
+          current_data_length_ = 0;
+          current_data_offset_ = 0;
+          continue;  // Request a new chunk.
+        }
+      }
+
+      // Did the data stream end?
+      if (data_ends) {
+        DCHECK(utf8_split_char_buffer_length_ == 0);
+        return data_in_buffer;
+      }
+    }
+
+    // Fill the buffer from current_data_.
+    unsigned new_offset = 0;
+    unsigned new_chars_in_buffer =
+        CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer,
+                        current_data_ + current_data_offset_, &new_offset,
+                        current_data_length_ - current_data_offset_, encoding_);
+    data_in_buffer += new_chars_in_buffer;
+    current_data_offset_ += new_offset;
+    DCHECK(data_in_buffer <= kBufferSize);
+
+    // Did we use all the data in the data chunk?
+    if (current_data_offset_ == current_data_length_) {
+      delete[] current_data_;
+      current_data_ = NULL;
+      current_data_length_ = 0;
+      current_data_offset_ = 0;
+    }
+  }
+  return data_in_buffer;
+}
+
+void ExternalStreamingStream::HandleUtf8SplitCharacters(
+    unsigned* data_in_buffer) {
+  // Note the following property of UTF-8 which makes this function possible:
+  // Given any byte, we can always read its local environment (in both
+  // directions) to find out the (possibly multi-byte) character it belongs
+  // to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a
+  // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or
+  // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX.
+
+  // First check if we have leftover data from the last chunk.
+  unibrow::uchar c;
+  if (utf8_split_char_buffer_length_ > 0) {
+    // Move the bytes which are part of the split character (which started in
+    // the previous chunk) into utf8_split_char_buffer_. Note that the
+    // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2.
+    while (current_data_offset_ < current_data_length_ &&
+           utf8_split_char_buffer_length_ < 4 &&
+           (c = current_data_[current_data_offset_]) >> 6 == 2) {
+      utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c;
+      ++utf8_split_char_buffer_length_;
+      ++current_data_offset_;
+    }
+
+    // Convert the data in utf8_split_char_buffer_.
+    unsigned new_offset = 0;
+    unsigned new_chars_in_buffer =
+        CopyCharsHelper(buffer_ + *data_in_buffer,
+                        kBufferSize - *data_in_buffer, utf8_split_char_buffer_,
+                        &new_offset, utf8_split_char_buffer_length_, encoding_);
+    *data_in_buffer += new_chars_in_buffer;
+    // Make sure we used all the data.
+    DCHECK(new_offset == utf8_split_char_buffer_length_);
+    DCHECK(*data_in_buffer <= kBufferSize);
+
+    utf8_split_char_buffer_length_ = 0;
+  }
+
+  // Move bytes which are part of an incomplete character from the end of the
+  // current chunk to utf8_split_char_buffer_. They will be converted when the
+  // next data chunk arrives. Note that all valid UTF-8 characters are at most 4
+  // bytes long, but if the data is invalid, we can have character values bigger
+  // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes.
+  while (current_data_length_ > current_data_offset_ &&
+         (c = current_data_[current_data_length_ - 1]) >
+             unibrow::Utf8::kMaxOneByteChar &&
+         utf8_split_char_buffer_length_ < 4) {
+    --current_data_length_;
+    ++utf8_split_char_buffer_length_;
+    if (c >= (3 << 6)) {
+      // 3 << 6 = 0b11000000; this is the first byte of the multi-byte
+      // character. No need to copy the previous characters into the conversion
+      // buffer (even if they're multi-byte).
+      break;
+    }
+  }
+  CHECK(utf8_split_char_buffer_length_ <= 4);
+  for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) {
+    utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i];
+  }
 }