Update To 11.40.268.0

[platform/framework/web/crosswalk.git] / src / v8 / src / scanner-character-streams.cc
diff --git a/src/v8/src/scanner-character-streams.cc b/src/v8/src/scanner-character-streams.cc

index d06f479..50c3955 100644 (file)
--- a/src/v8/src/scanner-character-streams.cc
+++ b/src/v8/src/scanner-character-streams.cc
@@ -18,6 +18,10 @@ namespace {
  unsigned CopyCharsHelper(uint16_t* dest, unsigned length, const uint8_t* src,
                           unsigned* src_pos, unsigned src_length,
                           ScriptCompiler::StreamedSource::Encoding encoding) {
+  // It's possible that this will be called with length 0, but don't assume that
+  // the functions this calls handle it gracefully.
+  if (length == 0) return 0;
+
    if (encoding == ScriptCompiler::StreamedSource::UTF8) {
      return v8::internal::Utf8ToUtf16CharacterStream::CopyChars(
          dest, length, src, src_pos, src_length);
@@ -381,15 +385,22 @@ unsigned ExternalStreamingStream::FillBuffer(unsigned position) {
  
  void ExternalStreamingStream::HandleUtf8SplitCharacters(
      unsigned* data_in_buffer) {
+  // Note the following property of UTF-8 which makes this function possible:
+  // Given any byte, we can always read its local environment (in both
+  // directions) to find out the (possibly multi-byte) character it belongs
+  // to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a
+  // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or
+  // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX.
+
    // First check if we have leftover data from the last chunk.
    unibrow::uchar c;
    if (utf8_split_char_buffer_length_ > 0) {
      // Move the bytes which are part of the split character (which started in
-    // the previous chunk) into utf8_split_char_buffer_.
+    // the previous chunk) into utf8_split_char_buffer_. Note that the
+    // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2.
      while (current_data_offset_ < current_data_length_ &&
             utf8_split_char_buffer_length_ < 4 &&
-           (c = current_data_[current_data_offset_]) >
-               unibrow::Utf8::kMaxOneByteChar) {
+           (c = current_data_[current_data_offset_]) >> 6 == 2) {
        utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c;
        ++utf8_split_char_buffer_length_;
        ++current_data_offset_;
@@ -420,6 +431,12 @@ void ExternalStreamingStream::HandleUtf8SplitCharacters(
           utf8_split_char_buffer_length_ < 4) {
      --current_data_length_;
      ++utf8_split_char_buffer_length_;
+    if (c >= (3 << 6)) {
+      // 3 << 6 = 0b11000000; this is the first byte of the multi-byte
+      // character. No need to copy the previous characters into the conversion
+      // buffer (even if they're multi-byte).
+      break;
+    }
    }
    CHECK(utf8_split_char_buffer_length_ <= 4);
    for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) {