Script streaming: fix split UTF-8 character handling.

author marja@chromium.org <marja@chromium.org>

Fri, 26 Sep 2014 11:17:31 +0000 (11:17 +0000)

committer marja@chromium.org <marja@chromium.org>

Fri, 26 Sep 2014 11:17:31 +0000 (11:17 +0000)
author marja@chromium.org <marja@chromium.org>
Fri, 26 Sep 2014 11:17:31 +0000 (11:17 +0000)
committer marja@chromium.org <marja@chromium.org>
Fri, 26 Sep 2014 11:17:31 +0000 (11:17 +0000)
diff --git a/src/scanner-character-streams.cc b/src/scanner-character-streams.cc

index 31b4ee4..d06f479 100644 (file)
--- a/src/scanner-character-streams.cc
+++ b/src/scanner-character-streams.cc
@@ -411,13 +411,17 @@ void ExternalStreamingStream::HandleUtf8SplitCharacters(
  
    // Move bytes which are part of an incomplete character from the end of the
    // current chunk to utf8_split_char_buffer_. They will be converted when the
-  // next data chunk arrives.
+  // next data chunk arrives. Note that all valid UTF-8 characters are at most 4
+  // bytes long, but if the data is invalid, we can have character values bigger
+  // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes.
    while (current_data_length_ > current_data_offset_ &&
           (c = current_data_[current_data_length_ - 1]) >
-             unibrow::Utf8::kMaxOneByteChar) {
+             unibrow::Utf8::kMaxOneByteChar &&
+         utf8_split_char_buffer_length_ < 4) {
      --current_data_length_;
      ++utf8_split_char_buffer_length_;
    }
+  CHECK(utf8_split_char_buffer_length_ <= 4);
    for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) {
      utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i];
    }
diff --git a/test/cctest/test-api.cc b/test/cctest/test-api.cc

index b4cf5cf..66dc5a0 100644 (file)
--- a/test/cctest/test-api.cc
+++ b/test/cctest/test-api.cc
@@ -23381,3 +23381,23 @@ TEST(StreamingProducesParserCache) {
    CHECK(cached_data->data != NULL);
    CHECK_GT(cached_data->length, 0);
  }
+
+
+TEST(StreamingScriptWithInvalidUtf8) {
+  // Regression test for a crash: test that invalid UTF-8 bytes in the end of a
+  // chunk don't produce a crash.
+  const char* reference = "\xeb\x91\x80\x80\x80";
+  char chunk1[] =
+      "function foo() {\n"
+      "  // This function will contain an UTF-8 character which is not in\n"
+      "  // ASCII.\n"
+      "  var foobXXXXX";  // Too many bytes which look like incomplete chars!
+  char chunk2[] =
+      "r = 13;\n"
+      "  return foob\xeb\x91\x80\x80\x80r;\n"
+      "}\n";
+  for (int i = 0; i < 5; ++i) chunk1[strlen(chunk1) - 5 + i] = reference[i];
+
+  const char* chunks[] = {chunk1, chunk2, "foo();", NULL};
+  RunStreamingTest(chunks, v8::ScriptCompiler::StreamedSource::UTF8, false);
+}
author	marja@chromium.org <marja@chromium.org>
	Fri, 26 Sep 2014 11:17:31 +0000 (11:17 +0000)
committer	marja@chromium.org <marja@chromium.org>
	Fri, 26 Sep 2014 11:17:31 +0000 (11:17 +0000)
src/scanner-character-streams.cc		patch \| blob \| history
test/cctest/test-api.cc		patch \| blob \| history