From f93f8ded96365c29a60bc79175286e9bbe799fcf Mon Sep 17 00:00:00 2001
From: "dcarney@chromium.org"
 <dcarney@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Date: Mon, 20 Jan 2014 09:52:54 +0000
Subject: [PATCH] String:WriteUtf8: Add REPLACE_INVALID_UTF8 option
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

This patch makes String::WriteUtf8 replace invalid code points (i.e. unmatched
surrogates) with the unicode replacement character when REPLACE_INVALID_UTF8 is
set.  This is done to avoid creating invalid UTF-8 output which can lead to
compatibility issues with software requiring valid UTF-8 inputs (e.g. the
WebSocket protocol requires valid UTF-8 and terminates connections when invalid
UTF-8 is encountered).

R=dcarney@chromium.org

BUG=

Review URL: https://codereview.chromium.org/121173009

Patch from Felix GeisendÃ¶rfer <haimuiba@gmail.com>.

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@18683 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
---
 include/v8.h            |  6 ++++-
 src/api.cc              | 67 ++++++++++++++++++++++++++++++++++++-------------
 src/unicode-inl.h       | 20 +++++++++++----
 src/unicode.h           | 15 +++++++++--
 test/cctest/test-api.cc | 63 ++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 145 insertions(+), 26 deletions(-)

diff --git a/include/v8.h b/include/v8.h
index fbdc82f..96d8c6f 100644
--- a/include/v8.h
+++ b/include/v8.h
@@ -1628,7 +1628,11 @@ class V8_EXPORT String : public Primitive {
     NO_OPTIONS = 0,
     HINT_MANY_WRITES_EXPECTED = 1,
     NO_NULL_TERMINATION = 2,
-    PRESERVE_ASCII_NULL = 4
+    PRESERVE_ASCII_NULL = 4,
+    // Used by WriteUtf8 to replace orphan surrogate code units with the
+    // unicode replacement character. Needs to be set to guarantee valid UTF-8
+    // output.
+    REPLACE_INVALID_UTF8 = 8
   };
 
   // 16-bit character codes.
diff --git a/src/api.cc b/src/api.cc
index 50879a1..6b5c1a9 100644
--- a/src/api.cc
+++ b/src/api.cc
@@ -4454,28 +4454,35 @@ int String::Utf8Length() const {
 class Utf8WriterVisitor {
  public:
   Utf8WriterVisitor(
-      char* buffer, int capacity, bool skip_capacity_check)
-      : early_termination_(false),
-        last_character_(unibrow::Utf16::kNoPreviousCharacter),
-        buffer_(buffer),
-        start_(buffer),
-        capacity_(capacity),
-        skip_capacity_check_(capacity == -1 || skip_capacity_check),
-        utf16_chars_read_(0) {
+      char* buffer,
+      int capacity,
+      bool skip_capacity_check,
+      bool replace_invalid_utf8)
+    : early_termination_(false),
+      last_character_(unibrow::Utf16::kNoPreviousCharacter),
+      buffer_(buffer),
+      start_(buffer),
+      capacity_(capacity),
+      skip_capacity_check_(capacity == -1 || skip_capacity_check),
+      replace_invalid_utf8_(replace_invalid_utf8),
+      utf16_chars_read_(0) {
   }
 
   static int WriteEndCharacter(uint16_t character,
                                int last_character,
                                int remaining,
-                               char* const buffer) {
+                               char* const buffer,
+                               bool replace_invalid_utf8) {
     using namespace unibrow;
     ASSERT(remaining > 0);
     // We can't use a local buffer here because Encode needs to modify
     // previous characters in the stream.  We know, however, that
     // exactly one character will be advanced.
-    if (Utf16::IsTrailSurrogate(character) &&
-        Utf16::IsLeadSurrogate(last_character)) {
-      int written = Utf8::Encode(buffer, character, last_character);
+    if (Utf16::IsSurrogatePair(last_character, character)) {
+      int written = Utf8::Encode(buffer,
+                                 character,
+                                 last_character,
+                                 replace_invalid_utf8);
       ASSERT(written == 1);
       return written;
     }
@@ -4484,7 +4491,8 @@ class Utf8WriterVisitor {
     // Can't encode using last_character as gcc has array bounds issues.
     int written = Utf8::Encode(temp_buffer,
                                character,
-                               Utf16::kNoPreviousCharacter);
+                               Utf16::kNoPreviousCharacter,
+                               replace_invalid_utf8);
     // Won't fit.
     if (written > remaining) return 0;
     // Copy over the character from temp_buffer.
@@ -4494,6 +4502,16 @@ class Utf8WriterVisitor {
     return written;
   }
 
+  // Visit writes out a group of code units (chars) of a v8::String to the
+  // internal buffer_. This is done in two phases. The first phase calculates a
+  // pesimistic estimate (writable_length) on how many code units can be safely
+  // written without exceeding the buffer capacity and without writing the last
+  // code unit (it could be a lead surrogate). The estimated number of code
+  // units is then written out in one go, and the reported byte usage is used
+  // to correct the estimate. This is repeated until the estimate becomes <= 0
+  // or all code units have been written out. The second phase writes out code
+  // units until the buffer capacity is reached, would be exceeded by the next
+  // unit, or all units have been written out.
   template<typename Char>
   void Visit(const Char* chars, const int length) {
     using namespace unibrow;
@@ -4531,7 +4549,10 @@ class Utf8WriterVisitor {
       } else {
         for (; i < fast_length; i++) {
           uint16_t character = *chars++;
-          buffer += Utf8::Encode(buffer, character, last_character);
+          buffer += Utf8::Encode(buffer,
+                                 character,
+                                 last_character,
+                                 replace_invalid_utf8_);
           last_character = character;
           ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
         }
@@ -4551,10 +4572,17 @@ class Utf8WriterVisitor {
     ASSERT(remaining_capacity >= 0);
     for (; i < length && remaining_capacity > 0; i++) {
       uint16_t character = *chars++;
+      // remaining_capacity is <= 3 bytes at this point, so we do not write out
+      // an umatched lead surrogate.
+      if (replace_invalid_utf8_ && Utf16::IsLeadSurrogate(character)) {
+        early_termination_ = true;
+        break;
+      }
       int written = WriteEndCharacter(character,
                                       last_character,
                                       remaining_capacity,
-                                      buffer);
+                                      buffer,
+                                      replace_invalid_utf8_);
       if (written == 0) {
         early_termination_ = true;
         break;
@@ -4602,6 +4630,7 @@ class Utf8WriterVisitor {
   char* const start_;
   int capacity_;
   bool const skip_capacity_check_;
+  bool const replace_invalid_utf8_;
   int utf16_chars_read_;
   DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);
 };
@@ -4640,9 +4669,11 @@ int String::WriteUtf8(char* buffer,
   }
   const int string_length = str->length();
   bool write_null = !(options & NO_NULL_TERMINATION);
+  bool replace_invalid_utf8 = (options & REPLACE_INVALID_UTF8);
+  int max16BitCodeUnitSize = unibrow::Utf8::kMax16BitCodeUnitSize;
   // First check if we can just write the string without checking capacity.
-  if (capacity == -1 || capacity / 3 >= string_length) {
-    Utf8WriterVisitor writer(buffer, capacity, true);
+  if (capacity == -1 || capacity / max16BitCodeUnitSize >= string_length) {
+    Utf8WriterVisitor writer(buffer, capacity, true, replace_invalid_utf8);
     const int kMaxRecursion = 100;
     bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion);
     if (success) return writer.CompleteWrite(write_null, nchars_ref);
@@ -4670,7 +4701,7 @@ int String::WriteUtf8(char* buffer,
   }
   // Recursive slow path can potentially be unreasonable slow. Flatten.
   str = FlattenGetString(str);
-  Utf8WriterVisitor writer(buffer, capacity, false);
+  Utf8WriterVisitor writer(buffer, capacity, false, replace_invalid_utf8);
   i::String::VisitFlat(&writer, *str);
   return writer.CompleteWrite(write_null, nchars_ref);
 }
diff --git a/src/unicode-inl.h b/src/unicode-inl.h
index f861f9f..99eca64 100644
--- a/src/unicode-inl.h
+++ b/src/unicode-inl.h
@@ -107,8 +107,14 @@ unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
   return 2;
 }
 
-
-unsigned Utf8::Encode(char* str, uchar c, int previous) {
+// Encode encodes the UTF-16 code units c and previous into the given str
+// buffer, and combines surrogate code units into single code points. If
+// replace_invalid is set to true, orphan surrogate code units will be replaced
+// with kBadChar.
+unsigned Utf8::Encode(char* str,
+                      uchar c,
+                      int previous,
+                      bool replace_invalid) {
   static const int kMask = ~(1 << 6);
   if (c <= kMaxOneByteChar) {
     str[0] = c;
@@ -118,12 +124,16 @@ unsigned Utf8::Encode(char* str, uchar c, int previous) {
     str[1] = 0x80 | (c & kMask);
     return 2;
   } else if (c <= kMaxThreeByteChar) {
-    if (Utf16::IsTrailSurrogate(c) &&
-        Utf16::IsLeadSurrogate(previous)) {
+    if (Utf16::IsSurrogatePair(previous, c)) {
       const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
       return Encode(str - kUnmatchedSize,
                     Utf16::CombineSurrogatePair(previous, c),
-                    Utf16::kNoPreviousCharacter) - kUnmatchedSize;
+                    Utf16::kNoPreviousCharacter,
+                    replace_invalid) - kUnmatchedSize;
+    } else if (replace_invalid &&
+               (Utf16::IsLeadSurrogate(c) ||
+               Utf16::IsTrailSurrogate(c))) {
+      c = kBadChar;
     }
     str[0] = 0xE0 | (c >> 12);
     str[1] = 0x80 | ((c >> 6) & kMask);
diff --git a/src/unicode.h b/src/unicode.h
index 6ba61d0..bb5506d 100644
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -102,6 +102,9 @@ class UnicodeData {
 
 class Utf16 {
  public:
+  static inline bool IsSurrogatePair(int lead, int trail) {
+    return IsLeadSurrogate(lead) && IsTrailSurrogate(trail);
+  }
   static inline bool IsLeadSurrogate(int code) {
     if (code == kNoPreviousCharacter) return false;
     return (code & 0xfc00) == 0xd800;
@@ -146,11 +149,16 @@ class Utf8 {
  public:
   static inline uchar Length(uchar chr, int previous);
   static inline unsigned EncodeOneByte(char* out, uint8_t c);
-  static inline unsigned Encode(
-      char* out, uchar c, int previous);
+  static inline unsigned Encode(char* out,
+                                uchar c,
+                                int previous,
+                                bool replace_invalid = false);
   static uchar CalculateValue(const byte* str,
                               unsigned length,
                               unsigned* cursor);
+
+  // The unicode replacement character, used to signal invalid unicode
+  // sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding.
   static const uchar kBadChar = 0xFFFD;
   static const unsigned kMaxEncodedSize   = 4;
   static const unsigned kMaxOneByteChar   = 0x7f;
@@ -162,6 +170,9 @@ class Utf8 {
   // that match are coded as a 4 byte UTF-8 sequence.
   static const unsigned kBytesSavedByCombiningSurrogates = 2;
   static const unsigned kSizeOfUnmatchedSurrogate = 3;
+  // The maximum size a single UTF-16 code unit may take up when encoded as
+  // UTF-8.
+  static const unsigned kMax16BitCodeUnitSize  = 3;
   static inline uchar ValueOf(const byte* str,
                               unsigned length,
                               unsigned* cursor);
diff --git a/test/cctest/test-api.cc b/test/cctest/test-api.cc
index fc4d630..b54aa16 100644
--- a/test/cctest/test-api.cc
+++ b/test/cctest/test-api.cc
@@ -7614,6 +7614,22 @@ THREADED_TEST(StringWrite) {
   v8::Handle<String> str2 = v8_str("abc\303\260\342\230\203");
   v8::Handle<String> str3 = v8::String::NewFromUtf8(
       context->GetIsolate(), "abc\0def", v8::String::kNormalString, 7);
+  // "ab" + lead surrogate + "cd" + trail surrogate + "ef"
+  uint16_t orphans[8] = { 0x61, 0x62, 0xd800, 0x63, 0x64, 0xdc00, 0x65, 0x66 };
+  v8::Handle<String> orphans_str = v8::String::NewFromTwoByte(
+      context->GetIsolate(), orphans, v8::String::kNormalString, 8);
+  // single lead surrogate
+  uint16_t lead[1] = { 0xd800 };
+  v8::Handle<String> lead_str = v8::String::NewFromTwoByte(
+      context->GetIsolate(), lead, v8::String::kNormalString, 1);
+  // single trail surrogate
+  uint16_t trail[1] = { 0xdc00 };
+  v8::Handle<String> trail_str = v8::String::NewFromTwoByte(
+      context->GetIsolate(), trail, v8::String::kNormalString, 1);
+  // surrogate pair
+  uint16_t pair[2] = { 0xd800,  0xdc00 };
+  v8::Handle<String> pair_str = v8::String::NewFromTwoByte(
+      context->GetIsolate(), pair, v8::String::kNormalString, 2);
   const int kStride = 4;  // Must match stride in for loops in JS below.
   CompileRun(
       "var left = '';"
@@ -7687,6 +7703,53 @@ THREADED_TEST(StringWrite) {
   CHECK_EQ(2, charlen);
   CHECK_EQ(0, strncmp(utf8buf, "ab\1", 3));
 
+  // allow orphan surrogates by default
+  memset(utf8buf, 0x1, 1000);
+  len = orphans_str->WriteUtf8(utf8buf, sizeof(utf8buf), &charlen);
+  CHECK_EQ(13, len);
+  CHECK_EQ(8, charlen);
+  CHECK_EQ(0, strcmp(utf8buf, "ab\355\240\200cd\355\260\200ef"));
+
+  // replace orphan surrogates with unicode replacement character
+  memset(utf8buf, 0x1, 1000);
+  len = orphans_str->WriteUtf8(utf8buf,
+                               sizeof(utf8buf),
+                               &charlen,
+                               String::REPLACE_INVALID_UTF8);
+  CHECK_EQ(13, len);
+  CHECK_EQ(8, charlen);
+  CHECK_EQ(0, strcmp(utf8buf, "ab\357\277\275cd\357\277\275ef"));
+
+  // replace single lead surrogate with unicode replacement character
+  memset(utf8buf, 0x1, 1000);
+  len = lead_str->WriteUtf8(utf8buf,
+                            sizeof(utf8buf),
+                            &charlen,
+                            String::REPLACE_INVALID_UTF8);
+  CHECK_EQ(4, len);
+  CHECK_EQ(1, charlen);
+  CHECK_EQ(0, strcmp(utf8buf, "\357\277\275"));
+
+  // replace single trail surrogate with unicode replacement character
+  memset(utf8buf, 0x1, 1000);
+  len = trail_str->WriteUtf8(utf8buf,
+                             sizeof(utf8buf),
+                             &charlen,
+                             String::REPLACE_INVALID_UTF8);
+  CHECK_EQ(4, len);
+  CHECK_EQ(1, charlen);
+  CHECK_EQ(0, strcmp(utf8buf, "\357\277\275"));
+
+  // do not replace / write anything if surrogate pair does not fit the buffer
+  // space
+  memset(utf8buf, 0x1, 1000);
+  len = pair_str->WriteUtf8(utf8buf,
+                             3,
+                             &charlen,
+                             String::REPLACE_INVALID_UTF8);
+  CHECK_EQ(0, len);
+  CHECK_EQ(0, charlen);
+
   memset(utf8buf, 0x1, sizeof(utf8buf));
   len = GetUtf8Length(left_tree);
   int utf8_expected =
-- 
2.7.4