From f93f8ded96365c29a60bc79175286e9bbe799fcf Mon Sep 17 00:00:00 2001 From: "dcarney@chromium.org" Date: Mon, 20 Jan 2014 09:52:54 +0000 Subject: [PATCH] String:WriteUtf8: Add REPLACE_INVALID_UTF8 option MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This patch makes String::WriteUtf8 replace invalid code points (i.e. unmatched surrogates) with the unicode replacement character when REPLACE_INVALID_UTF8 is set. This is done to avoid creating invalid UTF-8 output which can lead to compatibility issues with software requiring valid UTF-8 inputs (e.g. the WebSocket protocol requires valid UTF-8 and terminates connections when invalid UTF-8 is encountered). R=dcarney@chromium.org BUG= Review URL: https://codereview.chromium.org/121173009 Patch from Felix Geisendörfer . git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@18683 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- include/v8.h | 6 ++++- src/api.cc | 67 ++++++++++++++++++++++++++++++++++++------------- src/unicode-inl.h | 20 +++++++++++---- src/unicode.h | 15 +++++++++-- test/cctest/test-api.cc | 63 ++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 145 insertions(+), 26 deletions(-) diff --git a/include/v8.h b/include/v8.h index fbdc82f..96d8c6f 100644 --- a/include/v8.h +++ b/include/v8.h @@ -1628,7 +1628,11 @@ class V8_EXPORT String : public Primitive { NO_OPTIONS = 0, HINT_MANY_WRITES_EXPECTED = 1, NO_NULL_TERMINATION = 2, - PRESERVE_ASCII_NULL = 4 + PRESERVE_ASCII_NULL = 4, + // Used by WriteUtf8 to replace orphan surrogate code units with the + // unicode replacement character. Needs to be set to guarantee valid UTF-8 + // output. + REPLACE_INVALID_UTF8 = 8 }; // 16-bit character codes. diff --git a/src/api.cc b/src/api.cc index 50879a1..6b5c1a9 100644 --- a/src/api.cc +++ b/src/api.cc @@ -4454,28 +4454,35 @@ int String::Utf8Length() const { class Utf8WriterVisitor { public: Utf8WriterVisitor( - char* buffer, int capacity, bool skip_capacity_check) - : early_termination_(false), - last_character_(unibrow::Utf16::kNoPreviousCharacter), - buffer_(buffer), - start_(buffer), - capacity_(capacity), - skip_capacity_check_(capacity == -1 || skip_capacity_check), - utf16_chars_read_(0) { + char* buffer, + int capacity, + bool skip_capacity_check, + bool replace_invalid_utf8) + : early_termination_(false), + last_character_(unibrow::Utf16::kNoPreviousCharacter), + buffer_(buffer), + start_(buffer), + capacity_(capacity), + skip_capacity_check_(capacity == -1 || skip_capacity_check), + replace_invalid_utf8_(replace_invalid_utf8), + utf16_chars_read_(0) { } static int WriteEndCharacter(uint16_t character, int last_character, int remaining, - char* const buffer) { + char* const buffer, + bool replace_invalid_utf8) { using namespace unibrow; ASSERT(remaining > 0); // We can't use a local buffer here because Encode needs to modify // previous characters in the stream. We know, however, that // exactly one character will be advanced. - if (Utf16::IsTrailSurrogate(character) && - Utf16::IsLeadSurrogate(last_character)) { - int written = Utf8::Encode(buffer, character, last_character); + if (Utf16::IsSurrogatePair(last_character, character)) { + int written = Utf8::Encode(buffer, + character, + last_character, + replace_invalid_utf8); ASSERT(written == 1); return written; } @@ -4484,7 +4491,8 @@ class Utf8WriterVisitor { // Can't encode using last_character as gcc has array bounds issues. int written = Utf8::Encode(temp_buffer, character, - Utf16::kNoPreviousCharacter); + Utf16::kNoPreviousCharacter, + replace_invalid_utf8); // Won't fit. if (written > remaining) return 0; // Copy over the character from temp_buffer. @@ -4494,6 +4502,16 @@ class Utf8WriterVisitor { return written; } + // Visit writes out a group of code units (chars) of a v8::String to the + // internal buffer_. This is done in two phases. The first phase calculates a + // pesimistic estimate (writable_length) on how many code units can be safely + // written without exceeding the buffer capacity and without writing the last + // code unit (it could be a lead surrogate). The estimated number of code + // units is then written out in one go, and the reported byte usage is used + // to correct the estimate. This is repeated until the estimate becomes <= 0 + // or all code units have been written out. The second phase writes out code + // units until the buffer capacity is reached, would be exceeded by the next + // unit, or all units have been written out. template void Visit(const Char* chars, const int length) { using namespace unibrow; @@ -4531,7 +4549,10 @@ class Utf8WriterVisitor { } else { for (; i < fast_length; i++) { uint16_t character = *chars++; - buffer += Utf8::Encode(buffer, character, last_character); + buffer += Utf8::Encode(buffer, + character, + last_character, + replace_invalid_utf8_); last_character = character; ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_); } @@ -4551,10 +4572,17 @@ class Utf8WriterVisitor { ASSERT(remaining_capacity >= 0); for (; i < length && remaining_capacity > 0; i++) { uint16_t character = *chars++; + // remaining_capacity is <= 3 bytes at this point, so we do not write out + // an umatched lead surrogate. + if (replace_invalid_utf8_ && Utf16::IsLeadSurrogate(character)) { + early_termination_ = true; + break; + } int written = WriteEndCharacter(character, last_character, remaining_capacity, - buffer); + buffer, + replace_invalid_utf8_); if (written == 0) { early_termination_ = true; break; @@ -4602,6 +4630,7 @@ class Utf8WriterVisitor { char* const start_; int capacity_; bool const skip_capacity_check_; + bool const replace_invalid_utf8_; int utf16_chars_read_; DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor); }; @@ -4640,9 +4669,11 @@ int String::WriteUtf8(char* buffer, } const int string_length = str->length(); bool write_null = !(options & NO_NULL_TERMINATION); + bool replace_invalid_utf8 = (options & REPLACE_INVALID_UTF8); + int max16BitCodeUnitSize = unibrow::Utf8::kMax16BitCodeUnitSize; // First check if we can just write the string without checking capacity. - if (capacity == -1 || capacity / 3 >= string_length) { - Utf8WriterVisitor writer(buffer, capacity, true); + if (capacity == -1 || capacity / max16BitCodeUnitSize >= string_length) { + Utf8WriterVisitor writer(buffer, capacity, true, replace_invalid_utf8); const int kMaxRecursion = 100; bool success = RecursivelySerializeToUtf8(*str, &writer, kMaxRecursion); if (success) return writer.CompleteWrite(write_null, nchars_ref); @@ -4670,7 +4701,7 @@ int String::WriteUtf8(char* buffer, } // Recursive slow path can potentially be unreasonable slow. Flatten. str = FlattenGetString(str); - Utf8WriterVisitor writer(buffer, capacity, false); + Utf8WriterVisitor writer(buffer, capacity, false, replace_invalid_utf8); i::String::VisitFlat(&writer, *str); return writer.CompleteWrite(write_null, nchars_ref); } diff --git a/src/unicode-inl.h b/src/unicode-inl.h index f861f9f..99eca64 100644 --- a/src/unicode-inl.h +++ b/src/unicode-inl.h @@ -107,8 +107,14 @@ unsigned Utf8::EncodeOneByte(char* str, uint8_t c) { return 2; } - -unsigned Utf8::Encode(char* str, uchar c, int previous) { +// Encode encodes the UTF-16 code units c and previous into the given str +// buffer, and combines surrogate code units into single code points. If +// replace_invalid is set to true, orphan surrogate code units will be replaced +// with kBadChar. +unsigned Utf8::Encode(char* str, + uchar c, + int previous, + bool replace_invalid) { static const int kMask = ~(1 << 6); if (c <= kMaxOneByteChar) { str[0] = c; @@ -118,12 +124,16 @@ unsigned Utf8::Encode(char* str, uchar c, int previous) { str[1] = 0x80 | (c & kMask); return 2; } else if (c <= kMaxThreeByteChar) { - if (Utf16::IsTrailSurrogate(c) && - Utf16::IsLeadSurrogate(previous)) { + if (Utf16::IsSurrogatePair(previous, c)) { const int kUnmatchedSize = kSizeOfUnmatchedSurrogate; return Encode(str - kUnmatchedSize, Utf16::CombineSurrogatePair(previous, c), - Utf16::kNoPreviousCharacter) - kUnmatchedSize; + Utf16::kNoPreviousCharacter, + replace_invalid) - kUnmatchedSize; + } else if (replace_invalid && + (Utf16::IsLeadSurrogate(c) || + Utf16::IsTrailSurrogate(c))) { + c = kBadChar; } str[0] = 0xE0 | (c >> 12); str[1] = 0x80 | ((c >> 6) & kMask); diff --git a/src/unicode.h b/src/unicode.h index 6ba61d0..bb5506d 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -102,6 +102,9 @@ class UnicodeData { class Utf16 { public: + static inline bool IsSurrogatePair(int lead, int trail) { + return IsLeadSurrogate(lead) && IsTrailSurrogate(trail); + } static inline bool IsLeadSurrogate(int code) { if (code == kNoPreviousCharacter) return false; return (code & 0xfc00) == 0xd800; @@ -146,11 +149,16 @@ class Utf8 { public: static inline uchar Length(uchar chr, int previous); static inline unsigned EncodeOneByte(char* out, uint8_t c); - static inline unsigned Encode( - char* out, uchar c, int previous); + static inline unsigned Encode(char* out, + uchar c, + int previous, + bool replace_invalid = false); static uchar CalculateValue(const byte* str, unsigned length, unsigned* cursor); + + // The unicode replacement character, used to signal invalid unicode + // sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding. static const uchar kBadChar = 0xFFFD; static const unsigned kMaxEncodedSize = 4; static const unsigned kMaxOneByteChar = 0x7f; @@ -162,6 +170,9 @@ class Utf8 { // that match are coded as a 4 byte UTF-8 sequence. static const unsigned kBytesSavedByCombiningSurrogates = 2; static const unsigned kSizeOfUnmatchedSurrogate = 3; + // The maximum size a single UTF-16 code unit may take up when encoded as + // UTF-8. + static const unsigned kMax16BitCodeUnitSize = 3; static inline uchar ValueOf(const byte* str, unsigned length, unsigned* cursor); diff --git a/test/cctest/test-api.cc b/test/cctest/test-api.cc index fc4d630..b54aa16 100644 --- a/test/cctest/test-api.cc +++ b/test/cctest/test-api.cc @@ -7614,6 +7614,22 @@ THREADED_TEST(StringWrite) { v8::Handle str2 = v8_str("abc\303\260\342\230\203"); v8::Handle str3 = v8::String::NewFromUtf8( context->GetIsolate(), "abc\0def", v8::String::kNormalString, 7); + // "ab" + lead surrogate + "cd" + trail surrogate + "ef" + uint16_t orphans[8] = { 0x61, 0x62, 0xd800, 0x63, 0x64, 0xdc00, 0x65, 0x66 }; + v8::Handle orphans_str = v8::String::NewFromTwoByte( + context->GetIsolate(), orphans, v8::String::kNormalString, 8); + // single lead surrogate + uint16_t lead[1] = { 0xd800 }; + v8::Handle lead_str = v8::String::NewFromTwoByte( + context->GetIsolate(), lead, v8::String::kNormalString, 1); + // single trail surrogate + uint16_t trail[1] = { 0xdc00 }; + v8::Handle trail_str = v8::String::NewFromTwoByte( + context->GetIsolate(), trail, v8::String::kNormalString, 1); + // surrogate pair + uint16_t pair[2] = { 0xd800, 0xdc00 }; + v8::Handle pair_str = v8::String::NewFromTwoByte( + context->GetIsolate(), pair, v8::String::kNormalString, 2); const int kStride = 4; // Must match stride in for loops in JS below. CompileRun( "var left = '';" @@ -7687,6 +7703,53 @@ THREADED_TEST(StringWrite) { CHECK_EQ(2, charlen); CHECK_EQ(0, strncmp(utf8buf, "ab\1", 3)); + // allow orphan surrogates by default + memset(utf8buf, 0x1, 1000); + len = orphans_str->WriteUtf8(utf8buf, sizeof(utf8buf), &charlen); + CHECK_EQ(13, len); + CHECK_EQ(8, charlen); + CHECK_EQ(0, strcmp(utf8buf, "ab\355\240\200cd\355\260\200ef")); + + // replace orphan surrogates with unicode replacement character + memset(utf8buf, 0x1, 1000); + len = orphans_str->WriteUtf8(utf8buf, + sizeof(utf8buf), + &charlen, + String::REPLACE_INVALID_UTF8); + CHECK_EQ(13, len); + CHECK_EQ(8, charlen); + CHECK_EQ(0, strcmp(utf8buf, "ab\357\277\275cd\357\277\275ef")); + + // replace single lead surrogate with unicode replacement character + memset(utf8buf, 0x1, 1000); + len = lead_str->WriteUtf8(utf8buf, + sizeof(utf8buf), + &charlen, + String::REPLACE_INVALID_UTF8); + CHECK_EQ(4, len); + CHECK_EQ(1, charlen); + CHECK_EQ(0, strcmp(utf8buf, "\357\277\275")); + + // replace single trail surrogate with unicode replacement character + memset(utf8buf, 0x1, 1000); + len = trail_str->WriteUtf8(utf8buf, + sizeof(utf8buf), + &charlen, + String::REPLACE_INVALID_UTF8); + CHECK_EQ(4, len); + CHECK_EQ(1, charlen); + CHECK_EQ(0, strcmp(utf8buf, "\357\277\275")); + + // do not replace / write anything if surrogate pair does not fit the buffer + // space + memset(utf8buf, 0x1, 1000); + len = pair_str->WriteUtf8(utf8buf, + 3, + &charlen, + String::REPLACE_INVALID_UTF8); + CHECK_EQ(0, len); + CHECK_EQ(0, charlen); + memset(utf8buf, 0x1, sizeof(utf8buf)); len = GetUtf8Length(left_tree); int utf8_expected = -- 2.7.4