From: Vedant Kumar Date: Wed, 3 Jun 2020 18:52:29 +0000 (-0700) Subject: [lldb/StringPrinter] Support strings with invalid utf8 sub-sequences X-Git-Tag: llvmorg-12-init~4244 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=4699a7e23010b7c0df49b64f8bea63919825a787;p=platform%2Fupstream%2Fllvm.git [lldb/StringPrinter] Support strings with invalid utf8 sub-sequences Support printing strings which contain invalid utf8 sub-sequences, e.g. strings like "hello world \xfe", instead of bailing out with "Summary Unavailable". I took the opportunity here to delete some hand-rolled utf8 -> utf32 conversion code and replace it with calls into llvm's Support library. rdar://61554346 --- diff --git a/lldb/source/DataFormatters/StringPrinter.cpp b/lldb/source/DataFormatters/StringPrinter.cpp index 7f7d6c1..139f1ec 100644 --- a/lldb/source/DataFormatters/StringPrinter.cpp +++ b/lldb/source/DataFormatters/StringPrinter.cpp @@ -15,6 +15,7 @@ #include "lldb/Target/Target.h" #include "lldb/Utility/Status.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Support/ConvertUTF.h" #include @@ -92,7 +93,7 @@ static bool isprint32(char32_t codepoint) { return true; } -DecodedCharBuffer attemptASCIIEscape(char32_t c, +DecodedCharBuffer attemptASCIIEscape(llvm::UTF32 c, StringPrinter::EscapeStyle escape_style) { const bool is_swift_escape_style = escape_style == StringPrinter::EscapeStyle::Swift; @@ -141,7 +142,10 @@ DecodedCharBuffer GetPrintableImpl( DecodedCharBuffer retval = attemptASCIIEscape(*buffer, escape_style); if (retval.GetSize()) return retval; - if (isprint(*buffer)) + + // Use llvm's locale-independent isPrint(char), instead of the libc + // implementation which may give different results on different platforms. + if (llvm::isPrint(*buffer)) return {buffer, 1}; unsigned escaped_len; @@ -161,60 +165,30 @@ DecodedCharBuffer GetPrintableImpl( return {data, escaped_len}; } -static char32_t ConvertUTF8ToCodePoint(unsigned char c0, unsigned char c1) { - return (c0 - 192) * 64 + (c1 - 128); -} -static char32_t ConvertUTF8ToCodePoint(unsigned char c0, unsigned char c1, - unsigned char c2) { - return (c0 - 224) * 4096 + (c1 - 128) * 64 + (c2 - 128); -} -static char32_t ConvertUTF8ToCodePoint(unsigned char c0, unsigned char c1, - unsigned char c2, unsigned char c3) { - return (c0 - 240) * 262144 + (c2 - 128) * 4096 + (c2 - 128) * 64 + (c3 - 128); -} - template <> DecodedCharBuffer GetPrintableImpl( uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next, StringPrinter::EscapeStyle escape_style) { - const unsigned utf8_encoded_len = llvm::getNumBytesForUTF8(*buffer); - - // If the utf8 encoded length is invalid, or if there aren't enough bytes to - // print, this is some kind of corrupted string. - if (utf8_encoded_len == 0 || utf8_encoded_len > 4) - return nullptr; - if ((buffer_end - buffer) < utf8_encoded_len) - // There's no room in the buffer for the utf8 sequence. - return nullptr; - - char32_t codepoint = 0; - switch (utf8_encoded_len) { - case 1: - // this is just an ASCII byte - ask ASCII + // If the utf8 encoded length is invalid (i.e., not in the closed interval + // [1;4]), or if there aren't enough bytes to print, or if the subsequence + // isn't valid utf8, fall back to printing an ASCII-escaped subsequence. + if (!llvm::isLegalUTF8Sequence(buffer, buffer_end)) return GetPrintableImpl(buffer, buffer_end, next, escape_style); - case 2: - codepoint = ConvertUTF8ToCodePoint((unsigned char)*buffer, - (unsigned char)*(buffer + 1)); - break; - case 3: - codepoint = ConvertUTF8ToCodePoint((unsigned char)*buffer, - (unsigned char)*(buffer + 1), - (unsigned char)*(buffer + 2)); - break; - case 4: - codepoint = ConvertUTF8ToCodePoint( - (unsigned char)*buffer, (unsigned char)*(buffer + 1), - (unsigned char)*(buffer + 2), (unsigned char)*(buffer + 3)); - break; - } - // We couldn't figure out how to print this codepoint. - if (!codepoint) - return nullptr; + // Convert the valid utf8 sequence to a utf32 codepoint. This cannot fail. + llvm::UTF32 codepoint = 0; + const llvm::UTF8 *buffer_for_conversion = buffer; + llvm::ConversionResult result = llvm::convertUTF8Sequence( + &buffer_for_conversion, buffer_end, &codepoint, llvm::strictConversion); + assert(result == llvm::conversionOK && + "Failed to convert legal utf8 sequence"); + (void)result; // The UTF8 helper always advances by the utf8 encoded length. + const unsigned utf8_encoded_len = buffer_for_conversion - buffer; next = buffer + utf8_encoded_len; + DecodedCharBuffer retval = attemptASCIIEscape(codepoint, escape_style); if (retval.GetSize()) return retval; @@ -227,11 +201,11 @@ DecodedCharBuffer GetPrintableImpl( switch (escape_style) { case StringPrinter::EscapeStyle::CXX: // Prints 10 characters, then a \0 terminator. - escaped_len = sprintf((char *)data, "\\U%08x", (unsigned)codepoint); + escaped_len = sprintf((char *)data, "\\U%08x", codepoint); break; case StringPrinter::EscapeStyle::Swift: // Prints up to 12 characters, then a \0 terminator. - escaped_len = sprintf((char *)data, "\\u{%x}", (unsigned)codepoint); + escaped_len = sprintf((char *)data, "\\u{%x}", codepoint); break; } lldbassert(escaped_len > 0 && "unknown string escape style"); diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py index c6f95d7..863e7bc 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py @@ -120,7 +120,7 @@ class LibcxxStringDataFormatterTestCase(TestBase): is_alternate_layout = ('arm' in self.getArchitecture()) and self.platformIsDarwin() if is_64_bit and not is_alternate_layout: self.expect("frame variable garbage1", substrs=['garbage1 = Summary Unavailable']) - self.expect("frame variable garbage2", substrs=['garbage2 = Summary Unavailable']) - self.expect("frame variable garbage3", substrs=['garbage3 = Summary Unavailable']) + self.expect("frame variable garbage2", substrs=[r'garbage2 = "\xfa\xfa\xfa\xfa"']) + self.expect("frame variable garbage3", substrs=[r'garbage3 = "\xf0\xf0"']) self.expect("frame variable garbage4", substrs=['garbage4 = Summary Unavailable']) self.expect("frame variable garbage5", substrs=['garbage5 = Summary Unavailable']) diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/main.cpp index 05af880..e9a3a4b 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/main.cpp +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/main.cpp @@ -18,7 +18,9 @@ static struct { // A corrupt libcxx string in long mode with a payload that contains a utf8 // sequence that's inherently too long. static unsigned char garbage_utf8_payload1[] = { - 250 // This means that we expect a 5-byte sequence, this is invalid. + 250, // This means that we expect a 5-byte sequence, this is invalid. LLDB + // should fall back to ASCII printing. + 250, 250, 250 }; static struct { uint64_t cap = 5; @@ -29,13 +31,14 @@ static struct { // A corrupt libcxx string in long mode with a payload that contains a utf8 // sequence that's too long to fit in the buffer. static unsigned char garbage_utf8_payload2[] = { - 240 // This means that we expect a 4-byte sequence, but the buffer is too - // small for this. + 240, // This means that we expect a 4-byte sequence, but the buffer is too + // small for this. LLDB should fall back to ASCII printing. + 240 }; static struct { uint64_t cap = 3; uint64_t size = 2; - unsigned char *data = &garbage_utf8_payload1[0]; + unsigned char *data = &garbage_utf8_payload2[0]; } garbage_string_long_mode2; // A corrupt libcxx string which has an invalid size (i.e. a size greater than diff --git a/lldb/unittests/DataFormatter/StringPrinterTests.cpp b/lldb/unittests/DataFormatter/StringPrinterTests.cpp index 180b137..84a9372 100644 --- a/lldb/unittests/DataFormatter/StringPrinterTests.cpp +++ b/lldb/unittests/DataFormatter/StringPrinterTests.cpp @@ -77,11 +77,8 @@ TEST(StringPrinterTests, CxxASCII) { EXPECT_EQ(fmt("\uD55C"), QUOTE("\uD55C")); EXPECT_EQ(fmt("\U00010348"), QUOTE("\U00010348")); - // FIXME: These strings are all rejected, but shouldn't be AFAICT. LLDB finds - // that these are not valid utf8 sequences, but that's OK, the raw values - // should still be printed out. - EXPECT_NE(fmt("\376"), QUOTE(R"(\xfe)")); // \376 is 254 in decimal. - EXPECT_NE(fmt("\xfe"), QUOTE(R"(\xfe)")); // \xfe is 254 in decimal. + EXPECT_EQ(fmt("\376"), QUOTE(R"(\xfe)")); // \376 is 254 in decimal. + EXPECT_EQ(fmt("\xfe"), QUOTE(R"(\xfe)")); // \xfe is 254 in decimal. } // Test UTF8 formatting for C++. @@ -114,11 +111,8 @@ TEST(StringPrinterTests, CxxUTF8) { EXPECT_EQ(fmt("\uD55C"), QUOTE("\uD55C")); EXPECT_EQ(fmt("\U00010348"), QUOTE("\U00010348")); - // FIXME: These strings are all rejected, but shouldn't be AFAICT. LLDB finds - // that these are not valid utf8 sequences, but that's OK, the raw values - // should still be printed out. - EXPECT_NE(fmt("\376"), QUOTE(R"(\xfe)")); // \376 is 254 in decimal. - EXPECT_NE(fmt("\xfe"), QUOTE(R"(\xfe)")); // \xfe is 254 in decimal. + EXPECT_EQ(fmt("\376"), QUOTE(R"(\xfe)")); // \376 is 254 in decimal. + EXPECT_EQ(fmt("\xfe"), QUOTE(R"(\xfe)")); // \xfe is 254 in decimal. } // Test UTF8 formatting for Swift. @@ -151,9 +145,6 @@ TEST(StringPrinterTests, SwiftUTF8) { EXPECT_EQ(fmt("\uD55C"), QUOTE("\uD55C")); EXPECT_EQ(fmt("\U00010348"), QUOTE("\U00010348")); - // FIXME: These strings are all rejected, but shouldn't be AFAICT. LLDB finds - // that these are not valid utf8 sequences, but that's OK, the raw values - // should still be printed out. - EXPECT_NE(fmt("\376"), QUOTE(R"(\xfe)")); // \376 is 254 in decimal. - EXPECT_NE(fmt("\xfe"), QUOTE(R"(\xfe)")); // \xfe is 254 in decimal. + EXPECT_EQ(fmt("\376"), QUOTE(R"(\u{fe})")); // \376 is 254 in decimal. + EXPECT_EQ(fmt("\xfe"), QUOTE(R"(\u{fe})")); // \xfe is 254 in decimal. }