From bcd682e1faed71fd861518ca43235706fc39a7cd Mon Sep 17 00:00:00 2001 From: Jonathan Wakely Date: Thu, 16 Mar 2017 15:27:51 +0000 Subject: [PATCH] PR libstdc++/79980 fix BOM detection, maxcode checks, UCS2 handling PR libstdc++/79980 * include/bits/locale_conv.h (__do_str_codecvt): Set __count on error path. * src/c++11/codecvt.cc (operator&=, operator|=, operator~): Overloads for manipulating codecvt_mode values. (read_utf16_bom): Compare input to BOM constants instead of integral constants that depend on endianness. Take mode parameter by reference and adjust it, to distinguish between no BOM present and UTF-16BE BOM present. (ucs4_in, ucs2_span, ucs4_span): Adjust calls to read_utf16_bom. (surrogates): New enumeration type. (utf16_in, utf16_out): Add surrogates parameter to choose between UTF-16 and UCS2 behaviour. (utf16_span, ucs2_span): Use std::min not std::max. (ucs2_out): Use std::min not std::max. Disallow surrogate pairs. (ucs2_in): Likewise. Adjust calls to read_utf16_bom. * testsuite/22_locale/codecvt/codecvt_utf16/79980.cc: New test. * testsuite/22_locale/codecvt/codecvt_utf8/79980.cc: New test. From-SVN: r246200 --- libstdc++-v3/ChangeLog | 19 ++++ libstdc++-v3/include/bits/locale_conv.h | 5 +- libstdc++-v3/src/c++11/codecvt.cc | 94 +++++++++++------ .../22_locale/codecvt/codecvt_utf16/79980.cc | 115 +++++++++++++++++++++ .../22_locale/codecvt/codecvt_utf8/79980.cc | 94 +++++++++++++++++ 5 files changed, 296 insertions(+), 31 deletions(-) create mode 100644 libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc create mode 100644 libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog index 98735ca..83f74ef 100644 --- a/libstdc++-v3/ChangeLog +++ b/libstdc++-v3/ChangeLog @@ -1,5 +1,24 @@ 2017-03-16 Jonathan Wakely + PR libstdc++/79980 + * include/bits/locale_conv.h (__do_str_codecvt): Set __count on + error path. + * src/c++11/codecvt.cc (operator&=, operator|=, operator~): Overloads + for manipulating codecvt_mode values. + (read_utf16_bom): Compare input to BOM constants instead of integral + constants that depend on endianness. Take mode parameter by + reference and adjust it, to distinguish between no BOM present and + UTF-16BE BOM present. + (ucs4_in, ucs2_span, ucs4_span): Adjust calls to read_utf16_bom. + (surrogates): New enumeration type. + (utf16_in, utf16_out): Add surrogates parameter to choose between + UTF-16 and UCS2 behaviour. + (utf16_span, ucs2_span): Use std::min not std::max. + (ucs2_out): Use std::min not std::max. Disallow surrogate pairs. + (ucs2_in): Likewise. Adjust calls to read_utf16_bom. + * testsuite/22_locale/codecvt/codecvt_utf16/79980.cc: New test. + * testsuite/22_locale/codecvt/codecvt_utf8/79980.cc: New test. + PR libstdc++/79511 * src/c++11/codecvt.cc (write_utf16_code_point): Don't write 0xffff as a surrogate pair. diff --git a/libstdc++-v3/include/bits/locale_conv.h b/libstdc++-v3/include/bits/locale_conv.h index cd8f146..9b952d4 100644 --- a/libstdc++-v3/include/bits/locale_conv.h +++ b/libstdc++-v3/include/bits/locale_conv.h @@ -81,7 +81,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION && (__outstr.size() - __outchars) < __maxlen); if (__result == codecvt_base::error) - return false; + { + __count = __next - __first; + return false; + } if (__result == codecvt_base::noconv) { diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc index 9b63e2b..a50804c 100644 --- a/libstdc++-v3/src/c++11/codecvt.cc +++ b/libstdc++-v3/src/c++11/codecvt.cc @@ -24,13 +24,27 @@ #include #include // std::memcpy, std::memcmp -#include // std::max +#include // std::min #ifdef _GLIBCXX_USE_C99_STDINT_TR1 namespace std _GLIBCXX_VISIBILITY(default) { _GLIBCXX_BEGIN_NAMESPACE_VERSION + // The standard doesn't define these operators, which is annoying. + static underlying_type::type + to_integer(codecvt_mode m) + { return static_cast(m); } + + static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n) + { return m = codecvt_mode(to_integer(m) & to_integer(n)); } + + static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n) + { return m = codecvt_mode(to_integer(m) | to_integer(n)); } + + static codecvt_mode operator~(codecvt_mode m) + { return codecvt_mode(~to_integer(m)); } + namespace { // Largest code point that fits in a single UTF-16 code unit. @@ -117,22 +131,26 @@ namespace read_bom(from, utf8_bom); } - // If consume_header is set in mode update from.next to after any BOM. - // Return little_endian iff the UTF-16LE BOM was present. - codecvt_mode - read_utf16_bom(range& from, codecvt_mode mode) + // If consume_header is not set in mode, no effects. + // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then: + // - if the UTF-16BE BOM was found unset little_endian in mode, or + // - if the UTF-16LE BOM was found set little_endian in mode. + void + read_utf16_bom(range& from, codecvt_mode& mode) { if (mode & consume_header && from.size()) { - if (*from.next == 0xFEFF) - ++from.next; - else if (*from.next == 0xFFFE) + if (!memcmp(from.next, utf16_bom, 2)) + { + ++from.next; + mode &= ~little_endian; + } + else if (!memcmp(from.next, utf16le_bom, 2)) { ++from.next; - return little_endian; + mode |= little_endian; } } - return {}; } // Read a codepoint from a UTF-8 multibyte sequence. @@ -380,8 +398,7 @@ namespace ucs4_in(range& from, range& to, unsigned long maxcode = max_code_point, codecvt_mode mode = {}) { - if (read_utf16_bom(from, mode) == little_endian) - mode = codecvt_mode(mode & little_endian); + read_utf16_bom(from, mode); while (from.size() && to.size()) { const char32_t codepoint = read_utf16_code_point(from, maxcode, mode); @@ -413,11 +430,15 @@ namespace return codecvt_base::ok; } - // utf8 -> utf16 + // Flag indicating whether to process UTF-16 or UCS2 + enum class surrogates { allowed, disallowed }; + + // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed) template codecvt_base::result utf16_in(range& from, range& to, - unsigned long maxcode = max_code_point, codecvt_mode mode = {}) + unsigned long maxcode = max_code_point, codecvt_mode mode = {}, + surrogates s = surrogates::allowed) { read_utf8_bom(from, mode); while (from.size() && to.size()) @@ -425,7 +446,12 @@ namespace const char* const first = from.next; const char32_t codepoint = read_utf8_code_point(from, maxcode); if (codepoint == incomplete_mb_character) - return codecvt_base::partial; + { + if (s == surrogates::allowed) + return codecvt_base::partial; + else + return codecvt_base::error; // No surrogates in UCS2 + } if (codepoint > maxcode) return codecvt_base::error; if (!write_utf16_code_point(to, codepoint, mode)) @@ -437,11 +463,12 @@ namespace return codecvt_base::ok; } - // utf16 -> utf8 + // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed) template codecvt_base::result utf16_out(range& from, range& to, - unsigned long maxcode = max_code_point, codecvt_mode mode = {}) + unsigned long maxcode = max_code_point, codecvt_mode mode = {}, + surrogates s = surrogates::allowed) { if (!write_utf8_bom(to, mode)) return codecvt_base::partial; @@ -451,6 +478,9 @@ namespace int inc = 1; if (is_high_surrogate(c)) { + if (s == surrogates::disallowed) + return codecvt_base::error; // No surrogates in UCS-2 + if (from.size() < 2) return codecvt_base::ok; // stop converting at this point @@ -492,7 +522,7 @@ namespace ++count; } if (count+1 == max) // take one more character if it fits in a single unit - read_utf8_code_point(from, std::max(max_single_utf16_unit, maxcode)); + read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode)); return from.next; } @@ -501,7 +531,9 @@ namespace ucs2_in(range& from, range& to, char32_t maxcode = max_code_point, codecvt_mode mode = {}) { - return utf16_in(from, to, std::max(max_single_utf16_unit, maxcode), mode); + // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: + maxcode = std::min(max_single_utf16_unit, maxcode); + return utf16_in(from, to, maxcode, mode, surrogates::disallowed); } // ucs2 -> utf8 @@ -509,7 +541,9 @@ namespace ucs2_out(range& from, range& to, char32_t maxcode = max_code_point, codecvt_mode mode = {}) { - return utf16_out(from, to, std::max(max_single_utf16_unit, maxcode), mode); + // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: + maxcode = std::min(max_single_utf16_unit, maxcode); + return utf16_out(from, to, maxcode, mode, surrogates::disallowed); } // ucs2 -> utf16 @@ -537,14 +571,14 @@ namespace ucs2_in(range& from, range& to, char32_t maxcode = max_code_point, codecvt_mode mode = {}) { - if (read_utf16_bom(from, mode) == little_endian) - mode = codecvt_mode(mode & little_endian); - maxcode = std::max(max_single_utf16_unit, maxcode); + read_utf16_bom(from, mode); + // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: + maxcode = std::min(max_single_utf16_unit, maxcode); while (from.size() && to.size()) { const char32_t c = read_utf16_code_point(from, maxcode, mode); if (c == incomplete_mb_character) - return codecvt_base::partial; + return codecvt_base::error; // UCS-2 only supports single units. if (c > maxcode) return codecvt_base::error; *to.next++ = c; @@ -557,9 +591,9 @@ namespace char32_t maxcode, codecvt_mode mode) { range from{ begin, end }; - if (read_utf16_bom(from, mode) == little_endian) - mode = codecvt_mode(mode & little_endian); - maxcode = std::max(max_single_utf16_unit, maxcode); + read_utf16_bom(from, mode); + // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: + maxcode = std::min(max_single_utf16_unit, maxcode); char32_t c = 0; while (max-- && c <= maxcode) c = read_utf16_code_point(from, maxcode, mode); @@ -572,7 +606,8 @@ namespace { range from{ begin, end }; read_utf8_bom(from, mode); - maxcode = std::max(max_single_utf16_unit, maxcode); + // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: + maxcode = std::min(max_single_utf16_unit, maxcode); char32_t c = 0; while (max-- && c <= maxcode) c = read_utf8_code_point(from, maxcode); @@ -598,8 +633,7 @@ namespace char32_t maxcode = max_code_point, codecvt_mode mode = {}) { range from{ begin, end }; - if (read_utf16_bom(from, mode) == little_endian) - mode = codecvt_mode(mode & little_endian); + read_utf16_bom(from, mode); char32_t c = 0; while (max-- && c <= maxcode) c = read_utf16_code_point(from, maxcode, mode); diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc new file mode 100644 index 0000000..9383818 --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc @@ -0,0 +1,115 @@ +// Copyright (C) 2017 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// { dg-do run { target c++11 } } + +#include +#include +#include + +// PR libstdc++/79980 + +constexpr std::codecvt_mode mode(std::codecvt_mode m) +{ return static_cast(m | std::consume_header); } + +template + using Conv + = std::wstring_convert, WCh>; + +void +test01() +{ + const char src[] = "\xFE\xFF\xAB\xCD"; + Conv conv; + auto dst = conv.from_bytes(src, src+4); + VERIFY( dst[0] == 0xabcd ); +} + +void +test02() +{ + const char src[] = "\xFF\xFE\xAB\xCD"; + Conv conv; + auto dst = conv.from_bytes(src, src+4); + VERIFY( dst[0] == 0xcdab ); +} + +void +test03() +{ + const char src[] = "\xFE\xFF\xAB\xCD"; + Conv conv; + auto dst = conv.from_bytes(src, src+4); + VERIFY( dst[0] == 0xabcd ); +} + +void +test04() +{ + const char src[] = "\xFF\xFE\xAB\xCD"; + Conv conv; + auto dst = conv.from_bytes(src, src+4); + VERIFY( dst[0] == 0xcdab ); +} + +void +test05() +{ + const char src[] = "\0\x61\xAB\xCD"; // character greater than 0x00FF + Conv conv("to_bytes failed", u"from_bytes failed"); + std::u16string result = conv.from_bytes(src, src+4); + VERIFY( result == u"from_bytes failed" ); + VERIFY( conv.converted() == 2 ); +} + +void +test06() +{ + const char src[] = "\0\x61\xAB\xCD"; + Conv conv("to_bytes failed", u"from_bytes failed"); + std::u16string result = conv.from_bytes(src, src+3); // incomplete character + VERIFY( result == u"from_bytes failed" ); + VERIFY( conv.converted() == 2 ); +} + +void +test07() +{ + Conv conv("to_bytes failed", u"from_bytes failed"); + // ucs2 to utf-16 conversion should fail on invalid ucs2 input: + std::u16string utf16 = u"1234\U00001111\U0001ffff"; + auto out = conv.to_bytes(utf16); + VERIFY( out == "to_bytes failed" ); + VERIFY( conv.converted() == 5 ); + + // And should also fail on incomplete surrogate pair (not return partial): + out = conv.to_bytes(utf16.substr(0, utf16.size()-1)); + VERIFY( out == "to_bytes failed" ); + VERIFY( conv.converted() == 5 ); +} + +int main() +{ + test01(); + test02(); + test03(); + test04(); + test05(); + test06(); + test07(); +} diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc new file mode 100644 index 0000000..1251acb --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc @@ -0,0 +1,94 @@ +// Copyright (C) 2017 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// { dg-do run { target c++11 } } + +#include +#include +#include +#include + +using std::wstring_convert; +using std::codecvt_utf8; + +void +test01() +{ + std::string src = u8"1234\U00001111\U0001ffff"; + wstring_convert, char16_t> c("bad", u"BAD"); + + // utf-8 to ucs2 conversion should fail on character outside BMP + auto ucs2 = c.from_bytes(src); + VERIFY( ucs2 == u"BAD" ); + VERIFY( c.converted() == 7 ); + + // ucs2 to utf-8 conversion should fail on invalid ucs2 input: + std::u16string utf16 = u"1234\U00001111\U0001ffff"; + auto out = c.to_bytes(utf16); + VERIFY( out == "bad" ); + VERIFY( c.converted() == 5 ); + + // And should also fail on incomplete surrogate pair (not return partial): + out = c.to_bytes(utf16.substr(0, utf16.size()-1)); + VERIFY( out == "bad" ); + VERIFY( c.converted() == 5 ); +} + +void +test02() +{ + std::string src = u8"1234\U00001111\U0001ffff"; + wstring_convert, char16_t> c("bad", u"BAD"); + + // utf-8 to ucs2 conversion should fail on character above Maxcode=0x1000 + auto ucs2 = c.from_bytes(src); + VERIFY( ucs2 == u"BAD" ); + VERIFY( c.converted() == 4 ); +} + +void +test03() +{ + std::string src = u8"1234\U00001111\U0001ffff"; + wstring_convert, char32_t> c("bad", U"BAD"); + + // utf-8 to ucs4 conversion should fail on character above Maxcode=0x10000 + auto ucs4 = c.from_bytes(src); + VERIFY( ucs4 == U"BAD" ); + VERIFY( c.converted() == 7 ); +} + +void +test04() +{ + std::string src = u8"1234\U00001111\U0001ffff"; + wstring_convert, char32_t> c("bad", U"BAD"); + + // utf-8 to ucs4 conversion should fail on character above Maxcode=0x1000 + auto ucs4 = c.from_bytes(src); + VERIFY( ucs4 == U"BAD" ); + VERIFY( c.converted() == 4 ); +} + +int +main() +{ + test01(); + test02(); + test03(); + test04(); +} -- 2.7.4