PR libstdc++/79980 fix BOM detection, maxcode checks, UCS2 handling

author Jonathan Wakely <jwakely@redhat.com>

Thu, 16 Mar 2017 15:27:51 +0000 (15:27 +0000)

committer Jonathan Wakely <redi@gcc.gnu.org>

Thu, 16 Mar 2017 15:27:51 +0000 (15:27 +0000)
author Jonathan Wakely <jwakely@redhat.com>
Thu, 16 Mar 2017 15:27:51 +0000 (15:27 +0000)
committer Jonathan Wakely <redi@gcc.gnu.org>
Thu, 16 Mar 2017 15:27:51 +0000 (15:27 +0000)
diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog

index 98735ca..83f74ef 100644 (file)
--- a/libstdc++-v3/ChangeLog
+++ b/libstdc++-v3/ChangeLog
@@ -1,5 +1,24 @@
  2017-03-16  Jonathan Wakely  <jwakely@redhat.com>
  
+       PR libstdc++/79980
+       * include/bits/locale_conv.h (__do_str_codecvt): Set __count on
+       error path.
+       * src/c++11/codecvt.cc (operator&=, operator|=, operator~): Overloads
+       for manipulating codecvt_mode values.
+       (read_utf16_bom): Compare input to BOM constants instead of integral
+       constants that depend on endianness.  Take mode parameter by
+       reference and adjust it, to distinguish between no BOM present and
+       UTF-16BE BOM present.
+       (ucs4_in, ucs2_span, ucs4_span): Adjust calls to read_utf16_bom.
+       (surrogates): New enumeration type.
+       (utf16_in, utf16_out): Add surrogates parameter to choose between
+       UTF-16 and UCS2 behaviour.
+       (utf16_span, ucs2_span): Use std::min not std::max.
+       (ucs2_out): Use std::min not std::max.  Disallow surrogate pairs.
+       (ucs2_in): Likewise. Adjust calls to read_utf16_bom.
+       * testsuite/22_locale/codecvt/codecvt_utf16/79980.cc: New test.
+       * testsuite/22_locale/codecvt/codecvt_utf8/79980.cc: New test.
+
         PR libstdc++/79511
         * src/c++11/codecvt.cc (write_utf16_code_point): Don't write 0xffff
         as a surrogate pair.
diff --git a/libstdc++-v3/include/bits/locale_conv.h b/libstdc++-v3/include/bits/locale_conv.h

index cd8f146..9b952d4 100644 (file)
--- a/libstdc++-v3/include/bits/locale_conv.h
+++ b/libstdc++-v3/include/bits/locale_conv.h
@@ -81,7 +81,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
              && (__outstr.size() - __outchars) < __maxlen);
  
        if (__result == codecvt_base::error)
-       return false;
+       {
+         __count = __next - __first;
+         return false;
+       }
  
        if (__result == codecvt_base::noconv)
         {
diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc

index 9b63e2b..a50804c 100644 (file)
--- a/libstdc++-v3/src/c++11/codecvt.cc
+++ b/libstdc++-v3/src/c++11/codecvt.cc
@@ -24,13 +24,27 @@
  
  #include <codecvt>
  #include <cstring>             // std::memcpy, std::memcmp
-#include <bits/stl_algobase.h> // std::max
+#include <bits/stl_algobase.h> // std::min
  
  #ifdef _GLIBCXX_USE_C99_STDINT_TR1
  namespace std _GLIBCXX_VISIBILITY(default)
  {
  _GLIBCXX_BEGIN_NAMESPACE_VERSION
  
+  // The standard doesn't define these operators, which is annoying.
+  static underlying_type<codecvt_mode>::type
+  to_integer(codecvt_mode m)
+  { return static_cast<mode_t>(m); }
+
+  static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
+  { return m = codecvt_mode(to_integer(m) & to_integer(n)); }
+
+  static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
+  { return m = codecvt_mode(to_integer(m) | to_integer(n)); }
+
+  static codecvt_mode operator~(codecvt_mode m)
+  { return codecvt_mode(~to_integer(m)); }
+
  namespace
  {
    // Largest code point that fits in a single UTF-16 code unit.
@@ -117,22 +131,26 @@ namespace
        read_bom(from, utf8_bom);
    }
  
-  // If consume_header is set in mode update from.next to after any BOM.
-  // Return little_endian iff the UTF-16LE BOM was present.
-  codecvt_mode
-  read_utf16_bom(range<const char16_t>& from, codecvt_mode mode)
+  // If consume_header is not set in mode, no effects.
+  // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
+  // - if the UTF-16BE BOM was found unset little_endian in mode, or
+  // - if the UTF-16LE BOM was found set little_endian in mode.
+  void
+  read_utf16_bom(range<const char16_t>& from, codecvt_mode& mode)
    {
      if (mode & consume_header && from.size())
        {
-       if (*from.next == 0xFEFF)
-         ++from.next;
-       else if (*from.next == 0xFFFE)
+       if (!memcmp(from.next, utf16_bom, 2))
+         {
+           ++from.next;
+           mode &= ~little_endian;
+         }
+       else if (!memcmp(from.next, utf16le_bom, 2))
           {
             ++from.next;
-           return little_endian;
+           mode |= little_endian;
           }
        }
-    return {};
    }
  
    // Read a codepoint from a UTF-8 multibyte sequence.
@@ -380,8 +398,7 @@ namespace
    ucs4_in(range<const char16_t>& from, range<char32_t>& to,
            unsigned long maxcode = max_code_point, codecvt_mode mode = {})
    {
-    if (read_utf16_bom(from, mode) == little_endian)
-      mode = codecvt_mode(mode & little_endian);
+    read_utf16_bom(from, mode);
      while (from.size() && to.size())
        {
         const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
@@ -413,11 +430,15 @@ namespace
      return codecvt_base::ok;
    }
  
-  // utf8 -> utf16
+  // Flag indicating whether to process UTF-16 or UCS2
+  enum class surrogates { allowed, disallowed };
+
+  // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
    template<typename C>
    codecvt_base::result
    utf16_in(range<const char>& from, range<C>& to,
-           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
+          unsigned long maxcode = max_code_point, codecvt_mode mode = {},
+          surrogates s = surrogates::allowed)
    {
      read_utf8_bom(from, mode);
      while (from.size() && to.size())
@@ -425,7 +446,12 @@ namespace
         const char* const first = from.next;
         const char32_t codepoint = read_utf8_code_point(from, maxcode);
         if (codepoint == incomplete_mb_character)
-         return codecvt_base::partial;
+         {
+           if (s == surrogates::allowed)
+             return codecvt_base::partial;
+           else
+             return codecvt_base::error; // No surrogates in UCS2
+         }
         if (codepoint > maxcode)
           return codecvt_base::error;
         if (!write_utf16_code_point(to, codepoint, mode))
@@ -437,11 +463,12 @@ namespace
      return codecvt_base::ok;
    }
  
-  // utf16 -> utf8
+  // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
    template<typename C>
    codecvt_base::result
    utf16_out(range<const C>& from, range<char>& to,
-            unsigned long maxcode = max_code_point, codecvt_mode mode = {})
+           unsigned long maxcode = max_code_point, codecvt_mode mode = {},
+           surrogates s = surrogates::allowed)
    {
      if (!write_utf8_bom(to, mode))
        return codecvt_base::partial;
@@ -451,6 +478,9 @@ namespace
         int inc = 1;
         if (is_high_surrogate(c))
           {
+           if (s == surrogates::disallowed)
+             return codecvt_base::error; // No surrogates in UCS-2
+
             if (from.size() < 2)
               return codecvt_base::ok; // stop converting at this point
  
@@ -492,7 +522,7 @@ namespace
         ++count;
        }
      if (count+1 == max) // take one more character if it fits in a single unit
-      read_utf8_code_point(from, std::max(max_single_utf16_unit, maxcode));
+      read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
      return from.next;
    }
  
@@ -501,7 +531,9 @@ namespace
    ucs2_in(range<const char>& from, range<char16_t>& to,
           char32_t maxcode = max_code_point, codecvt_mode mode = {})
    {
-    return utf16_in(from, to, std::max(max_single_utf16_unit, maxcode), mode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
+    return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
    }
  
    // ucs2 -> utf8
@@ -509,7 +541,9 @@ namespace
    ucs2_out(range<const char16_t>& from, range<char>& to,
            char32_t maxcode = max_code_point, codecvt_mode mode = {})
    {
-    return utf16_out(from, to, std::max(max_single_utf16_unit, maxcode), mode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
+    return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
    }
  
    // ucs2 -> utf16
@@ -537,14 +571,14 @@ namespace
    ucs2_in(range<const char16_t>& from, range<char16_t>& to,
           char32_t maxcode = max_code_point, codecvt_mode mode = {})
    {
-    if (read_utf16_bom(from, mode) == little_endian)
-      mode = codecvt_mode(mode & little_endian);
-    maxcode = std::max(max_single_utf16_unit, maxcode);
+    read_utf16_bom(from, mode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
      while (from.size() && to.size())
        {
         const char32_t c = read_utf16_code_point(from, maxcode, mode);
         if (c == incomplete_mb_character)
-         return codecvt_base::partial;
+         return codecvt_base::error; // UCS-2 only supports single units.
         if (c > maxcode)
           return codecvt_base::error;
         *to.next++ = c;
@@ -557,9 +591,9 @@ namespace
              char32_t maxcode, codecvt_mode mode)
    {
      range<const char16_t> from{ begin, end };
-    if (read_utf16_bom(from, mode) == little_endian)
-      mode = codecvt_mode(mode & little_endian);
-    maxcode = std::max(max_single_utf16_unit, maxcode);
+    read_utf16_bom(from, mode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
      char32_t c = 0;
      while (max-- && c <= maxcode)
        c = read_utf16_code_point(from, maxcode, mode);
@@ -572,7 +606,8 @@ namespace
    {
      range<const char> from{ begin, end };
      read_utf8_bom(from, mode);
-    maxcode = std::max(max_single_utf16_unit, maxcode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
      char32_t c = 0;
      while (max-- && c <= maxcode)
        c = read_utf8_code_point(from, maxcode);
@@ -598,8 +633,7 @@ namespace
              char32_t maxcode = max_code_point, codecvt_mode mode = {})
    {
      range<const char16_t> from{ begin, end };
-    if (read_utf16_bom(from, mode) == little_endian)
-      mode = codecvt_mode(mode & little_endian);
+    read_utf16_bom(from, mode);
      char32_t c = 0;
      while (max-- && c <= maxcode)
        c = read_utf16_code_point(from, maxcode, mode);
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc

new file mode 100644 (file)

index 0000000..9383818
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc
@@ -0,0 +1,115 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <locale>
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+// PR libstdc++/79980
+
+constexpr std::codecvt_mode mode(std::codecvt_mode m)
+{ return static_cast<std::codecvt_mode>(m | std::consume_header); }
+
+template<typename WCh, unsigned long Max = 0x10FFFF,
+        std::codecvt_mode Mode = std::consume_header>
+  using Conv
+    = std::wstring_convert<std::codecvt_utf16<WCh, Max, mode(Mode)>, WCh>;
+
+void
+test01()
+{
+  const char src[] = "\xFE\xFF\xAB\xCD";
+  Conv<char16_t> conv;
+  auto dst = conv.from_bytes(src, src+4);
+  VERIFY( dst[0] == 0xabcd );
+}
+
+void
+test02()
+{
+  const char src[] = "\xFF\xFE\xAB\xCD";
+  Conv<char16_t> conv;
+  auto dst = conv.from_bytes(src, src+4);
+  VERIFY( dst[0] == 0xcdab );
+}
+
+void
+test03()
+{
+  const char src[] = "\xFE\xFF\xAB\xCD";
+  Conv<char16_t, 0x10FFFF, std::little_endian> conv;
+  auto dst = conv.from_bytes(src, src+4);
+  VERIFY( dst[0] == 0xabcd );
+}
+
+void
+test04()
+{
+  const char src[] = "\xFF\xFE\xAB\xCD";
+  Conv<char16_t, 0x10FFFF, std::little_endian> conv;
+  auto dst = conv.from_bytes(src, src+4);
+  VERIFY( dst[0] == 0xcdab );
+}
+
+void
+test05()
+{
+  const char src[] = "\0\x61\xAB\xCD"; // character greater than 0x00FF
+  Conv<char16_t, 0xFF> conv("to_bytes failed", u"from_bytes failed");
+  std::u16string result = conv.from_bytes(src, src+4);
+  VERIFY( result == u"from_bytes failed" );
+  VERIFY( conv.converted() == 2 );
+}
+
+void
+test06()
+{
+  const char src[] = "\0\x61\xAB\xCD";
+  Conv<char16_t> conv("to_bytes failed", u"from_bytes failed");
+  std::u16string result = conv.from_bytes(src, src+3); // incomplete character
+  VERIFY( result == u"from_bytes failed" );
+  VERIFY( conv.converted() == 2 );
+}
+
+void
+test07()
+{
+  Conv<char16_t> conv("to_bytes failed", u"from_bytes failed");
+  // ucs2 to utf-16 conversion should fail on invalid ucs2 input:
+  std::u16string utf16 = u"1234\U00001111\U0001ffff";
+  auto out = conv.to_bytes(utf16);
+  VERIFY( out == "to_bytes failed" );
+  VERIFY( conv.converted() == 5 );
+
+  // And should also fail on incomplete surrogate pair (not return partial):
+  out = conv.to_bytes(utf16.substr(0, utf16.size()-1));
+  VERIFY( out == "to_bytes failed" );
+  VERIFY( conv.converted() == 5 );
+}
+
+int main()
+{
+  test01();
+  test02();
+  test03();
+  test04();
+  test05();
+  test06();
+  test07();
+}
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc

new file mode 100644 (file)

index 0000000..1251acb
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc
@@ -0,0 +1,94 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <locale>
+#include <string>
+#include <testsuite_hooks.h>
+
+using std::wstring_convert;
+using std::codecvt_utf8;
+
+void
+test01()
+{
+  std::string src = u8"1234\U00001111\U0001ffff";
+  wstring_convert<codecvt_utf8<char16_t>, char16_t> c("bad", u"BAD");
+
+  // utf-8 to ucs2 conversion should fail on character outside BMP
+  auto ucs2 = c.from_bytes(src);
+  VERIFY( ucs2 == u"BAD" );
+  VERIFY( c.converted() == 7 );
+
+  // ucs2 to utf-8 conversion should fail on invalid ucs2 input:
+  std::u16string utf16 = u"1234\U00001111\U0001ffff";
+  auto out = c.to_bytes(utf16);
+  VERIFY( out == "bad" );
+  VERIFY( c.converted() == 5 );
+
+  // And should also fail on incomplete surrogate pair (not return partial):
+  out = c.to_bytes(utf16.substr(0, utf16.size()-1));
+  VERIFY( out == "bad" );
+  VERIFY( c.converted() == 5 );
+}
+
+void
+test02()
+{
+  std::string src = u8"1234\U00001111\U0001ffff";
+  wstring_convert<codecvt_utf8<char16_t, 0x1000>, char16_t> c("bad", u"BAD");
+
+  // utf-8 to ucs2 conversion should fail on character above Maxcode=0x1000
+  auto ucs2 = c.from_bytes(src);
+  VERIFY( ucs2 == u"BAD" );
+  VERIFY( c.converted() == 4 );
+}
+
+void
+test03()
+{
+  std::string src = u8"1234\U00001111\U0001ffff";
+  wstring_convert<codecvt_utf8<char32_t, 0x10000>, char32_t> c("bad", U"BAD");
+
+  // utf-8 to ucs4 conversion should fail on character above Maxcode=0x10000
+  auto ucs4 = c.from_bytes(src);
+  VERIFY( ucs4 == U"BAD" );
+  VERIFY( c.converted() == 7 );
+}
+
+void
+test04()
+{
+  std::string src = u8"1234\U00001111\U0001ffff";
+  wstring_convert<codecvt_utf8<char32_t, 0x1000>, char32_t> c("bad", U"BAD");
+
+  // utf-8 to ucs4 conversion should fail on character above Maxcode=0x1000
+  auto ucs4 = c.from_bytes(src);
+  VERIFY( ucs4 == U"BAD" );
+  VERIFY( c.converted() == 4 );
+}
+
+int
+main()
+{
+  test01();
+  test02();
+  test03();
+  test04();
+}
author	Jonathan Wakely <jwakely@redhat.com>
	Thu, 16 Mar 2017 15:27:51 +0000 (15:27 +0000)
committer	Jonathan Wakely <redi@gcc.gnu.org>
	Thu, 16 Mar 2017 15:27:51 +0000 (15:27 +0000)
libstdc++-v3/ChangeLog		patch \| blob \| history
libstdc++-v3/include/bits/locale_conv.h		patch \| blob \| history
libstdc++-v3/src/c++11/codecvt.cc		patch \| blob \| history
libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc	[new file with mode: 0644]	patch \| blob
libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc	[new file with mode: 0644]	patch \| blob