[libc++][format] Improves Unicode decoders.

author Mark de Wever <koraq@xs4all.nl>

Thu, 9 Feb 2023 20:38:42 +0000 (21:38 +0100)

committer Mark de Wever <koraq@xs4all.nl>

Wed, 8 Mar 2023 21:01:49 +0000 (22:01 +0100)
author Mark de Wever <koraq@xs4all.nl>
Thu, 9 Feb 2023 20:38:42 +0000 (21:38 +0100)
committer Mark de Wever <koraq@xs4all.nl>
Wed, 8 Mar 2023 21:01:49 +0000 (22:01 +0100)
diff --git a/libcxx/include/__format/formatter_output.h b/libcxx/include/__format/formatter_output.h

index 6c8cadb..2b3c5ae 100644 (file)
--- a/libcxx/include/__format/formatter_output.h
+++ b/libcxx/include/__format/formatter_output.h
@@ -12,6 +12,7 @@
  
  #include <__algorithm/ranges_copy.h>
  #include <__algorithm/ranges_fill_n.h>
+#include <__algorithm/ranges_for_each.h>
  #include <__algorithm/ranges_transform.h>
  #include <__chrono/statically_widen.h>
  #include <__concepts/same_as.h>
@@ -503,36 +504,17 @@ __escape(basic_string<_CharT>& __str, basic_string_view<_CharT> __values, __esca
    __unicode::__code_point_view<_CharT> __view{__values.begin(), __values.end()};
  
    while (!__view.__at_end()) {
-    auto __first                                        = __view.__position();
-    typename __unicode::__consume_p2286_result __result = __view.__consume_p2286();
-    if (__result.__ill_formed_size == 0) {
-      if (!__formatter::__is_escaped_sequence_written(__str, __result.__value, __mark))
+    auto __first                                  = __view.__position();
+    typename __unicode::__consume_result __result = __view.__consume();
+    if (__result.__status == __unicode::__consume_result::__ok) {
+      if (!__formatter::__is_escaped_sequence_written(__str, __result.__code_point, __mark))
          // 2.2.1.3 - Add the character
          ranges::copy(__first, __view.__position(), std::back_insert_iterator(__str));
-
      } else {
        // 2.2.3 sequence of ill-formed code units
-      // The number of code-units in __result.__value depends on the character type being used.
-      if constexpr (sizeof(_CharT) == 1) {
-        _LIBCPP_ASSERT(__result.__ill_formed_size == 1 || __result.__ill_formed_size == 4,
-                       "illegal number of invalid code units.");
-        if (__result.__ill_formed_size == 1) // ill-formed, one code unit
-          __formatter::__write_escape_ill_formed_code_unit(__str, __result.__value & 0xff);
-        else { // out of valid range, four code units
-               // The code point was properly encoded, decode the value.
-          __formatter::__write_escape_ill_formed_code_unit(__str, __result.__value >> 18 | 0xf0);
-          __formatter::__write_escape_ill_formed_code_unit(__str, (__result.__value >> 12 & 0x3f) | 0x80);
-          __formatter::__write_escape_ill_formed_code_unit(__str, (__result.__value >> 6 & 0x3f) | 0x80);
-          __formatter::__write_escape_ill_formed_code_unit(__str, (__result.__value & 0x3f) | 0x80);
-        }
-      } else if constexpr (sizeof(_CharT) == 2) {
-        _LIBCPP_ASSERT(__result.__ill_formed_size == 1, "for UTF-16 at most one invalid code unit");
-        __formatter::__write_escape_ill_formed_code_unit(__str, __result.__value & 0xffff);
-      } else {
-        static_assert(sizeof(_CharT) == 4, "unsupported character width");
-        _LIBCPP_ASSERT(__result.__ill_formed_size == 1, "for UTF-32 one code unit is one code point");
-        __formatter::__write_escape_ill_formed_code_unit(__str, __result.__value);
-      }
+      ranges::for_each(__first, __view.__position(), [&](_CharT __value) {
+        __formatter::__write_escape_ill_formed_code_unit(__str, __formatter::__to_char32(__value));
+      });
      }
    }
  }
diff --git a/libcxx/include/__format/unicode.h b/libcxx/include/__format/unicode.h

index 53b5320..12aed50 100644 (file)
--- a/libcxx/include/__format/unicode.h
+++ b/libcxx/include/__format/unicode.h
@@ -31,23 +31,28 @@ _LIBCPP_BEGIN_NAMESPACE_STD
  
  namespace __unicode {
  
-#  if _LIBCPP_STD_VER >= 23
-
-/// The result of consuming a code point using P2286' semantics
-///
-/// TODO FMT Combine __consume and  __consume_p2286 in one function.
-struct __consume_p2286_result {
-  // A size of 0 means well formed. This to differenciate between
-  // a valid code point and a code unit that's invalid like 0b11111xxx.
-  int __ill_formed_size;
-
-  // If well formed the consumed code point.
-  // Otherwise the ill-formed code units as unsigned 8-bit values. They are
-  // stored in reverse order, to make it easier to extract the values.
-  char32_t __value;
+// Helper struct for the result of a consume operation.
+//
+// The status value for a correct code point is 0. This allows a valid value to
+// be used without masking.
+// When the decoding fails it know the number of code units affected. For the
+// current use-cases that value is not needed, therefore it is not stored.
+// The escape routine needs the number of code units for both a valid and
+// invalid character and keeps track of it itself. Doing it in this result
+// unconditionally would give some overhead when the value is unneeded.
+struct __consume_result {
+  // When __status == __ok it contains the decoded code point.
+  // Else it contains the replacement character U+FFFD
+  char32_t __code_point : 31;
+
+  enum : char32_t {
+    // Consumed a well-formed code point.
+    __ok = 0,
+    // Encountered invalid UTF-8
+    __error = 1
+  } __status : 1 {__ok};
  };
-
-#  endif // _LIBCPP_STD_VER >= 23
+static_assert(sizeof(__consume_result) == sizeof(char32_t));
  
  #  ifndef _LIBCPP_HAS_NO_UNICODE
  
@@ -66,6 +71,36 @@ struct __consume_p2286_result {
  
  inline constexpr char32_t __replacement_character = U'\ufffd';
  
+// The error of a consume operation.
+//
+// This sets the code point to the replacement character. This code point does
+// not participate in the grapheme clustering, so grapheme clustering code can
+// ignore the error status and always use the code point.
+inline constexpr __consume_result __consume_result_error{__replacement_character, __consume_result::__error};
+
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_high_surrogate(char32_t __value) {
+  return __value >= 0xd800 && __value <= 0xdbff;
+}
+
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_low_surrogate(char32_t __value) {
+  return __value >= 0xdc00 && __value <= 0xdfff;
+}
+
+// https://www.unicode.org/glossary/#surrogate_code_point
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_surrogate(char32_t __value) {
+  return __value >= 0xd800 && __value <= 0xdfff;
+}
+
+// https://www.unicode.org/glossary/#code_point
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_code_point(char32_t __value) {
+  return __value <= 0x10ffff;
+}
+
+// https://www.unicode.org/glossary/#unicode_scalar_value
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_scalar_value(char32_t __value) {
+  return __unicode::__is_code_point(__value) && !__unicode::__is_surrogate(__value);
+}
+
  template <contiguous_iterator _Iterator>
    requires same_as<iter_value_t<_Iterator>, char>
  _LIBCPP_HIDE_FROM_ABI constexpr bool __is_continuation(_Iterator __char, int __count) {
@@ -97,122 +132,103 @@ public:
    _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
    _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
  
-  _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
+  // https://www.unicode.org/versions/latest/ch03.pdf#G7404
+  // Based on Table 3-7, Well-Formed UTF-8 Byte Sequences
+  //
+  // Code Points        First Byte Second Byte Third Byte Fourth Byte  Remarks
+  // U+0000..U+007F     00..7F                                         U+0000..U+007F 1 code unit range
+  //                    C0..C1     80..BF                              invalid overlong encoding
+  // U+0080..U+07FF     C2..DF     80..BF                              U+0080..U+07FF 2 code unit range
+  //                    E0         80..9F      80..BF                  invalid overlong encoding
+  // U+0800..U+0FFF     E0         A0..BF      80..BF                  U+0800..U+FFFF 3 code unit range
+  // U+1000..U+CFFF     E1..EC     80..BF      80..BF
+  // U+D000..U+D7FF     ED         80..9F      80..BF
+  // U+D800..U+DFFF     ED         A0..BF      80..BF                  invalid encoding of surrogate code point
+  // U+E000..U+FFFF     EE..EF     80..BF      80..BF
+  //                    F0         80..8F      80..BF     80..BF       invalid overlong encoding
+  // U+10000..U+3FFFF   F0         90..BF      80..BF     80..BF       U+10000..U+10FFFF 4 code unit range
+  // U+40000..U+FFFFF   F1..F3     80..BF      80..BF     80..BF
+  // U+100000..U+10FFFF F4         80..8F      80..BF     80..BF
+  //                    F4         90..BF      80..BF     80..BF       U+110000.. invalid code point range
+  //
+  // Unlike other parsers, these invalid entries are tested after decoding.
+  // - The parser always needs to consume these code units
+  // - The code is optimized for well-formed UTF-8
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
      _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
  
      // Based on the number of leading 1 bits the number of code units in the
      // code point can be determined. See
      // https://en.wikipedia.org/wiki/UTF-8#Encoding
-    switch (_VSTD::countl_one(static_cast<unsigned char>(*__first_))) {
+    switch (std::countl_one(static_cast<unsigned char>(*__first_))) {
      case 0:
-      return *__first_++;
+      return {static_cast<unsigned char>(*__first_++)};
  
-    case 2:
+    case 2: {
        if (__last_ - __first_ < 2 || !__unicode::__is_continuation(__first_ + 1, 1)) [[unlikely]]
          break;
-      else {
-        char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        return __value;
-      }
  
-    case 3:
-      if (__last_ - __first_ < 3 || !__unicode::__is_continuation(__first_ + 1, 2)) [[unlikely]]
-        break;
-      else {
-        char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        return __value;
-      }
+      char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
+      __value <<= 6;
+      __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
  
-    case 4:
-      if (__last_ - __first_ < 4 || !__unicode::__is_continuation(__first_ + 1, 3)) [[unlikely]]
-        break;
-      else {
-        char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        return __value;
-      }
-    }
-    // An invalid number of leading ones can be garbage or a code unit in the
-    // middle of a code point. By consuming one code unit the parser may get
-    // "in sync" after a few code units.
-    ++__first_;
-    return __replacement_character;
-  }
-
-#    if _LIBCPP_STD_VER >= 23
-  _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
-    _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
+      // These values should be encoded in 1 UTF-8 code unit.
+      if (__value < 0x0080) [[unlikely]]
+        return __consume_result_error;
  
-    // Based on the number of leading 1 bits the number of code units in the
-    // code point can be determined. See
-    // https://en.wikipedia.org/wiki/UTF-8#Encoding
-    switch (std::countl_one(static_cast<unsigned char>(*__first_))) {
-    case 0:
-      return {0, static_cast<unsigned char>(*__first_++)};
+      return {__value};
+    }
  
-    case 2:
-      if (__last_ - __first_ < 2) [[unlikely]]
+    case 3: {
+      if (__last_ - __first_ < 3 || !__unicode::__is_continuation(__first_ + 1, 2)) [[unlikely]]
          break;
  
-      if (__unicode::__is_continuation(__first_ + 1, 1)) {
-        char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        return {0, __value};
-      }
-      break;
+      char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
+      __value <<= 6;
+      __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
+      __value <<= 6;
+      __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
  
-    case 3:
-      if (__last_ - __first_ < 3) [[unlikely]]
-        break;
+      // These values should be encoded in 1 or 2 UTF-8 code units.
+      if (__value < 0x0800) [[unlikely]]
+        return __consume_result_error;
  
-      if (__unicode::__is_continuation(__first_ + 1, 2)) {
-        char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        return {0, __value};
-      }
-      break;
+      // A surrogate value is always encoded in 3 UTF-8 code units.
+      if (__unicode::__is_surrogate(__value)) [[unlikely]]
+        return __consume_result_error;
+
+      return {__value};
+    }
  
-    case 4:
-      if (__last_ - __first_ < 4) [[unlikely]]
+    case 4: {
+      if (__last_ - __first_ < 4 || !__unicode::__is_continuation(__first_ + 1, 3)) [[unlikely]]
          break;
  
-      if (__unicode::__is_continuation(__first_ + 1, 3)) {
-        char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
+      char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
+      __value <<= 6;
+      __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
+      __value <<= 6;
+      __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
+      __value <<= 6;
+      __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
  
-        if (__value > 0x10FFFF) // Outside the valid Unicode range?
-          return {4, __value};
+      // These values should be encoded in 1, 2, or 3 UTF-8 code units.
+      if (__value < 0x10000) [[unlikely]]
+        return __consume_result_error;
  
-        return {0, __value};
-      }
-      break;
+      // A value too large is always encoded in 4 UTF-8 code units.
+      if (!__unicode::__is_code_point(__value)) [[unlikely]]
+        return __consume_result_error;
+
+      return {__value};
+    }
      }
      // An invalid number of leading ones can be garbage or a code unit in the
      // middle of a code point. By consuming one code unit the parser may get
      // "in sync" after a few code units.
-    return {1, static_cast<unsigned char>(*__first_++)};
+    ++__first_;
+    return __consume_result_error;
    }
-#    endif // _LIBCPP_STD_VER >= 23
  
  private:
    _Iterator __first_;
@@ -244,62 +260,33 @@ public:
    _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
    _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
  
-  _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
      _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
  
+    char32_t __value = static_cast<char32_t>(*__first_++);
      if constexpr (sizeof(wchar_t) == 2) {
-      char32_t __result = *__first_++;
-      // Is the code unit part of a surrogate pair? See
-      // https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
-      if (__result >= 0xd800 && __result <= 0xDfff) {
-        // Malformed Unicode.
-        if (__first_ == __last_) [[unlikely]]
-          return __replacement_character;
-
-        __result -= 0xd800;
-        __result <<= 10;
-        __result += *__first_++ - 0xdc00;
-        __result += 0x10000;
-      }
-      return __result;
+      if (__unicode::__is_low_surrogate(__value)) [[unlikely]]
+        return __consume_result_error;
  
-    } else if constexpr (sizeof(wchar_t) == 4) {
-      char32_t __result = *__first_++;
-      if (__result > 0x10FFFF) [[unlikely]]
-        return __replacement_character;
-      return __result;
-    } else {
-      __libcpp_unreachable();
-    }
-  }
+      if (__unicode::__is_high_surrogate(__value)) {
+        if (__first_ == __last_ || !__unicode::__is_low_surrogate(static_cast<char32_t>(*__first_))) [[unlikely]]
+          return __consume_result_error;
  
-#      if _LIBCPP_STD_VER >= 23
-  _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
-    _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
+        __value -= 0xd800;
+        __value <<= 10;
+        __value += static_cast<char32_t>(*__first_++) - 0xdc00;
+        __value += 0x10000;
  
-    char32_t __result = *__first_++;
-    if constexpr (sizeof(wchar_t) == 2) {
-      // https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
-      if (__is_surrogate_pair_high(__result)) {
-        // Malformed Unicode.
-        if (__first_ == __last_ || !__is_surrogate_pair_low(*(__first_ + 1))) [[unlikely]]
-          return {1, __result};
-
-        __result -= 0xd800;
-        __result <<= 10;
-        __result += *__first_++ - 0xdc00;
-        __result += 0x10000;
-      } else if (__is_surrogate_pair_low(__result))
-        // A code point shouldn't start with the low surrogate pair
-        return {1, __result};
+        if (!__unicode::__is_code_point(__value)) [[unlikely]]
+          return __consume_result_error;
+      }
      } else {
-      if (__result > 0x10FFFF) [[unlikely]]
-        return {1, __result};
+      if (!__unicode::__is_scalar_value(__value)) [[unlikely]]
+        return __consume_result_error;
      }
  
-    return {0, __result};
+    return {__value};
    }
-#      endif // _LIBCPP_STD_VER >= 23
  
  private:
    _Iterator __first_;
@@ -399,7 +386,7 @@ class __extended_grapheme_cluster_view {
  public:
    _LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_view(_Iterator __first, _Iterator __last)
        : __code_point_view_(__first, __last),
-        __next_code_point_(__code_point_view_.__consume()),
+        __next_code_point_(__code_point_view_.__consume().__code_point),
          __next_prop_(__extended_grapheme_custer_property_boundary::__get_property(__next_code_point_)) {}
  
    struct __cluster {
@@ -420,6 +407,7 @@ public:
      _LIBCPP_ASSERT(
          __next_prop_ != __extended_grapheme_custer_property_boundary::__property::__eot,
          "can't move beyond the end of input");
+
      char32_t __code_point = __next_code_point_;
      if (!__code_point_view_.__at_end())
        return {__code_point, __get_break()};
@@ -444,7 +432,7 @@ private:
          __next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot;
          return __result;
        }
-      __next_code_point_ = __code_point_view_.__consume();
+      __next_code_point_ = __code_point_view_.__consume().__code_point;
        __next_prop_       = __extended_grapheme_custer_property_boundary::__get_property(__next_code_point_);
  
        __has_extened_pictographic |=
@@ -474,18 +462,10 @@ public:
    _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
    _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
  
-  _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
      _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
-    return *__first_++;
-  }
-
-#    if _LIBCPP_STD_VER >= 23
-  _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
-    _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
-
-    return {0, std::make_unsigned_t<_CharT>(*__first_++)};
+    return {static_cast<char32_t>(*__first_++)};
    }
-#    endif // _LIBCPP_STD_VER >= 23
  
  private:
    _Iterator __first_;
diff --git a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp

index 90bd305..9caa3a2 100644 (file)
--- a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
@@ -202,14 +202,9 @@ void test_char() {
      test_format(V{L"'\\u{600}'"}, L"{:?}", L'\x600');   // ARABIC NUMBER SIGN
      test_format(V{L"'\\u{feff}'"}, L"{:?}", L'\xfeff'); // ZERO WIDTH NO-BREAK SPACE
  
-    if constexpr (sizeof(CharT) == 2) {
-      // Incomplete surrogate pair in UTF-16
-      test_format(V{L"'\\x{d800}'"}, L"{:?}", L'\xd800'); // <surrogate-D800>
-      test_format(V{L"'\\x{dfff}'"}, L"{:?}", L'\xdfff'); // <surrogate-DFFF>
-    } else {
-      test_format(V{L"'\\u{d800}'"}, L"{:?}", L'\xd800'); // <surrogate-D800>
-      test_format(V{L"'\\u{dfff}'"}, L"{:?}", L'\xdfff'); // <surrogate-DFFF>
-    }
+    // Incomplete surrogate pair in UTF-16
+    test_format(V{L"'\\x{d800}'"}, L"{:?}", L'\xd800'); // <surrogate-D800>
+    test_format(V{L"'\\x{dfff}'"}, L"{:?}", L'\xdfff'); // <surrogate-DFFF>
  
      // Private_Use
      test_format(V{L"'\\u{e000}'"}, L"{:?}", L'\xe000'); // <private-use-E000>
@@ -277,6 +272,48 @@ void test_string() {
      // Ill-formend UTF-8
      test_format(SV(R"(["\x{c3}"])"), SV("[{:?}]"), "\xc3");
      test_format(SV(R"(["\x{c3}("])"), SV("[{:?}]"), "\xc3\x28");
+
+    /* U+0000..U+0007F 1 code unit range, encoded in 2 code units. */
+    test_format(SV(R"(["\x{c0}\x{80}"])"), SV("[{:?}]"), "\xc0\x80"); // U+0000
+    test_format(SV(R"(["\x{c1}\x{bf}"])"), SV("[{:?}]"), "\xc1\xbf"); // U+007F
+    test_format(SV(R"(["\u{80}"])"), SV("[{:?}]"), "\xc2\x80");       // U+0080 first valid (General_Category=Control)
+
+    /* U+0000..U+07FFF 1 and 2 code unit range, encoded in 3 code units. */
+    test_format(SV(R"(["\x{e0}\x{80}\x{80}"])"), SV("[{:?}]"), "\xe0\x80\x80"); // U+0000
+    test_format(SV(R"(["\x{e0}\x{81}\x{bf}"])"), SV("[{:?}]"), "\xe0\x81\xbf"); // U+007F
+    test_format(SV(R"(["\x{e0}\x{82}\x{80}"])"), SV("[{:?}]"), "\xe0\x82\x80"); // U+0080
+    test_format(SV(R"(["\x{e0}\x{9f}\x{bf}"])"), SV("[{:?}]"), "\xe0\x9f\xbf"); // U+07FF
+    test_format(SV("[\"\u0800\"]"), SV("[{:?}]"), "\xe0\xa0\x80");              // U+0800 first valid
+
+#if 0
+       // This code point is in the Hangul Jamo Extended-B block and at the time of writing
+       // it's unassigned. When it comes defined, this branch might become true.
+    test_format(SV("[\"\ud7ff\"]"), SV("[{:?}]"), "\xed\x9f\xbf");              // U+D7FF last valid
+#else
+    /* U+D800..D+DFFFF surrogate range */
+    test_format(SV(R"(["\u{d7ff}"])"), SV("[{:?}]"), "\xed\x9f\xbf");           // U+D7FF last valid
+#endif
+    test_format(SV(R"(["\x{ed}\x{a0}\x{80}"])"), SV("[{:?}]"), "\xed\xa0\x80"); // U+D800
+    test_format(SV(R"(["\x{ed}\x{af}\x{bf}"])"), SV("[{:?}]"), "\xed\xaf\xbf"); // U+DBFF
+    test_format(SV(R"(["\x{ed}\x{bf}\x{80}"])"), SV("[{:?}]"), "\xed\xbf\x80"); // U+DC00
+    test_format(SV(R"(["\x{ed}\x{bf}\x{bf}"])"), SV("[{:?}]"), "\xed\xbf\xbf"); // U+DFFF
+    test_format(SV(R"(["\u{e000}"])"), SV("[{:?}]"), "\xee\x80\x80");           // U+E000 first valid
+                                                                                // (in the Private Use Area block)
+
+    /* U+0000..U+FFFF 1, 2, and 3 code unit range */
+    test_format(SV(R"(["\x{f0}\x{80}\x{80}\x{80}"])"), SV("[{:?}]"), "\xf0\x80\x80\x80"); // U+0000
+    test_format(SV(R"(["\x{f0}\x{80}\x{81}\x{bf}"])"), SV("[{:?}]"), "\xf0\x80\x81\xbf"); // U+007F
+    test_format(SV(R"(["\x{f0}\x{80}\x{82}\x{80}"])"), SV("[{:?}]"), "\xf0\x80\x82\x80"); // U+0080
+    test_format(SV(R"(["\x{f0}\x{80}\x{9f}\x{bf}"])"), SV("[{:?}]"), "\xf0\x80\x9f\xbf"); // U+07FF
+    test_format(SV(R"(["\x{f0}\x{80}\x{a0}\x{80}"])"), SV("[{:?}]"), "\xf0\x80\xa0\x80"); // U+0800
+    test_format(SV(R"(["\x{f0}\x{8f}\x{bf}\x{bf}"])"), SV("[{:?}]"), "\xf0\x8f\xbf\xbf"); // U+FFFF
+    test_format(SV("[\"\U00010000\"]"), SV("[{:?}]"), "\xf0\x90\x80\x80");                // U+10000 first valid
+
+    /* U+10FFFF..U+1FFFFF invalid range */
+    test_format(SV(R"(["\u{10ffff}"])"), SV("[{:?}]"), "\xf4\x8f\xbf\xbf"); // U+10FFFF last valid
+                                                                            // (in Supplementary Private Use Area-B)
+    test_format(SV(R"(["\x{f4}\x{90}\x{80}\x{80}"])"), SV("[{:?}]"), "\xf4\x90\x80\x80"); // U+110000
+    test_format(SV(R"(["\x{f4}\x{bf}\x{bf}\x{bf}"])"), SV("[{:?}]"), "\xf4\xbf\xbf\xbf"); // U+11FFFF
    } else {
      // Valid UTF-16 and UTF-32
      test_format(SV("[\"\u00c3\"]"), SV("[{:?}]"), L"\xc3"); // LATIN CAPITAL LETTER A WITH TILDE
@@ -320,11 +357,8 @@ void test_string() {
      // Format
      test_format(V{LR"("\u{ad}\u{600}\u{feff}")"}, L"{:?}", L"\xad\x600\xfeff");
  
-    if constexpr (sizeof(CharT) == 2)
-      // Incomplete surrogate pair in UTF-16
-      test_format(V{LR"("\x{d800}")"}, L"{:?}", L"\xd800");
-    else
-      test_format(V{LR"("\u{d800}")"}, L"{:?}", L"\xd800");
+    // Incomplete surrogate pair in UTF-16
+    test_format(V{LR"("\x{d800}")"}, L"{:?}", L"\xd800");
  
      // Private_Use
      test_format(V{LR"("\u{e000}\u{f8ff}")"}, L"{:?}", L"\xe000\xf8ff");
author	Mark de Wever <koraq@xs4all.nl>
	Thu, 9 Feb 2023 20:38:42 +0000 (21:38 +0100)
committer	Mark de Wever <koraq@xs4all.nl>
	Wed, 8 Mar 2023 21:01:49 +0000 (22:01 +0100)
libcxx/include/__format/formatter_output.h		patch \| blob \| history
libcxx/include/__format/unicode.h		patch \| blob \| history
libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp		patch \| blob \| history