#include <__algorithm/ranges_copy.h>
#include <__algorithm/ranges_fill_n.h>
+#include <__algorithm/ranges_for_each.h>
#include <__algorithm/ranges_transform.h>
#include <__chrono/statically_widen.h>
#include <__concepts/same_as.h>
__unicode::__code_point_view<_CharT> __view{__values.begin(), __values.end()};
while (!__view.__at_end()) {
- auto __first = __view.__position();
- typename __unicode::__consume_p2286_result __result = __view.__consume_p2286();
- if (__result.__ill_formed_size == 0) {
- if (!__formatter::__is_escaped_sequence_written(__str, __result.__value, __mark))
+ auto __first = __view.__position();
+ typename __unicode::__consume_result __result = __view.__consume();
+ if (__result.__status == __unicode::__consume_result::__ok) {
+ if (!__formatter::__is_escaped_sequence_written(__str, __result.__code_point, __mark))
// 2.2.1.3 - Add the character
ranges::copy(__first, __view.__position(), std::back_insert_iterator(__str));
-
} else {
// 2.2.3 sequence of ill-formed code units
- // The number of code-units in __result.__value depends on the character type being used.
- if constexpr (sizeof(_CharT) == 1) {
- _LIBCPP_ASSERT(__result.__ill_formed_size == 1 || __result.__ill_formed_size == 4,
- "illegal number of invalid code units.");
- if (__result.__ill_formed_size == 1) // ill-formed, one code unit
- __formatter::__write_escape_ill_formed_code_unit(__str, __result.__value & 0xff);
- else { // out of valid range, four code units
- // The code point was properly encoded, decode the value.
- __formatter::__write_escape_ill_formed_code_unit(__str, __result.__value >> 18 | 0xf0);
- __formatter::__write_escape_ill_formed_code_unit(__str, (__result.__value >> 12 & 0x3f) | 0x80);
- __formatter::__write_escape_ill_formed_code_unit(__str, (__result.__value >> 6 & 0x3f) | 0x80);
- __formatter::__write_escape_ill_formed_code_unit(__str, (__result.__value & 0x3f) | 0x80);
- }
- } else if constexpr (sizeof(_CharT) == 2) {
- _LIBCPP_ASSERT(__result.__ill_formed_size == 1, "for UTF-16 at most one invalid code unit");
- __formatter::__write_escape_ill_formed_code_unit(__str, __result.__value & 0xffff);
- } else {
- static_assert(sizeof(_CharT) == 4, "unsupported character width");
- _LIBCPP_ASSERT(__result.__ill_formed_size == 1, "for UTF-32 one code unit is one code point");
- __formatter::__write_escape_ill_formed_code_unit(__str, __result.__value);
- }
+ ranges::for_each(__first, __view.__position(), [&](_CharT __value) {
+ __formatter::__write_escape_ill_formed_code_unit(__str, __formatter::__to_char32(__value));
+ });
}
}
}
namespace __unicode {
-# if _LIBCPP_STD_VER >= 23
-
-/// The result of consuming a code point using P2286' semantics
-///
-/// TODO FMT Combine __consume and __consume_p2286 in one function.
-struct __consume_p2286_result {
- // A size of 0 means well formed. This to differenciate between
- // a valid code point and a code unit that's invalid like 0b11111xxx.
- int __ill_formed_size;
-
- // If well formed the consumed code point.
- // Otherwise the ill-formed code units as unsigned 8-bit values. They are
- // stored in reverse order, to make it easier to extract the values.
- char32_t __value;
+// Helper struct for the result of a consume operation.
+//
+// The status value for a correct code point is 0. This allows a valid value to
+// be used without masking.
+// When the decoding fails it know the number of code units affected. For the
+// current use-cases that value is not needed, therefore it is not stored.
+// The escape routine needs the number of code units for both a valid and
+// invalid character and keeps track of it itself. Doing it in this result
+// unconditionally would give some overhead when the value is unneeded.
+struct __consume_result {
+ // When __status == __ok it contains the decoded code point.
+ // Else it contains the replacement character U+FFFD
+ char32_t __code_point : 31;
+
+ enum : char32_t {
+ // Consumed a well-formed code point.
+ __ok = 0,
+ // Encountered invalid UTF-8
+ __error = 1
+ } __status : 1 {__ok};
};
-
-# endif // _LIBCPP_STD_VER >= 23
+static_assert(sizeof(__consume_result) == sizeof(char32_t));
# ifndef _LIBCPP_HAS_NO_UNICODE
inline constexpr char32_t __replacement_character = U'\ufffd';
+// The error of a consume operation.
+//
+// This sets the code point to the replacement character. This code point does
+// not participate in the grapheme clustering, so grapheme clustering code can
+// ignore the error status and always use the code point.
+inline constexpr __consume_result __consume_result_error{__replacement_character, __consume_result::__error};
+
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_high_surrogate(char32_t __value) {
+ return __value >= 0xd800 && __value <= 0xdbff;
+}
+
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_low_surrogate(char32_t __value) {
+ return __value >= 0xdc00 && __value <= 0xdfff;
+}
+
+// https://www.unicode.org/glossary/#surrogate_code_point
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_surrogate(char32_t __value) {
+ return __value >= 0xd800 && __value <= 0xdfff;
+}
+
+// https://www.unicode.org/glossary/#code_point
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_code_point(char32_t __value) {
+ return __value <= 0x10ffff;
+}
+
+// https://www.unicode.org/glossary/#unicode_scalar_value
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_scalar_value(char32_t __value) {
+ return __unicode::__is_code_point(__value) && !__unicode::__is_surrogate(__value);
+}
+
template <contiguous_iterator _Iterator>
requires same_as<iter_value_t<_Iterator>, char>
_LIBCPP_HIDE_FROM_ABI constexpr bool __is_continuation(_Iterator __char, int __count) {
_LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
_LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
- _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
+ // https://www.unicode.org/versions/latest/ch03.pdf#G7404
+ // Based on Table 3-7, Well-Formed UTF-8 Byte Sequences
+ //
+ // Code Points First Byte Second Byte Third Byte Fourth Byte Remarks
+ // U+0000..U+007F 00..7F U+0000..U+007F 1 code unit range
+ // C0..C1 80..BF invalid overlong encoding
+ // U+0080..U+07FF C2..DF 80..BF U+0080..U+07FF 2 code unit range
+ // E0 80..9F 80..BF invalid overlong encoding
+ // U+0800..U+0FFF E0 A0..BF 80..BF U+0800..U+FFFF 3 code unit range
+ // U+1000..U+CFFF E1..EC 80..BF 80..BF
+ // U+D000..U+D7FF ED 80..9F 80..BF
+ // U+D800..U+DFFF ED A0..BF 80..BF invalid encoding of surrogate code point
+ // U+E000..U+FFFF EE..EF 80..BF 80..BF
+ // F0 80..8F 80..BF 80..BF invalid overlong encoding
+ // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF U+10000..U+10FFFF 4 code unit range
+ // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
+ // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
+ // F4 90..BF 80..BF 80..BF U+110000.. invalid code point range
+ //
+ // Unlike other parsers, these invalid entries are tested after decoding.
+ // - The parser always needs to consume these code units
+ // - The code is optimized for well-formed UTF-8
+ [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
_LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
// Based on the number of leading 1 bits the number of code units in the
// code point can be determined. See
// https://en.wikipedia.org/wiki/UTF-8#Encoding
- switch (_VSTD::countl_one(static_cast<unsigned char>(*__first_))) {
+ switch (std::countl_one(static_cast<unsigned char>(*__first_))) {
case 0:
- return *__first_++;
+ return {static_cast<unsigned char>(*__first_++)};
- case 2:
+ case 2: {
if (__last_ - __first_ < 2 || !__unicode::__is_continuation(__first_ + 1, 1)) [[unlikely]]
break;
- else {
- char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
- __value <<= 6;
- __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
- return __value;
- }
- case 3:
- if (__last_ - __first_ < 3 || !__unicode::__is_continuation(__first_ + 1, 2)) [[unlikely]]
- break;
- else {
- char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
- __value <<= 6;
- __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
- __value <<= 6;
- __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
- return __value;
- }
+ char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
+ __value <<= 6;
+ __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
- case 4:
- if (__last_ - __first_ < 4 || !__unicode::__is_continuation(__first_ + 1, 3)) [[unlikely]]
- break;
- else {
- char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
- __value <<= 6;
- __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
- __value <<= 6;
- __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
- __value <<= 6;
- __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
- return __value;
- }
- }
- // An invalid number of leading ones can be garbage or a code unit in the
- // middle of a code point. By consuming one code unit the parser may get
- // "in sync" after a few code units.
- ++__first_;
- return __replacement_character;
- }
-
-# if _LIBCPP_STD_VER >= 23
- _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
- _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
+ // These values should be encoded in 1 UTF-8 code unit.
+ if (__value < 0x0080) [[unlikely]]
+ return __consume_result_error;
- // Based on the number of leading 1 bits the number of code units in the
- // code point can be determined. See
- // https://en.wikipedia.org/wiki/UTF-8#Encoding
- switch (std::countl_one(static_cast<unsigned char>(*__first_))) {
- case 0:
- return {0, static_cast<unsigned char>(*__first_++)};
+ return {__value};
+ }
- case 2:
- if (__last_ - __first_ < 2) [[unlikely]]
+ case 3: {
+ if (__last_ - __first_ < 3 || !__unicode::__is_continuation(__first_ + 1, 2)) [[unlikely]]
break;
- if (__unicode::__is_continuation(__first_ + 1, 1)) {
- char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
- __value <<= 6;
- __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
- return {0, __value};
- }
- break;
+ char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
+ __value <<= 6;
+ __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
+ __value <<= 6;
+ __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
- case 3:
- if (__last_ - __first_ < 3) [[unlikely]]
- break;
+ // These values should be encoded in 1 or 2 UTF-8 code units.
+ if (__value < 0x0800) [[unlikely]]
+ return __consume_result_error;
- if (__unicode::__is_continuation(__first_ + 1, 2)) {
- char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
- __value <<= 6;
- __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
- __value <<= 6;
- __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
- return {0, __value};
- }
- break;
+ // A surrogate value is always encoded in 3 UTF-8 code units.
+ if (__unicode::__is_surrogate(__value)) [[unlikely]]
+ return __consume_result_error;
+
+ return {__value};
+ }
- case 4:
- if (__last_ - __first_ < 4) [[unlikely]]
+ case 4: {
+ if (__last_ - __first_ < 4 || !__unicode::__is_continuation(__first_ + 1, 3)) [[unlikely]]
break;
- if (__unicode::__is_continuation(__first_ + 1, 3)) {
- char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
- __value <<= 6;
- __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
- __value <<= 6;
- __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
- __value <<= 6;
- __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
+ char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
+ __value <<= 6;
+ __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
+ __value <<= 6;
+ __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
+ __value <<= 6;
+ __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
- if (__value > 0x10FFFF) // Outside the valid Unicode range?
- return {4, __value};
+ // These values should be encoded in 1, 2, or 3 UTF-8 code units.
+ if (__value < 0x10000) [[unlikely]]
+ return __consume_result_error;
- return {0, __value};
- }
- break;
+ // A value too large is always encoded in 4 UTF-8 code units.
+ if (!__unicode::__is_code_point(__value)) [[unlikely]]
+ return __consume_result_error;
+
+ return {__value};
+ }
}
// An invalid number of leading ones can be garbage or a code unit in the
// middle of a code point. By consuming one code unit the parser may get
// "in sync" after a few code units.
- return {1, static_cast<unsigned char>(*__first_++)};
+ ++__first_;
+ return __consume_result_error;
}
-# endif // _LIBCPP_STD_VER >= 23
private:
_Iterator __first_;
_LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
_LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
- _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
+ [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
_LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
+ char32_t __value = static_cast<char32_t>(*__first_++);
if constexpr (sizeof(wchar_t) == 2) {
- char32_t __result = *__first_++;
- // Is the code unit part of a surrogate pair? See
- // https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
- if (__result >= 0xd800 && __result <= 0xDfff) {
- // Malformed Unicode.
- if (__first_ == __last_) [[unlikely]]
- return __replacement_character;
-
- __result -= 0xd800;
- __result <<= 10;
- __result += *__first_++ - 0xdc00;
- __result += 0x10000;
- }
- return __result;
+ if (__unicode::__is_low_surrogate(__value)) [[unlikely]]
+ return __consume_result_error;
- } else if constexpr (sizeof(wchar_t) == 4) {
- char32_t __result = *__first_++;
- if (__result > 0x10FFFF) [[unlikely]]
- return __replacement_character;
- return __result;
- } else {
- __libcpp_unreachable();
- }
- }
+ if (__unicode::__is_high_surrogate(__value)) {
+ if (__first_ == __last_ || !__unicode::__is_low_surrogate(static_cast<char32_t>(*__first_))) [[unlikely]]
+ return __consume_result_error;
-# if _LIBCPP_STD_VER >= 23
- _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
- _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
+ __value -= 0xd800;
+ __value <<= 10;
+ __value += static_cast<char32_t>(*__first_++) - 0xdc00;
+ __value += 0x10000;
- char32_t __result = *__first_++;
- if constexpr (sizeof(wchar_t) == 2) {
- // https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
- if (__is_surrogate_pair_high(__result)) {
- // Malformed Unicode.
- if (__first_ == __last_ || !__is_surrogate_pair_low(*(__first_ + 1))) [[unlikely]]
- return {1, __result};
-
- __result -= 0xd800;
- __result <<= 10;
- __result += *__first_++ - 0xdc00;
- __result += 0x10000;
- } else if (__is_surrogate_pair_low(__result))
- // A code point shouldn't start with the low surrogate pair
- return {1, __result};
+ if (!__unicode::__is_code_point(__value)) [[unlikely]]
+ return __consume_result_error;
+ }
} else {
- if (__result > 0x10FFFF) [[unlikely]]
- return {1, __result};
+ if (!__unicode::__is_scalar_value(__value)) [[unlikely]]
+ return __consume_result_error;
}
- return {0, __result};
+ return {__value};
}
-# endif // _LIBCPP_STD_VER >= 23
private:
_Iterator __first_;
public:
_LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_view(_Iterator __first, _Iterator __last)
: __code_point_view_(__first, __last),
- __next_code_point_(__code_point_view_.__consume()),
+ __next_code_point_(__code_point_view_.__consume().__code_point),
__next_prop_(__extended_grapheme_custer_property_boundary::__get_property(__next_code_point_)) {}
struct __cluster {
_LIBCPP_ASSERT(
__next_prop_ != __extended_grapheme_custer_property_boundary::__property::__eot,
"can't move beyond the end of input");
+
char32_t __code_point = __next_code_point_;
if (!__code_point_view_.__at_end())
return {__code_point, __get_break()};
__next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot;
return __result;
}
- __next_code_point_ = __code_point_view_.__consume();
+ __next_code_point_ = __code_point_view_.__consume().__code_point;
__next_prop_ = __extended_grapheme_custer_property_boundary::__get_property(__next_code_point_);
__has_extened_pictographic |=
_LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
_LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
- _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
+ [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
_LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
- return *__first_++;
- }
-
-# if _LIBCPP_STD_VER >= 23
- _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
- _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
-
- return {0, std::make_unsigned_t<_CharT>(*__first_++)};
+ return {static_cast<char32_t>(*__first_++)};
}
-# endif // _LIBCPP_STD_VER >= 23
private:
_Iterator __first_;
test_format(V{L"'\\u{600}'"}, L"{:?}", L'\x600'); // ARABIC NUMBER SIGN
test_format(V{L"'\\u{feff}'"}, L"{:?}", L'\xfeff'); // ZERO WIDTH NO-BREAK SPACE
- if constexpr (sizeof(CharT) == 2) {
- // Incomplete surrogate pair in UTF-16
- test_format(V{L"'\\x{d800}'"}, L"{:?}", L'\xd800'); // <surrogate-D800>
- test_format(V{L"'\\x{dfff}'"}, L"{:?}", L'\xdfff'); // <surrogate-DFFF>
- } else {
- test_format(V{L"'\\u{d800}'"}, L"{:?}", L'\xd800'); // <surrogate-D800>
- test_format(V{L"'\\u{dfff}'"}, L"{:?}", L'\xdfff'); // <surrogate-DFFF>
- }
+ // Incomplete surrogate pair in UTF-16
+ test_format(V{L"'\\x{d800}'"}, L"{:?}", L'\xd800'); // <surrogate-D800>
+ test_format(V{L"'\\x{dfff}'"}, L"{:?}", L'\xdfff'); // <surrogate-DFFF>
// Private_Use
test_format(V{L"'\\u{e000}'"}, L"{:?}", L'\xe000'); // <private-use-E000>
// Ill-formend UTF-8
test_format(SV(R"(["\x{c3}"])"), SV("[{:?}]"), "\xc3");
test_format(SV(R"(["\x{c3}("])"), SV("[{:?}]"), "\xc3\x28");
+
+ /* U+0000..U+0007F 1 code unit range, encoded in 2 code units. */
+ test_format(SV(R"(["\x{c0}\x{80}"])"), SV("[{:?}]"), "\xc0\x80"); // U+0000
+ test_format(SV(R"(["\x{c1}\x{bf}"])"), SV("[{:?}]"), "\xc1\xbf"); // U+007F
+ test_format(SV(R"(["\u{80}"])"), SV("[{:?}]"), "\xc2\x80"); // U+0080 first valid (General_Category=Control)
+
+ /* U+0000..U+07FFF 1 and 2 code unit range, encoded in 3 code units. */
+ test_format(SV(R"(["\x{e0}\x{80}\x{80}"])"), SV("[{:?}]"), "\xe0\x80\x80"); // U+0000
+ test_format(SV(R"(["\x{e0}\x{81}\x{bf}"])"), SV("[{:?}]"), "\xe0\x81\xbf"); // U+007F
+ test_format(SV(R"(["\x{e0}\x{82}\x{80}"])"), SV("[{:?}]"), "\xe0\x82\x80"); // U+0080
+ test_format(SV(R"(["\x{e0}\x{9f}\x{bf}"])"), SV("[{:?}]"), "\xe0\x9f\xbf"); // U+07FF
+ test_format(SV("[\"\u0800\"]"), SV("[{:?}]"), "\xe0\xa0\x80"); // U+0800 first valid
+
+#if 0
+ // This code point is in the Hangul Jamo Extended-B block and at the time of writing
+ // it's unassigned. When it comes defined, this branch might become true.
+ test_format(SV("[\"\ud7ff\"]"), SV("[{:?}]"), "\xed\x9f\xbf"); // U+D7FF last valid
+#else
+ /* U+D800..D+DFFFF surrogate range */
+ test_format(SV(R"(["\u{d7ff}"])"), SV("[{:?}]"), "\xed\x9f\xbf"); // U+D7FF last valid
+#endif
+ test_format(SV(R"(["\x{ed}\x{a0}\x{80}"])"), SV("[{:?}]"), "\xed\xa0\x80"); // U+D800
+ test_format(SV(R"(["\x{ed}\x{af}\x{bf}"])"), SV("[{:?}]"), "\xed\xaf\xbf"); // U+DBFF
+ test_format(SV(R"(["\x{ed}\x{bf}\x{80}"])"), SV("[{:?}]"), "\xed\xbf\x80"); // U+DC00
+ test_format(SV(R"(["\x{ed}\x{bf}\x{bf}"])"), SV("[{:?}]"), "\xed\xbf\xbf"); // U+DFFF
+ test_format(SV(R"(["\u{e000}"])"), SV("[{:?}]"), "\xee\x80\x80"); // U+E000 first valid
+ // (in the Private Use Area block)
+
+ /* U+0000..U+FFFF 1, 2, and 3 code unit range */
+ test_format(SV(R"(["\x{f0}\x{80}\x{80}\x{80}"])"), SV("[{:?}]"), "\xf0\x80\x80\x80"); // U+0000
+ test_format(SV(R"(["\x{f0}\x{80}\x{81}\x{bf}"])"), SV("[{:?}]"), "\xf0\x80\x81\xbf"); // U+007F
+ test_format(SV(R"(["\x{f0}\x{80}\x{82}\x{80}"])"), SV("[{:?}]"), "\xf0\x80\x82\x80"); // U+0080
+ test_format(SV(R"(["\x{f0}\x{80}\x{9f}\x{bf}"])"), SV("[{:?}]"), "\xf0\x80\x9f\xbf"); // U+07FF
+ test_format(SV(R"(["\x{f0}\x{80}\x{a0}\x{80}"])"), SV("[{:?}]"), "\xf0\x80\xa0\x80"); // U+0800
+ test_format(SV(R"(["\x{f0}\x{8f}\x{bf}\x{bf}"])"), SV("[{:?}]"), "\xf0\x8f\xbf\xbf"); // U+FFFF
+ test_format(SV("[\"\U00010000\"]"), SV("[{:?}]"), "\xf0\x90\x80\x80"); // U+10000 first valid
+
+ /* U+10FFFF..U+1FFFFF invalid range */
+ test_format(SV(R"(["\u{10ffff}"])"), SV("[{:?}]"), "\xf4\x8f\xbf\xbf"); // U+10FFFF last valid
+ // (in Supplementary Private Use Area-B)
+ test_format(SV(R"(["\x{f4}\x{90}\x{80}\x{80}"])"), SV("[{:?}]"), "\xf4\x90\x80\x80"); // U+110000
+ test_format(SV(R"(["\x{f4}\x{bf}\x{bf}\x{bf}"])"), SV("[{:?}]"), "\xf4\xbf\xbf\xbf"); // U+11FFFF
} else {
// Valid UTF-16 and UTF-32
test_format(SV("[\"\u00c3\"]"), SV("[{:?}]"), L"\xc3"); // LATIN CAPITAL LETTER A WITH TILDE
// Format
test_format(V{LR"("\u{ad}\u{600}\u{feff}")"}, L"{:?}", L"\xad\x600\xfeff");
- if constexpr (sizeof(CharT) == 2)
- // Incomplete surrogate pair in UTF-16
- test_format(V{LR"("\x{d800}")"}, L"{:?}", L"\xd800");
- else
- test_format(V{LR"("\u{d800}")"}, L"{:?}", L"\xd800");
+ // Incomplete surrogate pair in UTF-16
+ test_format(V{LR"("\x{d800}")"}, L"{:?}", L"\xd800");
// Private_Use
test_format(V{LR"("\u{e000}\u{f8ff}")"}, L"{:?}", L"\xe000\xf8ff");