From 0b8c57ed40f19086e30ce54faec3222ac21cc0df Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Thu, 1 Sep 2022 09:48:01 +0200 Subject: [PATCH] libcpp: Add -Winvalid-utf8 warning [PR106655] The following patch introduces a new warning - -Winvalid-utf8 similarly to what clang now has - to diagnose invalid UTF-8 byte sequences in comments, but not just in those, but also in string/character literals and outside of them. The warning is on by default when explicit -finput-charset=UTF-8 is used and C++23 compilation is requested and if -{,W}pedantic or -pedantic-errors it is actually a pedwarn. The reason it is on by default only for -finput-charset=UTF-8 is that the sources often are UTF-8, but sometimes could be some ASCII compatible single byte encoding where non-ASCII characters only appear in comments. So having the warning off by default is IMO desirable. The C++23 pedantic mode for when the source code is UTF-8 is -std=c++23 -pedantic-errors -finput-charset=UTF-8. 2022-09-01 Jakub Jelinek PR c++/106655 libcpp/ * include/cpplib.h (struct cpp_options): Implement C++23 P2295R6 - Support for UTF-8 as a portable source file encoding. Add cpp_warn_invalid_utf8 and cpp_input_charset_explicit fields. (enum cpp_warning_reason): Add CPP_W_INVALID_UTF8 enumerator. * init.cc (cpp_create_reader): Initialize cpp_warn_invalid_utf8 and cpp_input_charset_explicit. * charset.cc (_cpp_valid_utf8): Adjust function comment. * lex.cc (UCS_LIMIT): Define. (utf8_continuation): New const variable. (utf8_signifier): Move earlier in the file. (_cpp_warn_invalid_utf8, _cpp_handle_multibyte_utf8): New functions. (_cpp_skip_block_comment): Handle -Winvalid-utf8 warning. (skip_line_comment): Likewise. (lex_raw_string, lex_string): Likewise. (_cpp_lex_direct): Likewise. gcc/ * doc/invoke.texi (-Winvalid-utf8): Document it. gcc/c-family/ * c.opt (-Winvalid-utf8): New warning. * c-opts.cc (c_common_handle_option) : Set cpp_opts->cpp_input_charset_explicit. (c_common_post_options): If -finput-charset=UTF-8 is explicit in C++23, enable -Winvalid-utf8 by default and if -pedantic or -pedantic-errors, make it a pedwarn. gcc/testsuite/ * c-c++-common/cpp/Winvalid-utf8-1.c: New test. * c-c++-common/cpp/Winvalid-utf8-2.c: New test. * c-c++-common/cpp/Winvalid-utf8-3.c: New test. * g++.dg/cpp23/Winvalid-utf8-1.C: New test. * g++.dg/cpp23/Winvalid-utf8-2.C: New test. * g++.dg/cpp23/Winvalid-utf8-3.C: New test. * g++.dg/cpp23/Winvalid-utf8-4.C: New test. * g++.dg/cpp23/Winvalid-utf8-5.C: New test. * g++.dg/cpp23/Winvalid-utf8-6.C: New test. * g++.dg/cpp23/Winvalid-utf8-7.C: New test. * g++.dg/cpp23/Winvalid-utf8-8.C: New test. * g++.dg/cpp23/Winvalid-utf8-9.C: New test. * g++.dg/cpp23/Winvalid-utf8-10.C: New test. * g++.dg/cpp23/Winvalid-utf8-11.C: New test. * g++.dg/cpp23/Winvalid-utf8-12.C: New test. --- gcc/c-family/c-opts.cc | 12 ++ gcc/c-family/c.opt | 4 + gcc/doc/invoke.texi | 13 +- gcc/testsuite/c-c++-common/cpp/Winvalid-utf8-1.c | 43 +++++ gcc/testsuite/c-c++-common/cpp/Winvalid-utf8-2.c | 88 ++++++++++ gcc/testsuite/c-c++-common/cpp/Winvalid-utf8-3.c | 27 +++ gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-1.C | 43 +++++ gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-10.C | 25 +++ gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-11.C | 25 +++ gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-12.C | 25 +++ gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-2.C | 43 +++++ gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-3.C | 43 +++++ gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-4.C | 43 +++++ gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-5.C | 80 +++++++++ gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-6.C | 80 +++++++++ gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-7.C | 80 +++++++++ gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-8.C | 80 +++++++++ gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-9.C | 25 +++ libcpp/charset.cc | 6 +- libcpp/include/cpplib.h | 10 +- libcpp/init.cc | 2 + libcpp/lex.cc | 209 ++++++++++++++++++++--- 22 files changed, 973 insertions(+), 33 deletions(-) create mode 100644 gcc/testsuite/c-c++-common/cpp/Winvalid-utf8-1.c create mode 100644 gcc/testsuite/c-c++-common/cpp/Winvalid-utf8-2.c create mode 100644 gcc/testsuite/c-c++-common/cpp/Winvalid-utf8-3.c create mode 100644 gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-1.C create mode 100644 gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-10.C create mode 100644 gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-11.C create mode 100644 gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-12.C create mode 100644 gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-2.C create mode 100644 gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-3.C create mode 100644 gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-4.C create mode 100644 gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-5.C create mode 100644 gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-6.C create mode 100644 gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-7.C create mode 100644 gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-8.C create mode 100644 gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-9.C diff --git a/gcc/c-family/c-opts.cc b/gcc/c-family/c-opts.cc index 337a524..babaa2f 100644 --- a/gcc/c-family/c-opts.cc +++ b/gcc/c-family/c-opts.cc @@ -534,6 +534,7 @@ c_common_handle_option (size_t scode, const char *arg, HOST_WIDE_INT value, case OPT_finput_charset_: cpp_opts->input_charset = arg; + cpp_opts->cpp_input_charset_explicit = 1; break; case OPT_ftemplate_depth_: @@ -1152,6 +1153,17 @@ c_common_post_options (const char **pfilename) lang_hooks.preprocess_options (parse_in); cpp_post_options (parse_in); init_global_opts_from_cpp (&global_options, cpp_get_options (parse_in)); + /* For C++23 and explicit -finput-charset=UTF-8, turn on -Winvalid-utf8 + by default and make it a pedwarn unless -Wno-invalid-utf8. */ + if (cxx_dialect >= cxx23 + && cpp_opts->cpp_input_charset_explicit + && strcmp (cpp_opts->input_charset, "UTF-8") == 0 + && (cpp_opts->cpp_warn_invalid_utf8 + || !global_options_set.x_warn_invalid_utf8)) + { + global_options.x_warn_invalid_utf8 = 1; + cpp_opts->cpp_warn_invalid_utf8 = cpp_opts->cpp_pedantic ? 2 : 1; + } /* Let diagnostics infrastructure know how to convert input files the same way libcpp will do it, namely using the configured input charset and diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt index f776efd..ff6fe86 100644 --- a/gcc/c-family/c.opt +++ b/gcc/c-family/c.opt @@ -821,6 +821,10 @@ Winvalid-pch C ObjC C++ ObjC++ CPP(warn_invalid_pch) CppReason(CPP_W_INVALID_PCH) Var(cpp_warn_invalid_pch) Init(0) Warning Warn about PCH files that are found but not used. +Winvalid-utf8 +C objC C++ ObjC++ CPP(cpp_warn_invalid_utf8) CppReason(CPP_W_INVALID_UTF8) Var(warn_invalid_utf8) Init(0) Warning +Warn about invalid UTF-8 characters in comments. + Wjump-misses-init C ObjC Var(warn_jump_misses_init) Warning LangEnabledby(C ObjC,Wc++-compat) Warn when a jump misses a variable initialization. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index e5eb525..8def6ba 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -365,9 +365,9 @@ Objective-C and Objective-C++ Dialects}. -Winfinite-recursion @gol -Winit-self -Winline -Wno-int-conversion -Wint-in-bool-context @gol -Wno-int-to-pointer-cast -Wno-invalid-memory-model @gol --Winvalid-pch -Wjump-misses-init -Wlarger-than=@var{byte-size} @gol --Wlogical-not-parentheses -Wlogical-op -Wlong-long @gol --Wno-lto-type-mismatch -Wmain -Wmaybe-uninitialized @gol +-Winvalid-pch -Winvalid-utf8 -Wjump-misses-init @gol +-Wlarger-than=@var{byte-size} -Wlogical-not-parentheses -Wlogical-op @gol +-Wlong-long -Wno-lto-type-mismatch -Wmain -Wmaybe-uninitialized @gol -Wmemset-elt-size -Wmemset-transposed-args @gol -Wmisleading-indentation -Wmissing-attributes -Wmissing-braces @gol -Wmissing-field-initializers -Wmissing-format-attribute @gol @@ -9569,6 +9569,13 @@ different size. Warn if a precompiled header (@pxref{Precompiled Headers}) is found in the search path but cannot be used. +@item -Winvalid-utf8 +@opindex Winvalid-utf8 +@opindex Wno-invalid-utf8 +Warn if an invalid UTF-8 character is found. +This warning is on by default for C++23 if @option{-finput-charset=UTF-8} +is used and turned into error with @option{-pedantic-errors}. + @item -Wlong-long @opindex Wlong-long @opindex Wno-long-long diff --git a/gcc/testsuite/c-c++-common/cpp/Winvalid-utf8-1.c b/gcc/testsuite/c-c++-common/cpp/Winvalid-utf8-1.c new file mode 100644 index 0000000..0d5a6a7 --- /dev/null +++ b/gcc/testsuite/c-c++-common/cpp/Winvalid-utf8-1.c @@ -0,0 +1,43 @@ +// P2295R6 - Support for UTF-8 as a portable source file encoding +// This test intentionally contains various byte sequences which are not valid UTF-8 +// { dg-do preprocess } +// { dg-options "-finput-charset=UTF-8 -Winvalid-utf8" } + +// a€߿ࠀ퟿𐀀ô¿¿a { dg-bogus "invalid UTF-8 character" } +// a€a { dg-warning "invalid UTF-8 character <80>" } +// a¿a { dg-warning "invalid UTF-8 character " } +// aÀa { dg-warning "invalid UTF-8 character " } +// aÁa { dg-warning "invalid UTF-8 character " } +// aõa { dg-warning "invalid UTF-8 character " } +// aÿa { dg-warning "invalid UTF-8 character " } +// aÂa { dg-warning "invalid UTF-8 character " } +// aàa { dg-warning "invalid UTF-8 character " } +// aà€¿a { dg-warning "invalid UTF-8 character <80>" } +// aàŸ€a { dg-warning "invalid UTF-8 character <9f><80>" } +// aà¿a { dg-warning "invalid UTF-8 character " } +// aì€a { dg-warning "invalid UTF-8 character <80>" } +// aí €a { dg-warning "invalid UTF-8 character <80>" } +// að€€€a { dg-warning "invalid UTF-8 character <80><80><80>" } +// að¿¿a { dg-warning "invalid UTF-8 character <8f>" } +// aô€€a { dg-warning "invalid UTF-8 character <90><80><80>" } +// aý¿¿¿¿¿a { dg-warning "invalid UTF-8 character " } +// { dg-warning "invalid UTF-8 character " "" { target *-*-* } .-1 } +/* a€߿ࠀ퟿𐀀ô¿¿a { dg-bogus "invalid UTF-8 character" } */ +/* a€a { dg-warning "invalid UTF-8 character <80>" } */ +/* a¿a { dg-warning "invalid UTF-8 character " } */ +/* aÀa { dg-warning "invalid UTF-8 character " } */ +/* aÁa { dg-warning "invalid UTF-8 character " } */ +/* aõa { dg-warning "invalid UTF-8 character " } */ +/* aÿa { dg-warning "invalid UTF-8 character " } */ +/* aÂa { dg-warning "invalid UTF-8 character " } */ +/* aàa { dg-warning "invalid UTF-8 character " } */ +/* aà€¿a { dg-warning "invalid UTF-8 character <80>" } */ +/* aàŸ€a { dg-warning "invalid UTF-8 character <9f><80>" } */ +/* aà¿a { dg-warning "invalid UTF-8 character " } */ +/* aì€a { dg-warning "invalid UTF-8 character <80>" } */ +/* aí €a { dg-warning "invalid UTF-8 character <80>" } */ +/* að€€€a { dg-warning "invalid UTF-8 character <80><80><80>" } */ +/* að¿¿a { dg-warning "invalid UTF-8 character <8f>" } */ +/* aô€€a { dg-warning "invalid UTF-8 character <90><80><80>" } */ +/* aý¿¿¿¿¿a { dg-warning "invalid UTF-8 character " } */ +/* { dg-warning "invalid UTF-8 character " "" { target *-*-* } .-1 } */ diff --git a/gcc/testsuite/c-c++-common/cpp/Winvalid-utf8-2.c b/gcc/testsuite/c-c++-common/cpp/Winvalid-utf8-2.c new file mode 100644 index 0000000..9ab69e1 --- /dev/null +++ b/gcc/testsuite/c-c++-common/cpp/Winvalid-utf8-2.c @@ -0,0 +1,88 @@ +// P2295R6 - Support for UTF-8 as a portable source file encoding +// This test intentionally contains various byte sequences which are not valid UTF-8 +// { dg-do preprocess { target { c || c++11 } } } +// { dg-require-effective-target wchar } +// { dg-options "-finput-charset=UTF-8 -Winvalid-utf8" } +// { dg-additional-options "-std=gnu99" { target c } } + +#ifndef __cplusplus +#include +typedef __CHAR16_TYPE__ char16_t; +typedef __CHAR32_TYPE__ char32_t; +#endif + +char32_t a = U'€'; // { dg-warning "invalid UTF-8 character <80>" } +char32_t b = U'¿'; // { dg-warning "invalid UTF-8 character " } +char32_t c = U'À'; // { dg-warning "invalid UTF-8 character " } +char32_t d = U'Á'; // { dg-warning "invalid UTF-8 character " } +char32_t e = U'õ'; // { dg-warning "invalid UTF-8 character " } +char32_t f = U'ÿ'; // { dg-warning "invalid UTF-8 character " } +char32_t g = U'Â'; // { dg-warning "invalid UTF-8 character " } +char32_t h = U'à'; // { dg-warning "invalid UTF-8 character " } +char32_t i = U'à€¿'; // { dg-warning "invalid UTF-8 character <80>" } +char32_t j = U'àŸ€'; // { dg-warning "invalid UTF-8 character <9f><80>" } +char32_t k = U'à¿'; // { dg-warning "invalid UTF-8 character " } +char32_t l = U'ì€'; // { dg-warning "invalid UTF-8 character <80>" } +char32_t m = U'í €'; // { dg-warning "invalid UTF-8 character <80>" } +char32_t n = U'ð€€€'; // { dg-warning "invalid UTF-8 character <80><80><80>" } +char32_t o = U'ð¿¿'; // { dg-warning "invalid UTF-8 character <8f>" } +char32_t p = U'ô€€'; // { dg-warning "invalid UTF-8 character <90><80><80>" } +char32_t q = U'ý¿¿¿¿¿'; // { dg-warning "invalid UTF-8 character " } + // { dg-warning "invalid UTF-8 character " "" { target *-*-* } .-1 } +const char32_t *A = U"€߿ࠀ퟿𐀀ô¿¿"; // { dg-bogus "invalid UTF-8 character" } +const char32_t *B = U"€"; // { dg-warning "invalid UTF-8 character <80>" } +const char32_t *C = U"¿"; // { dg-warning "invalid UTF-8 character " } +const char32_t *D = U"À"; // { dg-warning "invalid UTF-8 character " } +const char32_t *E = U"Á"; // { dg-warning "invalid UTF-8 character " } +const char32_t *F = U"õ"; // { dg-warning "invalid UTF-8 character " } +const char32_t *G = U"ÿ"; // { dg-warning "invalid UTF-8 character " } +const char32_t *H = U"Â"; // { dg-warning "invalid UTF-8 character " } +const char32_t *I = U"à"; // { dg-warning "invalid UTF-8 character " } +const char32_t *J = U"à€¿"; // { dg-warning "invalid UTF-8 character <80>" } +const char32_t *K = U"àŸ€"; // { dg-warning "invalid UTF-8 character <9f><80>" } +const char32_t *L = U"à¿"; // { dg-warning "invalid UTF-8 character " } +const char32_t *M = U"ì€"; // { dg-warning "invalid UTF-8 character <80>" } +const char32_t *N = U"í €"; // { dg-warning "invalid UTF-8 character <80>" } +const char32_t *O = U"ð€€€"; // { dg-warning "invalid UTF-8 character <80><80><80>" } +const char32_t *P = U"ð¿¿"; // { dg-warning "invalid UTF-8 character <8f>" } +const char32_t *Q = U"ô€€"; // { dg-warning "invalid UTF-8 character <90><80><80>" } +const char32_t *R = U"ý¿¿¿¿¿"; // { dg-warning "invalid UTF-8 character " } + // { dg-warning "invalid UTF-8 character " "" { target *-*-* } .-1 } +const char32_t *A1 = UR"(€߿ࠀ퟿𐀀ô¿¿)"; // { dg-bogus "invalid UTF-8 character" } +const char32_t *B1 = UR"(€)"; // { dg-warning "invalid UTF-8 character <80>" } +const char32_t *C1 = UR"(¿)"; // { dg-warning "invalid UTF-8 character " } +const char32_t *D1 = UR"(À)"; // { dg-warning "invalid UTF-8 character " } +const char32_t *E1 = UR"(Á)"; // { dg-warning "invalid UTF-8 character " } +const char32_t *F1 = UR"(õ)"; // { dg-warning "invalid UTF-8 character " } +const char32_t *G1 = UR"(ÿ)"; // { dg-warning "invalid UTF-8 character " } +const char32_t *H1 = UR"(Â)"; // { dg-warning "invalid UTF-8 character " } +const char32_t *I1 = UR"(à)"; // { dg-warning "invalid UTF-8 character " } +const char32_t *J1 = UR"(à€¿)"; // { dg-warning "invalid UTF-8 character <80>" } +const char32_t *K1 = UR"(àŸ€)"; // { dg-warning "invalid UTF-8 character <9f><80>" } +const char32_t *L1 = UR"(à¿)"; // { dg-warning "invalid UTF-8 character " } +const char32_t *M1 = UR"(ì€)"; // { dg-warning "invalid UTF-8 character <80>" } +const char32_t *N1 = UR"(í €)"; // { dg-warning "invalid UTF-8 character <80>" } +const char32_t *O1 = UR"(ð€€€)"; // { dg-warning "invalid UTF-8 character <80><80><80>" } +const char32_t *P1 = UR"(ð¿¿)"; // { dg-warning "invalid UTF-8 character <8f>" } +const char32_t *Q1 = UR"(ô€€)"; // { dg-warning "invalid UTF-8 character <90><80><80>" } +const char32_t *R1 = UR"(ý¿¿¿¿¿)"; // { dg-warning "invalid UTF-8 character " } + // { dg-warning "invalid UTF-8 character " "" { target *-*-* } .-1 } +const char *A2 = u8"€߿ࠀ퟿𐀀ô¿¿"; // { dg-bogus "invalid UTF-8 character" } +const char *B2 = u8"€"; // { dg-warning "invalid UTF-8 character <80>" } +const char *C2 = u8"¿"; // { dg-warning "invalid UTF-8 character " } +const char *D2 = u8"À"; // { dg-warning "invalid UTF-8 character " } +const char *E2 = u8"Á"; // { dg-warning "invalid UTF-8 character " } +const char *F2 = u8"õ"; // { dg-warning "invalid UTF-8 character " } +const char *G2 = u8"ÿ"; // { dg-warning "invalid UTF-8 character " } +const char *H2 = u8"Â"; // { dg-warning "invalid UTF-8 character " } +const char *I2 = u8"à"; // { dg-warning "invalid UTF-8 character " } +const char *J2 = u8"à€¿"; // { dg-warning "invalid UTF-8 character <80>" } +const char *K2 = u8"àŸ€"; // { dg-warning "invalid UTF-8 character <9f><80>" } +const char *L2 = u8"à¿"; // { dg-warning "invalid UTF-8 character " } +const char *M2 = u8"ì€"; // { dg-warning "invalid UTF-8 character <80>" } +const char *N2 = u8"í €"; // { dg-warning "invalid UTF-8 character <80>" } +const char *O2 = u8"ð€€€"; // { dg-warning "invalid UTF-8 character <80><80><80>" } +const char *P2 = u8"ð¿¿"; // { dg-warning "invalid UTF-8 character <8f>" } +const char *Q2 = u8"ô€€"; // { dg-warning "invalid UTF-8 character <90><80><80>" } +const char *R2 = u8"ý¿¿¿¿¿"; // { dg-warning "invalid UTF-8 character " } + // { dg-warning "invalid UTF-8 character " "" { target *-*-* } .-1 } diff --git a/gcc/testsuite/c-c++-common/cpp/Winvalid-utf8-3.c b/gcc/testsuite/c-c++-common/cpp/Winvalid-utf8-3.c new file mode 100644 index 0000000..4cb230f --- /dev/null +++ b/gcc/testsuite/c-c++-common/cpp/Winvalid-utf8-3.c @@ -0,0 +1,27 @@ +// P2295R6 - Support for UTF-8 as a portable source file encoding +// This test intentionally contains various byte sequences which are not valid UTF-8 +// { dg-do preprocess } +// { dg-options "-finput-charset=UTF-8 -Winvalid-utf8" } + +#define I(x) +I(€߿ࠀ퟿𐀀ô¿¿) // { dg-bogus "invalid UTF-8 character" } + // { dg-error "is not valid in an identifier" "" { target c++ } .-1 } +I(€) // { dg-warning "invalid UTF-8 character <80>" } +I(¿) // { dg-warning "invalid UTF-8 character " } +I(À) // { dg-warning "invalid UTF-8 character " } +I(Á) // { dg-warning "invalid UTF-8 character " } +I(õ) // { dg-warning "invalid UTF-8 character " } +I(ÿ) // { dg-warning "invalid UTF-8 character " } +I(Â) // { dg-warning "invalid UTF-8 character " } +I(à) // { dg-warning "invalid UTF-8 character " } +I(à€¿) // { dg-warning "invalid UTF-8 character <80>" } +I(àŸ€) // { dg-warning "invalid UTF-8 character <9f><80>" } +I(à¿) // { dg-warning "invalid UTF-8 character " } +I(ì€) // { dg-warning "invalid UTF-8 character <80>" } +I(í €) // { dg-warning "invalid UTF-8 character <80>" } +I(ð€€€) // { dg-warning "invalid UTF-8 character <80><80><80>" } +I(ð¿¿) // { dg-warning "invalid UTF-8 character <8f>" } +I(ô€€) // { dg-warning "invalid UTF-8 character <90><80><80>" "" { target c } } + // { dg-error "is not valid in an identifier" "" { target c++ } .-1 } +I(ý¿¿¿¿¿) // { dg-warning "invalid UTF-8 character " "" { target c } } + // { dg-error "is not valid in an identifier" "" { target c++ } .-1 } diff --git a/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-1.C b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-1.C new file mode 100644 index 0000000..95e3827 --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-1.C @@ -0,0 +1,43 @@ +// P2295R6 - Support for UTF-8 as a portable source file encoding +// This test intentionally contains various byte sequences which are not valid UTF-8 +// { dg-do preprocess } +// { dg-options "-finput-charset=UTF-8" } + +// a€߿ࠀ퟿𐀀ô¿¿a { dg-bogus "invalid UTF-8 character" } +// a€a { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +// a¿a { dg-warning "invalid UTF-8 character " "" { target c++23 } } +// aÀa { dg-warning "invalid UTF-8 character " "" { target c++23 } } +// aÁa { dg-warning "invalid UTF-8 character " "" { target c++23 } } +// aõa { dg-warning "invalid UTF-8 character " "" { target c++23 } } +// aÿa { dg-warning "invalid UTF-8 character " "" { target c++23 } } +// aÂa { dg-warning "invalid UTF-8 character " "" { target c++23 } } +// aàa { dg-warning "invalid UTF-8 character " "" { target c++23 } } +// aà€¿a { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +// aàŸ€a { dg-warning "invalid UTF-8 character <9f><80>" "" { target c++23 } } +// aà¿a { dg-warning "invalid UTF-8 character " "" { target c++23 } } +// aì€a { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +// aí €a { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +// að€€€a { dg-warning "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +// að¿¿a { dg-warning "invalid UTF-8 character <8f>" "" { target c++23 } } +// aô€€a { dg-warning "invalid UTF-8 character <90><80><80>" "" { target c++23 } } +// aý¿¿¿¿¿a { dg-warning "invalid UTF-8 character " "" { target c++23 } } +// { dg-warning "invalid UTF-8 character " "" { target c++23 } .-1 } +/* a€߿ࠀ퟿𐀀ô¿¿a { dg-bogus "invalid UTF-8 character" } */ +/* a€a { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } */ +/* a¿a { dg-warning "invalid UTF-8 character " "" { target c++23 } } */ +/* aÀa { dg-warning "invalid UTF-8 character " "" { target c++23 } } */ +/* aÁa { dg-warning "invalid UTF-8 character " "" { target c++23 } } */ +/* aõa { dg-warning "invalid UTF-8 character " "" { target c++23 } } */ +/* aÿa { dg-warning "invalid UTF-8 character " "" { target c++23 } } */ +/* aÂa { dg-warning "invalid UTF-8 character " "" { target c++23 } } */ +/* aàa { dg-warning "invalid UTF-8 character " "" { target c++23 } } */ +/* aà€¿a { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } */ +/* aàŸ€a { dg-warning "invalid UTF-8 character <9f><80>" "" { target c++23 } } */ +/* aà¿a { dg-warning "invalid UTF-8 character " "" { target c++23 } } */ +/* aì€a { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } */ +/* aí €a { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } */ +/* að€€€a { dg-warning "invalid UTF-8 character <80><80><80>" "" { target c++23 } } */ +/* að¿¿a { dg-warning "invalid UTF-8 character <8f>" "" { target c++23 } } */ +/* aô€€a { dg-warning "invalid UTF-8 character <90><80><80>" "" { target c++23 } } */ +/* aý¿¿¿¿¿a { dg-warning "invalid UTF-8 character " "" { target c++23 } } */ +/* { dg-warning "invalid UTF-8 character " "" { target c++23 } .-1 } */ diff --git a/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-10.C b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-10.C new file mode 100644 index 0000000..4684b9d --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-10.C @@ -0,0 +1,25 @@ +// P2295R6 - Support for UTF-8 as a portable source file encoding +// This test intentionally contains various byte sequences which are not valid UTF-8 +// { dg-do preprocess } +// { dg-options "-finput-charset=UTF-8 -pedantic" } + +#define I(x) +I(€߿ࠀ퟿𐀀ô¿¿) // { dg-bogus "invalid UTF-8 character" } + // { dg-error "is not valid in an identifier" "" { target *-*-* } .-1 } +I(€) // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +I(¿) // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +I(À) // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +I(Á) // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +I(õ) // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +I(ÿ) // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +I(Â) // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +I(à) // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +I(à€¿) // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +I(àŸ€) // { dg-warning "invalid UTF-8 character <9f><80>" "" { target c++23 } } +I(à¿) // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +I(ì€) // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +I(í €) // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +I(ð€€€) // { dg-warning "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +I(ð¿¿) // { dg-warning "invalid UTF-8 character <8f>" "" { target c++23 } } +I(ô€€) // { dg-error "is not valid in an identifier" } +I(ý¿¿¿¿¿) // { dg-error "is not valid in an identifier" } diff --git a/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-11.C b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-11.C new file mode 100644 index 0000000..85f04bf --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-11.C @@ -0,0 +1,25 @@ +// P2295R6 - Support for UTF-8 as a portable source file encoding +// This test intentionally contains various byte sequences which are not valid UTF-8 +// { dg-do preprocess } +// { dg-options "-finput-charset=UTF-8 -pedantic-errors" } + +#define I(x) +I(€߿ࠀ퟿𐀀ô¿¿) // { dg-bogus "invalid UTF-8 character" } + // { dg-error "is not valid in an identifier" "" { target *-*-* } .-1 } +I(€) // { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +I(¿) // { dg-error "invalid UTF-8 character " "" { target c++23 } } +I(À) // { dg-error "invalid UTF-8 character " "" { target c++23 } } +I(Á) // { dg-error "invalid UTF-8 character " "" { target c++23 } } +I(õ) // { dg-error "invalid UTF-8 character " "" { target c++23 } } +I(ÿ) // { dg-error "invalid UTF-8 character " "" { target c++23 } } +I(Â) // { dg-error "invalid UTF-8 character " "" { target c++23 } } +I(à) // { dg-error "invalid UTF-8 character " "" { target c++23 } } +I(à€¿) // { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +I(àŸ€) // { dg-error "invalid UTF-8 character <9f><80>" "" { target c++23 } } +I(à¿) // { dg-error "invalid UTF-8 character " "" { target c++23 } } +I(ì€) // { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +I(í €) // { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +I(ð€€€) // { dg-error "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +I(ð¿¿) // { dg-error "invalid UTF-8 character <8f>" "" { target c++23 } } +I(ô€€) // { dg-error "is not valid in an identifier" } +I(ý¿¿¿¿¿) // { dg-error "is not valid in an identifier" } diff --git a/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-12.C b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-12.C new file mode 100644 index 0000000..6a4091f --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-12.C @@ -0,0 +1,25 @@ +// P2295R6 - Support for UTF-8 as a portable source file encoding +// This test intentionally contains various byte sequences which are not valid UTF-8 +// { dg-do preprocess } +// { dg-options "-finput-charset=UTF-8 -pedantic-errors -Wno-invalid-utf8" } + +#define I(x) +I(€߿ࠀ퟿𐀀ô¿¿) // { dg-bogus "invalid UTF-8 character" } + // { dg-error "is not valid in an identifier" "" { target *-*-* } .-1 } +I(€) // { dg-bogus "invalid UTF-8 character <80>" } +I(¿) // { dg-bogus "invalid UTF-8 character " } +I(À) // { dg-bogus "invalid UTF-8 character " } +I(Á) // { dg-bogus "invalid UTF-8 character " } +I(õ) // { dg-bogus "invalid UTF-8 character " } +I(ÿ) // { dg-bogus "invalid UTF-8 character " } +I(Â) // { dg-bogus "invalid UTF-8 character " } +I(à) // { dg-bogus "invalid UTF-8 character " } +I(à€¿) // { dg-bogus "invalid UTF-8 character <80>" } +I(àŸ€) // { dg-bogus "invalid UTF-8 character <9f><80>" } +I(à¿) // { dg-bogus "invalid UTF-8 character " } +I(ì€) // { dg-bogus "invalid UTF-8 character <80>" } +I(í €) // { dg-bogus "invalid UTF-8 character <80>" } +I(ð€€€) // { dg-bogus "invalid UTF-8 character <80><80><80>" } +I(ð¿¿) // { dg-bogus "invalid UTF-8 character <8f>" } +I(ô€€) // { dg-error "is not valid in an identifier" } +I(ý¿¿¿¿¿) // { dg-error "is not valid in an identifier" } diff --git a/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-2.C b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-2.C new file mode 100644 index 0000000..70ab8e5 --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-2.C @@ -0,0 +1,43 @@ +// P2295R6 - Support for UTF-8 as a portable source file encoding +// This test intentionally contains various byte sequences which are not valid UTF-8 +// { dg-do preprocess } +// { dg-options "-finput-charset=UTF-8 -pedantic" } + +// a€߿ࠀ퟿𐀀ô¿¿a { dg-bogus "invalid UTF-8 character" } +// a€a { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +// a¿a { dg-warning "invalid UTF-8 character " "" { target c++23 } } +// aÀa { dg-warning "invalid UTF-8 character " "" { target c++23 } } +// aÁa { dg-warning "invalid UTF-8 character " "" { target c++23 } } +// aõa { dg-warning "invalid UTF-8 character " "" { target c++23 } } +// aÿa { dg-warning "invalid UTF-8 character " "" { target c++23 } } +// aÂa { dg-warning "invalid UTF-8 character " "" { target c++23 } } +// aàa { dg-warning "invalid UTF-8 character " "" { target c++23 } } +// aà€¿a { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +// aàŸ€a { dg-warning "invalid UTF-8 character <9f><80>" "" { target c++23 } } +// aà¿a { dg-warning "invalid UTF-8 character " "" { target c++23 } } +// aì€a { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +// aí €a { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +// að€€€a { dg-warning "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +// að¿¿a { dg-warning "invalid UTF-8 character <8f>" "" { target c++23 } } +// aô€€a { dg-warning "invalid UTF-8 character <90><80><80>" "" { target c++23 } } +// aý¿¿¿¿¿a { dg-warning "invalid UTF-8 character " "" { target c++23 } } +// { dg-warning "invalid UTF-8 character " "" { target c++23 } .-1 } +/* a€߿ࠀ퟿𐀀ô¿¿a { dg-bogus "invalid UTF-8 character" } */ +/* a€a { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } */ +/* a¿a { dg-warning "invalid UTF-8 character " "" { target c++23 } } */ +/* aÀa { dg-warning "invalid UTF-8 character " "" { target c++23 } } */ +/* aÁa { dg-warning "invalid UTF-8 character " "" { target c++23 } } */ +/* aõa { dg-warning "invalid UTF-8 character " "" { target c++23 } } */ +/* aÿa { dg-warning "invalid UTF-8 character " "" { target c++23 } } */ +/* aÂa { dg-warning "invalid UTF-8 character " "" { target c++23 } } */ +/* aàa { dg-warning "invalid UTF-8 character " "" { target c++23 } } */ +/* aà€¿a { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } */ +/* aàŸ€a { dg-warning "invalid UTF-8 character <9f><80>" "" { target c++23 } } */ +/* aà¿a { dg-warning "invalid UTF-8 character " "" { target c++23 } } */ +/* aì€a { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } */ +/* aí €a { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } */ +/* að€€€a { dg-warning "invalid UTF-8 character <80><80><80>" "" { target c++23 } } */ +/* að¿¿a { dg-warning "invalid UTF-8 character <8f>" "" { target c++23 } } */ +/* aô€€a { dg-warning "invalid UTF-8 character <90><80><80>" "" { target c++23 } } */ +/* aý¿¿¿¿¿a { dg-warning "invalid UTF-8 character " "" { target c++23 } } */ +/* { dg-warning "invalid UTF-8 character " "" { target c++23 } .-1 } */ diff --git a/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-3.C b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-3.C new file mode 100644 index 0000000..c0f748b --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-3.C @@ -0,0 +1,43 @@ +// P2295R6 - Support for UTF-8 as a portable source file encoding +// This test intentionally contains various byte sequences which are not valid UTF-8 +// { dg-do preprocess } +// { dg-options "-finput-charset=UTF-8 -pedantic-errors" } + +// a€߿ࠀ퟿𐀀ô¿¿a { dg-bogus "invalid UTF-8 character" } +// a€a { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +// a¿a { dg-error "invalid UTF-8 character " "" { target c++23 } } +// aÀa { dg-error "invalid UTF-8 character " "" { target c++23 } } +// aÁa { dg-error "invalid UTF-8 character " "" { target c++23 } } +// aõa { dg-error "invalid UTF-8 character " "" { target c++23 } } +// aÿa { dg-error "invalid UTF-8 character " "" { target c++23 } } +// aÂa { dg-error "invalid UTF-8 character " "" { target c++23 } } +// aàa { dg-error "invalid UTF-8 character " "" { target c++23 } } +// aà€¿a { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +// aàŸ€a { dg-error "invalid UTF-8 character <9f><80>" "" { target c++23 } } +// aà¿a { dg-error "invalid UTF-8 character " "" { target c++23 } } +// aì€a { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +// aí €a { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +// að€€€a { dg-error "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +// að¿¿a { dg-error "invalid UTF-8 character <8f>" "" { target c++23 } } +// aô€€a { dg-error "invalid UTF-8 character <90><80><80>" "" { target c++23 } } +// aý¿¿¿¿¿a { dg-error "invalid UTF-8 character " "" { target c++23 } } +// { dg-error "invalid UTF-8 character " "" { target c++23 } .-1 } +/* a€߿ࠀ퟿𐀀ô¿¿a { dg-bogus "invalid UTF-8 character" } */ +/* a€a { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } */ +/* a¿a { dg-error "invalid UTF-8 character " "" { target c++23 } } */ +/* aÀa { dg-error "invalid UTF-8 character " "" { target c++23 } } */ +/* aÁa { dg-error "invalid UTF-8 character " "" { target c++23 } } */ +/* aõa { dg-error "invalid UTF-8 character " "" { target c++23 } } */ +/* aÿa { dg-error "invalid UTF-8 character " "" { target c++23 } } */ +/* aÂa { dg-error "invalid UTF-8 character " "" { target c++23 } } */ +/* aàa { dg-error "invalid UTF-8 character " "" { target c++23 } } */ +/* aà€¿a { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } */ +/* aàŸ€a { dg-error "invalid UTF-8 character <9f><80>" "" { target c++23 } } */ +/* aà¿a { dg-error "invalid UTF-8 character " "" { target c++23 } } */ +/* aì€a { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } */ +/* aí €a { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } */ +/* að€€€a { dg-error "invalid UTF-8 character <80><80><80>" "" { target c++23 } } */ +/* að¿¿a { dg-error "invalid UTF-8 character <8f>" "" { target c++23 } } */ +/* aô€€a { dg-error "invalid UTF-8 character <90><80><80>" "" { target c++23 } } */ +/* aý¿¿¿¿¿a { dg-error "invalid UTF-8 character " "" { target c++23 } } */ +/* { dg-error "invalid UTF-8 character " "" { target c++23 } .-1 } */ diff --git a/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-4.C b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-4.C new file mode 100644 index 0000000..1dc65e3 --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-4.C @@ -0,0 +1,43 @@ +// P2295R6 - Support for UTF-8 as a portable source file encoding +// This test intentionally contains various byte sequences which are not valid UTF-8 +// { dg-do preprocess } +// { dg-options "-finput-charset=UTF-8 -pedantic-errors -Wno-invalid-utf8" } + +// a€߿ࠀ퟿𐀀ô¿¿a { dg-bogus "invalid UTF-8 character" } +// a€a { dg-bogus "invalid UTF-8 character <80>" } +// a¿a { dg-bogus "invalid UTF-8 character " } +// aÀa { dg-bogus "invalid UTF-8 character " } +// aÁa { dg-bogus "invalid UTF-8 character " } +// aõa { dg-bogus "invalid UTF-8 character " } +// aÿa { dg-bogus "invalid UTF-8 character " } +// aÂa { dg-bogus "invalid UTF-8 character " } +// aàa { dg-bogus "invalid UTF-8 character " } +// aà€¿a { dg-bogus "invalid UTF-8 character <80>" } +// aàŸ€a { dg-bogus "invalid UTF-8 character <9f><80>" } +// aà¿a { dg-bogus "invalid UTF-8 character " } +// aì€a { dg-bogus "invalid UTF-8 character <80>" } +// aí €a { dg-bogus "invalid UTF-8 character <80>" } +// að€€€a { dg-bogus "invalid UTF-8 character <80><80><80>" } +// að¿¿a { dg-bogus "invalid UTF-8 character <8f>" } +// aô€€a { dg-bogus "invalid UTF-8 character <90><80><80>" } +// aý¿¿¿¿¿a { dg-bogus "invalid UTF-8 character " } +// { dg-bogus "invalid UTF-8 character " "" { target *-*-* } .-1 } +/* a€߿ࠀ퟿𐀀ô¿¿a { dg-bogus "invalid UTF-8 character" } */ +/* a€a { dg-bogus "invalid UTF-8 character <80>" } */ +/* a¿a { dg-bogus "invalid UTF-8 character " } */ +/* aÀa { dg-bogus "invalid UTF-8 character " } */ +/* aÁa { dg-bogus "invalid UTF-8 character " } */ +/* aõa { dg-bogus "invalid UTF-8 character " } */ +/* aÿa { dg-bogus "invalid UTF-8 character " } */ +/* aÂa { dg-bogus "invalid UTF-8 character " } */ +/* aàa { dg-bogus "invalid UTF-8 character " } */ +/* aà€¿a { dg-bogus "invalid UTF-8 character <80>" } */ +/* aàŸ€a { dg-bogus "invalid UTF-8 character <9f><80>" } */ +/* aà¿a { dg-bogus "invalid UTF-8 character " } */ +/* aì€a { dg-bogus "invalid UTF-8 character <80>" } */ +/* aí €a { dg-bogus "invalid UTF-8 character <80>" } */ +/* að€€€a { dg-bogus "invalid UTF-8 character <80><80><80>" } */ +/* að¿¿a { dg-bogus "invalid UTF-8 character <8f>" } */ +/* aô€€a { dg-bogus "invalid UTF-8 character <90><80><80>" } */ +/* aý¿¿¿¿¿a { dg-bogus "invalid UTF-8 character " } */ +/* { dg-bogus "invalid UTF-8 character " "" { target *-*-* } .-1 } */ diff --git a/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-5.C b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-5.C new file mode 100644 index 0000000..f0140ba --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-5.C @@ -0,0 +1,80 @@ +// P2295R6 - Support for UTF-8 as a portable source file encoding +// This test intentionally contains various byte sequences which are not valid UTF-8 +// { dg-do preprocess { target c++11 } } +// { dg-options "-finput-charset=UTF-8" } + +char32_t a = U'€'; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +char32_t b = U'¿'; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +char32_t c = U'À'; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +char32_t d = U'Á'; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +char32_t e = U'õ'; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +char32_t f = U'ÿ'; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +char32_t g = U'Â'; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +char32_t h = U'à'; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +char32_t i = U'à€¿'; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +char32_t j = U'àŸ€'; // { dg-warning "invalid UTF-8 character <9f><80>" "" { target c++23 } } +char32_t k = U'à¿'; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +char32_t l = U'ì€'; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +char32_t m = U'í €'; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +char32_t n = U'ð€€€'; // { dg-warning "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +char32_t o = U'ð¿¿'; // { dg-warning "invalid UTF-8 character <8f>" "" { target c++23 } } +char32_t p = U'ô€€'; // { dg-warning "invalid UTF-8 character <90><80><80>" "" { target c++23 } } +char32_t q = U'ý¿¿¿¿¿'; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } + // { dg-warning "invalid UTF-8 character " "" { target c++23 } .-1 } +auto A = U"€߿ࠀ퟿𐀀ô¿¿"; // { dg-bogus "invalid UTF-8 character" } +auto B = U"€"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto C = U"¿"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto D = U"À"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto E = U"Á"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto F = U"õ"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto G = U"ÿ"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto H = U"Â"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto I = U"à"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto J = U"à€¿"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto K = U"àŸ€"; // { dg-warning "invalid UTF-8 character <9f><80>" "" { target c++23 } } +auto L = U"à¿"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto M = U"ì€"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto N = U"í €"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto O = U"ð€€€"; // { dg-warning "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +auto P = U"ð¿¿"; // { dg-warning "invalid UTF-8 character <8f>" "" { target c++23 } } +auto Q = U"ô€€"; // { dg-warning "invalid UTF-8 character <90><80><80>" "" { target c++23 } } +auto R = U"ý¿¿¿¿¿"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } + // { dg-warning "invalid UTF-8 character " "" { target c++23 } .-1 } +auto A1 = UR"(€߿ࠀ퟿𐀀ô¿¿)"; // { dg-bogus "invalid UTF-8 character" } +auto B1 = UR"(€)"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto C1 = UR"(¿)"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto D1 = UR"(À)"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto E1 = UR"(Á)"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto F1 = UR"(õ)"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto G1 = UR"(ÿ)"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto H1 = UR"(Â)"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto I1 = UR"(à)"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto J1 = UR"(à€¿)"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto K1 = UR"(àŸ€)"; // { dg-warning "invalid UTF-8 character <9f><80>" "" { target c++23 } } +auto L1 = UR"(à¿)"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto M1 = UR"(ì€)"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto N1 = UR"(í €)"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto O1 = UR"(ð€€€)"; // { dg-warning "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +auto P1 = UR"(ð¿¿)"; // { dg-warning "invalid UTF-8 character <8f>" "" { target c++23 } } +auto Q1 = UR"(ô€€)"; // { dg-warning "invalid UTF-8 character <90><80><80>" "" { target c++23 } } +auto R1 = UR"(ý¿¿¿¿¿)"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } + // { dg-warning "invalid UTF-8 character " "" { target c++23 } .-1 } +auto A2 = u8"€߿ࠀ퟿𐀀ô¿¿"; // { dg-bogus "invalid UTF-8 character" } +auto B2 = u8"€"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto C2 = u8"¿"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto D2 = u8"À"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto E2 = u8"Á"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto F2 = u8"õ"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto G2 = u8"ÿ"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto H2 = u8"Â"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto I2 = u8"à"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto J2 = u8"à€¿"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto K2 = u8"àŸ€"; // { dg-warning "invalid UTF-8 character <9f><80>" "" { target c++23 } } +auto L2 = u8"à¿"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto M2 = u8"ì€"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto N2 = u8"í €"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto O2 = u8"ð€€€"; // { dg-warning "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +auto P2 = u8"ð¿¿"; // { dg-warning "invalid UTF-8 character <8f>" "" { target c++23 } } +auto Q2 = u8"ô€€"; // { dg-warning "invalid UTF-8 character <90><80><80>" "" { target c++23 } } +auto R2 = u8"ý¿¿¿¿¿"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } + // { dg-warning "invalid UTF-8 character " "" { target c++23 } .-1 } diff --git a/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-6.C b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-6.C new file mode 100644 index 0000000..01023d3 --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-6.C @@ -0,0 +1,80 @@ +// P2295R6 - Support for UTF-8 as a portable source file encoding +// This test intentionally contains various byte sequences which are not valid UTF-8 +// { dg-do preprocess { target c++11 } } +// { dg-options "-finput-charset=UTF-8 -pedantic" } + +char32_t a = U'€'; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +char32_t b = U'¿'; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +char32_t c = U'À'; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +char32_t d = U'Á'; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +char32_t e = U'õ'; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +char32_t f = U'ÿ'; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +char32_t g = U'Â'; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +char32_t h = U'à'; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +char32_t i = U'à€¿'; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +char32_t j = U'àŸ€'; // { dg-warning "invalid UTF-8 character <9f><80>" "" { target c++23 } } +char32_t k = U'à¿'; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +char32_t l = U'ì€'; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +char32_t m = U'í €'; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +char32_t n = U'ð€€€'; // { dg-warning "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +char32_t o = U'ð¿¿'; // { dg-warning "invalid UTF-8 character <8f>" "" { target c++23 } } +char32_t p = U'ô€€'; // { dg-warning "invalid UTF-8 character <90><80><80>" "" { target c++23 } } +char32_t q = U'ý¿¿¿¿¿'; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } + // { dg-warning "invalid UTF-8 character " "" { target c++23 } .-1 } +auto A = U"€߿ࠀ퟿𐀀ô¿¿"; // { dg-bogus "invalid UTF-8 character" } +auto B = U"€"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto C = U"¿"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto D = U"À"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto E = U"Á"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto F = U"õ"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto G = U"ÿ"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto H = U"Â"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto I = U"à"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto J = U"à€¿"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto K = U"àŸ€"; // { dg-warning "invalid UTF-8 character <9f><80>" "" { target c++23 } } +auto L = U"à¿"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto M = U"ì€"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto N = U"í €"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto O = U"ð€€€"; // { dg-warning "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +auto P = U"ð¿¿"; // { dg-warning "invalid UTF-8 character <8f>" "" { target c++23 } } +auto Q = U"ô€€"; // { dg-warning "invalid UTF-8 character <90><80><80>" "" { target c++23 } } +auto R = U"ý¿¿¿¿¿"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } + // { dg-warning "invalid UTF-8 character " "" { target c++23 } .-1 } +auto A1 = UR"(€߿ࠀ퟿𐀀ô¿¿)"; // { dg-bogus "invalid UTF-8 character" } +auto B1 = UR"(€)"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto C1 = UR"(¿)"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto D1 = UR"(À)"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto E1 = UR"(Á)"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto F1 = UR"(õ)"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto G1 = UR"(ÿ)"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto H1 = UR"(Â)"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto I1 = UR"(à)"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto J1 = UR"(à€¿)"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto K1 = UR"(àŸ€)"; // { dg-warning "invalid UTF-8 character <9f><80>" "" { target c++23 } } +auto L1 = UR"(à¿)"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto M1 = UR"(ì€)"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto N1 = UR"(í €)"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto O1 = UR"(ð€€€)"; // { dg-warning "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +auto P1 = UR"(ð¿¿)"; // { dg-warning "invalid UTF-8 character <8f>" "" { target c++23 } } +auto Q1 = UR"(ô€€)"; // { dg-warning "invalid UTF-8 character <90><80><80>" "" { target c++23 } } +auto R1 = UR"(ý¿¿¿¿¿)"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } + // { dg-warning "invalid UTF-8 character " "" { target c++23 } .-1 } +auto A2 = u8"€߿ࠀ퟿𐀀ô¿¿"; // { dg-bogus "invalid UTF-8 character" } +auto B2 = u8"€"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto C2 = u8"¿"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto D2 = u8"À"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto E2 = u8"Á"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto F2 = u8"õ"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto G2 = u8"ÿ"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto H2 = u8"Â"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto I2 = u8"à"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto J2 = u8"à€¿"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto K2 = u8"àŸ€"; // { dg-warning "invalid UTF-8 character <9f><80>" "" { target c++23 } } +auto L2 = u8"à¿"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +auto M2 = u8"ì€"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto N2 = u8"í €"; // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +auto O2 = u8"ð€€€"; // { dg-warning "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +auto P2 = u8"ð¿¿"; // { dg-warning "invalid UTF-8 character <8f>" "" { target c++23 } } +auto Q2 = u8"ô€€"; // { dg-warning "invalid UTF-8 character <90><80><80>" "" { target c++23 } } +auto R2 = u8"ý¿¿¿¿¿"; // { dg-warning "invalid UTF-8 character " "" { target c++23 } } + // { dg-warning "invalid UTF-8 character " "" { target c++23 } .-1 } diff --git a/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-7.C b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-7.C new file mode 100644 index 0000000..7991a64 --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-7.C @@ -0,0 +1,80 @@ +// P2295R6 - Support for UTF-8 as a portable source file encoding +// This test intentionally contains various byte sequences which are not valid UTF-8 +// { dg-do preprocess { target c++11 } } +// { dg-options "-finput-charset=UTF-8 -pedantic-errors" } + +char32_t a = U'€'; // { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +char32_t b = U'¿'; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +char32_t c = U'À'; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +char32_t d = U'Á'; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +char32_t e = U'õ'; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +char32_t f = U'ÿ'; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +char32_t g = U'Â'; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +char32_t h = U'à'; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +char32_t i = U'à€¿'; // { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +char32_t j = U'àŸ€'; // { dg-error "invalid UTF-8 character <9f><80>" "" { target c++23 } } +char32_t k = U'à¿'; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +char32_t l = U'ì€'; // { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +char32_t m = U'í €'; // { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +char32_t n = U'ð€€€'; // { dg-error "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +char32_t o = U'ð¿¿'; // { dg-error "invalid UTF-8 character <8f>" "" { target c++23 } } +char32_t p = U'ô€€'; // { dg-error "invalid UTF-8 character <90><80><80>" "" { target c++23 } } +char32_t q = U'ý¿¿¿¿¿'; // { dg-error "invalid UTF-8 character " "" { target c++23 } } + // { dg-error "invalid UTF-8 character " "" { target c++23 } .-1 } +auto A = U"€߿ࠀ퟿𐀀ô¿¿"; // { dg-bogus "invalid UTF-8 character" } +auto B = U"€"; // { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +auto C = U"¿"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto D = U"À"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto E = U"Á"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto F = U"õ"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto G = U"ÿ"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto H = U"Â"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto I = U"à"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto J = U"à€¿"; // { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +auto K = U"àŸ€"; // { dg-error "invalid UTF-8 character <9f><80>" "" { target c++23 } } +auto L = U"à¿"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto M = U"ì€"; // { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +auto N = U"í €"; // { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +auto O = U"ð€€€"; // { dg-error "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +auto P = U"ð¿¿"; // { dg-error "invalid UTF-8 character <8f>" "" { target c++23 } } +auto Q = U"ô€€"; // { dg-error "invalid UTF-8 character <90><80><80>" "" { target c++23 } } +auto R = U"ý¿¿¿¿¿"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } + // { dg-error "invalid UTF-8 character " "" { target c++23 } .-1 } +auto A1 = UR"(€߿ࠀ퟿𐀀ô¿¿)"; // { dg-bogus "invalid UTF-8 character" } +auto B1 = UR"(€)"; // { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +auto C1 = UR"(¿)"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto D1 = UR"(À)"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto E1 = UR"(Á)"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto F1 = UR"(õ)"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto G1 = UR"(ÿ)"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto H1 = UR"(Â)"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto I1 = UR"(à)"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto J1 = UR"(à€¿)"; // { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +auto K1 = UR"(àŸ€)"; // { dg-error "invalid UTF-8 character <9f><80>" "" { target c++23 } } +auto L1 = UR"(à¿)"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto M1 = UR"(ì€)"; // { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +auto N1 = UR"(í €)"; // { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +auto O1 = UR"(ð€€€)"; // { dg-error "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +auto P1 = UR"(ð¿¿)"; // { dg-error "invalid UTF-8 character <8f>" "" { target c++23 } } +auto Q1 = UR"(ô€€)"; // { dg-error "invalid UTF-8 character <90><80><80>" "" { target c++23 } } +auto R1 = UR"(ý¿¿¿¿¿)"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } + // { dg-error "invalid UTF-8 character " "" { target c++23 } .-1 } +auto A2 = u8"€߿ࠀ퟿𐀀ô¿¿"; // { dg-bogus "invalid UTF-8 character" } +auto B2 = u8"€"; // { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +auto C2 = u8"¿"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto D2 = u8"À"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto E2 = u8"Á"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto F2 = u8"õ"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto G2 = u8"ÿ"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto H2 = u8"Â"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto I2 = u8"à"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto J2 = u8"à€¿"; // { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +auto K2 = u8"àŸ€"; // { dg-error "invalid UTF-8 character <9f><80>" "" { target c++23 } } +auto L2 = u8"à¿"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } +auto M2 = u8"ì€"; // { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +auto N2 = u8"í €"; // { dg-error "invalid UTF-8 character <80>" "" { target c++23 } } +auto O2 = u8"ð€€€"; // { dg-error "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +auto P2 = u8"ð¿¿"; // { dg-error "invalid UTF-8 character <8f>" "" { target c++23 } } +auto Q2 = u8"ô€€"; // { dg-error "invalid UTF-8 character <90><80><80>" "" { target c++23 } } +auto R2 = u8"ý¿¿¿¿¿"; // { dg-error "invalid UTF-8 character " "" { target c++23 } } + // { dg-error "invalid UTF-8 character " "" { target c++23 } .-1 } diff --git a/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-8.C b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-8.C new file mode 100644 index 0000000..95c8a91 --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-8.C @@ -0,0 +1,80 @@ +// P2295R6 - Support for UTF-8 as a portable source file encoding +// This test intentionally contains various byte sequences which are not valid UTF-8 +// { dg-do preprocess { target c++11 } } +// { dg-options "-finput-charset=UTF-8 -pedantic-errors -Wno-invalid-utf8" } + +char32_t a = U'€'; // { dg-bogus "invalid UTF-8 character <80>" "" { target c++23 } } +char32_t b = U'¿'; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +char32_t c = U'À'; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +char32_t d = U'Á'; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +char32_t e = U'õ'; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +char32_t f = U'ÿ'; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +char32_t g = U'Â'; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +char32_t h = U'à'; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +char32_t i = U'à€¿'; // { dg-bogus "invalid UTF-8 character <80>" "" { target c++23 } } +char32_t j = U'àŸ€'; // { dg-bogus "invalid UTF-8 character <9f><80>" "" { target c++23 } } +char32_t k = U'à¿'; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +char32_t l = U'ì€'; // { dg-bogus "invalid UTF-8 character <80>" "" { target c++23 } } +char32_t m = U'í €'; // { dg-bogus "invalid UTF-8 character <80>" "" { target c++23 } } +char32_t n = U'ð€€€'; // { dg-bogus "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +char32_t o = U'ð¿¿'; // { dg-bogus "invalid UTF-8 character <8f>" "" { target c++23 } } +char32_t p = U'ô€€'; // { dg-bogus "invalid UTF-8 character <90><80><80>" "" { target c++23 } } +char32_t q = U'ý¿¿¿¿¿'; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } + // { dg-bogus "invalid UTF-8 character " "" { target c++23 } .-1 } +auto A = U"€߿ࠀ퟿𐀀ô¿¿"; // { dg-bogus "invalid UTF-8 character" } +auto B = U"€"; // { dg-bogus "invalid UTF-8 character <80>" "" { target c++23 } } +auto C = U"¿"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto D = U"À"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto E = U"Á"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto F = U"õ"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto G = U"ÿ"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto H = U"Â"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto I = U"à"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto J = U"à€¿"; // { dg-bogus "invalid UTF-8 character <80>" "" { target c++23 } } +auto K = U"àŸ€"; // { dg-bogus "invalid UTF-8 character <9f><80>" "" { target c++23 } } +auto L = U"à¿"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto M = U"ì€"; // { dg-bogus "invalid UTF-8 character <80>" "" { target c++23 } } +auto N = U"í €"; // { dg-bogus "invalid UTF-8 character <80>" "" { target c++23 } } +auto O = U"ð€€€"; // { dg-bogus "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +auto P = U"ð¿¿"; // { dg-bogus "invalid UTF-8 character <8f>" "" { target c++23 } } +auto Q = U"ô€€"; // { dg-bogus "invalid UTF-8 character <90><80><80>" "" { target c++23 } } +auto R = U"ý¿¿¿¿¿"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } + // { dg-bogus "invalid UTF-8 character " "" { target c++23 } .-1 } +auto A1 = UR"(€߿ࠀ퟿𐀀ô¿¿)"; // { dg-bogus "invalid UTF-8 character" } +auto B1 = UR"(€)"; // { dg-bogus "invalid UTF-8 character <80>" "" { target c++23 } } +auto C1 = UR"(¿)"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto D1 = UR"(À)"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto E1 = UR"(Á)"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto F1 = UR"(õ)"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto G1 = UR"(ÿ)"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto H1 = UR"(Â)"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto I1 = UR"(à)"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto J1 = UR"(à€¿)"; // { dg-bogus "invalid UTF-8 character <80>" "" { target c++23 } } +auto K1 = UR"(àŸ€)"; // { dg-bogus "invalid UTF-8 character <9f><80>" "" { target c++23 } } +auto L1 = UR"(à¿)"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto M1 = UR"(ì€)"; // { dg-bogus "invalid UTF-8 character <80>" "" { target c++23 } } +auto N1 = UR"(í €)"; // { dg-bogus "invalid UTF-8 character <80>" "" { target c++23 } } +auto O1 = UR"(ð€€€)"; // { dg-bogus "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +auto P1 = UR"(ð¿¿)"; // { dg-bogus "invalid UTF-8 character <8f>" "" { target c++23 } } +auto Q1 = UR"(ô€€)"; // { dg-bogus "invalid UTF-8 character <90><80><80>" "" { target c++23 } } +auto R1 = UR"(ý¿¿¿¿¿)"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } + // { dg-bogus "invalid UTF-8 character " "" { target c++23 } .-1 } +auto A2 = u8"€߿ࠀ퟿𐀀ô¿¿"; // { dg-bogus "invalid UTF-8 character" } +auto B2 = u8"€"; // { dg-bogus "invalid UTF-8 character <80>" "" { target c++23 } } +auto C2 = u8"¿"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto D2 = u8"À"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto E2 = u8"Á"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto F2 = u8"õ"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto G2 = u8"ÿ"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto H2 = u8"Â"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto I2 = u8"à"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto J2 = u8"à€¿"; // { dg-bogus "invalid UTF-8 character <80>" "" { target c++23 } } +auto K2 = u8"àŸ€"; // { dg-bogus "invalid UTF-8 character <9f><80>" "" { target c++23 } } +auto L2 = u8"à¿"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } +auto M2 = u8"ì€"; // { dg-bogus "invalid UTF-8 character <80>" "" { target c++23 } } +auto N2 = u8"í €"; // { dg-bogus "invalid UTF-8 character <80>" "" { target c++23 } } +auto O2 = u8"ð€€€"; // { dg-bogus "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +auto P2 = u8"ð¿¿"; // { dg-bogus "invalid UTF-8 character <8f>" "" { target c++23 } } +auto Q2 = u8"ô€€"; // { dg-bogus "invalid UTF-8 character <90><80><80>" "" { target c++23 } } +auto R2 = u8"ý¿¿¿¿¿"; // { dg-bogus "invalid UTF-8 character " "" { target c++23 } } + // { dg-bogus "invalid UTF-8 character " "" { target c++23 } .-1 } diff --git a/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-9.C b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-9.C new file mode 100644 index 0000000..0afc945 --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp23/Winvalid-utf8-9.C @@ -0,0 +1,25 @@ +// P2295R6 - Support for UTF-8 as a portable source file encoding +// This test intentionally contains various byte sequences which are not valid UTF-8 +// { dg-do preprocess } +// { dg-options "-finput-charset=UTF-8" } + +#define I(x) +I(€߿ࠀ퟿𐀀ô¿¿) // { dg-bogus "invalid UTF-8 character" } + // { dg-error "is not valid in an identifier" "" { target *-*-* } .-1 } +I(€) // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +I(¿) // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +I(À) // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +I(Á) // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +I(õ) // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +I(ÿ) // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +I(Â) // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +I(à) // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +I(à€¿) // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +I(àŸ€) // { dg-warning "invalid UTF-8 character <9f><80>" "" { target c++23 } } +I(à¿) // { dg-warning "invalid UTF-8 character " "" { target c++23 } } +I(ì€) // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +I(í €) // { dg-warning "invalid UTF-8 character <80>" "" { target c++23 } } +I(ð€€€) // { dg-warning "invalid UTF-8 character <80><80><80>" "" { target c++23 } } +I(ð¿¿) // { dg-warning "invalid UTF-8 character <8f>" "" { target c++23 } } +I(ô€€) // { dg-error "is not valid in an identifier" } +I(ý¿¿¿¿¿) // { dg-error "is not valid in an identifier" } diff --git a/libcpp/charset.cc b/libcpp/charset.cc index d3c07d6..c9656db 100644 --- a/libcpp/charset.cc +++ b/libcpp/charset.cc @@ -1742,9 +1742,9 @@ convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit, case, no diagnostic is emitted, and the return value of FALSE should cause a new token to be formed. - Unlike _cpp_valid_ucn, this will never be called when lexing a string; only - a potential identifier, or a CPP_OTHER token. NST is unused in the latter - case. + _cpp_valid_utf8 can be called when lexing a potential identifier, or a + CPP_OTHER token or for the purposes of -Winvalid-utf8 warning in string or + character literals. NST is unused when not in a potential identifier. As in _cpp_valid_ucn, IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of an identifier, or 2 otherwise. */ diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h index 810203d..a7600de 100644 --- a/libcpp/include/cpplib.h +++ b/libcpp/include/cpplib.h @@ -560,6 +560,13 @@ struct cpp_options cpp_bidirectional_level. */ unsigned char cpp_warn_bidirectional; + /* True if libcpp should warn about invalid UTF-8 characters in comments. + 2 if it should be a pedwarn. */ + unsigned char cpp_warn_invalid_utf8; + + /* True if -finput-charset= option has been used explicitly. */ + bool cpp_input_charset_explicit; + /* Dependency generation. */ struct { @@ -666,7 +673,8 @@ enum cpp_warning_reason { CPP_W_CXX11_COMPAT, CPP_W_CXX20_COMPAT, CPP_W_EXPANSION_TO_DEFINED, - CPP_W_BIDIRECTIONAL + CPP_W_BIDIRECTIONAL, + CPP_W_INVALID_UTF8 }; /* Callback for header lookup for HEADER, which is the name of a diff --git a/libcpp/init.cc b/libcpp/init.cc index 39e7e75..41b10b3 100644 --- a/libcpp/init.cc +++ b/libcpp/init.cc @@ -227,6 +227,8 @@ cpp_create_reader (enum c_lang lang, cpp_hash_table *table, CPP_OPTION (pfile, ext_numeric_literals) = 1; CPP_OPTION (pfile, warn_date_time) = 0; CPP_OPTION (pfile, cpp_warn_bidirectional) = bidirectional_unpaired; + CPP_OPTION (pfile, cpp_warn_invalid_utf8) = 0; + CPP_OPTION (pfile, cpp_input_charset_explicit) = 0; /* Default CPP arithmetic to something sensible for the host for the benefit of dumb users like fix-header. */ diff --git a/libcpp/lex.cc b/libcpp/lex.cc index 528d598..41f905de 100644 --- a/libcpp/lex.cc +++ b/libcpp/lex.cc @@ -50,6 +50,9 @@ static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE }; #define TOKEN_SPELL(token) (token_spellings[(token)->type].category) #define TOKEN_NAME(token) (token_spellings[(token)->type].name) +/* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive. */ +#define UCS_LIMIT 0x10FFFF + static void add_line_note (cpp_buffer *, const uchar *, unsigned int); static int skip_line_comment (cpp_reader *); static void skip_whitespace (cpp_reader *, cppchar_t); @@ -1704,6 +1707,120 @@ maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind, bidi::on_char (kind, ucn_p, loc); } +static const cppchar_t utf8_continuation = 0x80; +static const cppchar_t utf8_signifier = 0xC0; + +/* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting + at PFILE->buffer->cur. Return a pointer after the diagnosed + invalid character. */ + +static const uchar * +_cpp_warn_invalid_utf8 (cpp_reader *pfile) +{ + cpp_buffer *buffer = pfile->buffer; + const uchar *cur = buffer->cur; + bool pedantic = (CPP_PEDANTIC (pfile) + && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2); + + if (cur[0] < utf8_signifier + || cur[1] < utf8_continuation || cur[1] >= utf8_signifier) + { + if (pedantic) + cpp_error_with_line (pfile, CPP_DL_PEDWARN, + pfile->line_table->highest_line, + CPP_BUF_COL (buffer), + "invalid UTF-8 character <%x>", + cur[0]); + else + cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8, + pfile->line_table->highest_line, + CPP_BUF_COL (buffer), + "invalid UTF-8 character <%x>", + cur[0]); + return cur + 1; + } + else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier) + { + if (pedantic) + cpp_error_with_line (pfile, CPP_DL_PEDWARN, + pfile->line_table->highest_line, + CPP_BUF_COL (buffer), + "invalid UTF-8 character <%x><%x>", + cur[0], cur[1]); + else + cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8, + pfile->line_table->highest_line, + CPP_BUF_COL (buffer), + "invalid UTF-8 character <%x><%x>", + cur[0], cur[1]); + return cur + 2; + } + else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier) + { + if (pedantic) + cpp_error_with_line (pfile, CPP_DL_PEDWARN, + pfile->line_table->highest_line, + CPP_BUF_COL (buffer), + "invalid UTF-8 character <%x><%x><%x>", + cur[0], cur[1], cur[2]); + else + cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8, + pfile->line_table->highest_line, + CPP_BUF_COL (buffer), + "invalid UTF-8 character <%x><%x><%x>", + cur[0], cur[1], cur[2]); + return cur + 3; + } + else + { + if (pedantic) + cpp_error_with_line (pfile, CPP_DL_PEDWARN, + pfile->line_table->highest_line, + CPP_BUF_COL (buffer), + "invalid UTF-8 character <%x><%x><%x><%x>", + cur[0], cur[1], cur[2], cur[3]); + else + cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8, + pfile->line_table->highest_line, + CPP_BUF_COL (buffer), + "invalid UTF-8 character <%x><%x><%x><%x>", + cur[0], cur[1], cur[2], cur[3]); + return cur + 4; + } +} + +/* Helper function of *skip_*_comment and lex*_string. For C, + character at CUR[-1] with MSB set handle -Wbidi-chars* and + -Winvalid-utf8 diagnostics and return pointer to first character + that should be processed next. */ + +static inline const uchar * +_cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c, + const uchar *cur, bool warn_bidi_p, + bool warn_invalid_utf8_p) +{ + /* If this is a beginning of a UTF-8 encoding, it might be + a bidirectional control character. */ + if (c == bidi::utf8_start && warn_bidi_p) + { + location_t loc; + bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc); + maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc); + } + if (!warn_invalid_utf8_p) + return cur; + if (c >= utf8_signifier) + { + cppchar_t s; + const uchar *pstr = cur - 1; + if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s) + && s <= UCS_LIMIT) + return pstr; + } + pfile->buffer->cur = cur - 1; + return _cpp_warn_invalid_utf8 (pfile); +} + /* Skip a C-style block comment. We find the end of the comment by seeing if an asterisk is before every '/' we encounter. Returns nonzero if comment terminated by EOF, zero otherwise. @@ -1716,6 +1833,8 @@ _cpp_skip_block_comment (cpp_reader *pfile) const uchar *cur = buffer->cur; uchar c; const bool warn_bidi_p = pfile->warn_bidi_p (); + const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8); + const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p; cur++; if (*cur == '/') @@ -1765,14 +1884,10 @@ _cpp_skip_block_comment (cpp_reader *pfile) cur = buffer->cur; } - /* If this is a beginning of a UTF-8 encoding, it might be - a bidirectional control character. */ - else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p) - { - location_t loc; - bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc); - maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc); - } + else if (__builtin_expect (c >= utf8_continuation, 0) + && warn_bidi_or_invalid_utf8_p) + cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p, + warn_invalid_utf8_p); } buffer->cur = cur; @@ -1789,11 +1904,13 @@ skip_line_comment (cpp_reader *pfile) cpp_buffer *buffer = pfile->buffer; location_t orig_line = pfile->line_table->highest_line; const bool warn_bidi_p = pfile->warn_bidi_p (); + const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8); + const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p; - if (!warn_bidi_p) + if (!warn_bidi_or_invalid_utf8_p) while (*buffer->cur != '\n') buffer->cur++; - else + else if (!warn_invalid_utf8_p) { while (*buffer->cur != '\n' && *buffer->cur != bidi::utf8_start) @@ -1813,6 +1930,22 @@ skip_line_comment (cpp_reader *pfile) maybe_warn_bidi_on_close (pfile, buffer->cur); } } + else + { + while (*buffer->cur != '\n') + { + if (*buffer->cur < utf8_continuation) + { + buffer->cur++; + continue; + } + buffer->cur + = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1, + warn_bidi_p, warn_invalid_utf8_p); + } + if (warn_bidi_p) + maybe_warn_bidi_on_close (pfile, buffer->cur); + } _cpp_process_line_notes (pfile, true); return orig_line != pfile->line_table->highest_line; @@ -1919,8 +2052,6 @@ warn_about_normalization (cpp_reader *pfile, } } -static const cppchar_t utf8_signifier = 0xC0; - /* Returns TRUE if the sequence starting at buffer->cur is valid in an identifier. FIRST is TRUE if this starts an identifier. */ @@ -2361,6 +2492,8 @@ lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base) { const uchar *pos = base; const bool warn_bidi_p = pfile->warn_bidi_p (); + const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8); + const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p; /* 'tis a pity this information isn't passed down from the lexer's initial categorization of the token. */ @@ -2597,13 +2730,10 @@ lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base) pos = base = pfile->buffer->cur; note = &pfile->buffer->notes[pfile->buffer->cur_note]; } - else if (__builtin_expect ((unsigned char) c == bidi::utf8_start, 0) - && warn_bidi_p) - { - location_t loc; - bidi::kind kind = get_bidi_utf8 (pfile, pos - 1, &loc); - maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc); - } + else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0) + && warn_bidi_or_invalid_utf8_p) + pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p, + warn_invalid_utf8_p); } if (warn_bidi_p) @@ -2704,6 +2834,8 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base) terminator = '>', type = CPP_HEADER_NAME; const bool warn_bidi_p = pfile->warn_bidi_p (); + const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8); + const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p; for (;;) { cppchar_t c = *cur++; @@ -2745,12 +2877,10 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base) } else if (c == '\0') saw_NUL = true; - else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p) - { - location_t loc; - bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc); - maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc); - } + else if (__builtin_expect (c >= utf8_continuation, 0) + && warn_bidi_or_invalid_utf8_p) + cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p, + warn_invalid_utf8_p); } if (saw_NUL && !pfile->state.skipping) @@ -4052,6 +4182,7 @@ _cpp_lex_direct (cpp_reader *pfile) default: { const uchar *base = --buffer->cur; + static int no_warn_cnt; /* Check for an extended identifier ($ or UCN or UTF-8). */ struct normalize_state nst = INITIAL_NORMALIZE_STATE; @@ -4072,7 +4203,33 @@ _cpp_lex_direct (cpp_reader *pfile) const uchar *pstr = base; cppchar_t s; if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s)) - buffer->cur = pstr; + { + if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8)) + { + buffer->cur = base; + _cpp_warn_invalid_utf8 (pfile); + } + buffer->cur = pstr; + } + else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8)) + { + buffer->cur = base; + const uchar *end = _cpp_warn_invalid_utf8 (pfile); + buffer->cur = base + 1; + no_warn_cnt = end - buffer->cur; + } + } + else if (c >= utf8_continuation + && CPP_OPTION (pfile, cpp_warn_invalid_utf8)) + { + if (no_warn_cnt) + --no_warn_cnt; + else + { + buffer->cur = base; + _cpp_warn_invalid_utf8 (pfile); + buffer->cur = base + 1; + } } create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER); break; -- 2.7.4