From fd406fc7046f427385b644759265ae06ed741d6b Mon Sep 17 00:00:00 2001 From: Owen Avery Date: Sun, 8 Jan 2023 17:19:12 -0500 Subject: [PATCH] gccrs: Implemented UTF-8 checking for include_str!() gcc/rust/ChangeLog: * expand/rust-macro-builtins.cc (MacroBuiltin::include_str_handler): Add check for valid UTF-8. gcc/testsuite/ChangeLog: * rust/compile/builtin_macro_include_str.rs: Include test of invalid UTF-8. * rust/compile/invalid_utf8: File with invalid UTF-8. Signed-off-by: Owen Avery --- gcc/rust/expand/rust-macro-builtins.cc | 51 +++++++++++++++++++++- .../rust/compile/builtin_macro_include_str.rs | 1 + gcc/testsuite/rust/compile/invalid_utf8 | 1 + 3 files changed, 51 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/rust/compile/invalid_utf8 diff --git a/gcc/rust/expand/rust-macro-builtins.cc b/gcc/rust/expand/rust-macro-builtins.cc index e594a25..3b6f69b 100644 --- a/gcc/rust/expand/rust-macro-builtins.cc +++ b/gcc/rust/expand/rust-macro-builtins.cc @@ -389,8 +389,55 @@ MacroBuiltin::include_str_handler (Location invoc_locus, std::vector bytes = load_file_bytes (target_filename.c_str ()); - /* FIXME: Enforce that the file contents are valid UTF-8. */ - std::string str ((const char *) &bytes[0], bytes.size ()); + /* FIXME: reuse lexer */ + int expect_single = 0; + for (uint8_t b : bytes) + { + if (expect_single) + { + if ((b & 0xC0) != 0x80) + /* character was truncated, exit with expect_single != 0 */ + break; + expect_single--; + } + else if (b & 0x80) + { + if (b >= 0xF8) + { + /* more than 4 leading 1s */ + expect_single = 1; + break; + } + else if (b >= 0xF0) + { + /* 4 leading 1s */ + expect_single = 3; + } + else if (b >= 0xE0) + { + /* 3 leading 1s */ + expect_single = 2; + } + else if (b >= 0xC0) + { + /* 2 leading 1s */ + expect_single = 1; + } + else + { + /* only 1 leading 1 */ + expect_single = 1; + break; + } + } + } + + std::string str; + if (expect_single) + rust_error_at (invoc_locus, "%s was not a valid utf-8 file", + target_filename.c_str ()); + else + str = std::string ((const char *) &bytes[0], bytes.size ()); auto node = AST::SingleASTNode (make_string (invoc_locus, str)); auto str_tok = make_token (Token::make_string (invoc_locus, std::move (str))); diff --git a/gcc/testsuite/rust/compile/builtin_macro_include_str.rs b/gcc/testsuite/rust/compile/builtin_macro_include_str.rs index 38f5e3b..8092193 100644 --- a/gcc/testsuite/rust/compile/builtin_macro_include_str.rs +++ b/gcc/testsuite/rust/compile/builtin_macro_include_str.rs @@ -10,4 +10,5 @@ fn main () { include_str! ("foo.txt", "bar.txt"); // { dg-error "macro takes 1 argument" "" } include_str! ("builtin_macro_include_str.rs"); // ok include_str! ("builtin_macro_include_str.rs",); // trailing comma ok + include_str! ("invalid_utf8"); // { dg-error "invalid_utf8 was not a valid utf-8 file" "" } } diff --git a/gcc/testsuite/rust/compile/invalid_utf8 b/gcc/testsuite/rust/compile/invalid_utf8 new file mode 100644 index 0000000..29e181e --- /dev/null +++ b/gcc/testsuite/rust/compile/invalid_utf8 @@ -0,0 +1 @@ +ΓΏ -- 2.7.4