From e7a21f763a44a5198b32f963842aea00d796017b Mon Sep 17 00:00:00 2001 From: peter klausler Date: Tue, 11 Jun 2019 10:34:58 -0700 Subject: [PATCH] [flang] Tests pass Original-commit: flang-compiler/f18@ae9d07a2ab3fa5c59480ed869a9cceacaf90f8af Reviewed-on: https://github.com/flang-compiler/f18/pull/496 Tree-same-pre-rewrite: false --- flang/documentation/Parsing.md | 3 +- flang/lib/parser/characters.cc | 268 +++++++++++++++++++++++------ flang/lib/parser/characters.h | 71 +++++--- flang/lib/parser/parse-state.h | 12 +- flang/lib/parser/parsing.h | 2 +- flang/lib/parser/prescan.cc | 38 ++-- flang/lib/parser/prescan.h | 2 +- flang/lib/parser/token-parsers.h | 51 ++---- flang/lib/parser/unparse.cc | 17 +- flang/lib/parser/unparse.h | 2 +- flang/lib/semantics/expression.cc | 39 +++-- flang/lib/semantics/unparse-with-symbols.h | 4 +- flang/tools/f18/f18-parse-demo.cc | 2 +- flang/tools/f18/f18.cc | 4 +- 14 files changed, 355 insertions(+), 160 deletions(-) diff --git a/flang/documentation/Parsing.md b/flang/documentation/Parsing.md index 716a1e8..9a0d1aa 100644 --- a/flang/documentation/Parsing.md +++ b/flang/documentation/Parsing.md @@ -1,5 +1,5 @@ The F18 Parser @@ -58,6 +58,7 @@ by a CookedSource class instance, in which: * except for the payload in character literals, Hollerith constants, and character and Hollerith edit descriptors, all letters have been normalized to lower case +* all non-ASCII characters have been re-encoded in UTF-8. Lines in the cooked character stream can be of arbitrary length. diff --git a/flang/lib/parser/characters.cc b/flang/lib/parser/characters.cc index e92c1d1..3345fa1 100644 --- a/flang/lib/parser/characters.cc +++ b/flang/lib/parser/characters.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,27 +13,28 @@ // limitations under the License. #include "characters.h" +#include "../common/idioms.h" #include #include namespace Fortran::parser { -std::optional UTF8CharacterBytes(const char *p) { +std::optional UTF_8CharacterBytes(const char *p) { if ((*p & 0x80) == 0) { - return {1}; + return 1; } if ((*p & 0xf8) == 0xf0) { if ((*p & 0x07) != 0 && (p[1] & 0xc0) == 0x80 && (p[2] & 0xc0) == 0x80 && (p[3] & 0xc0) == 0x80) { - return {4}; + return 4; } } else if ((*p & 0xf0) == 0xe0) { if ((*p & 0x0f) != 0 && (p[1] & 0xc0) == 0x80 && (p[2] & 0xc0) == 0x80) { - return {3}; + return 3; } } else if ((*p & 0xe0) == 0xc0) { if ((*p & 0x1f) != 0 && (p[1] & 0xc0) == 0x80) { - return {2}; + return 2; } } return std::nullopt; @@ -42,49 +43,68 @@ std::optional UTF8CharacterBytes(const char *p) { std::optional EUC_JPCharacterBytes(const char *p) { int b1 = *p & 0xff; if (b1 <= 0x7f) { - return {1}; + return 1; } if (b1 >= 0xa1 && b1 <= 0xfe) { int b2 = p[1] & 0xff; if (b2 >= 0xa1 && b2 <= 0xfe) { // JIS X 0208 (code set 1) - return {2}; + return 2; } } else if (b1 == 0x8e) { int b2 = p[1] & 0xff; if (b2 >= 0xa1 && b2 <= 0xdf) { // upper half JIS 0201 (half-width kana, code set 2) - return {2}; + return 2; } } else if (b1 == 0x8f) { int b2 = p[1] & 0xff; int b3 = p[2] & 0xff; if (b2 >= 0xa1 && b2 <= 0xfe && b3 >= 0xa1 && b3 <= 0xfe) { // JIS X 0212 (code set 3) - return {3}; + return 3; } } return std::nullopt; } -std::optional CountCharacters( - const char *p, std::size_t bytes, std::optional (*cbf)(const char *)) { +static std::optional One(const char *) { return 1; } + +static std::optional (*CharacterCounter(Encoding encoding))(const char *) { + switch (encoding) { + case Encoding::UTF_8: return UTF_8CharacterBytes; + case Encoding::EUC_JP: return EUC_JPCharacterBytes; + default: return One; + } +} + +std::optional CharacterBytes(const char *p, Encoding encoding) { + return CharacterCounter(encoding)(p); +} + +std::optional CountCharacters( + const char *p, std::size_t bytes, Encoding encoding) { std::size_t chars{0}; const char *limit{p + bytes}; + std::optional (*cbf)(const char *){CharacterCounter(encoding)}; while (p < limit) { - ++chars; - std::optional cb{cbf(p)}; - if (!cb.has_value()) { + if (std::optional cb{cbf(p)}) { + p += *cb; + ++chars; + } else { return std::nullopt; } - p += *cb; } - return {chars}; + if (p == limit) { + return chars; + } else { + return std::nullopt; + } } template -std::string QuoteCharacterLiteralHelper( - const STRING &str, bool doubleDoubleQuotes, bool doubleBackslash) { +std::string QuoteCharacterLiteralHelper(const STRING &str, + bool doubleDoubleQuotes, bool doubleBackslash, Encoding encoding) { std::string result{'"'}; const auto emit{[&](char ch) { result += ch; }}; for (auto ch : str) { @@ -92,60 +112,198 @@ std::string QuoteCharacterLiteralHelper( if constexpr (std::is_same_v) { // char may be signed depending on host. char32_t ch32{static_cast(ch)}; - EmitQuotedChar(ch32, emit, emit, doubleDoubleQuotes, doubleBackslash); + EmitQuotedChar( + ch32, emit, emit, doubleDoubleQuotes, doubleBackslash, encoding); } else { char32_t ch32{ch}; - EmitQuotedChar(ch32, emit, emit, doubleDoubleQuotes, doubleBackslash); + EmitQuotedChar( + ch32, emit, emit, doubleDoubleQuotes, doubleBackslash, encoding); } } result += '"'; return result; } -std::string QuoteCharacterLiteral( - const std::string &str, bool doubleDoubleQuotes, bool doubleBackslash) { - return QuoteCharacterLiteralHelper(str, doubleDoubleQuotes, doubleBackslash); +std::string QuoteCharacterLiteral(const std::string &str, + bool doubleDoubleQuotes, bool doubleBackslash, Encoding encoding) { + return QuoteCharacterLiteralHelper( + str, doubleDoubleQuotes, doubleBackslash, encoding); +} + +std::string QuoteCharacterLiteral(const std::u16string &str, + bool doubleDoubleQuotes, bool doubleBackslash, Encoding encoding) { + return QuoteCharacterLiteralHelper( + str, doubleDoubleQuotes, doubleBackslash, encoding); +} + +std::string QuoteCharacterLiteral(const std::u32string &str, + bool doubleDoubleQuotes, bool doubleBackslash, Encoding encoding) { + return QuoteCharacterLiteralHelper( + str, doubleDoubleQuotes, doubleBackslash, encoding); +} + +EncodedCharacter EncodeLATIN_1(char codepoint) { + CHECK(codepoint <= 0xff); + EncodedCharacter result; + result.buffer[0] = codepoint; + result.bytes = 1; + return result; +} + +EncodedCharacter EncodeUTF_8(char32_t codepoint) { + // N.B. char32_t is unsigned + EncodedCharacter result; + if (codepoint <= 0x7f) { + result.buffer[0] = codepoint; + result.bytes = 1; + } else if (codepoint <= 0x7ff) { + result.buffer[0] = 0xc0 | (codepoint >> 6); + result.buffer[1] = 0x80 | (codepoint & 0x3f); + result.bytes = 2; + } else if (codepoint <= 0xffff) { + result.buffer[0] = 0xe0 | (codepoint >> 12); + result.buffer[1] = 0x80 | ((codepoint >> 6) & 0x3f); + result.buffer[2] = 0x80 | (codepoint & 0x3f); + result.bytes = 3; + } else { + // UCS actually only goes up to 0x10ffff but the + // UTF-8 encoding handles 21 bits. + CHECK(codepoint <= 0x1fffff); + result.buffer[0] = 0xf0 | (codepoint >> 18); + result.buffer[1] = 0x80 | ((codepoint >> 12) & 0x3f); + result.buffer[2] = 0x80 | ((codepoint >> 6) & 0x3f); + result.buffer[3] = 0x80 | (codepoint & 0x3f); + result.bytes = 4; + } + return result; +} + +EncodedCharacter EncodeEUC_JP(char16_t codepoint) { + // Assume JIS X 0208 (TODO: others) + CHECK(codepoint <= 0x6e6e); + EncodedCharacter result; + if (codepoint <= 0x7f) { + result.buffer[0] = codepoint; + result.bytes = 1; + } else { + result.buffer[0] = 0x80 | (codepoint >> 8); + result.buffer[1] = 0x80 | (codepoint & 0x7f); + result.bytes = 2; + } + return result; +} + +EncodedCharacter EncodeCharacter(Encoding encoding, char32_t codepoint) { + switch (encoding) { + case Encoding::LATIN_1: return EncodeLATIN_1(codepoint); + case Encoding::UTF_8: return EncodeUTF_8(codepoint); + case Encoding::EUC_JP: return EncodeEUC_JP(codepoint); + default: CRASH_NO_CASE; + } +} + +DecodedCharacter DecodeUTF_8Character(const char *cp, std::size_t bytes) { + auto p{reinterpret_cast(cp)}; + char32_t ch{*p}; + int charBytes{1}; + if (ch >= 0x80) { + if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 && + ((p[1] | p[2] | p[3]) & 0xc0) == 0x80) { + charBytes = 4; + ch = ((ch & 7) << 6) | (p[1] & 0x3f); + ch = (ch << 6) | (p[2] & 0x3f); + ch = (ch << 6) | (p[3] & 0x3f); + } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 && + ((p[1] | p[2]) & 0xc0) == 0x80) { + charBytes = 3; + ch = ((ch & 0xf) << 6) | (p[1] & 0x3f); + ch = (ch << 6) | (p[2] & 0x3f); + } else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 && + (p[1] & 0xc0) == 0x80) { + charBytes = 2; + ch = ((ch & 0x1f) << 6) | (p[1] & 0x3f); + } else { + return {}; // not valid UTF-8 + } + } + return {ch, charBytes}; } -std::string QuoteCharacterLiteral( - const std::u16string &str, bool doubleDoubleQuotes, bool doubleBackslash) { - return QuoteCharacterLiteralHelper(str, doubleDoubleQuotes, doubleBackslash); +DecodedCharacter DecodeEUC_JPCharacter(const char *cp, std::size_t bytes) { + auto p{reinterpret_cast(cp)}; + char32_t ch{*p}; + int charBytes{1}; + if (ch >= 0x80) { + if (bytes >= 2 && ch == 0x8e && p[1] >= 0xa1 && p[1] <= 0xdf) { + charBytes = 2; // JIS X 0201 + ch = p[1]; + } else if (bytes >= 3 && ch == 0x8f && p[1] >= 0xa1 && p[1] <= 0xfe && + p[2] >= 0xa1 && p[2] <= 0xfe) { + charBytes = 3; // JIS X 0212 + ch = (p[1] & 0x7f) << 8 | (p[1] & 0x7f); + } else if (bytes >= 2 && ch >= 0xa1 && ch <= 0xfe && p[1] >= 0x1 && + p[1] <= 0xfe) { + charBytes = 2; // JIS X 0208 + ch = ((ch & 0x7f) << 8) | (p[1] & 0x7f); + } else { + return {}; + } + } + return {ch, charBytes}; } -std::string QuoteCharacterLiteral( - const std::u32string &str, bool doubleDoubleQuotes, bool doubleBackslash) { - return QuoteCharacterLiteralHelper(str, doubleDoubleQuotes, doubleBackslash); +DecodedCharacter DecodeLATIN1Character(const char *cp) { + return {*reinterpret_cast(cp), 1}; } -std::optional DecodeUTF8(const std::string &s) { +DecodedCharacter DecodeCharacter( + Encoding encoding, const char *cp, std::size_t bytes) { + switch (encoding) { + case Encoding::LATIN_1: return DecodeLATIN1Character(cp); + case Encoding::UTF_8: return DecodeUTF_8Character(cp, bytes); + case Encoding::EUC_JP: return DecodeEUC_JPCharacter(cp, bytes); + default: CRASH_NO_CASE; + } +} + +std::u32string DecodeUTF_8(const std::string &s) { std::u32string result; - const std::uint8_t *p{reinterpret_cast(s.data())}; + const char *p{s.c_str()}; for (auto bytes{s.size()}; bytes != 0;) { - decltype(bytes) charBytes{1}; - char32_t ch{*p++}; - if ((ch & 0xc0) > 0x40) { - if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 && - ((p[0] | p[1] | p[2]) & 0xc0) == 0x80) { - charBytes = 4; - ch = ((ch & 7) << 6) | (*p++ & 0x3f); - ch = (ch << 6) | (*p++ & 0x3f); - ch = (ch << 6) | (*p++ & 0x3f); - } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 && - ((p[0] | p[1]) & 0xc0) == 0x80) { - charBytes = 3; - ch = ((ch & 0xf) << 6) | (*p++ & 0x3f); - ch = (ch << 6) | (*p++ & 0x3f); - } else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 && - (*p & 0xc0) == 0x80) { - charBytes = 2; - ch = ((ch & 0x1f) << 6) | (*p++ & 0x3f); - } else { - return std::nullopt; // not valid UTF-8 + DecodedCharacter decoded{DecodeUTF_8Character(p, bytes)}; + if (decoded.bytes > 0) { + if (static_cast(decoded.bytes) <= bytes) { + result.append(1, decoded.unicode); + bytes -= decoded.bytes; + p += decoded.bytes; + continue; } } - result.append(1, ch); - bytes -= charBytes; + result.append(1, static_cast(*p)); + ++p; + --bytes; } - return {result}; + return result; } + +std::u16string DecodeEUC_JP(const std::string &s) { + std::u16string result; + const char *p{s.c_str()}; + for (auto bytes{s.size()}; bytes != 0;) { + DecodedCharacter decoded{DecodeEUC_JPCharacter(p, bytes)}; + if (decoded.bytes > 0) { + if (static_cast(decoded.bytes) <= bytes) { + result.append(1, decoded.unicode); + bytes -= decoded.bytes; + p += decoded.bytes; + continue; + } + } + result.append(1, static_cast(*p)); + ++p; + --bytes; + } + return result; +} + } diff --git a/flang/lib/parser/characters.h b/flang/lib/parser/characters.h index a54df93..47c3cc9 100644 --- a/flang/lib/parser/characters.h +++ b/flang/lib/parser/characters.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -26,7 +26,13 @@ namespace Fortran::parser { -enum class Encoding { UTF8, EUC_JP }; +// We can easily support Fortran program source in any character +// set whose first 128 code points correspond to ASCII codes 0-127 (ISO/IEC646). +// The specific encodings that we can handle include: +// LATIN_1: ISO 8859-1 Latin-1 +// UTF_8: Multi-byte encoding of Unicode (ISO/IEC 10646) +// EUC_JP: 1-3 byte encoding of JIS X 0208 / 0212 +enum class Encoding { LATIN_1, UTF_8, EUC_JP }; inline constexpr bool IsUpperCaseLetter(char ch) { return ch >= 'A' && ch <= 'Z'; @@ -133,9 +139,20 @@ inline constexpr std::optional BackslashEscapeChar(char ch) { } } +struct EncodedCharacter { + char buffer[4]; + int bytes{0}; +}; + +EncodedCharacter EncodeLATIN_1(char); +EncodedCharacter EncodeUTF_8(char32_t); +EncodedCharacter EncodeEUC_JP(char16_t); +EncodedCharacter EncodeCharacter(Encoding, char32_t); + template void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert, - bool doubleDoubleQuotes = true, bool doubleBackslash = true) { + bool doubleDoubleQuotes = true, bool doubleBackslash = true, + Encoding encoding = Encoding::UTF_8) { if (ch == '"') { if (doubleDoubleQuotes) { insert('"'); @@ -146,7 +163,7 @@ void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert, insert('\\'); } emit('\\'); - } else if (ch < ' ' || (ch >= 0x80 && ch <= 0xff)) { + } else if (ch < ' ' || (encoding == Encoding::LATIN_1 && ch >= 0x7f)) { insert('\\'); if (std::optional escape{BackslashEscapeChar(ch)}) { emit(*escape); @@ -156,34 +173,40 @@ void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert, insert('0' + ((ch >> 3) & 7)); insert('0' + (ch & 7)); } - } else if (ch <= 0x7f) { - emit(ch); - } else if (ch <= 0x7ff) { - emit(0xc0 | ((ch >> 6) & 0x1f)); - emit(0x80 | (ch & 0x3f)); - } else if (ch <= 0xffff) { - emit(0xe0 | ((ch >> 12) & 0x0f)); - emit(0x80 | ((ch >> 6) & 0x3f)); - emit(0x80 | (ch & 0x3f)); } else { - emit(0xf0 | ((ch >> 18) & 0x07)); - emit(0x80 | ((ch >> 12) & 0x3f)); - emit(0x80 | ((ch >> 6) & 0x3f)); - emit(0x80 | (ch & 0x3f)); + EncodedCharacter encoded{EncodeCharacter(encoding, ch)}; + for (int j{0}; j < encoded.bytes; ++j) { + emit(encoded.buffer[j]); + } } } std::string QuoteCharacterLiteral(const std::string &, - bool doubleDoubleQuotes = true, bool doubleBackslash = true); + bool doubleDoubleQuotes = true, bool doubleBackslash = true, + Encoding = Encoding::LATIN_1); std::string QuoteCharacterLiteral(const std::u16string &, - bool doubleDoubleQuotes = true, bool doubleBackslash = true); + bool doubleDoubleQuotes = true, bool doubleBackslash = true, + Encoding = Encoding::EUC_JP); std::string QuoteCharacterLiteral(const std::u32string &, - bool doubleDoubleQuotes = true, bool doubleBackslash = true); + bool doubleDoubleQuotes = true, bool doubleBackslash = true, + Encoding = Encoding::UTF_8); -std::optional UTF8CharacterBytes(const char *); +std::optional UTF_8CharacterBytes(const char *); std::optional EUC_JPCharacterBytes(const char *); -std::optional CountCharacters( - const char *, std::size_t bytes, std::optional (*)(const char *)); -std::optional DecodeUTF8(const std::string &); +std::optional CharacterBytes(const char *, Encoding); +std::optional CountCharacters(const char *, std::size_t bytes, Encoding); + +struct DecodedCharacter { + char32_t unicode{0}; + int bytes{0}; // signifying failure +}; + +DecodedCharacter DecodeUTF_8Character(const char *, std::size_t); +DecodedCharacter DecodeEUC_JPCharacter(const char *, std::size_t); +DecodedCharacter DecodeLATIN1Character(const char *); +DecodedCharacter DecodeCharacter(Encoding, const char *, std::size_t); + +std::u32string DecodeUTF_8(const std::string &); +std::u16string DecodeEUC_JP(const std::string &); } #endif // FORTRAN_PARSER_CHARACTERS_H_ diff --git a/flang/lib/parser/parse-state.h b/flang/lib/parser/parse-state.h index 65ec9ec..973b200 100644 --- a/flang/lib/parser/parse-state.h +++ b/flang/lib/parser/parse-state.h @@ -185,17 +185,19 @@ public: } std::optional GetNextChar() { - if (p_ >= limit_) { + if (p_ < limit_) { + return UncheckedAdvance(); + } else { return std::nullopt; } - return {UncheckedAdvance()}; } std::optional PeekAtNextChar() const { - if (p_ >= limit_) { + if (p_ < limit_) { + return p_; + } else { return std::nullopt; } - return {p_}; } std::size_t BytesRemaining() const { @@ -229,7 +231,7 @@ private: UserState *userState_{nullptr}; bool inFixedForm_{false}; - Encoding encoding_{Encoding::UTF8}; + Encoding encoding_{Encoding::UTF_8}; bool anyErrorRecovery_{false}; bool anyConformanceViolation_{false}; bool deferMessages_{false}; diff --git a/flang/lib/parser/parsing.h b/flang/lib/parser/parsing.h index 3bb77ac..be470eb 100644 --- a/flang/lib/parser/parsing.h +++ b/flang/lib/parser/parsing.h @@ -37,7 +37,7 @@ struct Options { bool isFixedForm{false}; int fixedFormColumns{72}; LanguageFeatureControl features; - Encoding encoding{Encoding::UTF8}; + Encoding encoding{Encoding::UTF_8}; std::vector searchDirectories; std::vector predefinitions; bool instrumentedParse{false}; diff --git a/flang/lib/parser/prescan.cc b/flang/lib/parser/prescan.cc index b7ac18e..7a6ffe4 100644 --- a/flang/lib/parser/prescan.cc +++ b/flang/lib/parser/prescan.cc @@ -550,7 +550,14 @@ void Prescanner::QuotedCharacterLiteral( bool escape{false}; bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)}; while (true) { - char32_t ch{static_cast(*at_)}; + DecodedCharacter decoded{DecodeCharacter( + encoding_, at_, static_cast(limit_ - at_))}; + if (decoded.bytes <= 0) { + Say(GetProvenanceRange(start, end), + "Bad character in character literal"_err_en_US); + break; + } + char32_t ch{decoded.unicode}; escape = !escape && ch == '\\' && escapesEnabled; EmitQuotedChar(ch, emit, insert, false, !escapesEnabled); while (PadOutCharacterLiteral(tokens)) { @@ -562,6 +569,7 @@ void Prescanner::QuotedCharacterLiteral( } break; } + at_ += decoded.bytes - 1; end = at_ + 1; NextChar(); if (*at_ == quote && !escape) { @@ -592,24 +600,24 @@ void Prescanner::Hollerith( if (PadOutCharacterLiteral(tokens)) { } else if (*at_ == '\n') { Say(GetProvenanceRange(start, at_), - "possible truncated Hollerith literal"_en_US); + "Possible truncated Hollerith literal"_en_US); break; } else { NextChar(); - EmitChar(tokens, *at_); - // Multi-byte character encodings should count as single characters. - int bytes{1}; - if (encoding_ == Encoding::EUC_JP) { - if (std::optional chBytes{EUC_JPCharacterBytes(at_)}) { - bytes = *chBytes; - } - } else if (encoding_ == Encoding::UTF8) { - if (std::optional chBytes{UTF8CharacterBytes(at_)}) { - bytes = *chBytes; + // Multi-byte character encodings each count as single characters. + DecodedCharacter decoded{DecodeCharacter( + encoding_, at_, static_cast(limit_ - at_))}; + if (decoded.bytes > 0) { + // The cooked character stream we emit is always in UTF-8. + EncodedCharacter utf8{EncodeUTF_8(decoded.unicode)}; + for (int j{0}; j < utf8.bytes; ++j) { + EmitChar(tokens, utf8.buffer[j]); } - } - while (bytes-- > 1) { - EmitChar(tokens, *++at_); + at_ += decoded.bytes; + } else { + Say(GetProvenanceRange(start, at_), + "Bad character in Hollerith literal"_err_en_US); + break; } } } diff --git a/flang/lib/parser/prescan.h b/flang/lib/parser/prescan.h index abc1ed9..0e492f7 100644 --- a/flang/lib/parser/prescan.h +++ b/flang/lib/parser/prescan.h @@ -186,7 +186,7 @@ private: LanguageFeatureControl features_; bool inFixedForm_{false}; int fixedFormColumnLimit_{72}; - Encoding encoding_{Encoding::UTF8}; + Encoding encoding_{Encoding::UTF_8}; int delimiterNesting_{0}; int prescannerNesting_{0}; diff --git a/flang/lib/parser/token-parsers.h b/flang/lib/parser/token-parsers.h index 46cdcdb..3de491d 100644 --- a/flang/lib/parser/token-parsers.h +++ b/flang/lib/parser/token-parsers.h @@ -228,7 +228,7 @@ struct CharLiteralChar { return std::nullopt; } if (ch != '\\') { - return {Result::Bare(ch)}; + return Result::Bare(ch); } if (!(och = nextCh.Parse(state)).has_value()) { return std::nullopt; @@ -240,7 +240,7 @@ struct CharLiteralChar { return std::nullopt; } if (std::optional escChar{BackslashEscapeValue(ch)}) { - return {Result::Escaped(*escChar)}; + return Result::Escaped(*escChar); } if (IsOctalDigit(ch)) { ch -= '0'; @@ -283,7 +283,7 @@ template struct CharLiteral { if (ch->ch == quote && !ch->wasEscaped) { static constexpr auto doubled{attempt(AnyOfChars{SetOfChars{quote}})}; if (!doubled.Parse(state).has_value()) { - return {str}; + return str; } } str += ch->ch; @@ -543,39 +543,26 @@ struct HollerithLiteral { } std::string content; for (auto j{*charCount}; j-- > 0;) { - int bytes{1}; - const char *p{state.GetLocation()}; - if (state.encoding() == Encoding::EUC_JP) { - std::optional chBytes{EUC_JPCharacterBytes(p)}; - if (!chBytes.has_value()) { - state.Say(start, "bad EUC_JP characters in Hollerith"_err_en_US); - return std::nullopt; - } - bytes = *chBytes; - } else if (state.encoding() == Encoding::UTF8) { - std::optional chBytes{UTF8CharacterBytes(p)}; - if (!chBytes.has_value()) { - state.Say(start, "bad UTF-8 characters in Hollerith"_err_en_US); - return std::nullopt; - } - bytes = *chBytes; - } - if (bytes == 1) { - std::optional at{nextCh.Parse(state)}; - if (!at.has_value() || !isprint(**at)) { - state.Say( - start, "insufficient or bad characters in Hollerith"_err_en_US); - return std::nullopt; + if (std::optional chBytes{ + CharacterBytes(state.GetLocation(), state.encoding())}) { + for (int bytes{*chBytes}; bytes > 0; --bytes) { + if (std::optional at{nextCh.Parse(state)}) { + if (*chBytes == 1 && !isprint(**at)) { + state.Say(start, "Bad character in Hollerith"_err_en_US); + return std::nullopt; + } + content += **at; + } else { + state.Say(start, "Insufficient characters in Hollerith"_err_en_US); + return std::nullopt; + } } - content += **at; } else { - // Multi-byte character - while (bytes-- > 0) { - content += *nextCh.Parse(state).value(); - } + state.Say(start, "Bad multi-byte character in Hollerith"_err_en_US); + return std::nullopt; } } - return {content}; + return content; } }; diff --git a/flang/lib/parser/unparse.cc b/flang/lib/parser/unparse.cc index 4c03a79..9e6f19c 100644 --- a/flang/lib/parser/unparse.cc +++ b/flang/lib/parser/unparse.cc @@ -189,18 +189,21 @@ public: if (const auto &k{std::get>(x.t)}) { if (std::holds_alternative(k->u)) { Word("NC"); + Put(QuoteCharacterLiteral(std::get(x.t), true, + backslashEscapes_, Encoding::EUC_JP)); } else { Walk(*k), Put('_'); + Put(QuoteCharacterLiteral( + std::get(x.t), true, backslashEscapes_)); } + } else { + Put(QuoteCharacterLiteral( + std::get(x.t), true, backslashEscapes_)); } - Put(QuoteCharacterLiteral( - std::get(x.t), true, backslashEscapes_)); } void Before(const HollerithLiteralConstant &x) { - std::optional chars{CountCharacters(x.v.data(), x.v.size(), - encoding_ == Encoding::EUC_JP ? EUC_JPCharacterBytes - : UTF8CharacterBytes)}; - if (chars.has_value()) { + if (std::optional chars{ + CountCharacters(x.v.data(), x.v.size(), encoding_)}) { Unparse(*chars); } else { Unparse(x.v.size()); @@ -2575,7 +2578,7 @@ private: int column_{1}; const int maxColumns_{80}; std::set structureComponents_; - Encoding encoding_{Encoding::UTF8}; + Encoding encoding_{Encoding::UTF_8}; bool capitalizeKeywords_{true}; bool openmpDirective_{false}; bool backslashEscapes_{false}; diff --git a/flang/lib/parser/unparse.h b/flang/lib/parser/unparse.h index 6ff8388..070c758 100644 --- a/flang/lib/parser/unparse.h +++ b/flang/lib/parser/unparse.h @@ -39,7 +39,7 @@ using TypedExprAsFortran = /// Convert parsed program to out as Fortran. void Unparse(std::ostream &out, const Program &program, - Encoding encoding = Encoding::UTF8, bool capitalizeKeywords = true, + Encoding encoding = Encoding::UTF_8, bool capitalizeKeywords = true, bool backslashEscapes = true, preStatementType *preStatement = nullptr, TypedExprAsFortran *expr = nullptr); } diff --git a/flang/lib/semantics/expression.cc b/flang/lib/semantics/expression.cc index 2b6dd0b..a3fc057 100644 --- a/flang/lib/semantics/expression.cc +++ b/flang/lib/semantics/expression.cc @@ -519,26 +519,37 @@ MaybeExpr ExpressionAnalyzer::AnalyzeString(std::string &&string, int kind) { if (!CheckIntrinsicKind(TypeCategory::Character, kind)) { return std::nullopt; } + std::u32string unicode{parser::DecodeUTF_8(string)}; if (kind == 1) { - return {AsGenericExpr( - Constant>{std::move(string)})}; - } else if (std::optional unicode{ - parser::DecodeUTF8(string)}) { - if (kind == 4) { - return {AsGenericExpr( - Constant>{std::move(*unicode)})}; + std::string result; + for (const char32_t &ch : unicode) { + if (ch <= 0xff) { + result += static_cast(ch); + } else { + // Original literal in UTF-8 source contained a byte sequence + // that looked like UTF-8 and got decoded as such. Reconstruct. + parser::EncodedCharacter encoded{parser::EncodeUTF_8(ch)}; + result += std::string{ + encoded.buffer, static_cast(encoded.bytes)}; + } } - CHECK(kind == 2); - // TODO: better Kanji support + return AsGenericExpr( + Constant>{std::move(result)}); + } else if (kind == 2) { std::u16string result; - for (const char32_t &ch : *unicode) { + for (const char32_t &ch : unicode) { + if (ch > 0xffff) { + Say("Bad character in CHARACTER(KIND=2) literal"_err_en_US); + return std::nullopt; + } result += static_cast(ch); } - return {AsGenericExpr( - Constant>{std::move(result)})}; + return AsGenericExpr( + Constant>{std::move(result)}); } else { - Say("bad UTF-8 encoding of CHARACTER(KIND=%d) literal"_err_en_US, kind); - return std::nullopt; + CHECK(kind == 4); + return AsGenericExpr( + Constant>{std::move(unicode)}); } } diff --git a/flang/lib/semantics/unparse-with-symbols.h b/flang/lib/semantics/unparse-with-symbols.h index 4d4d9d4..07729ad 100644 --- a/flang/lib/semantics/unparse-with-symbols.h +++ b/flang/lib/semantics/unparse-with-symbols.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -24,7 +24,7 @@ struct Program; namespace Fortran::semantics { void UnparseWithSymbols(std::ostream &, const parser::Program &, - parser::Encoding encoding = parser::Encoding::UTF8); + parser::Encoding encoding = parser::Encoding::UTF_8); } #endif // FORTRAN_SEMANTICS_UNPARSE_WITH_SYMBOLS_H_ diff --git a/flang/tools/f18/f18-parse-demo.cc b/flang/tools/f18/f18-parse-demo.cc index 57e68dd..5030345 100644 --- a/flang/tools/f18/f18-parse-demo.cc +++ b/flang/tools/f18/f18-parse-demo.cc @@ -92,7 +92,7 @@ struct DriverOptions { bool forcedForm{false}; // -Mfixed or -Mfree appeared bool warnOnNonstandardUsage{false}; // -Mstandard bool warningsAreErrors{false}; // -Werror - Fortran::parser::Encoding encoding{Fortran::parser::Encoding::UTF8}; + Fortran::parser::Encoding encoding{Fortran::parser::Encoding::UTF_8}; bool parseOnly{false}; bool dumpProvenance{false}; bool dumpCookedChars{false}; diff --git a/flang/tools/f18/f18.cc b/flang/tools/f18/f18.cc index 5a1d34b..cde2389 100644 --- a/flang/tools/f18/f18.cc +++ b/flang/tools/f18/f18.cc @@ -87,7 +87,7 @@ struct DriverOptions { bool forcedForm{false}; // -Mfixed or -Mfree appeared bool warnOnNonstandardUsage{false}; // -Mstandard bool warningsAreErrors{false}; // -Werror - Fortran::parser::Encoding encoding{Fortran::parser::Encoding::UTF8}; + Fortran::parser::Encoding encoding{Fortran::parser::Encoding::UTF_8}; bool parseOnly{false}; bool dumpProvenance{false}; bool dumpCookedChars{false}; @@ -451,6 +451,8 @@ int main(int argc, char *const argv[]) { } else if (arg == "-module-suffix") { driver.moduleFileSuffix = args.front(); args.pop_front(); + } else if (arg == "-fno-utf-8") { + options.encoding = Fortran::parser::Encoding::LATIN_1; } else if (arg == "-help" || arg == "--help" || arg == "-?") { std::cerr << "f18 options:\n" -- 2.7.4