From 8971f7f2b3595b19cf19a3961c6ade237279114d Mon Sep 17 00:00:00 2001 From: peter klausler Date: Fri, 14 Jun 2019 15:44:32 -0700 Subject: [PATCH] [flang] More debugging vs. regression tests Original-commit: flang-compiler/f18@d4fd4ad1eca91336cdd2fb1add93d97ecb303720 Reviewed-on: https://github.com/flang-compiler/f18/pull/496 Tree-same-pre-rewrite: false --- flang/lib/parser/characters.cc | 179 +++++++++++++++---------------------- flang/lib/parser/characters.h | 11 ++- flang/lib/parser/prescan.cc | 13 +-- flang/lib/parser/token-parsers.h | 23 ++--- flang/lib/parser/token-sequence.cc | 10 +++ flang/lib/parser/token-sequence.h | 2 + flang/lib/parser/unparse.cc | 29 +++--- flang/lib/parser/unparse.h | 2 +- flang/tools/f18/f18-parse-demo.cc | 2 +- flang/tools/f18/f18.cc | 11 ++- 10 files changed, 130 insertions(+), 152 deletions(-) diff --git a/flang/lib/parser/characters.cc b/flang/lib/parser/characters.cc index bc60252..bc0e569 100644 --- a/flang/lib/parser/characters.cc +++ b/flang/lib/parser/characters.cc @@ -20,82 +20,19 @@ namespace Fortran::parser { -std::optional UTF_8CharacterBytes(const char *p) { +int UTF_8CharacterBytes(const char *p) { if ((*p & 0x80) == 0) { return 1; - } - if ((*p & 0xf8) == 0xf0) { - if ((*p & 0x07) != 0 && (p[1] & 0xc0) == 0x80 && (p[2] & 0xc0) == 0x80 && - (p[3] & 0xc0) == 0x80) { - return 4; - } - } else if ((*p & 0xf0) == 0xe0) { - if ((*p & 0x0f) != 0 && (p[1] & 0xc0) == 0x80 && (p[2] & 0xc0) == 0x80) { - return 3; - } } else if ((*p & 0xe0) == 0xc0) { - if ((*p & 0x1f) != 0 && (p[1] & 0xc0) == 0x80) { - return 2; - } - } - return std::nullopt; -} - -std::optional EUC_JPCharacterBytes(const char *p) { - int b1 = *p & 0xff; - if (b1 <= 0x7f) { - return 1; - } - if (b1 >= 0xa0 && b1 <= 0xfe) { - int b2 = p[1] & 0xff; - if (b2 >= 0xa0 && b2 <= 0xfe) { - // JIS X 0208 (code set 1) - return 2; - } - } else if (b1 == 0x8e) { - int b2 = p[1] & 0xff; - if (b2 >= 0xa0 && b2 <= 0xdf) { - // upper half JIS 0201 (half-width kana, code set 2) - return 2; - } - } else if (b1 == 0x8f) { - int b2 = p[1] & 0xff; - int b3 = p[2] & 0xff; - if (b2 >= 0xa0 && b2 <= 0xfe && b3 >= 0xa0 && b3 <= 0xfe) { - // JIS X 0212 (code set 3) - return 3; - } - } - return std::nullopt; -} - -static std::optional One(const char *) { return 1; } - -static std::optional (*CharacterCounter(Encoding encoding))(const char *) { - switch (encoding) { - case Encoding::UTF_8: return UTF_8CharacterBytes; - case Encoding::EUC_JP: return EUC_JPCharacterBytes; - default: return One; - } -} - -std::optional CountCharacters( - const char *p, std::size_t bytes, Encoding encoding) { - std::size_t chars{0}; - const char *limit{p + bytes}; - std::optional (*cbf)(const char *){CharacterCounter(encoding)}; - while (p < limit) { - if (std::optional cb{cbf(p)}) { - p += *cb; - ++chars; - } else { - return std::nullopt; - } - } - if (p == limit) { - return chars; + return 2; + } else if ((*p & 0xf0) == 0xe0) { + return 3; + } else if ((*p & 0xf8) == 0xf0) { + return 4; + } else if ((*p & 0xfc) == 0xf8) { + return 5; } else { - return std::nullopt; + return 6; } } @@ -131,10 +68,10 @@ std::string QuoteCharacterLiteral( return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding); } -EncodedCharacter EncodeLATIN_1(char32_t codepoint) { - CHECK(codepoint <= 0xff); +EncodedCharacter EncodeLATIN_1(char32_t ucs) { + CHECK(ucs <= 0xff); EncodedCharacter result; - result.buffer[0] = codepoint; + result.buffer[0] = ucs; result.bytes = 1; return result; } @@ -154,39 +91,68 @@ EncodedCharacter EncodeUTF_8(char32_t codepoint) { result.buffer[1] = 0x80 | ((codepoint >> 6) & 0x3f); result.buffer[2] = 0x80 | (codepoint & 0x3f); result.bytes = 3; - } else { - // UCS actually only goes up to 0x10ffff but the - // UTF-8 encoding handles 21 bits. - CHECK(codepoint <= 0x1fffff); + } else if (codepoint <= 0x1fffff) { + // UCS actually only goes up to 0x10ffff, but the + // UTF-8 encoding can handle 32 bits. result.buffer[0] = 0xf0 | (codepoint >> 18); result.buffer[1] = 0x80 | ((codepoint >> 12) & 0x3f); result.buffer[2] = 0x80 | ((codepoint >> 6) & 0x3f); result.buffer[3] = 0x80 | (codepoint & 0x3f); result.bytes = 4; + } else if (codepoint <= 0x3ffffff) { + result.buffer[0] = 0xf8 | (codepoint >> 24); + result.buffer[1] = 0x80 | ((codepoint >> 18) & 0x3f); + result.buffer[2] = 0x80 | ((codepoint >> 12) & 0x3f); + result.buffer[3] = 0x80 | ((codepoint >> 6) & 0x3f); + result.buffer[4] = 0x80 | (codepoint & 0x3f); + result.bytes = 5; + } else { + result.buffer[0] = 0xfc | (codepoint >> 30); + result.buffer[1] = 0x80 | ((codepoint >> 24) & 0x3f); + result.buffer[2] = 0x80 | ((codepoint >> 18) & 0x3f); + result.buffer[3] = 0x80 | ((codepoint >> 12) & 0x3f); + result.buffer[4] = 0x80 | ((codepoint >> 6) & 0x3f); + result.buffer[5] = 0x80 | (codepoint & 0x3f); + result.bytes = 6; } return result; } -EncodedCharacter EncodeEUC_JP(char32_t codepoint) { - // Assume JIS X 0208 (TODO: others) - CHECK(codepoint <= 0x6e6e); +// These are placeholders; the actual mapping is complicated. +static char32_t JIS_0208ToUCS(char32_t jis) { return jis | 0x80000; } +static char32_t JIS_0212ToUCS(char32_t jis) { return jis | 0x90000; } +static bool IsUCSJIS_0212(char32_t ucs) { return (ucs & 0x90000) == 0x90000; } +static char32_t UCSToJIS(char32_t ucs) { return ucs & 0xffff; } + +EncodedCharacter EncodeEUC_JP(char32_t ucs) { EncodedCharacter result; - if (codepoint <= 0x7f) { - result.buffer[0] = codepoint; + if (ucs <= 0x7f) { + result.buffer[0] = ucs; result.bytes = 1; - } else { - result.buffer[0] = 0x80 | (codepoint >> 8); - result.buffer[1] = 0x80 | (codepoint & 0x7f); + } else if (ucs <= 0xff) { + result.buffer[0] = 0x8e; // JIS X 0201 + result.buffer[1] = ucs; + result.bytes = 2; + } else if (IsUCSJIS_0212(ucs)) { // JIS X 0212 + char32_t jis{UCSToJIS(ucs)}; + result.buffer[0] = 0x8f; + result.buffer[1] = 0x80 ^ (jis >> 8); + result.buffer[2] = 0x80 ^ jis; + result.bytes = 3; + } else { // JIS X 0208 + char32_t jis{UCSToJIS(ucs)}; + result.buffer[0] = 0x80 ^ (jis >> 8); + result.buffer[1] = 0x80 ^ jis; result.bytes = 2; } return result; } -EncodedCharacter EncodeCharacter(Encoding encoding, char32_t codepoint) { +EncodedCharacter EncodeCharacter(Encoding encoding, char32_t ucs) { switch (encoding) { - case Encoding::LATIN_1: return EncodeLATIN_1(codepoint); - case Encoding::UTF_8: return EncodeUTF_8(codepoint); - case Encoding::EUC_JP: return EncodeEUC_JP(codepoint); + case Encoding::LATIN_1: return EncodeLATIN_1(ucs); + case Encoding::UTF_8: return EncodeUTF_8(ucs); + case Encoding::EUC_JP: return EncodeEUC_JP(ucs); default: CRASH_NO_CASE; } } @@ -221,19 +187,18 @@ DecodedCharacter DecodeEUC_JPCharacter(const char *cp, std::size_t bytes) { char32_t ch{*p}; if (ch <= 0x7f) { return {ch, 1}; - } else if (ch >= 0xa0 && ch <= 0xfe && bytes >= 2 && p[1] >= 0xa0 && - p[1] <= 0xfe) { - ch = ((ch << 8) | p[1]) & 0x7f7f; // JIS X 0208 - return {ch, 2}; - } else if (ch == 0x8e && bytes >= 2 && p[1] >= 0xa0 && p[1] <= 0xdf) { - return {p[1], 2}; // JIS X 0201 - } else if (ch == 0x8f && bytes >= 3 && p[1] >= 0xa0 && p[1] <= 0xfe && - p[2] >= 0xa0 && p[2] <= 0xfe) { - ch = ((p[1] << 8) | p[1]) & 0x7f7f; // JIS X 0212 - return {ch, 3}; - } else { - return {}; // not valid EUC_JP + } else if (ch == 0x8e) { + if (bytes >= 2) { + return {p[1], 2}; // JIS X 0201 + } + } else if (ch == 0x8f) { // JIS X 0212 + if (bytes >= 3) { + return {JIS_0212ToUCS(((p[1] << 8) | p[2]) ^ 0x8080), 3}; + } + } else if (bytes >= 2) { // assume JIS X 0208 + return {JIS_0208ToUCS(((ch << 8) | p[1]) ^ 0x8080), 2}; } + return {}; } DecodedCharacter DecodeLATIN1Character(const char *cp) { @@ -267,10 +232,10 @@ static DecodedCharacter DecodeEscapedCharacter( static DecodedCharacter DecodeEscapedCharacters( Encoding encoding, const char *cp, std::size_t bytes) { - char buffer[4]; - int count[4]; + char buffer[EncodedCharacter::maxEncodingBytes]; + int count[EncodedCharacter::maxEncodingBytes]; std::size_t at{0}, len{0}; - for (; len < 4 && at < bytes; ++len) { + for (; len < EncodedCharacter::maxEncodingBytes && at < bytes; ++len) { DecodedCharacter code{DecodeEscapedCharacter(cp + at, bytes - at)}; buffer[len] = code.codepoint; at += code.bytes; @@ -280,7 +245,7 @@ static DecodedCharacter DecodeEscapedCharacters( if (code.bytes > 0) { code.bytes = count[code.bytes - 1]; } else { - code.codepoint = static_cast(buffer[0]); + code.codepoint = buffer[0] & 0xff; code.bytes = count[0]; } return code; diff --git a/flang/lib/parser/characters.h b/flang/lib/parser/characters.h index 4947ae6..eade306 100644 --- a/flang/lib/parser/characters.h +++ b/flang/lib/parser/characters.h @@ -125,7 +125,7 @@ inline constexpr std::optional BackslashEscapeValue(char ch) { inline constexpr std::optional BackslashEscapeChar(char ch) { switch (ch) { - case '\a': return 'a'; + // case '\a': return 'a'; // PGF90 doesn't know \a case '\b': return 'b'; case '\f': return 'f'; case '\n': return 'n'; @@ -140,7 +140,8 @@ inline constexpr std::optional BackslashEscapeChar(char ch) { } struct EncodedCharacter { - char buffer[4]; + static constexpr int maxEncodingBytes{6}; + char buffer[maxEncodingBytes]; int bytes{0}; }; @@ -155,7 +156,7 @@ template void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert, bool backslashEscapes = true, Encoding encoding = Encoding::UTF_8) { auto emitOneChar{[&](std::uint8_t ch) { - if (ch < ' ' || ch == '\\' || (backslashEscapes && ch >= 0x7f)) { + if (ch < ' ' || (backslashEscapes && (ch == '\\' || ch >= 0x7f))) { insert('\\'); if (std::optional escape{BackslashEscapeChar(ch)}) { emit(*escape); @@ -190,9 +191,7 @@ std::string QuoteCharacterLiteral(const std::u16string &, std::string QuoteCharacterLiteral(const std::u32string &, bool backslashEscapes = true, Encoding = Encoding::UTF_8); -std::optional UTF_8CharacterBytes(const char *); -std::optional EUC_JPCharacterBytes(const char *); -std::optional CountCharacters(const char *, std::size_t bytes, Encoding); +int UTF_8CharacterBytes(const char *); struct DecodedCharacter { char32_t codepoint{0}; diff --git a/flang/lib/parser/prescan.cc b/flang/lib/parser/prescan.cc index 26a507a..fadf07b1 100644 --- a/flang/lib/parser/prescan.cc +++ b/flang/lib/parser/prescan.cc @@ -21,6 +21,7 @@ #include "../common/idioms.h" #include #include +#include // TODO pmk rm #include #include #include @@ -168,12 +169,12 @@ void Prescanner::Statement() { Provenance newlineProvenance{GetCurrentProvenance()}; if (std::optional preprocessed{ preprocessor_.MacroReplacement(tokens, *this)}) { - // Reprocess the preprocessed line. + // Reprocess the preprocessed line. Append a newline temporarily. preprocessed->PutNextTokenChar('\n', newlineProvenance); preprocessed->CloseToken(); const char *ppd{preprocessed->ToCharBlock().begin()}; LineClassification ppl{ClassifyLine(ppd)}; - preprocessed->ReopenLastToken(); // remove the newline + preprocessed->RemoveLastToken(); // remove the newline switch (ppl.kind) { case LineClassification::Kind::Comment: break; case LineClassification::Kind::IncludeLine: @@ -183,7 +184,7 @@ void Prescanner::Statement() { case LineClassification::Kind::IncludeDirective: case LineClassification::Kind::PreprocessorDirective: Say(preprocessed->GetProvenanceRange(), - "preprocessed line resembles a preprocessor directive"_en_US); + "Preprocessed line resembles a preprocessor directive"_en_US); preprocessed->ToLowerCase().Emit(cooked_); break; case LineClassification::Kind::CompilerDirective: @@ -483,12 +484,12 @@ bool Prescanner::NextToken(TokenSequence &tokens) { preventHollerith_ = false; } else if (IsLegalInIdentifier(*at_)) { do { - } while (IsLegalInIdentifier( - EmitCharAndAdvance(tokens, ToLowerCaseLetter(*at_)))); + } while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))); if (*at_ == '\'' || *at_ == '"') { // Look for prefix of NC'...' legacy PGI "Kanji" NCHARACTER literal CharBlock prefix{tokens.CurrentOpenToken()}; - bool isKanji{prefix.size() == 2 && prefix[0] == 'n' && prefix[1] == 'c'}; + bool isKanji{prefix.size() == 2 && ToLowerCaseLetter(prefix[0]) == 'n' && + ToLowerCaseLetter(prefix[1]) == 'c'}; QuotedCharacterLiteral(tokens, start, isKanji); preventHollerith_ = false; } else { diff --git a/flang/lib/parser/token-parsers.h b/flang/lib/parser/token-parsers.h index 0c6c327..550a8c8 100644 --- a/flang/lib/parser/token-parsers.h +++ b/flang/lib/parser/token-parsers.h @@ -515,23 +515,18 @@ struct HollerithLiteral { } std::string content; for (auto j{*charCount}; j-- > 0;) { - if (std::optional chBytes{ - UTF_8CharacterBytes(state.GetLocation())}) { - for (int bytes{*chBytes}; bytes > 0; --bytes) { - if (std::optional at{nextCh.Parse(state)}) { - if (*chBytes == 1 && !isprint(**at)) { - state.Say(start, "Bad character in Hollerith"_err_en_US); - return std::nullopt; - } - content += **at; - } else { - state.Say(start, "Insufficient characters in Hollerith"_err_en_US); + int chBytes{UTF_8CharacterBytes(state.GetLocation())}; + for (int bytes{chBytes}; bytes > 0; --bytes) { + if (std::optional at{nextCh.Parse(state)}) { + if (chBytes == 1 && !isprint(**at)) { + state.Say(start, "Bad character in Hollerith"_err_en_US); return std::nullopt; } + content += **at; + } else { + state.Say(start, "Insufficient characters in Hollerith"_err_en_US); + return std::nullopt; } - } else { - state.Say(start, "Bad multi-byte character in Hollerith"_err_en_US); - return std::nullopt; } } return content; diff --git a/flang/lib/parser/token-sequence.cc b/flang/lib/parser/token-sequence.cc index 829d1f7..2c4424a 100644 --- a/flang/lib/parser/token-sequence.cc +++ b/flang/lib/parser/token-sequence.cc @@ -61,6 +61,16 @@ std::size_t TokenSequence::SkipBlanks(std::size_t at) const { return tokens; // even if at > tokens } +void TokenSequence::RemoveLastToken() { + CHECK(!start_.empty()); + CHECK(nextStart_ > start_.back()); + std::size_t bytes{nextStart_ - start_.back()}; + nextStart_ = start_.back(); + start_.pop_back(); + char_.erase(char_.begin() + nextStart_, char_.end()); + provenances_.RemoveLastBytes(bytes); +} + void TokenSequence::Put(const TokenSequence &that) { if (nextStart_ < char_.size()) { start_.push_back(nextStart_); diff --git a/flang/lib/parser/token-sequence.h b/flang/lib/parser/token-sequence.h index ea06184..19dff79 100644 --- a/flang/lib/parser/token-sequence.h +++ b/flang/lib/parser/token-sequence.h @@ -89,6 +89,8 @@ public: start_.pop_back(); } + void RemoveLastToken(); + void Put(const TokenSequence &); void Put(const TokenSequence &, ProvenanceRange); void Put(const TokenSequence &, std::size_t at, std::size_t tokens = 1); diff --git a/flang/lib/parser/unparse.cc b/flang/lib/parser/unparse.cc index b3ddfd3..e428bbce 100644 --- a/flang/lib/parser/unparse.cc +++ b/flang/lib/parser/unparse.cc @@ -186,31 +186,28 @@ public: x.u); } void Unparse(const CharLiteralConstant &x) { // R724 + Encoding encoding{encoding_}; if (const auto &k{std::get>(x.t)}) { if (std::holds_alternative(k->u)) { Word("NC"); - std::u16string jis; - for (char32_t ch : DecodeUTF_8(std::get(x.t))) { - jis += static_cast(ch); - } - Put(QuoteCharacterLiteral(jis, backslashEscapes_, Encoding::EUC_JP)); + encoding = Encoding::EUC_JP; } else { Walk(*k), Put('_'); - Put(QuoteCharacterLiteral( - std::get(x.t), backslashEscapes_)); } - } else { - Put(QuoteCharacterLiteral(std::get(x.t), backslashEscapes_)); } + Put(QuoteCharacterLiteral( + DecodeUTF_8(std::get(x.t)), backslashEscapes_, encoding)); } - void Before(const HollerithLiteralConstant &x) { - if (std::optional chars{ - CountCharacters(x.v.data(), x.v.size(), encoding_)}) { - Unparse(*chars); - } else { - Unparse(x.v.size()); - } + void Unparse(const HollerithLiteralConstant &x) { + std::u32string ucs{DecodeUTF_8(x.v)}; + Unparse(ucs.size()); Put('H'); + for (char32_t ch : DecodeUTF_8(x.v)) { + EncodedCharacter encoded{EncodeCharacter(encoding_, ch)}; + for (int j{0}; j < encoded.bytes; ++j) { + Put(encoded.buffer[j]); + } + } } void Unparse(const LogicalLiteralConstant &x) { // R725 Put(std::get(x.t) ? ".TRUE." : ".FALSE."); diff --git a/flang/lib/parser/unparse.h b/flang/lib/parser/unparse.h index 070c758..470ba9d 100644 --- a/flang/lib/parser/unparse.h +++ b/flang/lib/parser/unparse.h @@ -37,7 +37,7 @@ using preStatementType = using TypedExprAsFortran = std::function; -/// Convert parsed program to out as Fortran. +// Converts parsed program to out as Fortran. void Unparse(std::ostream &out, const Program &program, Encoding encoding = Encoding::UTF_8, bool capitalizeKeywords = true, bool backslashEscapes = true, preStatementType *preStatement = nullptr, diff --git a/flang/tools/f18/f18-parse-demo.cc b/flang/tools/f18/f18-parse-demo.cc index 1eb997a..1edb23a 100644 --- a/flang/tools/f18/f18-parse-demo.cc +++ b/flang/tools/f18/f18-parse-demo.cc @@ -92,7 +92,7 @@ struct DriverOptions { bool forcedForm{false}; // -Mfixed or -Mfree appeared bool warnOnNonstandardUsage{false}; // -Mstandard bool warningsAreErrors{false}; // -Werror - Fortran::parser::Encoding encoding{Fortran::parser::Encoding::UTF_8}; + Fortran::parser::Encoding encoding{Fortran::parser::Encoding::LATIN_1}; bool parseOnly{false}; bool dumpProvenance{false}; bool dumpCookedChars{false}; diff --git a/flang/tools/f18/f18.cc b/flang/tools/f18/f18.cc index 922b2fa..9109c36 100644 --- a/flang/tools/f18/f18.cc +++ b/flang/tools/f18/f18.cc @@ -451,8 +451,13 @@ int main(int argc, char *const argv[]) { } else if (arg == "-module-suffix") { driver.moduleFileSuffix = args.front(); args.pop_front(); - } else if (arg == "-fno-utf-8") { + } else if (arg == "-futf-8") { + driver.encoding = Fortran::parser::Encoding::UTF_8; + } else if (arg == "-flatin") { driver.encoding = Fortran::parser::Encoding::LATIN_1; + } else if (arg == "-fkanji") { + driver.encoding = Fortran::parser::Encoding::EUC_JP; + driver.pgf90Args.push_back("-Mx,125,4"); // PGI "Kanji" mode } else if (arg == "-help" || arg == "--help" || arg == "-?") { std::cerr << "f18 options:\n" @@ -467,6 +472,10 @@ int main(int argc, char *const argv[]) { << " -ed enable fixed form D lines\n" << " -E prescan & preprocess only\n" << " -module dir module output directory (default .)\n" + << " -fkanji interpret source as EUC_JP rather than " + "UTF-8\n" + << " -flatin interpret source as Latin-1 (ISO 8859-1) " + "rather than UTF-8\n" << " -fparse-only parse only, no output except messages\n" << " -funparse parse & reformat only, no code " "generation\n" -- 2.7.4