<!--
-Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
-->
The F18 Parser
* except for the payload in character literals, Hollerith constants,
and character and Hollerith edit descriptors, all letters have been
normalized to lower case
+* all non-ASCII characters have been re-encoded in UTF-8.
Lines in the cooked character stream can be of arbitrary length.
-// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// limitations under the License.
#include "characters.h"
+#include "../common/idioms.h"
#include <cstddef>
#include <optional>
namespace Fortran::parser {
-std::optional<int> UTF8CharacterBytes(const char *p) {
+std::optional<int> UTF_8CharacterBytes(const char *p) {
if ((*p & 0x80) == 0) {
- return {1};
+ return 1;
}
if ((*p & 0xf8) == 0xf0) {
if ((*p & 0x07) != 0 && (p[1] & 0xc0) == 0x80 && (p[2] & 0xc0) == 0x80 &&
(p[3] & 0xc0) == 0x80) {
- return {4};
+ return 4;
}
} else if ((*p & 0xf0) == 0xe0) {
if ((*p & 0x0f) != 0 && (p[1] & 0xc0) == 0x80 && (p[2] & 0xc0) == 0x80) {
- return {3};
+ return 3;
}
} else if ((*p & 0xe0) == 0xc0) {
if ((*p & 0x1f) != 0 && (p[1] & 0xc0) == 0x80) {
- return {2};
+ return 2;
}
}
return std::nullopt;
std::optional<int> EUC_JPCharacterBytes(const char *p) {
int b1 = *p & 0xff;
if (b1 <= 0x7f) {
- return {1};
+ return 1;
}
if (b1 >= 0xa1 && b1 <= 0xfe) {
int b2 = p[1] & 0xff;
if (b2 >= 0xa1 && b2 <= 0xfe) {
// JIS X 0208 (code set 1)
- return {2};
+ return 2;
}
} else if (b1 == 0x8e) {
int b2 = p[1] & 0xff;
if (b2 >= 0xa1 && b2 <= 0xdf) {
// upper half JIS 0201 (half-width kana, code set 2)
- return {2};
+ return 2;
}
} else if (b1 == 0x8f) {
int b2 = p[1] & 0xff;
int b3 = p[2] & 0xff;
if (b2 >= 0xa1 && b2 <= 0xfe && b3 >= 0xa1 && b3 <= 0xfe) {
// JIS X 0212 (code set 3)
- return {3};
+ return 3;
}
}
return std::nullopt;
}
-std::optional<std::size_t> CountCharacters(
- const char *p, std::size_t bytes, std::optional<int> (*cbf)(const char *)) {
+static std::optional<int> One(const char *) { return 1; }
+
+static std::optional<int> (*CharacterCounter(Encoding encoding))(const char *) {
+ switch (encoding) {
+ case Encoding::UTF_8: return UTF_8CharacterBytes;
+ case Encoding::EUC_JP: return EUC_JPCharacterBytes;
+ default: return One;
+ }
+}
+
+std::optional<int> CharacterBytes(const char *p, Encoding encoding) {
+ return CharacterCounter(encoding)(p);
+}
+
+std::optional<int> CountCharacters(
+ const char *p, std::size_t bytes, Encoding encoding) {
std::size_t chars{0};
const char *limit{p + bytes};
+ std::optional<int> (*cbf)(const char *){CharacterCounter(encoding)};
while (p < limit) {
- ++chars;
- std::optional<int> cb{cbf(p)};
- if (!cb.has_value()) {
+ if (std::optional<int> cb{cbf(p)}) {
+ p += *cb;
+ ++chars;
+ } else {
return std::nullopt;
}
- p += *cb;
}
- return {chars};
+ if (p == limit) {
+ return chars;
+ } else {
+ return std::nullopt;
+ }
}
template<typename STRING>
-std::string QuoteCharacterLiteralHelper(
- const STRING &str, bool doubleDoubleQuotes, bool doubleBackslash) {
+std::string QuoteCharacterLiteralHelper(const STRING &str,
+ bool doubleDoubleQuotes, bool doubleBackslash, Encoding encoding) {
std::string result{'"'};
const auto emit{[&](char ch) { result += ch; }};
for (auto ch : str) {
if constexpr (std::is_same_v<char, CharT>) {
// char may be signed depending on host.
char32_t ch32{static_cast<unsigned char>(ch)};
- EmitQuotedChar(ch32, emit, emit, doubleDoubleQuotes, doubleBackslash);
+ EmitQuotedChar(
+ ch32, emit, emit, doubleDoubleQuotes, doubleBackslash, encoding);
} else {
char32_t ch32{ch};
- EmitQuotedChar(ch32, emit, emit, doubleDoubleQuotes, doubleBackslash);
+ EmitQuotedChar(
+ ch32, emit, emit, doubleDoubleQuotes, doubleBackslash, encoding);
}
}
result += '"';
return result;
}
-std::string QuoteCharacterLiteral(
- const std::string &str, bool doubleDoubleQuotes, bool doubleBackslash) {
- return QuoteCharacterLiteralHelper(str, doubleDoubleQuotes, doubleBackslash);
+std::string QuoteCharacterLiteral(const std::string &str,
+ bool doubleDoubleQuotes, bool doubleBackslash, Encoding encoding) {
+ return QuoteCharacterLiteralHelper(
+ str, doubleDoubleQuotes, doubleBackslash, encoding);
+}
+
+std::string QuoteCharacterLiteral(const std::u16string &str,
+ bool doubleDoubleQuotes, bool doubleBackslash, Encoding encoding) {
+ return QuoteCharacterLiteralHelper(
+ str, doubleDoubleQuotes, doubleBackslash, encoding);
+}
+
+std::string QuoteCharacterLiteral(const std::u32string &str,
+ bool doubleDoubleQuotes, bool doubleBackslash, Encoding encoding) {
+ return QuoteCharacterLiteralHelper(
+ str, doubleDoubleQuotes, doubleBackslash, encoding);
+}
+
+EncodedCharacter EncodeLATIN_1(char codepoint) {
+ CHECK(codepoint <= 0xff);
+ EncodedCharacter result;
+ result.buffer[0] = codepoint;
+ result.bytes = 1;
+ return result;
+}
+
+EncodedCharacter EncodeUTF_8(char32_t codepoint) {
+ // N.B. char32_t is unsigned
+ EncodedCharacter result;
+ if (codepoint <= 0x7f) {
+ result.buffer[0] = codepoint;
+ result.bytes = 1;
+ } else if (codepoint <= 0x7ff) {
+ result.buffer[0] = 0xc0 | (codepoint >> 6);
+ result.buffer[1] = 0x80 | (codepoint & 0x3f);
+ result.bytes = 2;
+ } else if (codepoint <= 0xffff) {
+ result.buffer[0] = 0xe0 | (codepoint >> 12);
+ result.buffer[1] = 0x80 | ((codepoint >> 6) & 0x3f);
+ result.buffer[2] = 0x80 | (codepoint & 0x3f);
+ result.bytes = 3;
+ } else {
+ // UCS actually only goes up to 0x10ffff but the
+ // UTF-8 encoding handles 21 bits.
+ CHECK(codepoint <= 0x1fffff);
+ result.buffer[0] = 0xf0 | (codepoint >> 18);
+ result.buffer[1] = 0x80 | ((codepoint >> 12) & 0x3f);
+ result.buffer[2] = 0x80 | ((codepoint >> 6) & 0x3f);
+ result.buffer[3] = 0x80 | (codepoint & 0x3f);
+ result.bytes = 4;
+ }
+ return result;
+}
+
+EncodedCharacter EncodeEUC_JP(char16_t codepoint) {
+ // Assume JIS X 0208 (TODO: others)
+ CHECK(codepoint <= 0x6e6e);
+ EncodedCharacter result;
+ if (codepoint <= 0x7f) {
+ result.buffer[0] = codepoint;
+ result.bytes = 1;
+ } else {
+ result.buffer[0] = 0x80 | (codepoint >> 8);
+ result.buffer[1] = 0x80 | (codepoint & 0x7f);
+ result.bytes = 2;
+ }
+ return result;
+}
+
+EncodedCharacter EncodeCharacter(Encoding encoding, char32_t codepoint) {
+ switch (encoding) {
+ case Encoding::LATIN_1: return EncodeLATIN_1(codepoint);
+ case Encoding::UTF_8: return EncodeUTF_8(codepoint);
+ case Encoding::EUC_JP: return EncodeEUC_JP(codepoint);
+ default: CRASH_NO_CASE;
+ }
+}
+
+DecodedCharacter DecodeUTF_8Character(const char *cp, std::size_t bytes) {
+ auto p{reinterpret_cast<const std::uint8_t *>(cp)};
+ char32_t ch{*p};
+ int charBytes{1};
+ if (ch >= 0x80) {
+ if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 &&
+ ((p[1] | p[2] | p[3]) & 0xc0) == 0x80) {
+ charBytes = 4;
+ ch = ((ch & 7) << 6) | (p[1] & 0x3f);
+ ch = (ch << 6) | (p[2] & 0x3f);
+ ch = (ch << 6) | (p[3] & 0x3f);
+ } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 &&
+ ((p[1] | p[2]) & 0xc0) == 0x80) {
+ charBytes = 3;
+ ch = ((ch & 0xf) << 6) | (p[1] & 0x3f);
+ ch = (ch << 6) | (p[2] & 0x3f);
+ } else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 &&
+ (p[1] & 0xc0) == 0x80) {
+ charBytes = 2;
+ ch = ((ch & 0x1f) << 6) | (p[1] & 0x3f);
+ } else {
+ return {}; // not valid UTF-8
+ }
+ }
+ return {ch, charBytes};
}
-std::string QuoteCharacterLiteral(
- const std::u16string &str, bool doubleDoubleQuotes, bool doubleBackslash) {
- return QuoteCharacterLiteralHelper(str, doubleDoubleQuotes, doubleBackslash);
+DecodedCharacter DecodeEUC_JPCharacter(const char *cp, std::size_t bytes) {
+ auto p{reinterpret_cast<const std::uint8_t *>(cp)};
+ char32_t ch{*p};
+ int charBytes{1};
+ if (ch >= 0x80) {
+ if (bytes >= 2 && ch == 0x8e && p[1] >= 0xa1 && p[1] <= 0xdf) {
+ charBytes = 2; // JIS X 0201
+ ch = p[1];
+ } else if (bytes >= 3 && ch == 0x8f && p[1] >= 0xa1 && p[1] <= 0xfe &&
+ p[2] >= 0xa1 && p[2] <= 0xfe) {
+ charBytes = 3; // JIS X 0212
+ ch = (p[1] & 0x7f) << 8 | (p[1] & 0x7f);
+ } else if (bytes >= 2 && ch >= 0xa1 && ch <= 0xfe && p[1] >= 0x1 &&
+ p[1] <= 0xfe) {
+ charBytes = 2; // JIS X 0208
+ ch = ((ch & 0x7f) << 8) | (p[1] & 0x7f);
+ } else {
+ return {};
+ }
+ }
+ return {ch, charBytes};
}
-std::string QuoteCharacterLiteral(
- const std::u32string &str, bool doubleDoubleQuotes, bool doubleBackslash) {
- return QuoteCharacterLiteralHelper(str, doubleDoubleQuotes, doubleBackslash);
+DecodedCharacter DecodeLATIN1Character(const char *cp) {
+ return {*reinterpret_cast<const std::uint8_t *>(cp), 1};
}
-std::optional<std::u32string> DecodeUTF8(const std::string &s) {
+DecodedCharacter DecodeCharacter(
+ Encoding encoding, const char *cp, std::size_t bytes) {
+ switch (encoding) {
+ case Encoding::LATIN_1: return DecodeLATIN1Character(cp);
+ case Encoding::UTF_8: return DecodeUTF_8Character(cp, bytes);
+ case Encoding::EUC_JP: return DecodeEUC_JPCharacter(cp, bytes);
+ default: CRASH_NO_CASE;
+ }
+}
+
+std::u32string DecodeUTF_8(const std::string &s) {
std::u32string result;
- const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(s.data())};
+ const char *p{s.c_str()};
for (auto bytes{s.size()}; bytes != 0;) {
- decltype(bytes) charBytes{1};
- char32_t ch{*p++};
- if ((ch & 0xc0) > 0x40) {
- if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 &&
- ((p[0] | p[1] | p[2]) & 0xc0) == 0x80) {
- charBytes = 4;
- ch = ((ch & 7) << 6) | (*p++ & 0x3f);
- ch = (ch << 6) | (*p++ & 0x3f);
- ch = (ch << 6) | (*p++ & 0x3f);
- } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 &&
- ((p[0] | p[1]) & 0xc0) == 0x80) {
- charBytes = 3;
- ch = ((ch & 0xf) << 6) | (*p++ & 0x3f);
- ch = (ch << 6) | (*p++ & 0x3f);
- } else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 &&
- (*p & 0xc0) == 0x80) {
- charBytes = 2;
- ch = ((ch & 0x1f) << 6) | (*p++ & 0x3f);
- } else {
- return std::nullopt; // not valid UTF-8
+ DecodedCharacter decoded{DecodeUTF_8Character(p, bytes)};
+ if (decoded.bytes > 0) {
+ if (static_cast<std::size_t>(decoded.bytes) <= bytes) {
+ result.append(1, decoded.unicode);
+ bytes -= decoded.bytes;
+ p += decoded.bytes;
+ continue;
}
}
- result.append(1, ch);
- bytes -= charBytes;
+ result.append(1, static_cast<uint8_t>(*p));
+ ++p;
+ --bytes;
}
- return {result};
+ return result;
}
+
+std::u16string DecodeEUC_JP(const std::string &s) {
+ std::u16string result;
+ const char *p{s.c_str()};
+ for (auto bytes{s.size()}; bytes != 0;) {
+ DecodedCharacter decoded{DecodeEUC_JPCharacter(p, bytes)};
+ if (decoded.bytes > 0) {
+ if (static_cast<std::size_t>(decoded.bytes) <= bytes) {
+ result.append(1, decoded.unicode);
+ bytes -= decoded.bytes;
+ p += decoded.bytes;
+ continue;
+ }
+ }
+ result.append(1, static_cast<uint8_t>(*p));
+ ++p;
+ --bytes;
+ }
+ return result;
+}
+
}
-// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
namespace Fortran::parser {
-enum class Encoding { UTF8, EUC_JP };
+// We can easily support Fortran program source in any character
+// set whose first 128 code points correspond to ASCII codes 0-127 (ISO/IEC646).
+// The specific encodings that we can handle include:
+// LATIN_1: ISO 8859-1 Latin-1
+// UTF_8: Multi-byte encoding of Unicode (ISO/IEC 10646)
+// EUC_JP: 1-3 byte encoding of JIS X 0208 / 0212
+enum class Encoding { LATIN_1, UTF_8, EUC_JP };
inline constexpr bool IsUpperCaseLetter(char ch) {
return ch >= 'A' && ch <= 'Z';
}
}
+struct EncodedCharacter {
+ char buffer[4];
+ int bytes{0};
+};
+
+EncodedCharacter EncodeLATIN_1(char);
+EncodedCharacter EncodeUTF_8(char32_t);
+EncodedCharacter EncodeEUC_JP(char16_t);
+EncodedCharacter EncodeCharacter(Encoding, char32_t);
+
template<typename NORMAL, typename INSERTED>
void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert,
- bool doubleDoubleQuotes = true, bool doubleBackslash = true) {
+ bool doubleDoubleQuotes = true, bool doubleBackslash = true,
+ Encoding encoding = Encoding::UTF_8) {
if (ch == '"') {
if (doubleDoubleQuotes) {
insert('"');
insert('\\');
}
emit('\\');
- } else if (ch < ' ' || (ch >= 0x80 && ch <= 0xff)) {
+ } else if (ch < ' ' || (encoding == Encoding::LATIN_1 && ch >= 0x7f)) {
insert('\\');
if (std::optional<char> escape{BackslashEscapeChar(ch)}) {
emit(*escape);
insert('0' + ((ch >> 3) & 7));
insert('0' + (ch & 7));
}
- } else if (ch <= 0x7f) {
- emit(ch);
- } else if (ch <= 0x7ff) {
- emit(0xc0 | ((ch >> 6) & 0x1f));
- emit(0x80 | (ch & 0x3f));
- } else if (ch <= 0xffff) {
- emit(0xe0 | ((ch >> 12) & 0x0f));
- emit(0x80 | ((ch >> 6) & 0x3f));
- emit(0x80 | (ch & 0x3f));
} else {
- emit(0xf0 | ((ch >> 18) & 0x07));
- emit(0x80 | ((ch >> 12) & 0x3f));
- emit(0x80 | ((ch >> 6) & 0x3f));
- emit(0x80 | (ch & 0x3f));
+ EncodedCharacter encoded{EncodeCharacter(encoding, ch)};
+ for (int j{0}; j < encoded.bytes; ++j) {
+ emit(encoded.buffer[j]);
+ }
}
}
std::string QuoteCharacterLiteral(const std::string &,
- bool doubleDoubleQuotes = true, bool doubleBackslash = true);
+ bool doubleDoubleQuotes = true, bool doubleBackslash = true,
+ Encoding = Encoding::LATIN_1);
std::string QuoteCharacterLiteral(const std::u16string &,
- bool doubleDoubleQuotes = true, bool doubleBackslash = true);
+ bool doubleDoubleQuotes = true, bool doubleBackslash = true,
+ Encoding = Encoding::EUC_JP);
std::string QuoteCharacterLiteral(const std::u32string &,
- bool doubleDoubleQuotes = true, bool doubleBackslash = true);
+ bool doubleDoubleQuotes = true, bool doubleBackslash = true,
+ Encoding = Encoding::UTF_8);
-std::optional<int> UTF8CharacterBytes(const char *);
+std::optional<int> UTF_8CharacterBytes(const char *);
std::optional<int> EUC_JPCharacterBytes(const char *);
-std::optional<std::size_t> CountCharacters(
- const char *, std::size_t bytes, std::optional<int> (*)(const char *));
-std::optional<std::u32string> DecodeUTF8(const std::string &);
+std::optional<int> CharacterBytes(const char *, Encoding);
+std::optional<int> CountCharacters(const char *, std::size_t bytes, Encoding);
+
+struct DecodedCharacter {
+ char32_t unicode{0};
+ int bytes{0}; // signifying failure
+};
+
+DecodedCharacter DecodeUTF_8Character(const char *, std::size_t);
+DecodedCharacter DecodeEUC_JPCharacter(const char *, std::size_t);
+DecodedCharacter DecodeLATIN1Character(const char *);
+DecodedCharacter DecodeCharacter(Encoding, const char *, std::size_t);
+
+std::u32string DecodeUTF_8(const std::string &);
+std::u16string DecodeEUC_JP(const std::string &);
}
#endif // FORTRAN_PARSER_CHARACTERS_H_
}
std::optional<const char *> GetNextChar() {
- if (p_ >= limit_) {
+ if (p_ < limit_) {
+ return UncheckedAdvance();
+ } else {
return std::nullopt;
}
- return {UncheckedAdvance()};
}
std::optional<const char *> PeekAtNextChar() const {
- if (p_ >= limit_) {
+ if (p_ < limit_) {
+ return p_;
+ } else {
return std::nullopt;
}
- return {p_};
}
std::size_t BytesRemaining() const {
UserState *userState_{nullptr};
bool inFixedForm_{false};
- Encoding encoding_{Encoding::UTF8};
+ Encoding encoding_{Encoding::UTF_8};
bool anyErrorRecovery_{false};
bool anyConformanceViolation_{false};
bool deferMessages_{false};
bool isFixedForm{false};
int fixedFormColumns{72};
LanguageFeatureControl features;
- Encoding encoding{Encoding::UTF8};
+ Encoding encoding{Encoding::UTF_8};
std::vector<std::string> searchDirectories;
std::vector<Predefinition> predefinitions;
bool instrumentedParse{false};
bool escape{false};
bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
while (true) {
- char32_t ch{static_cast<unsigned char>(*at_)};
+ DecodedCharacter decoded{DecodeCharacter(
+ encoding_, at_, static_cast<std::size_t>(limit_ - at_))};
+ if (decoded.bytes <= 0) {
+ Say(GetProvenanceRange(start, end),
+ "Bad character in character literal"_err_en_US);
+ break;
+ }
+ char32_t ch{decoded.unicode};
escape = !escape && ch == '\\' && escapesEnabled;
EmitQuotedChar(ch, emit, insert, false, !escapesEnabled);
while (PadOutCharacterLiteral(tokens)) {
}
break;
}
+ at_ += decoded.bytes - 1;
end = at_ + 1;
NextChar();
if (*at_ == quote && !escape) {
if (PadOutCharacterLiteral(tokens)) {
} else if (*at_ == '\n') {
Say(GetProvenanceRange(start, at_),
- "possible truncated Hollerith literal"_en_US);
+ "Possible truncated Hollerith literal"_en_US);
break;
} else {
NextChar();
- EmitChar(tokens, *at_);
- // Multi-byte character encodings should count as single characters.
- int bytes{1};
- if (encoding_ == Encoding::EUC_JP) {
- if (std::optional<int> chBytes{EUC_JPCharacterBytes(at_)}) {
- bytes = *chBytes;
- }
- } else if (encoding_ == Encoding::UTF8) {
- if (std::optional<int> chBytes{UTF8CharacterBytes(at_)}) {
- bytes = *chBytes;
+ // Multi-byte character encodings each count as single characters.
+ DecodedCharacter decoded{DecodeCharacter(
+ encoding_, at_, static_cast<std::size_t>(limit_ - at_))};
+ if (decoded.bytes > 0) {
+ // The cooked character stream we emit is always in UTF-8.
+ EncodedCharacter utf8{EncodeUTF_8(decoded.unicode)};
+ for (int j{0}; j < utf8.bytes; ++j) {
+ EmitChar(tokens, utf8.buffer[j]);
}
- }
- while (bytes-- > 1) {
- EmitChar(tokens, *++at_);
+ at_ += decoded.bytes;
+ } else {
+ Say(GetProvenanceRange(start, at_),
+ "Bad character in Hollerith literal"_err_en_US);
+ break;
}
}
}
LanguageFeatureControl features_;
bool inFixedForm_{false};
int fixedFormColumnLimit_{72};
- Encoding encoding_{Encoding::UTF8};
+ Encoding encoding_{Encoding::UTF_8};
int delimiterNesting_{0};
int prescannerNesting_{0};
return std::nullopt;
}
if (ch != '\\') {
- return {Result::Bare(ch)};
+ return Result::Bare(ch);
}
if (!(och = nextCh.Parse(state)).has_value()) {
return std::nullopt;
return std::nullopt;
}
if (std::optional<char> escChar{BackslashEscapeValue(ch)}) {
- return {Result::Escaped(*escChar)};
+ return Result::Escaped(*escChar);
}
if (IsOctalDigit(ch)) {
ch -= '0';
if (ch->ch == quote && !ch->wasEscaped) {
static constexpr auto doubled{attempt(AnyOfChars{SetOfChars{quote}})};
if (!doubled.Parse(state).has_value()) {
- return {str};
+ return str;
}
}
str += ch->ch;
}
std::string content;
for (auto j{*charCount}; j-- > 0;) {
- int bytes{1};
- const char *p{state.GetLocation()};
- if (state.encoding() == Encoding::EUC_JP) {
- std::optional<int> chBytes{EUC_JPCharacterBytes(p)};
- if (!chBytes.has_value()) {
- state.Say(start, "bad EUC_JP characters in Hollerith"_err_en_US);
- return std::nullopt;
- }
- bytes = *chBytes;
- } else if (state.encoding() == Encoding::UTF8) {
- std::optional<int> chBytes{UTF8CharacterBytes(p)};
- if (!chBytes.has_value()) {
- state.Say(start, "bad UTF-8 characters in Hollerith"_err_en_US);
- return std::nullopt;
- }
- bytes = *chBytes;
- }
- if (bytes == 1) {
- std::optional<const char *> at{nextCh.Parse(state)};
- if (!at.has_value() || !isprint(**at)) {
- state.Say(
- start, "insufficient or bad characters in Hollerith"_err_en_US);
- return std::nullopt;
+ if (std::optional<int> chBytes{
+ CharacterBytes(state.GetLocation(), state.encoding())}) {
+ for (int bytes{*chBytes}; bytes > 0; --bytes) {
+ if (std::optional<const char *> at{nextCh.Parse(state)}) {
+ if (*chBytes == 1 && !isprint(**at)) {
+ state.Say(start, "Bad character in Hollerith"_err_en_US);
+ return std::nullopt;
+ }
+ content += **at;
+ } else {
+ state.Say(start, "Insufficient characters in Hollerith"_err_en_US);
+ return std::nullopt;
+ }
}
- content += **at;
} else {
- // Multi-byte character
- while (bytes-- > 0) {
- content += *nextCh.Parse(state).value();
- }
+ state.Say(start, "Bad multi-byte character in Hollerith"_err_en_US);
+ return std::nullopt;
}
}
- return {content};
+ return content;
}
};
if (const auto &k{std::get<std::optional<KindParam>>(x.t)}) {
if (std::holds_alternative<KindParam::Kanji>(k->u)) {
Word("NC");
+ Put(QuoteCharacterLiteral(std::get<std::string>(x.t), true,
+ backslashEscapes_, Encoding::EUC_JP));
} else {
Walk(*k), Put('_');
+ Put(QuoteCharacterLiteral(
+ std::get<std::string>(x.t), true, backslashEscapes_));
}
+ } else {
+ Put(QuoteCharacterLiteral(
+ std::get<std::string>(x.t), true, backslashEscapes_));
}
- Put(QuoteCharacterLiteral(
- std::get<std::string>(x.t), true, backslashEscapes_));
}
void Before(const HollerithLiteralConstant &x) {
- std::optional<std::size_t> chars{CountCharacters(x.v.data(), x.v.size(),
- encoding_ == Encoding::EUC_JP ? EUC_JPCharacterBytes
- : UTF8CharacterBytes)};
- if (chars.has_value()) {
+ if (std::optional<std::size_t> chars{
+ CountCharacters(x.v.data(), x.v.size(), encoding_)}) {
Unparse(*chars);
} else {
Unparse(x.v.size());
int column_{1};
const int maxColumns_{80};
std::set<CharBlock> structureComponents_;
- Encoding encoding_{Encoding::UTF8};
+ Encoding encoding_{Encoding::UTF_8};
bool capitalizeKeywords_{true};
bool openmpDirective_{false};
bool backslashEscapes_{false};
/// Convert parsed program to out as Fortran.
void Unparse(std::ostream &out, const Program &program,
- Encoding encoding = Encoding::UTF8, bool capitalizeKeywords = true,
+ Encoding encoding = Encoding::UTF_8, bool capitalizeKeywords = true,
bool backslashEscapes = true, preStatementType *preStatement = nullptr,
TypedExprAsFortran *expr = nullptr);
}
if (!CheckIntrinsicKind(TypeCategory::Character, kind)) {
return std::nullopt;
}
+ std::u32string unicode{parser::DecodeUTF_8(string)};
if (kind == 1) {
- return {AsGenericExpr(
- Constant<Type<TypeCategory::Character, 1>>{std::move(string)})};
- } else if (std::optional<std::u32string> unicode{
- parser::DecodeUTF8(string)}) {
- if (kind == 4) {
- return {AsGenericExpr(
- Constant<Type<TypeCategory::Character, 4>>{std::move(*unicode)})};
+ std::string result;
+ for (const char32_t &ch : unicode) {
+ if (ch <= 0xff) {
+ result += static_cast<char>(ch);
+ } else {
+ // Original literal in UTF-8 source contained a byte sequence
+ // that looked like UTF-8 and got decoded as such. Reconstruct.
+ parser::EncodedCharacter encoded{parser::EncodeUTF_8(ch)};
+ result += std::string{
+ encoded.buffer, static_cast<std::size_t>(encoded.bytes)};
+ }
}
- CHECK(kind == 2);
- // TODO: better Kanji support
+ return AsGenericExpr(
+ Constant<Type<TypeCategory::Character, 1>>{std::move(result)});
+ } else if (kind == 2) {
std::u16string result;
- for (const char32_t &ch : *unicode) {
+ for (const char32_t &ch : unicode) {
+ if (ch > 0xffff) {
+ Say("Bad character in CHARACTER(KIND=2) literal"_err_en_US);
+ return std::nullopt;
+ }
result += static_cast<char16_t>(ch);
}
- return {AsGenericExpr(
- Constant<Type<TypeCategory::Character, 2>>{std::move(result)})};
+ return AsGenericExpr(
+ Constant<Type<TypeCategory::Character, 2>>{std::move(result)});
} else {
- Say("bad UTF-8 encoding of CHARACTER(KIND=%d) literal"_err_en_US, kind);
- return std::nullopt;
+ CHECK(kind == 4);
+ return AsGenericExpr(
+ Constant<Type<TypeCategory::Character, 4>>{std::move(unicode)});
}
}
-// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
namespace Fortran::semantics {
void UnparseWithSymbols(std::ostream &, const parser::Program &,
- parser::Encoding encoding = parser::Encoding::UTF8);
+ parser::Encoding encoding = parser::Encoding::UTF_8);
}
#endif // FORTRAN_SEMANTICS_UNPARSE_WITH_SYMBOLS_H_
bool forcedForm{false}; // -Mfixed or -Mfree appeared
bool warnOnNonstandardUsage{false}; // -Mstandard
bool warningsAreErrors{false}; // -Werror
- Fortran::parser::Encoding encoding{Fortran::parser::Encoding::UTF8};
+ Fortran::parser::Encoding encoding{Fortran::parser::Encoding::UTF_8};
bool parseOnly{false};
bool dumpProvenance{false};
bool dumpCookedChars{false};
bool forcedForm{false}; // -Mfixed or -Mfree appeared
bool warnOnNonstandardUsage{false}; // -Mstandard
bool warningsAreErrors{false}; // -Werror
- Fortran::parser::Encoding encoding{Fortran::parser::Encoding::UTF8};
+ Fortran::parser::Encoding encoding{Fortran::parser::Encoding::UTF_8};
bool parseOnly{false};
bool dumpProvenance{false};
bool dumpCookedChars{false};
} else if (arg == "-module-suffix") {
driver.moduleFileSuffix = args.front();
args.pop_front();
+ } else if (arg == "-fno-utf-8") {
+ options.encoding = Fortran::parser::Encoding::LATIN_1;
} else if (arg == "-help" || arg == "--help" || arg == "-?") {
std::cerr
<< "f18 options:\n"