* except for the payload in character literals, Hollerith constants,
and character and Hollerith edit descriptors, all letters have been
normalized to lower case
-* all original non-ASCII characters in character literals have been
- decoded, converted to UTF-8, and then formatted with escape sequences
+* all original non-ASCII characters in Hollerith constants have been
+ decoded and re-encoded into UTF-8
Lines in the cooked character stream can be of arbitrary length.
} else if (Rank() == 0) {
o << Result::kind << '_';
}
- o << parser::QuoteCharacterLiteral(value,
- false /* avoid backslash escapes */,
- parser::Encoding::UTF_8 /* module files are UTF-8 */);
+ o << parser::QuoteCharacterLiteral(value);
}
if (Rank() > 0) {
o << ']';
if (std::optional<char> escape{BackslashEscapeChar(ch)}) {
emit(*escape);
} else {
- // octal escape sequence
- if (ch > 077) {
- insert('0' + (ch >> 6));
- }
- if (ch > 07) {
- insert('0' + ((ch >> 3) & 7));
- }
+ // octal escape sequence; always emit 3 digits to avoid ambiguity
+ insert('0' + (ch >> 6));
+ insert('0' + ((ch >> 3) & 7));
insert('0' + (ch & 7));
}
} else {
inCharLiteral_ = true;
const auto emit{[&](char ch) { EmitChar(tokens, ch); }};
const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
+ bool isEscaped{false};
bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
- Encoding encoding{encoding_};
- if (isKanji) {
- // NC'...' - the contents are always decoded as EUC_JP
- encoding = Encoding::EUC_JP;
- } else if (encoding == Encoding::EUC_JP) {
- encoding = Encoding::LATIN_1; // for compatibility with tests
- }
while (true) {
- DecodedCharacter decoded{DecodeCharacter(
- encoding, at_, static_cast<std::size_t>(limit_ - at_), escapesEnabled)};
- if (decoded.bytes <= 0) {
- Say(GetProvenanceRange(start, at_),
- "Bad character in character literal"_en_US);
- // Just eat a byte and press on.
- decoded.codepoint = static_cast<unsigned char>(*at_);
- decoded.bytes = 1;
- }
- char32_t ch{decoded.codepoint};
- if (ch == quote) {
- if (decoded.bytes > 1) {
- EmitChar(tokens, '\\');
- }
- }
- EmitQuotedChar(
- ch, emit, insert, true /* use backslash escapes */, Encoding::UTF_8);
+ EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false,
+ Encoding::LATIN_1);
while (PadOutCharacterLiteral(tokens)) {
}
if (*at_ == '\n') {
}
break;
}
- at_ += decoded.bytes - 1;
+ isEscaped = !isEscaped && *at_ == '\\';
end = at_ + 1;
NextChar();
if (*at_ == quote) {
- // A doubled quote mark becomes a single instance of the quote character
- // in the literal (later). There can be spaces between the quotes in
- // fixed form source.
+ if (isEscaped) {
+ if (escapesEnabled) {
+ continue;
+ } else {
+ insert('\\');
+ }
+ }
+ // A doubled unescaped quote mark becomes a single instance of that
+ // quote character in the literal (later). There can be spaces between
+ // the quotes in fixed form source.
EmitChar(tokens, quote);
inCharLiteral_ = false; // for cases like print *, '...'!comment
NextChar();
} else {
NextChar();
// Multi-byte character encodings each count as single characters.
+ // The cooked character stream always uses UTF-8 for Hollerith.
DecodedCharacter decoded{DecodeCharacter(
encoding_, at_, static_cast<std::size_t>(limit_ - at_))};
if (decoded.bytes > 0) {
- // The cooked character stream we emit is always in UTF-8.
EncodedCharacter utf8{EncodeUTF_8(decoded.codepoint)};
for (int j{0}; j < utf8.bytes; ++j) {
EmitChar(tokens, utf8.buffer[j]);
} else if (inPreprocessorDirective_) {
return false;
} else if (lineClass.kind ==
- LineClassification::Kind::ConditionalCompilationDirective) {
+ LineClassification::Kind::ConditionalCompilationDirective ||
+ lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
// Allow conditional compilation directives (e.g., #ifdef) to affect
// continuation lines.
+ // Allow other preprocessor directives, too, except #include.
preprocessor_.Directive(TokenizePreprocessorDirective(), this);
return true;
} else if (afterAmpersand &&
NextLine();
return true;
}
- } while (SkipCommentLine(false /* not after & */));
+ } while (SkipCommentLine(false /* not after ampersand */));
return false;
}
return std::make_pair(*escChar, true);
} else if (IsOctalDigit(**cp)) {
char result{static_cast<char>(**cp - '0')};
- for (int j = (result > 3 ? 1 : 2); j-- > 0;) {
+ for (int j{0}; j < 2 && static_cast<unsigned char>(result) <= 037;
+ ++j) {
static constexpr auto octalDigit{attempt("01234567"_ch)};
if (std::optional<const char *> oct{octalDigit.Parse(state)}) {
- result = 8 * result + **oct - '0';
+ result = 8 * result + DecimalDigitValue(**oct);
} else {
break;
}
}
return std::make_pair(result, true);
- } else {
- // unknown escape - ignore the '\' (PGI compatibility)
- return std::make_pair(static_cast<char>(1 [*cp]), true);
+ } else if (**cp == 'x' || **cp == 'X') {
+ char result{0};
+ for (int j{0}; j < 2; ++j) {
+ static constexpr auto hexadecimalDigit{
+ attempt("01234567abcdefABCDEF"_ch)};
+ if (std::optional<const char *> hex{hexadecimalDigit.Parse(state)}) {
+ result = 16 * result + HexadecimalDigitValue(**hex);
+ } else {
+ return std::nullopt;
+ }
+ }
+ return std::make_pair(result, true);
+ } else if (IsLetter(**cp)) {
+ // Unknown escape - ignore the '\' (PGI compatibility)
+ return std::make_pair(**cp, true);
}
}
return std::nullopt;
x.u);
}
void Unparse(const CharLiteralConstant &x) { // R724
- Encoding encoding{Encoding::LATIN_1};
if (const auto &k{std::get<std::optional<KindParam>>(x.t)}) {
if (std::holds_alternative<KindParam::Kanji>(k->u)) {
Word("NC");
- encoding = Encoding::EUC_JP;
} else {
Walk(*k), Put('_');
}
}
Put(QuoteCharacterLiteral(
- DecodeUTF_8(std::get<std::string>(x.t)), backslashEscapes_, encoding));
+ std::get<std::string>(x.t), backslashEscapes_, Encoding::LATIN_1));
}
void Unparse(const HollerithLiteralConstant &x) {
std::u32string ucs{DecodeUTF_8(x.v)};
if (!CheckIntrinsicKind(TypeCategory::Character, kind)) {
return std::nullopt;
}
- std::u32string codepoint{parser::DecodeUTF_8(string)};
if (kind == 1) {
- std::string result;
- for (char32_t ch : codepoint) {
- if (ch <= 0xff) {
- result += static_cast<char>(ch);
- } else {
- // Original literal in UTF-8 source contained a byte sequence
- // that looked like UTF-8 and got decoded as such. Reconstruct.
- parser::EncodedCharacter encoded{parser::EncodeUTF_8(ch)};
- result += std::string{
- encoded.buffer, static_cast<std::size_t>(encoded.bytes)};
- }
- }
return AsGenericExpr(
- Constant<Type<TypeCategory::Character, 1>>{std::move(result)});
+ Constant<Type<TypeCategory::Character, 1>>{std::move(string)});
} else if (kind == 2) {
- std::u16string result;
- for (char32_t ch : codepoint) {
- if (ch > 0xffff) {
- Say("Bad character in CHARACTER(KIND=2) literal"_err_en_US);
- return std::nullopt;
- }
- result += static_cast<char16_t>(ch);
- }
- return AsGenericExpr(
- Constant<Type<TypeCategory::Character, 2>>{std::move(result)});
+ return AsGenericExpr(Constant<Type<TypeCategory::Character, 2>>{
+ parser::DecodeEUC_JP(string)});
} else {
CHECK(kind == 4);
- return AsGenericExpr(
- Constant<Type<TypeCategory::Character, 4>>{std::move(codepoint)});
+ return AsGenericExpr(Constant<Type<TypeCategory::Character, 4>>{
+ parser::DecodeUTF_8(string)});
}
}
!Expect: m.mod
!module m
-!character(:,4),parameter::c4=4_"Hi! 你好!"
-!character(:,1),parameter::c1=1_"Hi! ä½ å¥½!"
-!character(:,4),parameter::c4a(1_8:)=[CHARACTER(KIND=4,LEN=1)::"一","二","三","四","五"]
+!character(:,4),parameter::c4=4_"Hi! \344\275\240\345\245\275!"
+!character(:,1),parameter::c1=1_"Hi! \344\275\240\345\245\275!"
+!character(:,4),parameter::c4a(1_8:)=[CHARACTER(KIND=4,LEN=1)::"\344\270\200","\344\272\214","\344\270\211","\345\233\233","\344\272\224"]
!integer(4),parameter::lc4=7_4
!integer(4),parameter::lc1=11_4
!end