IostatShortRead,
IostatMissingTerminator,
IostatBadUnformattedRecord,
+ IostatUTF8Decoding,
};
const char *IostatErrorString(int);
type-info.cpp
unit.cpp
unit-map.cpp
+ utf.cpp
LINK_LIBS
FortranDecimal
for (std::size_t j{0}; j < numElements; ++j) {
A *x{&ExtractElement<A>(io, descriptor, subscripts)};
if (listOutput) {
- if (!ListDirectedDefaultCharacterOutput(io, *listOutput, x, length)) {
+ if (!ListDirectedCharacterOutput(io, *listOutput, x, length)) {
return false;
}
} else if (auto edit{io.GetNextDataEdit()}) {
if constexpr (DIR == Direction::Output) {
- if (!EditDefaultCharacterOutput(io, *edit, x, length)) {
+ if (!EditCharacterOutput(io, *edit, x, length)) {
return false;
}
} else {
if (edit->descriptor != DataEdit::ListDirectedNullValue) {
- if (EditDefaultCharacterInput(io, *edit, x, length)) {
+ if (EditCharacterInput(io, *edit, x, length)) {
anyInput = true;
} else {
return anyInput && edit->IsNamelist();
switch (kind) {
case 1:
return FormattedCharacterIO<char, DIR>(io, descriptor);
- // TODO cases 2, 4
+ case 2:
+ return FormattedCharacterIO<char16_t, DIR>(io, descriptor);
+ case 4:
+ return FormattedCharacterIO<char32_t, DIR>(io, descriptor);
default:
handler.Crash(
"DescriptorIO: Unimplemented CHARACTER kind (%d) in descriptor",
#include "edit-input.h"
#include "namelist.h"
+#include "utf.h"
#include "flang/Common/real.h"
#include "flang/Common/uint128.h"
#include <algorithm>
if (next) {
negative = *next == '-';
if (negative || *next == '+') {
- io.GotChar();
io.SkipSpaces(remaining);
next = io.NextInField(remaining, edit);
}
case 'Z':
return EditBOZInput(io, edit, n, 16, kind << 3);
case 'A': // legacy extension
- return EditDefaultCharacterInput(
- io, edit, reinterpret_cast<char *>(n), kind);
+ return EditCharacterInput(io, edit, reinterpret_cast<char *>(n), kind);
default:
io.GetIoErrorHandler().SignalError(IostatErrorInFormat,
"Data edit descriptor '%c' may not be used with an INTEGER data item",
next = io.NextInField(remaining, edit);
}
if (!next) { // NextInField fails on separators like ')'
- next = io.GetCurrentChar();
+ std::size_t byteCount{0};
+ next = io.GetCurrentChar(byteCount);
if (next && *next == ')') {
- io.HandleRelativePosition(1);
+ io.HandleRelativePosition(byteCount);
}
}
} else if (remaining) {
return EditBOZInput(
io, edit, n, 16, common::BitsForBinaryPrecision(binaryPrecision));
case 'A': // legacy extension
- return EditDefaultCharacterInput(
- io, edit, reinterpret_cast<char *>(n), KIND);
+ return EditCharacterInput(io, edit, reinterpret_cast<char *>(n), KIND);
default:
io.GetIoErrorHandler().SignalError(IostatErrorInFormat,
"Data edit descriptor '%c' may not be used for REAL input",
}
// See 13.10.3.1 paragraphs 7-9 in Fortran 2018
+template <typename CHAR>
static bool EditDelimitedCharacterInput(
- IoStatementState &io, char *x, std::size_t length, char32_t delimiter) {
+ IoStatementState &io, CHAR *x, std::size_t length, char32_t delimiter) {
bool result{true};
while (true) {
- auto ch{io.GetCurrentChar()};
+ std::size_t byteCount{0};
+ auto ch{io.GetCurrentChar(byteCount)};
if (!ch) {
if (io.AdvanceRecord()) {
continue;
break;
}
}
- io.HandleRelativePosition(1);
+ io.HandleRelativePosition(byteCount);
if (*ch == delimiter) {
- auto next{io.GetCurrentChar()};
+ auto next{io.GetCurrentChar(byteCount)};
if (next && *next == delimiter) {
// Repeated delimiter: use as character value
- io.HandleRelativePosition(1);
+ io.HandleRelativePosition(byteCount);
} else {
break; // closing delimiter
}
return result;
}
-static bool EditListDirectedDefaultCharacterInput(
- IoStatementState &io, char *x, std::size_t length, const DataEdit &edit) {
- auto ch{io.GetCurrentChar()};
+template <typename CHAR>
+static bool EditListDirectedCharacterInput(
+ IoStatementState &io, CHAR *x, std::size_t length, const DataEdit &edit) {
+ std::size_t byteCount{0};
+ auto ch{io.GetCurrentChar(byteCount)};
if (ch && (*ch == '\'' || *ch == '"')) {
- io.HandleRelativePosition(1);
+ io.HandleRelativePosition(byteCount);
return EditDelimitedCharacterInput(io, x, length, *ch);
}
if (IsNamelistName(io) || io.GetConnectionState().IsAtEOF()) {
return false;
}
// Undelimited list-directed character input: stop at a value separator
- // or the end of the current record.
- std::optional<int> remaining{length};
+ // or the end of the current record. Subtlety: the "remaining" count
+ // here is a dummy that's used to avoid the interpretation of separators
+ // in NextInField.
+ std::optional<int> remaining{maxUTF8Bytes};
while (std::optional<char32_t> next{io.NextInField(remaining, edit)}) {
switch (*next) {
case ' ':
default:
*x++ = *next;
--length;
+ remaining = maxUTF8Bytes;
}
}
std::fill_n(x, length, ' ');
return true;
}
-bool EditDefaultCharacterInput(
- IoStatementState &io, const DataEdit &edit, char *x, std::size_t length) {
+template <typename CHAR>
+bool EditCharacterInput(
+ IoStatementState &io, const DataEdit &edit, CHAR *x, std::size_t length) {
switch (edit.descriptor) {
case DataEdit::ListDirected:
- return EditListDirectedDefaultCharacterInput(io, x, length, edit);
+ return EditListDirectedCharacterInput(io, x, length, edit);
case 'A':
case 'G':
break;
edit.descriptor);
return false;
}
- if (io.GetConnectionState().IsAtEOF()) {
+ const ConnectionState &connection{io.GetConnectionState()};
+ if (connection.IsAtEOF()) {
return false;
}
std::size_t remaining{length};
const char *input{nullptr};
std::size_t ready{0};
bool hitEnd{false};
- if (remaining > length) {
- // Discard leading bytes.
- // These bytes don't count towards INQUIRE(IOLENGTH=).
- std::size_t skip{remaining - length};
- do {
- if (ready == 0) {
- ready = io.GetNextInputBytes(input);
- if (ready == 0) {
- hitEnd = true;
- break;
- }
- }
- std::size_t chunk{std::min<std::size_t>(skip, ready)};
- io.HandleRelativePosition(chunk);
- ready -= chunk;
- input += chunk;
- skip -= chunk;
- } while (skip > 0);
- remaining = length;
- }
+ // Skip leading bytes.
+ // These bytes don't count towards INQUIRE(IOLENGTH=).
+ std::size_t skip{remaining > length ? remaining - length : 0};
// Transfer payload bytes; these do count.
while (remaining > 0) {
if (ready == 0) {
break;
}
}
- std::size_t chunk{std::min<std::size_t>(remaining, ready)};
- std::memcpy(x, input, chunk);
- x += chunk;
+ std::size_t chunk;
+ bool skipping{skip > 0};
+ if (connection.isUTF8) {
+ chunk = MeasureUTF8Bytes(*input);
+ if (skipping) {
+ --skip;
+ } else if (auto ucs{DecodeUTF8(input)}) {
+ *x++ = *ucs;
+ --length;
+ } else if (chunk == 0) {
+ // error recovery: skip bad encoding
+ chunk = 1;
+ }
+ --remaining;
+ } else {
+ if (skipping) {
+ chunk = std::min<std::size_t>(skip, ready);
+ skip -= chunk;
+ } else {
+ chunk = std::min<std::size_t>(remaining, ready);
+ std::memcpy(x, input, chunk);
+ x += chunk;
+ length -= chunk;
+ }
+ remaining -= chunk;
+ }
input += chunk;
- io.GotChar(chunk);
+ if (!skipping) {
+ io.GotChar(chunk);
+ }
io.HandleRelativePosition(chunk);
ready -= chunk;
- remaining -= chunk;
- length -= chunk;
}
// Pad the remainder of the input variable, if any.
- std::memset(x, ' ', length);
+ std::fill_n(x, length, ' ');
if (hitEnd) {
io.CheckForEndOfRecord(); // signal any needed error
}
template bool EditRealInput<10>(IoStatementState &, const DataEdit &, void *);
// TODO: double/double
template bool EditRealInput<16>(IoStatementState &, const DataEdit &, void *);
+
+template bool EditCharacterInput(
+ IoStatementState &, const DataEdit &, char *, std::size_t);
+template bool EditCharacterInput(
+ IoStatementState &, const DataEdit &, char16_t *, std::size_t);
+template bool EditCharacterInput(
+ IoStatementState &, const DataEdit &, char32_t *, std::size_t);
+
} // namespace Fortran::runtime::io
bool EditRealInput(IoStatementState &, const DataEdit &, void *);
bool EditLogicalInput(IoStatementState &, const DataEdit &, bool &);
-bool EditDefaultCharacterInput(
- IoStatementState &, const DataEdit &, char *, std::size_t);
+
+template <typename CHAR>
+bool EditCharacterInput(
+ IoStatementState &, const DataEdit &, CHAR *, std::size_t);
extern template bool EditRealInput<2>(
IoStatementState &, const DataEdit &, void *);
// TODO: double/double
extern template bool EditRealInput<16>(
IoStatementState &, const DataEdit &, void *);
+
+extern template bool EditCharacterInput(
+ IoStatementState &, const DataEdit &, char *, std::size_t);
+extern template bool EditCharacterInput(
+ IoStatementState &, const DataEdit &, char16_t *, std::size_t);
+extern template bool EditCharacterInput(
+ IoStatementState &, const DataEdit &, char32_t *, std::size_t);
+
} // namespace Fortran::runtime::io
#endif // FORTRAN_RUNTIME_EDIT_INPUT_H_
//===----------------------------------------------------------------------===//
#include "edit-output.h"
+#include "utf.h"
#include "flang/Common/uint128.h"
#include <algorithm>
}
break;
case 'A': // legacy extension
- return EditDefaultCharacterOutput(
+ return EditCharacterOutput(
io, edit, reinterpret_cast<char *>(&n), sizeof n);
default:
io.GetIoErrorHandler().Crash(
case 'G':
return Edit(EditForGOutput(edit));
case 'A': // legacy extension
- return EditDefaultCharacterOutput(
+ return EditCharacterOutput(
io_, edit, reinterpret_cast<char *>(&x_), sizeof x_);
default:
if (edit.IsListDirected()) {
}
}
-bool ListDirectedDefaultCharacterOutput(IoStatementState &io,
- ListDirectedStatementState<Direction::Output> &list, const char *x,
+template <typename CHAR>
+bool ListDirectedCharacterOutput(IoStatementState &io,
+ ListDirectedStatementState<Direction::Output> &list, const CHAR *x,
std::size_t length) {
bool ok{true};
MutableModes &modes{io.mutableModes()};
ok = ok && list.EmitLeadingSpaceOrAdvance(io);
// Value is delimited with ' or " marks, and interior
// instances of that character are doubled.
- auto EmitOne{[&](char ch) {
+ auto EmitOne{[&](CHAR ch) {
if (connection.NeedAdvance(1)) {
ok = ok && io.AdvanceRecord();
}
- ok = ok && io.Emit(&ch, 1);
+ ok = ok && io.EmitEncoded(&ch, 1);
}};
EmitOne(modes.delim);
for (std::size_t j{0}; j < length; ++j) {
// the same thing when tested with this case.
// This runtime splits the doubled delimiters across
// two records for lack of a better alternative.
- if (x[j] == modes.delim) {
+ if (x[j] == static_cast<CHAR>(modes.delim)) {
EmitOne(x[j]);
}
EmitOne(x[j]);
// Undelimited list-directed output
ok = ok && list.EmitLeadingSpaceOrAdvance(io, length > 0 ? 1 : 0, true);
std::size_t put{0};
+ std::size_t oneIfUTF8{connection.isUTF8 ? 1 : length};
while (ok && put < length) {
- auto chunk{std::min(length - put, connection.RemainingSpaceInRecord())};
- ok = ok && io.Emit(x + put, chunk);
- put += chunk;
- if (put < length) {
- ok = ok && io.AdvanceRecord() && io.Emit(" ", 1);
+ if (std::size_t chunk{std::min<std::size_t>(
+ std::min<std::size_t>(length - put, oneIfUTF8),
+ connection.RemainingSpaceInRecord())}) {
+ ok = io.EmitEncoded(x + put, chunk);
+ put += chunk;
+ } else {
+ ok = io.AdvanceRecord() && io.Emit(" ", 1);
}
}
list.set_lastWasUndelimitedCharacter(true);
return ok;
}
-bool EditDefaultCharacterOutput(IoStatementState &io, const DataEdit &edit,
- const char *x, std::size_t length) {
+template <typename CHAR>
+bool EditCharacterOutput(IoStatementState &io, const DataEdit &edit,
+ const CHAR *x, std::size_t length) {
switch (edit.descriptor) {
case 'A':
case 'G':
int len{static_cast<int>(length)};
int width{edit.width.value_or(len)};
return io.EmitRepeated(' ', std::max(0, width - len)) &&
- io.Emit(x, std::min(width, len));
+ io.EmitEncoded(x, std::min(width, len));
}
template bool EditIntegerOutput<1>(
template class RealOutputEditing<10>;
// TODO: double/double
template class RealOutputEditing<16>;
+
+template bool ListDirectedCharacterOutput(IoStatementState &,
+ ListDirectedStatementState<Direction::Output> &, const char *,
+ std::size_t chars);
+template bool ListDirectedCharacterOutput(IoStatementState &,
+ ListDirectedStatementState<Direction::Output> &, const char16_t *,
+ std::size_t chars);
+template bool ListDirectedCharacterOutput(IoStatementState &,
+ ListDirectedStatementState<Direction::Output> &, const char32_t *,
+ std::size_t chars);
+
+template bool EditCharacterOutput(
+ IoStatementState &, const DataEdit &, const char *, std::size_t chars);
+template bool EditCharacterOutput(
+ IoStatementState &, const DataEdit &, const char16_t *, std::size_t chars);
+template bool EditCharacterOutput(
+ IoStatementState &, const DataEdit &, const char32_t *, std::size_t chars);
+
} // namespace Fortran::runtime::io
bool ListDirectedLogicalOutput(
IoStatementState &, ListDirectedStatementState<Direction::Output> &, bool);
bool EditLogicalOutput(IoStatementState &, const DataEdit &, bool);
-bool ListDirectedDefaultCharacterOutput(IoStatementState &,
- ListDirectedStatementState<Direction::Output> &, const char *, std::size_t);
-bool EditDefaultCharacterOutput(
- IoStatementState &, const DataEdit &, const char *, std::size_t);
+
+template <typename CHAR>
+bool ListDirectedCharacterOutput(IoStatementState &,
+ ListDirectedStatementState<Direction::Output> &, const CHAR *,
+ std::size_t chars);
+extern template bool ListDirectedCharacterOutput(IoStatementState &,
+ ListDirectedStatementState<Direction::Output> &, const char *,
+ std::size_t chars);
+extern template bool ListDirectedCharacterOutput(IoStatementState &,
+ ListDirectedStatementState<Direction::Output> &, const char16_t *,
+ std::size_t chars);
+extern template bool ListDirectedCharacterOutput(IoStatementState &,
+ ListDirectedStatementState<Direction::Output> &, const char32_t *,
+ std::size_t chars);
+
+template <typename CHAR>
+bool EditCharacterOutput(
+ IoStatementState &, const DataEdit &, const CHAR *, std::size_t chars);
+extern template bool EditCharacterOutput(
+ IoStatementState &, const DataEdit &, const char *, std::size_t chars);
+extern template bool EditCharacterOutput(
+ IoStatementState &, const DataEdit &, const char16_t *, std::size_t chars);
+extern template bool EditCharacterOutput(
+ IoStatementState &, const DataEdit &, const char32_t *, std::size_t chars);
extern template bool EditIntegerOutput<1>(
IoStatementState &, const DataEdit &, std::int8_t);
}
}
+ if (auto *x{std::getenv("DEFAULT_UTF8")}) {
+ char *end;
+ auto n{std::strtol(x, &end, 10)};
+ if (n >= 0 && n <= 1 && *end == '\0') {
+ defaultUTF8 = n != 0;
+ } else {
+ std::fprintf(
+ stderr, "Fortran runtime: DEFAULT_UTF8=%s is invalid; ignored\n", x);
+ }
+ }
+
// TODO: Set RP/ROUND='PROCESSOR_DEFINED' from environment
}
std::optional<Convert> GetConvertFromString(const char *, std::size_t);
struct ExecutionEnvironment {
+ constexpr ExecutionEnvironment(){};
void Configure(int argc, const char *argv[], const char *envp[]);
const char *GetEnv(
const char *name, std::size_t name_length, const Terminator &terminator);
- int argc;
- const char **argv;
- const char **envp;
+ int argc{0};
+ const char **argv{nullptr};
+ const char **envp{nullptr};
- int listDirectedOutputLineLengthLimit; // FORT_FMT_RECL
- enum decimal::FortranRounding defaultOutputRoundingMode;
- Convert conversion; // FORT_CONVERT
- bool noStopMessage; // NO_STOP_MESSAGE=1 inhibits "Fortran STOP"
+ int listDirectedOutputLineLengthLimit{79}; // FORT_FMT_RECL
+ enum decimal::FortranRounding defaultOutputRoundingMode{
+ decimal::FortranRounding::RoundNearest}; // RP(==PN)
+ Convert conversion{Convert::Unknown}; // FORT_CONVERT
+ bool noStopMessage{false}; // NO_STOP_MESSAGE=1 inhibits "Fortran STOP"
+ bool defaultUTF8{false}; // DEFAULT_UTF8
};
+
extern ExecutionEnvironment executionEnvironment;
} // namespace Fortran::runtime
}
}
-template <Direction DIR>
-std::optional<char32_t> InternalDescriptorUnit<DIR>::GetCurrentChar(
- IoErrorHandler &handler) {
- const char *p{nullptr};
- std::size_t bytes{GetNextInputBytes(p, handler)};
- if (bytes == 0) {
- return std::nullopt;
- } else {
- if (isUTF8) {
- // TODO: UTF-8 decoding
- }
- return *p;
- }
-}
-
template <Direction DIR>
bool InternalDescriptorUnit<DIR>::AdvanceRecord(IoErrorHandler &handler) {
if (currentRecordNumber >= endfileRecordNumber.value_or(0)) {
bool Emit(const char *, std::size_t, IoErrorHandler &);
std::size_t GetNextInputBytes(const char *&, IoErrorHandler &);
- std::optional<char32_t> GetCurrentChar(IoErrorHandler &);
bool AdvanceRecord(IoErrorHandler &);
void BackspaceRecord(IoErrorHandler &);
#include "format.h"
#include "tools.h"
#include "unit.h"
+#include "utf.h"
#include "flang/Runtime/memory.h"
#include <algorithm>
#include <cstdio>
#include <cstring>
#include <limits>
+#include <type_traits>
namespace Fortran::runtime::io {
Crash(
"ExternalIoStatementState::Emit(char16_t) called for input statement");
}
- // TODO: UTF-8 encoding
return unit().Emit(reinterpret_cast<const char *>(data), chars * sizeof *data,
sizeof *data, *this);
}
Crash(
"ExternalIoStatementState::Emit(char32_t) called for input statement");
}
- // TODO: UTF-8 encoding
return unit().Emit(reinterpret_cast<const char *>(data), chars * sizeof *data,
sizeof *data, *this);
}
return std::visit([=](auto &x) { return x.get().Emit(data, chars); }, u_);
}
+template <typename CHAR>
+bool IoStatementState::EmitEncoded(const CHAR *data0, std::size_t chars) {
+ // Don't allow sign extension
+ using UnsignedChar = std::make_unsigned_t<CHAR>;
+ const UnsignedChar *data{reinterpret_cast<const UnsignedChar *>(data0)};
+ if (GetConnectionState().isUTF8) {
+ char buffer[256];
+ std::size_t at{0};
+ while (chars-- > 0) {
+ auto len{EncodeUTF8(buffer + at, *data++)};
+ at += len;
+ if (at + maxUTF8Bytes > sizeof buffer) {
+ if (!Emit(buffer, at)) {
+ return false;
+ }
+ at = 0;
+ }
+ }
+ return at == 0 || Emit(buffer, at);
+ } else {
+ return Emit(data0, chars);
+ }
+}
+
bool IoStatementState::Receive(
char *data, std::size_t n, std::size_t elementBytes) {
return std::visit(
return std::visit([](auto &x) { return x.get().GetExternalFileUnit(); }, u_);
}
+std::optional<char32_t> IoStatementState::GetCurrentChar(
+ std::size_t &byteCount) {
+ const char *p{nullptr};
+ std::size_t bytes{GetNextInputBytes(p)};
+ if (bytes == 0) {
+ byteCount = 0;
+ return std::nullopt;
+ } else {
+ if (GetConnectionState().isUTF8) {
+ std::size_t length{MeasureUTF8Bytes(*p)};
+ if (length <= bytes) {
+ if (auto result{DecodeUTF8(p)}) {
+ byteCount = length;
+ return result;
+ }
+ }
+ GetIoErrorHandler().SignalError(IostatUTF8Decoding);
+ // Error recovery: return the next byte
+ }
+ byteCount = 1;
+ return *p;
+ }
+}
+
bool IoStatementState::EmitRepeated(char ch, std::size_t n) {
return std::visit(
[=](auto &x) {
std::optional<char32_t> IoStatementState::NextInField(
std::optional<int> &remaining, const DataEdit &edit) {
+ std::size_t byteCount{0};
if (!remaining) { // Stream, list-directed, or NAMELIST
- if (auto next{GetCurrentChar()}) {
+ if (auto next{GetCurrentChar(byteCount)}) {
if (edit.IsListDirected()) {
// list-directed or NAMELIST: check for separators
switch (*next) {
break;
}
}
- HandleRelativePosition(1);
- GotChar();
+ HandleRelativePosition(byteCount);
+ GotChar(byteCount);
return next;
}
} else if (*remaining > 0) {
- if (auto next{GetCurrentChar()}) {
- --*remaining;
- HandleRelativePosition(1);
- GotChar();
+ if (auto next{GetCurrentChar(byteCount)}) {
+ if (byteCount > static_cast<std::size_t>(*remaining)) {
+ return std::nullopt;
+ }
+ *remaining -= byteCount;
+ HandleRelativePosition(byteCount);
+ GotChar(byteCount);
return next;
}
if (CheckForEndOfRecord()) { // do padding
if (edit.modes.editingFlags & decimalComma) {
comma = ';';
}
+ std::size_t byteCount{0};
if (remaining_ > 0 && !realPart_) { // "r*c" repetition in progress
RUNTIME_CHECK(io.GetIoErrorHandler(), repeatPosition_.has_value());
repeatPosition_.reset(); // restores the saved position
if (!imaginaryPart_) {
edit.repeat = std::min<int>(remaining_, maxRepeat);
- auto ch{io.GetCurrentChar()};
+ auto ch{io.GetCurrentChar(byteCount)};
if (!ch || *ch == ' ' || *ch == '\t' || *ch == comma) {
// "r*" repeated null
edit.descriptor = DataEdit::ListDirectedNullValue;
imaginaryPart_ = true;
edit.descriptor = DataEdit::ListDirectedImaginaryPart;
}
- auto ch{io.GetNextNonBlank()};
+ auto ch{io.GetNextNonBlank(byteCount)};
if (ch && *ch == comma && eatComma_) {
// Consume comma & whitespace after previous item.
// This includes the comma between real and imaginary components
// in list-directed/NAMELIST complex input.
// (When DECIMAL='COMMA', the comma is actually a semicolon.)
- io.HandleRelativePosition(1);
- ch = io.GetNextNonBlank();
+ io.HandleRelativePosition(byteCount);
+ ch = io.GetNextNonBlank(byteCount);
}
eatComma_ = true;
if (!ch) {
break;
}
r = 10 * r + (*ch - '0');
- io.HandleRelativePosition(1);
- ch = io.GetCurrentChar();
+ io.HandleRelativePosition(byteCount);
+ ch = io.GetCurrentChar(byteCount);
} while (ch && *ch >= '0' && *ch <= '9');
if (r > 0 && ch && *ch == '*') { // subtle: r must be nonzero
- io.HandleRelativePosition(1);
- ch = io.GetCurrentChar();
+ io.HandleRelativePosition(byteCount);
+ ch = io.GetCurrentChar(byteCount);
if (ch && *ch == '/') { // r*/
hitSlash_ = true;
edit.descriptor = DataEdit::ListDirectedNullValue;
}
if (!imaginaryPart_ && ch && *ch == '(') {
realPart_ = true;
- io.HandleRelativePosition(1);
+ io.HandleRelativePosition(byteCount);
edit.descriptor = DataEdit::ListDirectedRealPart;
}
return edit;
return IoStatementBase::EndIoStatement();
}
+template bool IoStatementState::EmitEncoded<char>(const char *, std::size_t);
+template bool IoStatementState::EmitEncoded<char16_t>(
+ const char16_t *, std::size_t);
+template bool IoStatementState::EmitEncoded<char32_t>(
+ const char32_t *, std::size_t);
+
} // namespace Fortran::runtime::io
bool Emit(const char *, std::size_t);
bool Emit(const char16_t *, std::size_t chars);
bool Emit(const char32_t *, std::size_t chars);
+ template <typename CHAR> bool EmitEncoded(const CHAR *, std::size_t);
bool Receive(char *, std::size_t, std::size_t elementBytes = 0);
std::size_t GetNextInputBytes(const char *&);
bool AdvanceRecord(int = 1);
}
// Vacant after the end of the current record
- std::optional<char32_t> GetCurrentChar() {
- const char *p{nullptr};
- std::size_t bytes{GetNextInputBytes(p)};
- if (bytes == 0) {
- return std::nullopt;
- } else {
- // TODO: UTF-8 decoding; may have to get more bytes in a loop
- return *p;
- }
- }
+ std::optional<char32_t> GetCurrentChar(std::size_t &byteCount);
bool EmitRepeated(char, std::size_t);
bool EmitField(const char *, std::size_t length, std::size_t width);
const DataEdit &edit, std::optional<int> &remaining) {
remaining.reset();
if (edit.descriptor == DataEdit::ListDirected) {
- GetNextNonBlank();
+ std::size_t byteCount{0};
+ GetNextNonBlank(byteCount);
} else {
if (edit.width.value_or(0) > 0) {
remaining = *edit.width;
std::optional<char32_t> SkipSpaces(std::optional<int> &remaining) {
while (!remaining || *remaining > 0) {
- if (auto ch{GetCurrentChar()}) {
+ std::size_t byteCount{0};
+ if (auto ch{GetCurrentChar(byteCount)}) {
if (*ch != ' ' && *ch != '\t') {
return ch;
}
- HandleRelativePosition(1);
if (remaining) {
- GotChar();
- --*remaining;
+ if (static_cast<std::size_t>(*remaining) < byteCount) {
+ break;
+ }
+ GotChar(byteCount);
+ *remaining -= byteCount;
}
+ HandleRelativePosition(byteCount);
} else {
break;
}
bool CheckForEndOfRecord();
// Skips spaces, advances records, and ignores NAMELIST comments
- std::optional<char32_t> GetNextNonBlank() {
- auto ch{GetCurrentChar()};
+ std::optional<char32_t> GetNextNonBlank(std::size_t &byteCount) {
+ auto ch{GetCurrentChar(byteCount)};
bool inNamelist{mutableModes().inNamelist};
while (!ch || *ch == ' ' || *ch == '\t' || (inNamelist && *ch == '!')) {
if (ch && (*ch == ' ' || *ch == '\t')) {
- HandleRelativePosition(1);
+ HandleRelativePosition(byteCount);
} else if (!AdvanceRecord()) {
return std::nullopt;
}
- ch = GetCurrentChar();
+ ch = GetCurrentChar(byteCount);
}
return ch;
}
ConnectionState connection_;
};
+extern template bool IoStatementState::EmitEncoded<char>(
+ const char *, std::size_t);
+extern template bool IoStatementState::EmitEncoded<char16_t>(
+ const char16_t *, std::size_t);
+extern template bool IoStatementState::EmitEncoded<char32_t>(
+ const char32_t *, std::size_t);
+
} // namespace Fortran::runtime::io
#endif // FORTRAN_RUNTIME_IO_STMT_H_
return "Sequential record missing its terminator";
case IostatBadUnformattedRecord:
return "Erroneous unformatted sequential file record structure";
+ case IostatUTF8Decoding:
+ return "UTF-8 decoding error";
default:
return nullptr;
}
static bool GetLowerCaseName(
IoStatementState &io, char buffer[], std::size_t maxLength) {
- if (auto ch{io.GetNextNonBlank()}) {
+ std::size_t byteLength{0};
+ if (auto ch{io.GetNextNonBlank(byteLength)}) {
if (IsLegalIdStart(*ch)) {
std::size_t j{0};
do {
buffer[j] = NormalizeIdChar(*ch);
- io.HandleRelativePosition(1);
- ch = io.GetCurrentChar();
+ io.HandleRelativePosition(byteLength);
+ ch = io.GetCurrentChar(byteLength);
} while (++j < maxLength && ch && IsLegalIdChar(*ch));
buffer[j++] = '\0';
if (j <= maxLength) {
static std::optional<SubscriptValue> GetSubscriptValue(IoStatementState &io) {
std::optional<SubscriptValue> value;
- std::optional<char32_t> ch{io.GetCurrentChar()};
+ std::size_t byteCount{0};
+ std::optional<char32_t> ch{io.GetCurrentChar(byteCount)};
bool negate{ch && *ch == '-'};
if ((ch && *ch == '+') || negate) {
- io.HandleRelativePosition(1);
- ch = io.GetCurrentChar();
+ io.HandleRelativePosition(byteCount);
+ ch = io.GetCurrentChar(byteCount);
}
bool overflow{false};
while (ch && *ch >= '0' && *ch <= '9') {
SubscriptValue was{value.value_or(0)};
overflow |= was >= std::numeric_limits<SubscriptValue>::max() / 10;
value = 10 * was + *ch - '0';
- io.HandleRelativePosition(1);
- ch = io.GetCurrentChar();
+ io.HandleRelativePosition(byteCount);
+ ch = io.GetCurrentChar(byteCount);
}
if (overflow) {
io.GetIoErrorHandler().SignalError(
if (value) {
return -*value;
} else {
- io.HandleRelativePosition(-1); // give back '-' with no digits
+ io.HandleRelativePosition(-byteCount); // give back '-' with no digits
}
}
return value;
int j{0};
std::size_t contiguousStride{source.ElementBytes()};
bool ok{true};
- std::optional<char32_t> ch{io.GetNextNonBlank()};
+ std::size_t byteCount{0};
+ std::optional<char32_t> ch{io.GetNextNonBlank(byteCount)};
char32_t comma{GetComma(io)};
for (; ch && *ch != ')'; ++j) {
SubscriptValue dimLower{0}, dimUpper{0}, dimStride{0};
} else {
dimLower = *low;
}
- ch = io.GetNextNonBlank();
+ ch = io.GetNextNonBlank(byteCount);
}
if (ch && *ch == ':') {
- io.HandleRelativePosition(1);
- ch = io.GetNextNonBlank();
+ io.HandleRelativePosition(byteCount);
+ ch = io.GetNextNonBlank(byteCount);
if (auto high{GetSubscriptValue(io)}) {
if (*high > dimUpper) {
if (ok) {
} else {
dimUpper = *high;
}
- ch = io.GetNextNonBlank();
+ ch = io.GetNextNonBlank(byteCount);
}
if (ch && *ch == ':') {
- io.HandleRelativePosition(1);
- ch = io.GetNextNonBlank();
+ io.HandleRelativePosition(byteCount);
+ ch = io.GetNextNonBlank(byteCount);
if (auto str{GetSubscriptValue(io)}) {
dimStride = *str;
- ch = io.GetNextNonBlank();
+ ch = io.GetNextNonBlank(byteCount);
}
}
} else { // scalar
dimStride = 0;
}
if (ch && *ch == comma) {
- io.HandleRelativePosition(1);
- ch = io.GetNextNonBlank();
+ io.HandleRelativePosition(byteCount);
+ ch = io.GetNextNonBlank(byteCount);
}
if (ok) {
lower[j] = dimLower;
}
if (ok) {
if (ch && *ch == ')') {
- io.HandleRelativePosition(1);
+ io.HandleRelativePosition(byteCount);
if (desc.EstablishPointerSection(source, lower, upper, stride)) {
return true;
} else {
// ambiguous within the parentheses.
io.HandleRelativePosition(1); // skip '('
std::optional<SubscriptValue> lower, upper;
- std::optional<char32_t> ch{io.GetNextNonBlank()};
+ std::size_t byteCount{0};
+ std::optional<char32_t> ch{io.GetNextNonBlank(byteCount)};
if (ch) {
if (*ch == ':') {
lower = 1;
} else {
lower = GetSubscriptValue(io);
- ch = io.GetNextNonBlank();
+ ch = io.GetNextNonBlank(byteCount);
}
}
if (ch && ch == ':') {
- io.HandleRelativePosition(1);
- ch = io.GetNextNonBlank();
+ io.HandleRelativePosition(byteCount);
+ ch = io.GetNextNonBlank(byteCount);
if (ch) {
if (*ch == ')') {
upper = chars;
} else {
upper = GetSubscriptValue(io);
- ch = io.GetNextNonBlank();
+ ch = io.GetNextNonBlank(byteCount);
}
}
}
if (ch && *ch == ')') {
- io.HandleRelativePosition(1);
+ io.HandleRelativePosition(byteCount);
if (lower && upper) {
if (*lower > *upper) {
// An empty substring, whatever the values are
// Advance to the terminal '/' of a namelist group.
static void SkipNamelistGroup(IoStatementState &io) {
- while (auto ch{io.GetNextNonBlank()}) {
- io.HandleRelativePosition(1);
+ std::size_t byteCount{0};
+ while (auto ch{io.GetNextNonBlank(byteCount)}) {
+ io.HandleRelativePosition(byteCount);
if (*ch == '/') {
break;
} else if (*ch == '\'' || *ch == '"') {
// Skip quoted character literal
char32_t quote{*ch};
while (true) {
- if ((ch = io.GetCurrentChar())) {
- io.HandleRelativePosition(1);
+ if ((ch = io.GetCurrentChar(byteCount))) {
+ io.HandleRelativePosition(byteCount);
if (*ch == quote) {
break;
}
char name[nameBufferSize];
RUNTIME_CHECK(handler, group.groupName != nullptr);
char32_t comma{GetComma(io)};
+ std::size_t byteCount{0};
while (true) {
- next = io.GetNextNonBlank();
+ next = io.GetNextNonBlank(byteCount);
while (next && *next != '&') {
// Extension: comment lines without ! before namelist groups
if (!io.AdvanceRecord()) {
next.reset();
} else {
- next = io.GetNextNonBlank();
+ next = io.GetNextNonBlank(byteCount);
}
}
if (!next || *next != '&') {
"NAMELIST input group does not begin with '&' (at '%lc')", *next);
return false;
}
- io.HandleRelativePosition(1);
+ io.HandleRelativePosition(byteCount);
if (!GetLowerCaseName(io, name, sizeof name)) {
handler.SignalError("NAMELIST input group has no name");
return false;
}
// Read the group's items
while (true) {
- next = io.GetNextNonBlank();
+ next = io.GetNextNonBlank(byteCount);
if (!next || *next == '/') {
break;
}
const Descriptor *useDescriptor{&itemDescriptor};
StaticDescriptor<maxRank, true, 16> staticDesc[2];
int whichStaticDesc{0};
- next = io.GetCurrentChar();
+ next = io.GetCurrentChar(byteCount);
bool hadSubscripts{false};
bool hadSubstring{false};
if (next && (*next == '(' || *next == '%')) {
hadSubstring = false;
}
useDescriptor = &mutableDescriptor;
- next = io.GetCurrentChar();
+ next = io.GetCurrentChar(byteCount);
} while (next && (*next == '(' || *next == '%'));
}
// Skip the '='
- next = io.GetNextNonBlank();
+ next = io.GetNextNonBlank(byteCount);
if (!next || *next != '=') {
handler.SignalError("No '=' found after item '%s' in NAMELIST group '%s'",
name, group.groupName);
return false;
}
- io.HandleRelativePosition(1);
+ io.HandleRelativePosition(byteCount);
// Read the values into the descriptor. An array can be short.
listInput->ResetForNextNamelistItem();
if (!descr::DescriptorIO<Direction::Input>(io, *useDescriptor)) {
return false;
}
- next = io.GetNextNonBlank();
+ next = io.GetNextNonBlank(byteCount);
if (next && *next == comma) {
- io.HandleRelativePosition(1);
+ io.HandleRelativePosition(byteCount);
}
}
if (!next || *next != '/') {
if (io.get_if<ListDirectedStatementState<Direction::Input>>()) {
if (io.mutableModes().inNamelist) {
SavedPosition savedPosition{io};
- if (auto ch{io.GetNextNonBlank()}) {
+ std::size_t byteCount{0};
+ if (auto ch{io.GetNextNonBlank(byteCount)}) {
if (IsLegalIdStart(*ch)) {
do {
- io.HandleRelativePosition(1);
- ch = io.GetCurrentChar();
+ io.HandleRelativePosition(byteCount);
+ ch = io.GetCurrentChar(byteCount);
} while (ch && IsLegalIdChar(*ch));
- ch = io.GetNextNonBlank();
+ ch = io.GetNextNonBlank(byteCount);
// TODO: how to deal with NaN(...) ambiguity?
return ch && (*ch == '=' || *ch == '(' || *ch == '%');
}
//===----------------------------------------------------------------------===//
#include "unit.h"
-#include "environment.h"
#include "io-error.h"
#include "lock.h"
#include "unit-map.h"
error.isUnformatted = false;
errorOutput = &error;
- // TODO: Set UTF-8 mode from the environment
unitMap = newUnitMap;
return *unitMap;
}
return p ? length : 0;
}
-std::optional<char32_t> ExternalFileUnit::GetCurrentChar(
- IoErrorHandler &handler) {
- const char *p{nullptr};
- std::size_t bytes{GetNextInputBytes(p, handler)};
- if (bytes == 0) {
- return std::nullopt;
- } else {
- // TODO: UTF-8 decoding; may have to get more bytes in a loop
- return *p;
- }
-}
-
const char *ExternalFileUnit::FrameNextInput(
IoErrorHandler &handler, std::size_t bytes) {
RUNTIME_CHECK(handler, isUnformatted.has_value() && !*isUnformatted);
#include "buffer.h"
#include "connection.h"
+#include "environment.h"
#include "file.h"
#include "format.h"
#include "io-error.h"
public OpenFile,
public FileFrame<ExternalFileUnit> {
public:
- explicit ExternalFileUnit(int unitNumber) : unitNumber_{unitNumber} {}
+ explicit ExternalFileUnit(int unitNumber) : unitNumber_{unitNumber} {
+ isUTF8 = executionEnvironment.defaultUTF8;
+ }
~ExternalFileUnit() {}
int unitNumber() const { return unitNumber_; }
const char *, std::size_t, std::size_t elementBytes, IoErrorHandler &);
bool Receive(char *, std::size_t, std::size_t elementBytes, IoErrorHandler &);
std::size_t GetNextInputBytes(const char *&, IoErrorHandler &);
- std::optional<char32_t> GetCurrentChar(IoErrorHandler &);
void SetLeftTabLimit();
bool BeginReadingRecord(IoErrorHandler &);
void FinishReadingRecord(IoErrorHandler &);
--- /dev/null
+//===-- runtime/utf.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "utf.h"
+
+namespace Fortran::runtime {
+
+// clang-format off
+const std::uint8_t UTF8FirstByteTable[256]{
+ /* 00 - 7F: 7 bit payload in single byte */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ /* 80 - BF: invalid first byte, valid later byte */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* C0 - DF: 11 bit payload */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ /* E0 - EF: 16 bit payload */
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ /* F0 - F7: 21 bit payload */ 4, 4, 4, 4, 4, 4, 4, 4,
+ /* F8 - FB: 26 bit payload */ 5, 5, 5, 5,
+ /* FC - FD: 31 bit payload */ 6, 6,
+ /* FE: 32 bit payload */ 7,
+ /* FF: invalid */ 0
+};
+// clang-format on
+
+// Non-minimal encodings are accepted.
+std::optional<char32_t> DecodeUTF8(const char *p0) {
+ const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)};
+ std::size_t bytes{MeasureUTF8Bytes(*p0)};
+ if (bytes == 1) {
+ return char32_t{*p};
+ } else if (bytes > 1) {
+ std::uint64_t result{char32_t{*p} & (0x7f >> bytes)};
+ for (std::size_t j{1}; j < bytes; ++j) {
+ std::uint8_t next{p[j]};
+ if (next < 0x80 || next > 0xbf) {
+ return std::nullopt;
+ }
+ result = (result << 6) | (next & 0x3f);
+ }
+ if (result <= 0xffffffff) {
+ return static_cast<char32_t>(result);
+ }
+ }
+ return std::nullopt;
+}
+
+std::size_t EncodeUTF8(char *p0, char32_t ucs) {
+ std::uint8_t *p{reinterpret_cast<std::uint8_t *>(p0)};
+ if (ucs <= 0x7f) {
+ p[0] = ucs;
+ return 1;
+ } else if (ucs <= 0x7ff) {
+ p[0] = 0xc0 | (ucs >> 6);
+ p[1] = 0x80 | (ucs & 0x3f);
+ return 2;
+ } else if (ucs <= 0xffff) {
+ p[0] = 0xe0 | (ucs >> 12);
+ p[1] = 0x80 | ((ucs >> 6) & 0x3f);
+ p[2] = 0x80 | (ucs & 0x3f);
+ return 3;
+ } else if (ucs <= 0x1fffff) {
+ p[0] = 0xf0 | (ucs >> 18);
+ p[1] = 0x80 | ((ucs >> 12) & 0x3f);
+ p[2] = 0x80 | ((ucs >> 6) & 0x3f);
+ p[3] = 0x80 | (ucs & 0x3f);
+ return 4;
+ } else if (ucs <= 0x3ffffff) {
+ p[0] = 0xf8 | (ucs >> 24);
+ p[1] = 0x80 | ((ucs >> 18) & 0x3f);
+ p[2] = 0x80 | ((ucs >> 12) & 0x3f);
+ p[3] = 0x80 | ((ucs >> 6) & 0x3f);
+ p[4] = 0x80 | (ucs & 0x3f);
+ return 5;
+ } else if (ucs <= 0x7ffffff) {
+ p[0] = 0xf8 | (ucs >> 30);
+ p[1] = 0x80 | ((ucs >> 24) & 0x3f);
+ p[2] = 0x80 | ((ucs >> 18) & 0x3f);
+ p[3] = 0x80 | ((ucs >> 12) & 0x3f);
+ p[4] = 0x80 | ((ucs >> 6) & 0x3f);
+ p[5] = 0x80 | (ucs & 0x3f);
+ return 6;
+ } else {
+ p[0] = 0xfe;
+ p[1] = 0x80 | ((ucs >> 30) & 0x3f);
+ p[2] = 0x80 | ((ucs >> 24) & 0x3f);
+ p[3] = 0x80 | ((ucs >> 18) & 0x3f);
+ p[4] = 0x80 | ((ucs >> 12) & 0x3f);
+ p[5] = 0x80 | ((ucs >> 6) & 0x3f);
+ p[6] = 0x80 | (ucs & 0x3f);
+ return 7;
+ }
+}
+
+} // namespace Fortran::runtime
--- /dev/null
+//===-- runtime/utf.h -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UTF-8 is the variant-width standard encoding of Unicode (ISO 10646)
+// code points.
+//
+// 7-bit values in [00 .. 7F] represent themselves as single bytes, so true
+// 7-bit ASCII is also valid UTF-8.
+//
+// Larger values are encoded with a start byte in [C0 .. FE] that carries
+// the length of the encoding and some of the upper bits of the value, followed
+// by one or more bytes in the range [80 .. BF].
+//
+// Specifically, the first byte holds two or more uppermost set bits,
+// a zero bit, and some payload; the second and later bytes each start with
+// their uppermost bit set, the next bit clear, and six bits of payload.
+// Payload parcels are in big-endian order. All bytes must be present in a
+// valid sequence; i.e., low-order sezo bits must be explicit. UTF-8 is
+// self-synchronizing on input as any byte value cannot be both a valid
+// first byte or trailing byte.
+//
+// 0xxxxxxx - 7 bit ASCII
+// 110xxxxx 10xxxxxx - 11-bit value
+// 1110xxxx 10xxxxxx 10xxxxxx - 16-bit value
+// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 21-bit value
+// 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 26-bit value
+// 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 31-bit value
+// 11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 36-bit value
+//
+// Canonical UTF-8 sequences should be minimal, and our output is so, but
+// we do not reject non-minimal sequences on input. Unicode only defines
+// code points up to 0x10FFFF, so 21-bit (4-byte) UTF-8 is the actual
+// standard maximum. However, we support extended forms up to 32 bits so that
+// CHARACTER(KIND=4) can be abused to hold arbitrary 32-bit data.
+
+#ifndef FORTRAN_RUNTIME_UTF_H_
+#define FORTRAN_RUNTIME_UTF_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+
+namespace Fortran::runtime {
+
+// Derive the length of a UTF-8 character encoding from its first byte.
+// A zero result signifies an invalid encoding.
+extern const std::uint8_t UTF8FirstByteTable[256];
+static inline std::size_t MeasureUTF8Bytes(char first) {
+ return UTF8FirstByteTable[static_cast<std::uint8_t>(first)];
+}
+
+static constexpr std::size_t maxUTF8Bytes{7};
+
+// Ensure that all bytes are present in sequence in the input buffer
+// before calling; use MeasureUTF8Bytes(first byte) to count them.
+std::optional<char32_t> DecodeUTF8(const char *);
+
+// Ensure that at least maxUTF8Bytes remain in the output
+// buffer before calling.
+std::size_t EncodeUTF8(char *, char32_t);
+
+} // namespace Fortran::runtime
+#endif // FORTRAN_RUNTIME_UTF_H_
<< "Input-item value after non advancing read " << j;
j++;
}
+ // CLOSE(UNIT=unit)
+ io = IONAME(BeginClose)(unit, __FILE__, __LINE__);
+ ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+ << "EndIoStatement() for Close";
}
TEST(ExternalIOTests, TestWriteAfterNonAvancingInput) {
<< "InputAscii() ";
ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
<< "EndIoStatement() for Read ";
-
ASSERT_EQ(resultRecord, expectedRecord)
<< "Record after non advancing read followed by write";
+ // CLOSE(UNIT=unit)
+ io = IONAME(BeginClose)(unit, __FILE__, __LINE__);
+ ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+ << "EndIoStatement() for Close";
}
TEST(ExternalIOTests, TestWriteAfterEndfile) {
ASSERT_FALSE(IONAME(InputInteger)(io, eof)) << "InputInteger(eof)";
ASSERT_EQ(eof, -1) << "READ(eof)";
ASSERT_EQ(IONAME(EndIoStatement)(io), IostatEnd) << "EndIoStatement for READ";
+ // CLOSE(UNIT=unit)
+ io = IONAME(BeginClose)(unit, __FILE__, __LINE__);
+ ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+ << "EndIoStatement() for Close";
+}
+
+TEST(ExternalIOTests, TestUTF8Encoding) {
+ // OPEN(FILE="utf8test",NEWUNIT=unit,ACCESS='SEQUENTIAL',ACTION='READWRITE',&
+ // FORM='FORMATTED',STATUS='REPLACE',ENCODING='UTF-8')
+ auto *io{IONAME(BeginOpenNewUnit)(__FILE__, __LINE__)};
+ ASSERT_TRUE(IONAME(SetAccess)(io, "SEQUENTIAL", 10))
+ << "SetAccess(SEQUENTIAL)";
+ ASSERT_TRUE(IONAME(SetAction)(io, "READWRITE", 9)) << "SetAction(READWRITE)";
+ ASSERT_TRUE(IONAME(SetFile)(io, "utf8test", 8)) << "SetFile(utf8test)";
+ ASSERT_TRUE(IONAME(SetForm)(io, "FORMATTED", 9)) << "SetForm(FORMATTED)";
+ ASSERT_TRUE(IONAME(SetStatus)(io, "REPLACE", 7)) << "SetStatus(REPLACE)";
+ ASSERT_TRUE(IONAME(SetEncoding)(io, "UTF-8", 5)) << "SetEncoding(UTF-8)";
+ int unit{-1};
+ ASSERT_TRUE(IONAME(GetNewUnit)(io, unit)) << "GetNewUnit()";
+ ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+ << "EndIoStatement() for first OPEN";
+ char buffer[12];
+ std::memcpy(buffer,
+ "abc\x80\xff"
+ "de\0\0\0\0\0",
+ 12);
+ // WRITE(unit, *) buffer
+ io = IONAME(BeginExternalListOutput)(unit, __FILE__, __LINE__);
+ StaticDescriptor<0> staticDescriptor;
+ Descriptor &desc{staticDescriptor.descriptor()};
+ desc.Establish(TypeCode{CFI_type_char}, 7, buffer, 0);
+ desc.Check();
+ ASSERT_TRUE(IONAME(OutputDescriptor)(io, desc));
+ ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+ << "EndIoStatement() for WRITE";
+ // REWIND(unit)
+ io = IONAME(BeginRewind)(unit, __FILE__, __LINE__);
+ ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+ << "EndIoStatement for REWIND";
+ // READ(unit, *) buffer
+ desc.Establish(TypeCode(CFI_type_char), sizeof buffer, buffer, 0);
+ desc.Check();
+ io = IONAME(BeginExternalListInput)(unit, __FILE__, __LINE__);
+ ASSERT_TRUE(IONAME(InputDescriptor)(io, desc));
+ ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+ << "EndIoStatement() for first READ";
+ ASSERT_EQ(std::memcmp(buffer,
+ "abc\x80\xff"
+ "de ",
+ 12),
+ 0);
+ // CLOSE(UNIT=unit,STATUS='KEEP')
+ io = IONAME(BeginClose)(unit, __FILE__, __LINE__);
+ ASSERT_TRUE(IONAME(SetStatus)(io, "KEEP", 4)) << "SetStatus(KEEP)";
+ ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+ << "EndIoStatement() for first CLOSE";
+ // OPEN(FILE="utf8test",NEWUNIT=unit,ACCESS='SEQUENTIAL',ACTION='READWRITE',&
+ // FORM='FORMATTED',STATUS='OLD')
+ io = IONAME(BeginOpenNewUnit)(__FILE__, __LINE__);
+ ASSERT_TRUE(IONAME(SetAccess)(io, "SEQUENTIAL", 10))
+ << "SetAccess(SEQUENTIAL)";
+ ASSERT_TRUE(IONAME(SetAction)(io, "READWRITE", 9)) << "SetAction(READWRITE)";
+ ASSERT_TRUE(IONAME(SetFile)(io, "utf8test", 8)) << "SetFile(utf8test)";
+ ASSERT_TRUE(IONAME(SetForm)(io, "FORMATTED", 9)) << "SetForm(FORMATTED)";
+ ASSERT_TRUE(IONAME(SetStatus)(io, "OLD", 3)) << "SetStatus(OLD)";
+ ASSERT_TRUE(IONAME(GetNewUnit)(io, unit)) << "GetNewUnit()";
+ ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+ << "EndIoStatement() for second OPEN";
+ // READ(unit, *) buffer
+ io = IONAME(BeginExternalListInput)(unit, __FILE__, __LINE__);
+ ASSERT_TRUE(IONAME(InputDescriptor)(io, desc));
+ ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+ << "EndIoStatement() for second READ";
+ ASSERT_EQ(std::memcmp(buffer,
+ "abc\xc2\x80\xc3\xbf"
+ "de ",
+ 12),
+ 0);
+ // CLOSE(UNIT=unit,STATUS='DELETE')
+ io = IONAME(BeginClose)(unit, __FILE__, __LINE__);
+ ASSERT_TRUE(IONAME(SetStatus)(io, "DELETE", 6)) << "SetStatus(DELETE)";
+ ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+ << "EndIoStatement() for second CLOSE";
+}
+
+TEST(ExternalIOTests, TestUCS) {
+ // OPEN(FILE="ucstest',NEWUNIT=unit,ACCESS='SEQUENTIAL',ACTION='READWRITE',&
+ // FORM='FORMATTED',STATUS='REPLACE',ENCODING='UTF-8')
+ auto *io{IONAME(BeginOpenNewUnit)(__FILE__, __LINE__)};
+ ASSERT_TRUE(IONAME(SetAccess)(io, "SEQUENTIAL", 10))
+ << "SetAccess(SEQUENTIAL)";
+ ASSERT_TRUE(IONAME(SetAction)(io, "READWRITE", 9)) << "SetAction(READWRITE)";
+ ASSERT_TRUE(IONAME(SetFile)(io, "ucstest", 7)) << "SetAction(ucstest)";
+ ASSERT_TRUE(IONAME(SetForm)(io, "FORMATTED", 9)) << "SetForm(FORMATTED)";
+ ASSERT_TRUE(IONAME(SetStatus)(io, "REPLACE", 7)) << "SetStatus(REPLACE)";
+ ASSERT_TRUE(IONAME(SetEncoding)(io, "UTF-8", 5)) << "SetEncoding(UTF-8)";
+ int unit{-1};
+ ASSERT_TRUE(IONAME(GetNewUnit)(io, unit)) << "GetNewUnit()";
+ ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+ << "EndIoStatement() for first OPEN";
+ char32_t wbuffer[8]{U"abc\u0080\uffff"
+ "de"};
+ // WRITE(unit, *) wbuffec
+ io = IONAME(BeginExternalListOutput)(unit, __FILE__, __LINE__);
+ StaticDescriptor<0> staticDescriptor;
+ Descriptor &desc{staticDescriptor.descriptor()};
+ desc.Establish(TypeCode{CFI_type_char32_t}, sizeof wbuffer - sizeof(char32_t),
+ wbuffer, 0);
+ desc.Check();
+ ASSERT_TRUE(IONAME(OutputDescriptor)(io, desc));
+ ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+ << "EndIoStatement() for WRITE";
+ // REWIND(unit)
+ io = IONAME(BeginRewind)(unit, __FILE__, __LINE__);
+ ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+ << "EndIoStatement for REWIND";
+ // READ(unit, *) buffer
+ io = IONAME(BeginExternalListInput)(unit, __FILE__, __LINE__);
+ desc.Establish(TypeCode{CFI_type_char32_t}, sizeof wbuffer, wbuffer, 0);
+ desc.Check();
+ ASSERT_TRUE(IONAME(InputDescriptor)(io, desc));
+ ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+ << "EndIoStatement() for first READ";
+ char dump[80];
+ dump[0] = '\0';
+ for (int j{0}; j < 8; ++j) {
+ std::size_t dumpLen{std::strlen(dump)};
+ std::snprintf(
+ dump + dumpLen, sizeof dump - dumpLen, " %x", (unsigned)wbuffer[j]);
+ }
+ EXPECT_EQ(wbuffer[0], U'a') << dump;
+ EXPECT_EQ(wbuffer[1], U'b') << dump;
+ EXPECT_EQ(wbuffer[2], U'c') << dump;
+ EXPECT_EQ(wbuffer[3], U'\u0080') << dump;
+ EXPECT_EQ(wbuffer[4], U'\uffff') << dump;
+ EXPECT_EQ(wbuffer[5], U'd') << dump;
+ EXPECT_EQ(wbuffer[6], U'e') << dump;
+ EXPECT_EQ(wbuffer[7], U' ') << dump;
+ // CLOSE(UNIT=unit,STATUS='KEEP')
+ io = IONAME(BeginClose)(unit, __FILE__, __LINE__);
+ ASSERT_TRUE(IONAME(SetStatus)(io, "KEEP", 4)) << "SetStatus(KEEP)";
+ ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+ << "EndIoStatement() for first CLOSE";
+ // OPEN(FILE="ucstest",NEWUNIT=unit,ACCESS='SEQUENTIAL',ACTION='READWRITE',&
+ // FORM='FORMATTED',STATUS='OLD')
+ io = IONAME(BeginOpenNewUnit)(__FILE__, __LINE__);
+ ASSERT_TRUE(IONAME(SetAccess)(io, "SEQUENTIAL", 10))
+ << "SetAccess(SEQUENTIAL)";
+ ASSERT_TRUE(IONAME(SetAction)(io, "READWRITE", 9)) << "SetAction(READWRITE)";
+ ASSERT_TRUE(IONAME(SetFile)(io, "ucstest", 7)) << "SetFile(ucstest)";
+ ASSERT_TRUE(IONAME(SetForm)(io, "FORMATTED", 9)) << "SetForm(FORMATTED)";
+ ASSERT_TRUE(IONAME(SetStatus)(io, "OLD", 3)) << "SetStatus(OLD)";
+ ASSERT_TRUE(IONAME(GetNewUnit)(io, unit)) << "GetNewUnit()";
+ ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+ << "EndIoStatement() for second OPEN";
+ char buffer[12];
+ // READ(unit, *) buffer
+ io = IONAME(BeginExternalListInput)(unit, __FILE__, __LINE__);
+ desc.Establish(TypeCode{CFI_type_char}, sizeof buffer, buffer, 0);
+ desc.Check();
+ ASSERT_TRUE(IONAME(InputDescriptor)(io, desc));
+ ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+ << "EndIoStatement() for second READ";
+ dump[0] = '\0';
+ for (int j{0}; j < 12; ++j) {
+ std::size_t dumpLen{std::strlen(dump)};
+ std::snprintf(dump + dumpLen, sizeof dump - dumpLen, " %x",
+ (unsigned)(unsigned char)buffer[j]);
+ }
+ EXPECT_EQ(std::memcmp(buffer,
+ "abc\xc2\x80\xef\xbf\xbf"
+ "de ",
+ 12),
+ 0)
+ << dump;
+ // CLOSE(UNIT=unit,STATUS='DELETE')
+ io = IONAME(BeginClose)(unit, __FILE__, __LINE__);
+ ASSERT_TRUE(IONAME(SetStatus)(io, "DELETE", 6)) << "SetStatus(DELETE)";
+ ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+ << "EndIoStatement() for second CLOSE";
}