[flang] Initial UTF-8 support in runtime I/O

author Peter Klausler <pklausler@nvidia.com>

Wed, 16 Mar 2022 19:32:03 +0000 (12:32 -0700)

committer Peter Klausler <pklausler@nvidia.com>

Tue, 22 Mar 2022 18:48:14 +0000 (11:48 -0700)
author Peter Klausler <pklausler@nvidia.com>
Wed, 16 Mar 2022 19:32:03 +0000 (12:32 -0700)
committer Peter Klausler <pklausler@nvidia.com>
Tue, 22 Mar 2022 18:48:14 +0000 (11:48 -0700)
diff --git a/flang/include/flang/Runtime/iostat.h b/flang/include/flang/Runtime/iostat.h

index 0c0b3f4b3f7f396fb5dd96bf5f8f283a0a2464d2..d0e8ea7d65747c0d58ec7c262d3381ce7a2f9e5c 100644 (file)
--- a/flang/include/flang/Runtime/iostat.h
+++ b/flang/include/flang/Runtime/iostat.h
@@ -66,6 +66,7 @@ enum Iostat {
    IostatShortRead,
    IostatMissingTerminator,
    IostatBadUnformattedRecord,
+  IostatUTF8Decoding,
  };
  
  const char *IostatErrorString(int);
diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt

index 6a80b65ba03429c6ed9612fb4a5cc313fb1fc959..62f251f7dbbb4a94591820562c09080d7118d6aa 100644 (file)
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -82,6 +82,7 @@ add_flang_library(FortranRuntime
    type-info.cpp
    unit.cpp
    unit-map.cpp
+  utf.cpp
  
    LINK_LIBS
    FortranDecimal
diff --git a/flang/runtime/descriptor-io.h b/flang/runtime/descriptor-io.h

index 7e098d8cfca99af773ba2ad93c4e0c08f18c0499..1ca659a39a53a24a2dd78091e963fc48b19c882b 100644 (file)
--- a/flang/runtime/descriptor-io.h
+++ b/flang/runtime/descriptor-io.h
@@ -168,17 +168,17 @@ inline bool FormattedCharacterIO(
    for (std::size_t j{0}; j < numElements; ++j) {
      A *x{&ExtractElement<A>(io, descriptor, subscripts)};
      if (listOutput) {
-      if (!ListDirectedDefaultCharacterOutput(io, *listOutput, x, length)) {
+      if (!ListDirectedCharacterOutput(io, *listOutput, x, length)) {
          return false;
        }
      } else if (auto edit{io.GetNextDataEdit()}) {
        if constexpr (DIR == Direction::Output) {
-        if (!EditDefaultCharacterOutput(io, *edit, x, length)) {
+        if (!EditCharacterOutput(io, *edit, x, length)) {
            return false;
          }
        } else {
          if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-          if (EditDefaultCharacterInput(io, *edit, x, length)) {
+          if (EditCharacterInput(io, *edit, x, length)) {
              anyInput = true;
            } else {
              return anyInput && edit->IsNamelist();
@@ -456,7 +456,10 @@ static bool DescriptorIO(IoStatementState &io, const Descriptor &descriptor) {
        switch (kind) {
        case 1:
          return FormattedCharacterIO<char, DIR>(io, descriptor);
-      // TODO cases 2, 4
+      case 2:
+        return FormattedCharacterIO<char16_t, DIR>(io, descriptor);
+      case 4:
+        return FormattedCharacterIO<char32_t, DIR>(io, descriptor);
        default:
          handler.Crash(
              "DescriptorIO: Unimplemented CHARACTER kind (%d) in descriptor",
diff --git a/flang/runtime/edit-input.cpp b/flang/runtime/edit-input.cpp

index ee35bd4c76cde440732cea1b65f9990476961440..aabe5df30f6d9af83d8a295b7f32d6476d1b7b05 100644 (file)
--- a/flang/runtime/edit-input.cpp
+++ b/flang/runtime/edit-input.cpp
@@ -8,6 +8,7 @@
  
  #include "edit-input.h"
  #include "namelist.h"
+#include "utf.h"
  #include "flang/Common/real.h"
  #include "flang/Common/uint128.h"
  #include <algorithm>
@@ -61,7 +62,6 @@ static bool ScanNumericPrefix(IoStatementState &io, const DataEdit &edit,
    if (next) {
      negative = *next == '-';
      if (negative || *next == '+') {
-      io.GotChar();
        io.SkipSpaces(remaining);
        next = io.NextInField(remaining, edit);
      }
@@ -88,8 +88,7 @@ bool EditIntegerInput(
    case 'Z':
      return EditBOZInput(io, edit, n, 16, kind << 3);
    case 'A': // legacy extension
-    return EditDefaultCharacterInput(
-        io, edit, reinterpret_cast<char *>(n), kind);
+    return EditCharacterInput(io, edit, reinterpret_cast<char *>(n), kind);
    default:
      io.GetIoErrorHandler().SignalError(IostatErrorInFormat,
          "Data edit descriptor '%c' may not be used with an INTEGER data item",
@@ -260,9 +259,10 @@ static int ScanRealInput(char *buffer, int bufferSize, IoStatementState &io,
        next = io.NextInField(remaining, edit);
      }
      if (!next) { // NextInField fails on separators like ')'
-      next = io.GetCurrentChar();
+      std::size_t byteCount{0};
+      next = io.GetCurrentChar(byteCount);
        if (next && *next == ')') {
-        io.HandleRelativePosition(1);
+        io.HandleRelativePosition(byteCount);
        }
      }
    } else if (remaining) {
@@ -427,8 +427,7 @@ bool EditRealInput(IoStatementState &io, const DataEdit &edit, void *n) {
      return EditBOZInput(
          io, edit, n, 16, common::BitsForBinaryPrecision(binaryPrecision));
    case 'A': // legacy extension
-    return EditDefaultCharacterInput(
-        io, edit, reinterpret_cast<char *>(n), KIND);
+    return EditCharacterInput(io, edit, reinterpret_cast<char *>(n), KIND);
    default:
      io.GetIoErrorHandler().SignalError(IostatErrorInFormat,
          "Data edit descriptor '%c' may not be used for REAL input",
@@ -487,11 +486,13 @@ bool EditLogicalInput(IoStatementState &io, const DataEdit &edit, bool &x) {
  }
  
  // See 13.10.3.1 paragraphs 7-9 in Fortran 2018
+template <typename CHAR>
  static bool EditDelimitedCharacterInput(
-    IoStatementState &io, char *x, std::size_t length, char32_t delimiter) {
+    IoStatementState &io, CHAR *x, std::size_t length, char32_t delimiter) {
    bool result{true};
    while (true) {
-    auto ch{io.GetCurrentChar()};
+    std::size_t byteCount{0};
+    auto ch{io.GetCurrentChar(byteCount)};
      if (!ch) {
        if (io.AdvanceRecord()) {
          continue;
@@ -500,12 +501,12 @@ static bool EditDelimitedCharacterInput(
          break;
        }
      }
-    io.HandleRelativePosition(1);
+    io.HandleRelativePosition(byteCount);
      if (*ch == delimiter) {
-      auto next{io.GetCurrentChar()};
+      auto next{io.GetCurrentChar(byteCount)};
        if (next && *next == delimiter) {
          // Repeated delimiter: use as character value
-        io.HandleRelativePosition(1);
+        io.HandleRelativePosition(byteCount);
        } else {
          break; // closing delimiter
        }
@@ -519,19 +520,23 @@ static bool EditDelimitedCharacterInput(
    return result;
  }
  
-static bool EditListDirectedDefaultCharacterInput(
-    IoStatementState &io, char *x, std::size_t length, const DataEdit &edit) {
-  auto ch{io.GetCurrentChar()};
+template <typename CHAR>
+static bool EditListDirectedCharacterInput(
+    IoStatementState &io, CHAR *x, std::size_t length, const DataEdit &edit) {
+  std::size_t byteCount{0};
+  auto ch{io.GetCurrentChar(byteCount)};
    if (ch && (*ch == '\'' || *ch == '"')) {
-    io.HandleRelativePosition(1);
+    io.HandleRelativePosition(byteCount);
      return EditDelimitedCharacterInput(io, x, length, *ch);
    }
    if (IsNamelistName(io) || io.GetConnectionState().IsAtEOF()) {
      return false;
    }
    // Undelimited list-directed character input: stop at a value separator
-  // or the end of the current record.
-  std::optional<int> remaining{length};
+  // or the end of the current record.  Subtlety: the "remaining" count
+  // here is a dummy that's used to avoid the interpretation of separators
+  // in NextInField.
+  std::optional<int> remaining{maxUTF8Bytes};
    while (std::optional<char32_t> next{io.NextInField(remaining, edit)}) {
      switch (*next) {
      case ' ':
@@ -544,17 +549,19 @@ static bool EditListDirectedDefaultCharacterInput(
      default:
        *x++ = *next;
        --length;
+      remaining = maxUTF8Bytes;
      }
    }
    std::fill_n(x, length, ' ');
    return true;
  }
  
-bool EditDefaultCharacterInput(
-    IoStatementState &io, const DataEdit &edit, char *x, std::size_t length) {
+template <typename CHAR>
+bool EditCharacterInput(
+    IoStatementState &io, const DataEdit &edit, CHAR *x, std::size_t length) {
    switch (edit.descriptor) {
    case DataEdit::ListDirected:
-    return EditListDirectedDefaultCharacterInput(io, x, length, edit);
+    return EditListDirectedCharacterInput(io, x, length, edit);
    case 'A':
    case 'G':
      break;
@@ -564,7 +571,8 @@ bool EditDefaultCharacterInput(
          edit.descriptor);
      return false;
    }
-  if (io.GetConnectionState().IsAtEOF()) {
+  const ConnectionState &connection{io.GetConnectionState()};
+  if (connection.IsAtEOF()) {
      return false;
    }
    std::size_t remaining{length};
@@ -577,26 +585,9 @@ bool EditDefaultCharacterInput(
    const char *input{nullptr};
    std::size_t ready{0};
    bool hitEnd{false};
-  if (remaining > length) {
-    // Discard leading bytes.
-    // These bytes don't count towards INQUIRE(IOLENGTH=).
-    std::size_t skip{remaining - length};
-    do {
-      if (ready == 0) {
-        ready = io.GetNextInputBytes(input);
-        if (ready == 0) {
-          hitEnd = true;
-          break;
-        }
-      }
-      std::size_t chunk{std::min<std::size_t>(skip, ready)};
-      io.HandleRelativePosition(chunk);
-      ready -= chunk;
-      input += chunk;
-      skip -= chunk;
-    } while (skip > 0);
-    remaining = length;
-  }
+  // Skip leading bytes.
+  // These bytes don't count towards INQUIRE(IOLENGTH=).
+  std::size_t skip{remaining > length ? remaining - length : 0};
    // Transfer payload bytes; these do count.
    while (remaining > 0) {
      if (ready == 0) {
@@ -606,18 +597,41 @@ bool EditDefaultCharacterInput(
          break;
        }
      }
-    std::size_t chunk{std::min<std::size_t>(remaining, ready)};
-    std::memcpy(x, input, chunk);
-    x += chunk;
+    std::size_t chunk;
+    bool skipping{skip > 0};
+    if (connection.isUTF8) {
+      chunk = MeasureUTF8Bytes(*input);
+      if (skipping) {
+        --skip;
+      } else if (auto ucs{DecodeUTF8(input)}) {
+        *x++ = *ucs;
+        --length;
+      } else if (chunk == 0) {
+        // error recovery: skip bad encoding
+        chunk = 1;
+      }
+      --remaining;
+    } else {
+      if (skipping) {
+        chunk = std::min<std::size_t>(skip, ready);
+        skip -= chunk;
+      } else {
+        chunk = std::min<std::size_t>(remaining, ready);
+        std::memcpy(x, input, chunk);
+        x += chunk;
+        length -= chunk;
+      }
+      remaining -= chunk;
+    }
      input += chunk;
-    io.GotChar(chunk);
+    if (!skipping) {
+      io.GotChar(chunk);
+    }
      io.HandleRelativePosition(chunk);
      ready -= chunk;
-    remaining -= chunk;
-    length -= chunk;
    }
    // Pad the remainder of the input variable, if any.
-  std::memset(x, ' ', length);
+  std::fill_n(x, length, ' ');
    if (hitEnd) {
      io.CheckForEndOfRecord(); // signal any needed error
    }
@@ -631,4 +645,12 @@ template bool EditRealInput<8>(IoStatementState &, const DataEdit &, void *);
  template bool EditRealInput<10>(IoStatementState &, const DataEdit &, void *);
  // TODO: double/double
  template bool EditRealInput<16>(IoStatementState &, const DataEdit &, void *);
+
+template bool EditCharacterInput(
+    IoStatementState &, const DataEdit &, char *, std::size_t);
+template bool EditCharacterInput(
+    IoStatementState &, const DataEdit &, char16_t *, std::size_t);
+template bool EditCharacterInput(
+    IoStatementState &, const DataEdit &, char32_t *, std::size_t);
+
  } // namespace Fortran::runtime::io
diff --git a/flang/runtime/edit-input.h b/flang/runtime/edit-input.h

index a8b0e76cfefd4f94f8bc16386a9396703c68db5d..61844a1199a748ebe74e873197826f987a212584 100644 (file)
--- a/flang/runtime/edit-input.h
+++ b/flang/runtime/edit-input.h
@@ -21,8 +21,10 @@ template <int KIND>
  bool EditRealInput(IoStatementState &, const DataEdit &, void *);
  
  bool EditLogicalInput(IoStatementState &, const DataEdit &, bool &);
-bool EditDefaultCharacterInput(
-    IoStatementState &, const DataEdit &, char *, std::size_t);
+
+template <typename CHAR>
+bool EditCharacterInput(
+    IoStatementState &, const DataEdit &, CHAR *, std::size_t);
  
  extern template bool EditRealInput<2>(
      IoStatementState &, const DataEdit &, void *);
@@ -37,5 +39,13 @@ extern template bool EditRealInput<10>(
  // TODO: double/double
  extern template bool EditRealInput<16>(
      IoStatementState &, const DataEdit &, void *);
+
+extern template bool EditCharacterInput(
+    IoStatementState &, const DataEdit &, char *, std::size_t);
+extern template bool EditCharacterInput(
+    IoStatementState &, const DataEdit &, char16_t *, std::size_t);
+extern template bool EditCharacterInput(
+    IoStatementState &, const DataEdit &, char32_t *, std::size_t);
+
  } // namespace Fortran::runtime::io
  #endif // FORTRAN_RUNTIME_EDIT_INPUT_H_
diff --git a/flang/runtime/edit-output.cpp b/flang/runtime/edit-output.cpp

index aa5ef489d22e7bd3eb08c77ad9b43d06e7f344a2..e3bb5abb2bb985113118a997a42c2b58b3750e01 100644 (file)
--- a/flang/runtime/edit-output.cpp
+++ b/flang/runtime/edit-output.cpp
@@ -7,6 +7,7 @@
  //===----------------------------------------------------------------------===//
  
  #include "edit-output.h"
+#include "utf.h"
  #include "flang/Common/uint128.h"
  #include <algorithm>
  
@@ -53,7 +54,7 @@ bool EditIntegerOutput(IoStatementState &io, const DataEdit &edit,
      }
      break;
    case 'A': // legacy extension
-    return EditDefaultCharacterOutput(
+    return EditCharacterOutput(
          io, edit, reinterpret_cast<char *>(&n), sizeof n);
    default:
      io.GetIoErrorHandler().Crash(
@@ -434,7 +435,7 @@ template <int KIND> bool RealOutputEditing<KIND>::Edit(const DataEdit &edit) {
    case 'G':
      return Edit(EditForGOutput(edit));
    case 'A': // legacy extension
-    return EditDefaultCharacterOutput(
+    return EditCharacterOutput(
          io_, edit, reinterpret_cast<char *>(&x_), sizeof x_);
    default:
      if (edit.IsListDirected()) {
@@ -467,8 +468,9 @@ bool EditLogicalOutput(IoStatementState &io, const DataEdit &edit, bool truth) {
    }
  }
  
-bool ListDirectedDefaultCharacterOutput(IoStatementState &io,
-    ListDirectedStatementState<Direction::Output> &list, const char *x,
+template <typename CHAR>
+bool ListDirectedCharacterOutput(IoStatementState &io,
+    ListDirectedStatementState<Direction::Output> &list, const CHAR *x,
      std::size_t length) {
    bool ok{true};
    MutableModes &modes{io.mutableModes()};
@@ -477,11 +479,11 @@ bool ListDirectedDefaultCharacterOutput(IoStatementState &io,
      ok = ok && list.EmitLeadingSpaceOrAdvance(io);
      // Value is delimited with ' or " marks, and interior
      // instances of that character are doubled.
-    auto EmitOne{[&](char ch) {
+    auto EmitOne{[&](CHAR ch) {
        if (connection.NeedAdvance(1)) {
          ok = ok && io.AdvanceRecord();
        }
-      ok = ok && io.Emit(&ch, 1);
+      ok = ok && io.EmitEncoded(&ch, 1);
      }};
      EmitOne(modes.delim);
      for (std::size_t j{0}; j < length; ++j) {
@@ -494,7 +496,7 @@ bool ListDirectedDefaultCharacterOutput(IoStatementState &io,
        // the same thing when tested with this case.
        // This runtime splits the doubled delimiters across
        // two records for lack of a better alternative.
-      if (x[j] == modes.delim) {
+      if (x[j] == static_cast<CHAR>(modes.delim)) {
          EmitOne(x[j]);
        }
        EmitOne(x[j]);
@@ -504,12 +506,15 @@ bool ListDirectedDefaultCharacterOutput(IoStatementState &io,
      // Undelimited list-directed output
      ok = ok && list.EmitLeadingSpaceOrAdvance(io, length > 0 ? 1 : 0, true);
      std::size_t put{0};
+    std::size_t oneIfUTF8{connection.isUTF8 ? 1 : length};
      while (ok && put < length) {
-      auto chunk{std::min(length - put, connection.RemainingSpaceInRecord())};
-      ok = ok && io.Emit(x + put, chunk);
-      put += chunk;
-      if (put < length) {
-        ok = ok && io.AdvanceRecord() && io.Emit(" ", 1);
+      if (std::size_t chunk{std::min<std::size_t>(
+              std::min<std::size_t>(length - put, oneIfUTF8),
+              connection.RemainingSpaceInRecord())}) {
+        ok = io.EmitEncoded(x + put, chunk);
+        put += chunk;
+      } else {
+        ok = io.AdvanceRecord() && io.Emit(" ", 1);
        }
      }
      list.set_lastWasUndelimitedCharacter(true);
@@ -517,8 +522,9 @@ bool ListDirectedDefaultCharacterOutput(IoStatementState &io,
    return ok;
  }
  
-bool EditDefaultCharacterOutput(IoStatementState &io, const DataEdit &edit,
-    const char *x, std::size_t length) {
+template <typename CHAR>
+bool EditCharacterOutput(IoStatementState &io, const DataEdit &edit,
+    const CHAR *x, std::size_t length) {
    switch (edit.descriptor) {
    case 'A':
    case 'G':
@@ -532,7 +538,7 @@ bool EditDefaultCharacterOutput(IoStatementState &io, const DataEdit &edit,
    int len{static_cast<int>(length)};
    int width{edit.width.value_or(len)};
    return io.EmitRepeated(' ', std::max(0, width - len)) &&
-      io.Emit(x, std::min(width, len));
+      io.EmitEncoded(x, std::min(width, len));
  }
  
  template bool EditIntegerOutput<1>(
@@ -553,4 +559,22 @@ template class RealOutputEditing<8>;
  template class RealOutputEditing<10>;
  // TODO: double/double
  template class RealOutputEditing<16>;
+
+template bool ListDirectedCharacterOutput(IoStatementState &,
+    ListDirectedStatementState<Direction::Output> &, const char *,
+    std::size_t chars);
+template bool ListDirectedCharacterOutput(IoStatementState &,
+    ListDirectedStatementState<Direction::Output> &, const char16_t *,
+    std::size_t chars);
+template bool ListDirectedCharacterOutput(IoStatementState &,
+    ListDirectedStatementState<Direction::Output> &, const char32_t *,
+    std::size_t chars);
+
+template bool EditCharacterOutput(
+    IoStatementState &, const DataEdit &, const char *, std::size_t chars);
+template bool EditCharacterOutput(
+    IoStatementState &, const DataEdit &, const char16_t *, std::size_t chars);
+template bool EditCharacterOutput(
+    IoStatementState &, const DataEdit &, const char32_t *, std::size_t chars);
+
  } // namespace Fortran::runtime::io
diff --git a/flang/runtime/edit-output.h b/flang/runtime/edit-output.h

index bcb6fb0b6bfa7f6ced3e949de1229b913bb338d8..bd1377e3a18c4e3ed76970b53446fc038f8cc4c6 100644 (file)
--- a/flang/runtime/edit-output.h
+++ b/flang/runtime/edit-output.h
@@ -94,10 +94,30 @@ private:
  bool ListDirectedLogicalOutput(
      IoStatementState &, ListDirectedStatementState<Direction::Output> &, bool);
  bool EditLogicalOutput(IoStatementState &, const DataEdit &, bool);
-bool ListDirectedDefaultCharacterOutput(IoStatementState &,
-    ListDirectedStatementState<Direction::Output> &, const char *, std::size_t);
-bool EditDefaultCharacterOutput(
-    IoStatementState &, const DataEdit &, const char *, std::size_t);
+
+template <typename CHAR>
+bool ListDirectedCharacterOutput(IoStatementState &,
+    ListDirectedStatementState<Direction::Output> &, const CHAR *,
+    std::size_t chars);
+extern template bool ListDirectedCharacterOutput(IoStatementState &,
+    ListDirectedStatementState<Direction::Output> &, const char *,
+    std::size_t chars);
+extern template bool ListDirectedCharacterOutput(IoStatementState &,
+    ListDirectedStatementState<Direction::Output> &, const char16_t *,
+    std::size_t chars);
+extern template bool ListDirectedCharacterOutput(IoStatementState &,
+    ListDirectedStatementState<Direction::Output> &, const char32_t *,
+    std::size_t chars);
+
+template <typename CHAR>
+bool EditCharacterOutput(
+    IoStatementState &, const DataEdit &, const CHAR *, std::size_t chars);
+extern template bool EditCharacterOutput(
+    IoStatementState &, const DataEdit &, const char *, std::size_t chars);
+extern template bool EditCharacterOutput(
+    IoStatementState &, const DataEdit &, const char16_t *, std::size_t chars);
+extern template bool EditCharacterOutput(
+    IoStatementState &, const DataEdit &, const char32_t *, std::size_t chars);
  
  extern template bool EditIntegerOutput<1>(
      IoStatementState &, const DataEdit &, std::int8_t);
diff --git a/flang/runtime/environment.cpp b/flang/runtime/environment.cpp

index 53af239facea2d79e1a26cf9548220782bd5bf42..7ecbdce6bf961b45dc411059f18368c47656950d 100644 (file)
--- a/flang/runtime/environment.cpp
+++ b/flang/runtime/environment.cpp
@@ -78,6 +78,17 @@ void ExecutionEnvironment::Configure(
      }
    }
  
+  if (auto *x{std::getenv("DEFAULT_UTF8")}) {
+    char *end;
+    auto n{std::strtol(x, &end, 10)};
+    if (n >= 0 && n <= 1 && *end == '\0') {
+      defaultUTF8 = n != 0;
+    } else {
+      std::fprintf(
+          stderr, "Fortran runtime: DEFAULT_UTF8=%s is invalid; ignored\n", x);
+    }
+  }
+
    // TODO: Set RP/ROUND='PROCESSOR_DEFINED' from environment
  }
  
diff --git a/flang/runtime/environment.h b/flang/runtime/environment.h

index 7db6cf3f5723b76f12b99e41ff30d5ccddb0ee06..b6223a88446ceaa97e6da0e568452b9aa6b35dd6 100644 (file)
--- a/flang/runtime/environment.h
+++ b/flang/runtime/environment.h
@@ -30,19 +30,23 @@ enum class Convert { Unknown, Native, LittleEndian, BigEndian, Swap };
  std::optional<Convert> GetConvertFromString(const char *, std::size_t);
  
  struct ExecutionEnvironment {
+  constexpr ExecutionEnvironment(){};
    void Configure(int argc, const char *argv[], const char *envp[]);
    const char *GetEnv(
        const char *name, std::size_t name_length, const Terminator &terminator);
  
-  int argc;
-  const char **argv;
-  const char **envp;
+  int argc{0};
+  const char **argv{nullptr};
+  const char **envp{nullptr};
  
-  int listDirectedOutputLineLengthLimit; // FORT_FMT_RECL
-  enum decimal::FortranRounding defaultOutputRoundingMode;
-  Convert conversion; // FORT_CONVERT
-  bool noStopMessage; // NO_STOP_MESSAGE=1 inhibits "Fortran STOP"
+  int listDirectedOutputLineLengthLimit{79}; // FORT_FMT_RECL
+  enum decimal::FortranRounding defaultOutputRoundingMode{
+      decimal::FortranRounding::RoundNearest}; // RP(==PN)
+  Convert conversion{Convert::Unknown}; // FORT_CONVERT
+  bool noStopMessage{false}; // NO_STOP_MESSAGE=1 inhibits "Fortran STOP"
+  bool defaultUTF8{false}; // DEFAULT_UTF8
  };
+
  extern ExecutionEnvironment executionEnvironment;
  } // namespace Fortran::runtime
  
diff --git a/flang/runtime/internal-unit.cpp b/flang/runtime/internal-unit.cpp

index 0c833ba548ec725bd03d38520bb657240d5a0b3c..39a8e4b2c9c4e55a8b3cc6f280df958d94a5db17 100644 (file)
--- a/flang/runtime/internal-unit.cpp
+++ b/flang/runtime/internal-unit.cpp
@@ -102,21 +102,6 @@ std::size_t InternalDescriptorUnit<DIR>::GetNextInputBytes(
    }
  }
  
-template <Direction DIR>
-std::optional<char32_t> InternalDescriptorUnit<DIR>::GetCurrentChar(
-    IoErrorHandler &handler) {
-  const char *p{nullptr};
-  std::size_t bytes{GetNextInputBytes(p, handler)};
-  if (bytes == 0) {
-    return std::nullopt;
-  } else {
-    if (isUTF8) {
-      // TODO: UTF-8 decoding
-    }
-    return *p;
-  }
-}
-
  template <Direction DIR>
  bool InternalDescriptorUnit<DIR>::AdvanceRecord(IoErrorHandler &handler) {
    if (currentRecordNumber >= endfileRecordNumber.value_or(0)) {
diff --git a/flang/runtime/internal-unit.h b/flang/runtime/internal-unit.h

index ad52cc761de53b2ea1a4041d139c378485e761ec..e59866013188c3a166ec4d6618abfb93c6cf4968 100644 (file)
--- a/flang/runtime/internal-unit.h
+++ b/flang/runtime/internal-unit.h
@@ -32,7 +32,6 @@ public:
  
    bool Emit(const char *, std::size_t, IoErrorHandler &);
    std::size_t GetNextInputBytes(const char *&, IoErrorHandler &);
-  std::optional<char32_t> GetCurrentChar(IoErrorHandler &);
    bool AdvanceRecord(IoErrorHandler &);
    void BackspaceRecord(IoErrorHandler &);
  
diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp

index 1a8b06068802df8e55c9025947d8d4d1b4ce0992..ec824d9b3cdffde4c167649ef66c8d88ee31ce33 100644 (file)
--- a/flang/runtime/io-stmt.cpp
+++ b/flang/runtime/io-stmt.cpp
@@ -11,11 +11,13 @@
  #include "format.h"
  #include "tools.h"
  #include "unit.h"
+#include "utf.h"
  #include "flang/Runtime/memory.h"
  #include <algorithm>
  #include <cstdio>
  #include <cstring>
  #include <limits>
+#include <type_traits>
  
  namespace Fortran::runtime::io {
  
@@ -357,7 +359,6 @@ bool ExternalIoStatementState<DIR>::Emit(
      Crash(
          "ExternalIoStatementState::Emit(char16_t) called for input statement");
    }
-  // TODO: UTF-8 encoding
    return unit().Emit(reinterpret_cast<const char *>(data), chars * sizeof *data,
        sizeof *data, *this);
  }
@@ -369,7 +370,6 @@ bool ExternalIoStatementState<DIR>::Emit(
      Crash(
          "ExternalIoStatementState::Emit(char32_t) called for input statement");
    }
-  // TODO: UTF-8 encoding
    return unit().Emit(reinterpret_cast<const char *>(data), chars * sizeof *data,
        sizeof *data, *this);
  }
@@ -472,6 +472,30 @@ bool IoStatementState::Emit(const char32_t *data, std::size_t chars) {
    return std::visit([=](auto &x) { return x.get().Emit(data, chars); }, u_);
  }
  
+template <typename CHAR>
+bool IoStatementState::EmitEncoded(const CHAR *data0, std::size_t chars) {
+  // Don't allow sign extension
+  using UnsignedChar = std::make_unsigned_t<CHAR>;
+  const UnsignedChar *data{reinterpret_cast<const UnsignedChar *>(data0)};
+  if (GetConnectionState().isUTF8) {
+    char buffer[256];
+    std::size_t at{0};
+    while (chars-- > 0) {
+      auto len{EncodeUTF8(buffer + at, *data++)};
+      at += len;
+      if (at + maxUTF8Bytes > sizeof buffer) {
+        if (!Emit(buffer, at)) {
+          return false;
+        }
+        at = 0;
+      }
+    }
+    return at == 0 || Emit(buffer, at);
+  } else {
+    return Emit(data0, chars);
+  }
+}
+
  bool IoStatementState::Receive(
      char *data, std::size_t n, std::size_t elementBytes) {
    return std::visit(
@@ -533,6 +557,30 @@ ExternalFileUnit *IoStatementState::GetExternalFileUnit() const {
    return std::visit([](auto &x) { return x.get().GetExternalFileUnit(); }, u_);
  }
  
+std::optional<char32_t> IoStatementState::GetCurrentChar(
+    std::size_t &byteCount) {
+  const char *p{nullptr};
+  std::size_t bytes{GetNextInputBytes(p)};
+  if (bytes == 0) {
+    byteCount = 0;
+    return std::nullopt;
+  } else {
+    if (GetConnectionState().isUTF8) {
+      std::size_t length{MeasureUTF8Bytes(*p)};
+      if (length <= bytes) {
+        if (auto result{DecodeUTF8(p)}) {
+          byteCount = length;
+          return result;
+        }
+      }
+      GetIoErrorHandler().SignalError(IostatUTF8Decoding);
+      // Error recovery: return the next byte
+    }
+    byteCount = 1;
+    return *p;
+  }
+}
+
  bool IoStatementState::EmitRepeated(char ch, std::size_t n) {
    return std::visit(
        [=](auto &x) {
@@ -561,8 +609,9 @@ bool IoStatementState::EmitField(
  
  std::optional<char32_t> IoStatementState::NextInField(
      std::optional<int> &remaining, const DataEdit &edit) {
+  std::size_t byteCount{0};
    if (!remaining) { // Stream, list-directed, or NAMELIST
-    if (auto next{GetCurrentChar()}) {
+    if (auto next{GetCurrentChar(byteCount)}) {
        if (edit.IsListDirected()) {
          // list-directed or NAMELIST: check for separators
          switch (*next) {
@@ -587,15 +636,18 @@ std::optional<char32_t> IoStatementState::NextInField(
            break;
          }
        }
-      HandleRelativePosition(1);
-      GotChar();
+      HandleRelativePosition(byteCount);
+      GotChar(byteCount);
        return next;
      }
    } else if (*remaining > 0) {
-    if (auto next{GetCurrentChar()}) {
-      --*remaining;
-      HandleRelativePosition(1);
-      GotChar();
+    if (auto next{GetCurrentChar(byteCount)}) {
+      if (byteCount > static_cast<std::size_t>(*remaining)) {
+        return std::nullopt;
+      }
+      *remaining -= byteCount;
+      HandleRelativePosition(byteCount);
+      GotChar(byteCount);
        return next;
      }
      if (CheckForEndOfRecord()) { // do padding
@@ -708,12 +760,13 @@ ListDirectedStatementState<Direction::Input>::GetNextDataEdit(
    if (edit.modes.editingFlags & decimalComma) {
      comma = ';';
    }
+  std::size_t byteCount{0};
    if (remaining_ > 0 && !realPart_) { // "r*c" repetition in progress
      RUNTIME_CHECK(io.GetIoErrorHandler(), repeatPosition_.has_value());
      repeatPosition_.reset(); // restores the saved position
      if (!imaginaryPart_) {
        edit.repeat = std::min<int>(remaining_, maxRepeat);
-      auto ch{io.GetCurrentChar()};
+      auto ch{io.GetCurrentChar(byteCount)};
        if (!ch || *ch == ' ' || *ch == '\t' || *ch == comma) {
          // "r*" repeated null
          edit.descriptor = DataEdit::ListDirectedNullValue;
@@ -733,14 +786,14 @@ ListDirectedStatementState<Direction::Input>::GetNextDataEdit(
      imaginaryPart_ = true;
      edit.descriptor = DataEdit::ListDirectedImaginaryPart;
    }
-  auto ch{io.GetNextNonBlank()};
+  auto ch{io.GetNextNonBlank(byteCount)};
    if (ch && *ch == comma && eatComma_) {
      // Consume comma & whitespace after previous item.
      // This includes the comma between real and imaginary components
      // in list-directed/NAMELIST complex input.
      // (When DECIMAL='COMMA', the comma is actually a semicolon.)
-    io.HandleRelativePosition(1);
-    ch = io.GetNextNonBlank();
+    io.HandleRelativePosition(byteCount);
+    ch = io.GetNextNonBlank(byteCount);
    }
    eatComma_ = true;
    if (!ch) {
@@ -768,12 +821,12 @@ ListDirectedStatementState<Direction::Input>::GetNextDataEdit(
          break;
        }
        r = 10 * r + (*ch - '0');
-      io.HandleRelativePosition(1);
-      ch = io.GetCurrentChar();
+      io.HandleRelativePosition(byteCount);
+      ch = io.GetCurrentChar(byteCount);
      } while (ch && *ch >= '0' && *ch <= '9');
      if (r > 0 && ch && *ch == '*') { // subtle: r must be nonzero
-      io.HandleRelativePosition(1);
-      ch = io.GetCurrentChar();
+      io.HandleRelativePosition(byteCount);
+      ch = io.GetCurrentChar(byteCount);
        if (ch && *ch == '/') { // r*/
          hitSlash_ = true;
          edit.descriptor = DataEdit::ListDirectedNullValue;
@@ -793,7 +846,7 @@ ListDirectedStatementState<Direction::Input>::GetNextDataEdit(
    }
    if (!imaginaryPart_ && ch && *ch == '(') {
      realPart_ = true;
-    io.HandleRelativePosition(1);
+    io.HandleRelativePosition(byteCount);
      edit.descriptor = DataEdit::ListDirectedRealPart;
    }
    return edit;
@@ -1445,4 +1498,10 @@ int ErroneousIoStatementState::EndIoStatement() {
    return IoStatementBase::EndIoStatement();
  }
  
+template bool IoStatementState::EmitEncoded<char>(const char *, std::size_t);
+template bool IoStatementState::EmitEncoded<char16_t>(
+    const char16_t *, std::size_t);
+template bool IoStatementState::EmitEncoded<char32_t>(
+    const char32_t *, std::size_t);
+
  } // namespace Fortran::runtime::io
diff --git a/flang/runtime/io-stmt.h b/flang/runtime/io-stmt.h

index 2c43151296b8a63166a1229c4e89d0502451c9a4..0ed14e5ad6a4d057d3404e483d4f3e858498f3ff 100644 (file)
--- a/flang/runtime/io-stmt.h
+++ b/flang/runtime/io-stmt.h
@@ -90,6 +90,7 @@ public:
    bool Emit(const char *, std::size_t);
    bool Emit(const char16_t *, std::size_t chars);
    bool Emit(const char32_t *, std::size_t chars);
+  template <typename CHAR> bool EmitEncoded(const CHAR *, std::size_t);
    bool Receive(char *, std::size_t, std::size_t elementBytes = 0);
    std::size_t GetNextInputBytes(const char *&);
    bool AdvanceRecord(int = 1);
@@ -123,16 +124,7 @@ public:
    }
  
    // Vacant after the end of the current record
-  std::optional<char32_t> GetCurrentChar() {
-    const char *p{nullptr};
-    std::size_t bytes{GetNextInputBytes(p)};
-    if (bytes == 0) {
-      return std::nullopt;
-    } else {
-      // TODO: UTF-8 decoding; may have to get more bytes in a loop
-      return *p;
-    }
-  }
+  std::optional<char32_t> GetCurrentChar(std::size_t &byteCount);
  
    bool EmitRepeated(char, std::size_t);
    bool EmitField(const char *, std::size_t length, std::size_t width);
@@ -144,7 +136,8 @@ public:
        const DataEdit &edit, std::optional<int> &remaining) {
      remaining.reset();
      if (edit.descriptor == DataEdit::ListDirected) {
-      GetNextNonBlank();
+      std::size_t byteCount{0};
+      GetNextNonBlank(byteCount);
      } else {
        if (edit.width.value_or(0) > 0) {
          remaining = *edit.width;
@@ -156,15 +149,19 @@ public:
  
    std::optional<char32_t> SkipSpaces(std::optional<int> &remaining) {
      while (!remaining || *remaining > 0) {
-      if (auto ch{GetCurrentChar()}) {
+      std::size_t byteCount{0};
+      if (auto ch{GetCurrentChar(byteCount)}) {
          if (*ch != ' ' && *ch != '\t') {
            return ch;
          }
-        HandleRelativePosition(1);
          if (remaining) {
-          GotChar();
-          --*remaining;
+          if (static_cast<std::size_t>(*remaining) < byteCount) {
+            break;
+          }
+          GotChar(byteCount);
+          *remaining -= byteCount;
          }
+        HandleRelativePosition(byteCount);
        } else {
          break;
        }
@@ -182,16 +179,16 @@ public:
    bool CheckForEndOfRecord();
  
    // Skips spaces, advances records, and ignores NAMELIST comments
-  std::optional<char32_t> GetNextNonBlank() {
-    auto ch{GetCurrentChar()};
+  std::optional<char32_t> GetNextNonBlank(std::size_t &byteCount) {
+    auto ch{GetCurrentChar(byteCount)};
      bool inNamelist{mutableModes().inNamelist};
      while (!ch || *ch == ' ' || *ch == '\t' || (inNamelist && *ch == '!')) {
        if (ch && (*ch == ' ' || *ch == '\t')) {
-        HandleRelativePosition(1);
+        HandleRelativePosition(byteCount);
        } else if (!AdvanceRecord()) {
          return std::nullopt;
        }
-      ch = GetCurrentChar();
+      ch = GetCurrentChar(byteCount);
      }
      return ch;
    }
@@ -721,5 +718,12 @@ private:
    ConnectionState connection_;
  };
  
+extern template bool IoStatementState::EmitEncoded<char>(
+    const char *, std::size_t);
+extern template bool IoStatementState::EmitEncoded<char16_t>(
+    const char16_t *, std::size_t);
+extern template bool IoStatementState::EmitEncoded<char32_t>(
+    const char32_t *, std::size_t);
+
  } // namespace Fortran::runtime::io
  #endif // FORTRAN_RUNTIME_IO_STMT_H_
diff --git a/flang/runtime/iostat.cpp b/flang/runtime/iostat.cpp

index f6305eaca6559927615c16497a8a6af558fff6cc..73cf2b4e5800200631d0d3857991f0b82ed5f5f7 100644 (file)
--- a/flang/runtime/iostat.cpp
+++ b/flang/runtime/iostat.cpp
@@ -75,6 +75,8 @@ const char *IostatErrorString(int iostat) {
      return "Sequential record missing its terminator";
    case IostatBadUnformattedRecord:
      return "Erroneous unformatted sequential file record structure";
+  case IostatUTF8Decoding:
+    return "UTF-8 decoding error";
    default:
      return nullptr;
    }
diff --git a/flang/runtime/namelist.cpp b/flang/runtime/namelist.cpp

index 762b885b56b3ba1a839804864e66ed5686330e2d..3e2c7a012bada3b41075e99c42e0f1c1217cd83b 100644 (file)
--- a/flang/runtime/namelist.cpp
+++ b/flang/runtime/namelist.cpp
@@ -86,13 +86,14 @@ static constexpr char NormalizeIdChar(char32_t ch) {
  
  static bool GetLowerCaseName(
      IoStatementState &io, char buffer[], std::size_t maxLength) {
-  if (auto ch{io.GetNextNonBlank()}) {
+  std::size_t byteLength{0};
+  if (auto ch{io.GetNextNonBlank(byteLength)}) {
      if (IsLegalIdStart(*ch)) {
        std::size_t j{0};
        do {
          buffer[j] = NormalizeIdChar(*ch);
-        io.HandleRelativePosition(1);
-        ch = io.GetCurrentChar();
+        io.HandleRelativePosition(byteLength);
+        ch = io.GetCurrentChar(byteLength);
        } while (++j < maxLength && ch && IsLegalIdChar(*ch));
        buffer[j++] = '\0';
        if (j <= maxLength) {
@@ -107,19 +108,20 @@ static bool GetLowerCaseName(
  
  static std::optional<SubscriptValue> GetSubscriptValue(IoStatementState &io) {
    std::optional<SubscriptValue> value;
-  std::optional<char32_t> ch{io.GetCurrentChar()};
+  std::size_t byteCount{0};
+  std::optional<char32_t> ch{io.GetCurrentChar(byteCount)};
    bool negate{ch && *ch == '-'};
    if ((ch && *ch == '+') || negate) {
-    io.HandleRelativePosition(1);
-    ch = io.GetCurrentChar();
+    io.HandleRelativePosition(byteCount);
+    ch = io.GetCurrentChar(byteCount);
    }
    bool overflow{false};
    while (ch && *ch >= '0' && *ch <= '9') {
      SubscriptValue was{value.value_or(0)};
      overflow |= was >= std::numeric_limits<SubscriptValue>::max() / 10;
      value = 10 * was + *ch - '0';
-    io.HandleRelativePosition(1);
-    ch = io.GetCurrentChar();
+    io.HandleRelativePosition(byteCount);
+    ch = io.GetCurrentChar(byteCount);
    }
    if (overflow) {
      io.GetIoErrorHandler().SignalError(
@@ -130,7 +132,7 @@ static std::optional<SubscriptValue> GetSubscriptValue(IoStatementState &io) {
      if (value) {
        return -*value;
      } else {
-      io.HandleRelativePosition(-1); // give back '-' with no digits
+      io.HandleRelativePosition(-byteCount); // give back '-' with no digits
      }
    }
    return value;
@@ -146,7 +148,8 @@ static bool HandleSubscripts(IoStatementState &io, Descriptor &desc,
    int j{0};
    std::size_t contiguousStride{source.ElementBytes()};
    bool ok{true};
-  std::optional<char32_t> ch{io.GetNextNonBlank()};
+  std::size_t byteCount{0};
+  std::optional<char32_t> ch{io.GetNextNonBlank(byteCount)};
    char32_t comma{GetComma(io)};
    for (; ch && *ch != ')'; ++j) {
      SubscriptValue dimLower{0}, dimUpper{0}, dimStride{0};
@@ -176,11 +179,11 @@ static bool HandleSubscripts(IoStatementState &io, Descriptor &desc,
        } else {
          dimLower = *low;
        }
-      ch = io.GetNextNonBlank();
+      ch = io.GetNextNonBlank(byteCount);
      }
      if (ch && *ch == ':') {
-      io.HandleRelativePosition(1);
-      ch = io.GetNextNonBlank();
+      io.HandleRelativePosition(byteCount);
+      ch = io.GetNextNonBlank(byteCount);
        if (auto high{GetSubscriptValue(io)}) {
          if (*high > dimUpper) {
            if (ok) {
@@ -194,14 +197,14 @@ static bool HandleSubscripts(IoStatementState &io, Descriptor &desc,
          } else {
            dimUpper = *high;
          }
-        ch = io.GetNextNonBlank();
+        ch = io.GetNextNonBlank(byteCount);
        }
        if (ch && *ch == ':') {
-        io.HandleRelativePosition(1);
-        ch = io.GetNextNonBlank();
+        io.HandleRelativePosition(byteCount);
+        ch = io.GetNextNonBlank(byteCount);
          if (auto str{GetSubscriptValue(io)}) {
            dimStride = *str;
-          ch = io.GetNextNonBlank();
+          ch = io.GetNextNonBlank(byteCount);
          }
        }
      } else { // scalar
@@ -209,8 +212,8 @@ static bool HandleSubscripts(IoStatementState &io, Descriptor &desc,
        dimStride = 0;
      }
      if (ch && *ch == comma) {
-      io.HandleRelativePosition(1);
-      ch = io.GetNextNonBlank();
+      io.HandleRelativePosition(byteCount);
+      ch = io.GetNextNonBlank(byteCount);
      }
      if (ok) {
        lower[j] = dimLower;
@@ -220,7 +223,7 @@ static bool HandleSubscripts(IoStatementState &io, Descriptor &desc,
    }
    if (ok) {
      if (ch && *ch == ')') {
-      io.HandleRelativePosition(1);
+      io.HandleRelativePosition(byteCount);
        if (desc.EstablishPointerSection(source, lower, upper, stride)) {
          return true;
        } else {
@@ -250,29 +253,30 @@ static bool HandleSubstring(
    // ambiguous within the parentheses.
    io.HandleRelativePosition(1); // skip '('
    std::optional<SubscriptValue> lower, upper;
-  std::optional<char32_t> ch{io.GetNextNonBlank()};
+  std::size_t byteCount{0};
+  std::optional<char32_t> ch{io.GetNextNonBlank(byteCount)};
    if (ch) {
      if (*ch == ':') {
        lower = 1;
      } else {
        lower = GetSubscriptValue(io);
-      ch = io.GetNextNonBlank();
+      ch = io.GetNextNonBlank(byteCount);
      }
    }
    if (ch && ch == ':') {
-    io.HandleRelativePosition(1);
-    ch = io.GetNextNonBlank();
+    io.HandleRelativePosition(byteCount);
+    ch = io.GetNextNonBlank(byteCount);
      if (ch) {
        if (*ch == ')') {
          upper = chars;
        } else {
          upper = GetSubscriptValue(io);
-        ch = io.GetNextNonBlank();
+        ch = io.GetNextNonBlank(byteCount);
        }
      }
    }
    if (ch && *ch == ')') {
-    io.HandleRelativePosition(1);
+    io.HandleRelativePosition(byteCount);
      if (lower && upper) {
        if (*lower > *upper) {
          // An empty substring, whatever the values are
@@ -335,16 +339,17 @@ static bool HandleComponent(IoStatementState &io, Descriptor &desc,
  
  // Advance to the terminal '/' of a namelist group.
  static void SkipNamelistGroup(IoStatementState &io) {
-  while (auto ch{io.GetNextNonBlank()}) {
-    io.HandleRelativePosition(1);
+  std::size_t byteCount{0};
+  while (auto ch{io.GetNextNonBlank(byteCount)}) {
+    io.HandleRelativePosition(byteCount);
      if (*ch == '/') {
        break;
      } else if (*ch == '\'' || *ch == '"') {
        // Skip quoted character literal
        char32_t quote{*ch};
        while (true) {
-        if ((ch = io.GetCurrentChar())) {
-          io.HandleRelativePosition(1);
+        if ((ch = io.GetCurrentChar(byteCount))) {
+          io.HandleRelativePosition(byteCount);
            if (*ch == quote) {
              break;
            }
@@ -369,14 +374,15 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
    char name[nameBufferSize];
    RUNTIME_CHECK(handler, group.groupName != nullptr);
    char32_t comma{GetComma(io)};
+  std::size_t byteCount{0};
    while (true) {
-    next = io.GetNextNonBlank();
+    next = io.GetNextNonBlank(byteCount);
      while (next && *next != '&') {
        // Extension: comment lines without ! before namelist groups
        if (!io.AdvanceRecord()) {
          next.reset();
        } else {
-        next = io.GetNextNonBlank();
+        next = io.GetNextNonBlank(byteCount);
        }
      }
      if (!next || *next != '&') {
@@ -384,7 +390,7 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
            "NAMELIST input group does not begin with '&' (at '%lc')", *next);
        return false;
      }
-    io.HandleRelativePosition(1);
+    io.HandleRelativePosition(byteCount);
      if (!GetLowerCaseName(io, name, sizeof name)) {
        handler.SignalError("NAMELIST input group has no name");
        return false;
@@ -396,7 +402,7 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
    }
    // Read the group's items
    while (true) {
-    next = io.GetNextNonBlank();
+    next = io.GetNextNonBlank(byteCount);
      if (!next || *next == '/') {
        break;
      }
@@ -423,7 +429,7 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
      const Descriptor *useDescriptor{&itemDescriptor};
      StaticDescriptor<maxRank, true, 16> staticDesc[2];
      int whichStaticDesc{0};
-    next = io.GetCurrentChar();
+    next = io.GetCurrentChar(byteCount);
      bool hadSubscripts{false};
      bool hadSubstring{false};
      if (next && (*next == '(' || *next == '%')) {
@@ -456,25 +462,25 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
            hadSubstring = false;
          }
          useDescriptor = &mutableDescriptor;
-        next = io.GetCurrentChar();
+        next = io.GetCurrentChar(byteCount);
        } while (next && (*next == '(' || *next == '%'));
      }
      // Skip the '='
-    next = io.GetNextNonBlank();
+    next = io.GetNextNonBlank(byteCount);
      if (!next || *next != '=') {
        handler.SignalError("No '=' found after item '%s' in NAMELIST group '%s'",
            name, group.groupName);
        return false;
      }
-    io.HandleRelativePosition(1);
+    io.HandleRelativePosition(byteCount);
      // Read the values into the descriptor.  An array can be short.
      listInput->ResetForNextNamelistItem();
      if (!descr::DescriptorIO<Direction::Input>(io, *useDescriptor)) {
        return false;
      }
-    next = io.GetNextNonBlank();
+    next = io.GetNextNonBlank(byteCount);
      if (next && *next == comma) {
-      io.HandleRelativePosition(1);
+      io.HandleRelativePosition(byteCount);
      }
    }
    if (!next || *next != '/') {
@@ -490,13 +496,14 @@ bool IsNamelistName(IoStatementState &io) {
    if (io.get_if<ListDirectedStatementState<Direction::Input>>()) {
      if (io.mutableModes().inNamelist) {
        SavedPosition savedPosition{io};
-      if (auto ch{io.GetNextNonBlank()}) {
+      std::size_t byteCount{0};
+      if (auto ch{io.GetNextNonBlank(byteCount)}) {
          if (IsLegalIdStart(*ch)) {
            do {
-            io.HandleRelativePosition(1);
-            ch = io.GetCurrentChar();
+            io.HandleRelativePosition(byteCount);
+            ch = io.GetCurrentChar(byteCount);
            } while (ch && IsLegalIdChar(*ch));
-          ch = io.GetNextNonBlank();
+          ch = io.GetNextNonBlank(byteCount);
            // TODO: how to deal with NaN(...) ambiguity?
            return ch && (*ch == '=' || *ch == '(' || *ch == '%');
          }
diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp

index 23e5b6292621b24a60e81fab3210fa870ca25fcc..2ba4faf23dc3ffed33a784436198acc3ce62e556 100644 (file)
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -7,7 +7,6 @@
  //===----------------------------------------------------------------------===//
  
  #include "unit.h"
-#include "environment.h"
  #include "io-error.h"
  #include "lock.h"
  #include "unit-map.h"
@@ -233,7 +232,6 @@ UnitMap &ExternalFileUnit::GetUnitMap() {
    error.isUnformatted = false;
    errorOutput = &error;
  
-  // TODO: Set UTF-8 mode from the environment
    unitMap = newUnitMap;
    return *unitMap;
  }
@@ -374,18 +372,6 @@ std::size_t ExternalFileUnit::GetNextInputBytes(
    return p ? length : 0;
  }
  
-std::optional<char32_t> ExternalFileUnit::GetCurrentChar(
-    IoErrorHandler &handler) {
-  const char *p{nullptr};
-  std::size_t bytes{GetNextInputBytes(p, handler)};
-  if (bytes == 0) {
-    return std::nullopt;
-  } else {
-    // TODO: UTF-8 decoding; may have to get more bytes in a loop
-    return *p;
-  }
-}
-
  const char *ExternalFileUnit::FrameNextInput(
      IoErrorHandler &handler, std::size_t bytes) {
    RUNTIME_CHECK(handler, isUnformatted.has_value() && !*isUnformatted);
diff --git a/flang/runtime/unit.h b/flang/runtime/unit.h

index 7be5e2f387f8d7789af3c37c0b41b4f06635a245..6e1a5ffbac7d8e7927d6bc211edca65d5d027e5b 100644 (file)
--- a/flang/runtime/unit.h
+++ b/flang/runtime/unit.h
@@ -13,6 +13,7 @@
  
  #include "buffer.h"
  #include "connection.h"
+#include "environment.h"
  #include "file.h"
  #include "format.h"
  #include "io-error.h"
@@ -34,7 +35,9 @@ class ExternalFileUnit : public ConnectionState,
                           public OpenFile,
                           public FileFrame<ExternalFileUnit> {
  public:
-  explicit ExternalFileUnit(int unitNumber) : unitNumber_{unitNumber} {}
+  explicit ExternalFileUnit(int unitNumber) : unitNumber_{unitNumber} {
+    isUTF8 = executionEnvironment.defaultUTF8;
+  }
    ~ExternalFileUnit() {}
  
    int unitNumber() const { return unitNumber_; }
@@ -80,7 +83,6 @@ public:
        const char *, std::size_t, std::size_t elementBytes, IoErrorHandler &);
    bool Receive(char *, std::size_t, std::size_t elementBytes, IoErrorHandler &);
    std::size_t GetNextInputBytes(const char *&, IoErrorHandler &);
-  std::optional<char32_t> GetCurrentChar(IoErrorHandler &);
    void SetLeftTabLimit();
    bool BeginReadingRecord(IoErrorHandler &);
    void FinishReadingRecord(IoErrorHandler &);
diff --git a/flang/runtime/utf.cpp b/flang/runtime/utf.cpp

new file mode 100644 (file)

index 0000000..8f59ddb
--- /dev/null
+++ b/flang/runtime/utf.cpp
@@ -0,0 +1,111 @@
+//===-- runtime/utf.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "utf.h"
+
+namespace Fortran::runtime {
+
+// clang-format off
+const std::uint8_t UTF8FirstByteTable[256]{
+  /* 00 - 7F:  7 bit payload in single byte */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  /* 80 - BF: invalid first byte, valid later byte */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  /* C0 - DF: 11 bit payload */
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  /* E0 - EF: 16 bit payload */
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  /* F0 - F7: 21 bit payload */ 4, 4, 4, 4, 4, 4, 4, 4,
+  /* F8 - FB: 26 bit payload */ 5, 5, 5, 5,
+  /* FC - FD: 31 bit payload */ 6, 6,
+  /* FE:      32 bit payload */ 7,
+  /* FF:      invalid */ 0
+};
+// clang-format on
+
+// Non-minimal encodings are accepted.
+std::optional<char32_t> DecodeUTF8(const char *p0) {
+  const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)};
+  std::size_t bytes{MeasureUTF8Bytes(*p0)};
+  if (bytes == 1) {
+    return char32_t{*p};
+  } else if (bytes > 1) {
+    std::uint64_t result{char32_t{*p} & (0x7f >> bytes)};
+    for (std::size_t j{1}; j < bytes; ++j) {
+      std::uint8_t next{p[j]};
+      if (next < 0x80 || next > 0xbf) {
+        return std::nullopt;
+      }
+      result = (result << 6) | (next & 0x3f);
+    }
+    if (result <= 0xffffffff) {
+      return static_cast<char32_t>(result);
+    }
+  }
+  return std::nullopt;
+}
+
+std::size_t EncodeUTF8(char *p0, char32_t ucs) {
+  std::uint8_t *p{reinterpret_cast<std::uint8_t *>(p0)};
+  if (ucs <= 0x7f) {
+    p[0] = ucs;
+    return 1;
+  } else if (ucs <= 0x7ff) {
+    p[0] = 0xc0 | (ucs >> 6);
+    p[1] = 0x80 | (ucs & 0x3f);
+    return 2;
+  } else if (ucs <= 0xffff) {
+    p[0] = 0xe0 | (ucs >> 12);
+    p[1] = 0x80 | ((ucs >> 6) & 0x3f);
+    p[2] = 0x80 | (ucs & 0x3f);
+    return 3;
+  } else if (ucs <= 0x1fffff) {
+    p[0] = 0xf0 | (ucs >> 18);
+    p[1] = 0x80 | ((ucs >> 12) & 0x3f);
+    p[2] = 0x80 | ((ucs >> 6) & 0x3f);
+    p[3] = 0x80 | (ucs & 0x3f);
+    return 4;
+  } else if (ucs <= 0x3ffffff) {
+    p[0] = 0xf8 | (ucs >> 24);
+    p[1] = 0x80 | ((ucs >> 18) & 0x3f);
+    p[2] = 0x80 | ((ucs >> 12) & 0x3f);
+    p[3] = 0x80 | ((ucs >> 6) & 0x3f);
+    p[4] = 0x80 | (ucs & 0x3f);
+    return 5;
+  } else if (ucs <= 0x7ffffff) {
+    p[0] = 0xf8 | (ucs >> 30);
+    p[1] = 0x80 | ((ucs >> 24) & 0x3f);
+    p[2] = 0x80 | ((ucs >> 18) & 0x3f);
+    p[3] = 0x80 | ((ucs >> 12) & 0x3f);
+    p[4] = 0x80 | ((ucs >> 6) & 0x3f);
+    p[5] = 0x80 | (ucs & 0x3f);
+    return 6;
+  } else {
+    p[0] = 0xfe;
+    p[1] = 0x80 | ((ucs >> 30) & 0x3f);
+    p[2] = 0x80 | ((ucs >> 24) & 0x3f);
+    p[3] = 0x80 | ((ucs >> 18) & 0x3f);
+    p[4] = 0x80 | ((ucs >> 12) & 0x3f);
+    p[5] = 0x80 | ((ucs >> 6) & 0x3f);
+    p[6] = 0x80 | (ucs & 0x3f);
+    return 7;
+  }
+}
+
+} // namespace Fortran::runtime
diff --git a/flang/runtime/utf.h b/flang/runtime/utf.h

new file mode 100644 (file)

index 0000000..6d9943b
--- /dev/null
+++ b/flang/runtime/utf.h
@@ -0,0 +1,68 @@
+//===-- runtime/utf.h -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UTF-8 is the variant-width standard encoding of Unicode (ISO 10646)
+// code points.
+//
+// 7-bit values in [00 .. 7F] represent themselves as single bytes, so true
+// 7-bit ASCII is also valid UTF-8.
+//
+// Larger values are encoded with a start byte in [C0 .. FE] that carries
+// the length of the encoding and some of the upper bits of the value, followed
+// by one or more bytes in the range [80 .. BF].
+//
+// Specifically, the first byte holds two or more uppermost set bits,
+// a zero bit, and some payload; the second and later bytes each start with
+// their uppermost bit set, the next bit clear, and six bits of payload.
+// Payload parcels are in big-endian order.  All bytes must be present in a
+// valid sequence; i.e., low-order sezo bits must be explicit.  UTF-8 is
+// self-synchronizing on input as any byte value cannot be both a valid
+// first byte or trailing byte.
+//
+// 0xxxxxxx - 7 bit ASCII
+// 110xxxxx 10xxxxxx - 11-bit value
+// 1110xxxx 10xxxxxx 10xxxxxx - 16-bit value
+// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 21-bit value
+// 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 26-bit value
+// 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 31-bit value
+// 11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 36-bit value
+//
+// Canonical UTF-8 sequences should be minimal, and our output is so, but
+// we do not reject non-minimal sequences on input.  Unicode only defines
+// code points up to 0x10FFFF, so 21-bit (4-byte) UTF-8 is the actual
+// standard maximum.  However, we support extended forms up to 32 bits so that
+// CHARACTER(KIND=4) can be abused to hold arbitrary 32-bit data.
+
+#ifndef FORTRAN_RUNTIME_UTF_H_
+#define FORTRAN_RUNTIME_UTF_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+
+namespace Fortran::runtime {
+
+// Derive the length of a UTF-8 character encoding from its first byte.
+// A zero result signifies an invalid encoding.
+extern const std::uint8_t UTF8FirstByteTable[256];
+static inline std::size_t MeasureUTF8Bytes(char first) {
+  return UTF8FirstByteTable[static_cast<std::uint8_t>(first)];
+}
+
+static constexpr std::size_t maxUTF8Bytes{7};
+
+// Ensure that all bytes are present in sequence in the input buffer
+// before calling; use MeasureUTF8Bytes(first byte) to count them.
+std::optional<char32_t> DecodeUTF8(const char *);
+
+// Ensure that at least maxUTF8Bytes remain in the output
+// buffer before calling.
+std::size_t EncodeUTF8(char *, char32_t);
+
+} // namespace Fortran::runtime
+#endif // FORTRAN_RUNTIME_UTF_H_
diff --git a/flang/unittests/Runtime/ExternalIOTest.cpp b/flang/unittests/Runtime/ExternalIOTest.cpp

index fe88144bcff997aac3f20fea8750edf29102b6cc..d88a0e11d87d05a2be4e979176a375677f2a54dc 100644 (file)
--- a/flang/unittests/Runtime/ExternalIOTest.cpp
+++ b/flang/unittests/Runtime/ExternalIOTest.cpp
@@ -553,6 +553,10 @@ TEST(ExternalIOTests, TestNonAvancingInput) {
          << "Input-item value after non advancing read " << j;
      j++;
    }
+  // CLOSE(UNIT=unit)
+  io = IONAME(BeginClose)(unit, __FILE__, __LINE__);
+  ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+      << "EndIoStatement() for Close";
  }
  
  TEST(ExternalIOTests, TestWriteAfterNonAvancingInput) {
@@ -645,9 +649,12 @@ TEST(ExternalIOTests, TestWriteAfterNonAvancingInput) {
        << "InputAscii() ";
    ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
        << "EndIoStatement() for Read ";
-
    ASSERT_EQ(resultRecord, expectedRecord)
        << "Record after non advancing read followed by write";
+  // CLOSE(UNIT=unit)
+  io = IONAME(BeginClose)(unit, __FILE__, __LINE__);
+  ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+      << "EndIoStatement() for Close";
  }
  
  TEST(ExternalIOTests, TestWriteAfterEndfile) {
@@ -707,4 +714,184 @@ TEST(ExternalIOTests, TestWriteAfterEndfile) {
    ASSERT_FALSE(IONAME(InputInteger)(io, eof)) << "InputInteger(eof)";
    ASSERT_EQ(eof, -1) << "READ(eof)";
    ASSERT_EQ(IONAME(EndIoStatement)(io), IostatEnd) << "EndIoStatement for READ";
+  // CLOSE(UNIT=unit)
+  io = IONAME(BeginClose)(unit, __FILE__, __LINE__);
+  ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+      << "EndIoStatement() for Close";
+}
+
+TEST(ExternalIOTests, TestUTF8Encoding) {
+  // OPEN(FILE="utf8test",NEWUNIT=unit,ACCESS='SEQUENTIAL',ACTION='READWRITE',&
+  //   FORM='FORMATTED',STATUS='REPLACE',ENCODING='UTF-8')
+  auto *io{IONAME(BeginOpenNewUnit)(__FILE__, __LINE__)};
+  ASSERT_TRUE(IONAME(SetAccess)(io, "SEQUENTIAL", 10))
+      << "SetAccess(SEQUENTIAL)";
+  ASSERT_TRUE(IONAME(SetAction)(io, "READWRITE", 9)) << "SetAction(READWRITE)";
+  ASSERT_TRUE(IONAME(SetFile)(io, "utf8test", 8)) << "SetFile(utf8test)";
+  ASSERT_TRUE(IONAME(SetForm)(io, "FORMATTED", 9)) << "SetForm(FORMATTED)";
+  ASSERT_TRUE(IONAME(SetStatus)(io, "REPLACE", 7)) << "SetStatus(REPLACE)";
+  ASSERT_TRUE(IONAME(SetEncoding)(io, "UTF-8", 5)) << "SetEncoding(UTF-8)";
+  int unit{-1};
+  ASSERT_TRUE(IONAME(GetNewUnit)(io, unit)) << "GetNewUnit()";
+  ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+      << "EndIoStatement() for first OPEN";
+  char buffer[12];
+  std::memcpy(buffer,
+      "abc\x80\xff"
+      "de\0\0\0\0\0",
+      12);
+  // WRITE(unit, *) buffer
+  io = IONAME(BeginExternalListOutput)(unit, __FILE__, __LINE__);
+  StaticDescriptor<0> staticDescriptor;
+  Descriptor &desc{staticDescriptor.descriptor()};
+  desc.Establish(TypeCode{CFI_type_char}, 7, buffer, 0);
+  desc.Check();
+  ASSERT_TRUE(IONAME(OutputDescriptor)(io, desc));
+  ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+      << "EndIoStatement() for WRITE";
+  // REWIND(unit)
+  io = IONAME(BeginRewind)(unit, __FILE__, __LINE__);
+  ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+      << "EndIoStatement for REWIND";
+  // READ(unit, *) buffer
+  desc.Establish(TypeCode(CFI_type_char), sizeof buffer, buffer, 0);
+  desc.Check();
+  io = IONAME(BeginExternalListInput)(unit, __FILE__, __LINE__);
+  ASSERT_TRUE(IONAME(InputDescriptor)(io, desc));
+  ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+      << "EndIoStatement() for first READ";
+  ASSERT_EQ(std::memcmp(buffer,
+                "abc\x80\xff"
+                "de     ",
+                12),
+      0);
+  // CLOSE(UNIT=unit,STATUS='KEEP')
+  io = IONAME(BeginClose)(unit, __FILE__, __LINE__);
+  ASSERT_TRUE(IONAME(SetStatus)(io, "KEEP", 4)) << "SetStatus(KEEP)";
+  ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+      << "EndIoStatement() for first CLOSE";
+  // OPEN(FILE="utf8test",NEWUNIT=unit,ACCESS='SEQUENTIAL',ACTION='READWRITE',&
+  //   FORM='FORMATTED',STATUS='OLD')
+  io = IONAME(BeginOpenNewUnit)(__FILE__, __LINE__);
+  ASSERT_TRUE(IONAME(SetAccess)(io, "SEQUENTIAL", 10))
+      << "SetAccess(SEQUENTIAL)";
+  ASSERT_TRUE(IONAME(SetAction)(io, "READWRITE", 9)) << "SetAction(READWRITE)";
+  ASSERT_TRUE(IONAME(SetFile)(io, "utf8test", 8)) << "SetFile(utf8test)";
+  ASSERT_TRUE(IONAME(SetForm)(io, "FORMATTED", 9)) << "SetForm(FORMATTED)";
+  ASSERT_TRUE(IONAME(SetStatus)(io, "OLD", 3)) << "SetStatus(OLD)";
+  ASSERT_TRUE(IONAME(GetNewUnit)(io, unit)) << "GetNewUnit()";
+  ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+      << "EndIoStatement() for second OPEN";
+  // READ(unit, *) buffer
+  io = IONAME(BeginExternalListInput)(unit, __FILE__, __LINE__);
+  ASSERT_TRUE(IONAME(InputDescriptor)(io, desc));
+  ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+      << "EndIoStatement() for second READ";
+  ASSERT_EQ(std::memcmp(buffer,
+                "abc\xc2\x80\xc3\xbf"
+                "de   ",
+                12),
+      0);
+  // CLOSE(UNIT=unit,STATUS='DELETE')
+  io = IONAME(BeginClose)(unit, __FILE__, __LINE__);
+  ASSERT_TRUE(IONAME(SetStatus)(io, "DELETE", 6)) << "SetStatus(DELETE)";
+  ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+      << "EndIoStatement() for second CLOSE";
+}
+
+TEST(ExternalIOTests, TestUCS) {
+  // OPEN(FILE="ucstest',NEWUNIT=unit,ACCESS='SEQUENTIAL',ACTION='READWRITE',&
+  //   FORM='FORMATTED',STATUS='REPLACE',ENCODING='UTF-8')
+  auto *io{IONAME(BeginOpenNewUnit)(__FILE__, __LINE__)};
+  ASSERT_TRUE(IONAME(SetAccess)(io, "SEQUENTIAL", 10))
+      << "SetAccess(SEQUENTIAL)";
+  ASSERT_TRUE(IONAME(SetAction)(io, "READWRITE", 9)) << "SetAction(READWRITE)";
+  ASSERT_TRUE(IONAME(SetFile)(io, "ucstest", 7)) << "SetAction(ucstest)";
+  ASSERT_TRUE(IONAME(SetForm)(io, "FORMATTED", 9)) << "SetForm(FORMATTED)";
+  ASSERT_TRUE(IONAME(SetStatus)(io, "REPLACE", 7)) << "SetStatus(REPLACE)";
+  ASSERT_TRUE(IONAME(SetEncoding)(io, "UTF-8", 5)) << "SetEncoding(UTF-8)";
+  int unit{-1};
+  ASSERT_TRUE(IONAME(GetNewUnit)(io, unit)) << "GetNewUnit()";
+  ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+      << "EndIoStatement() for first OPEN";
+  char32_t wbuffer[8]{U"abc\u0080\uffff"
+                      "de"};
+  // WRITE(unit, *) wbuffec
+  io = IONAME(BeginExternalListOutput)(unit, __FILE__, __LINE__);
+  StaticDescriptor<0> staticDescriptor;
+  Descriptor &desc{staticDescriptor.descriptor()};
+  desc.Establish(TypeCode{CFI_type_char32_t}, sizeof wbuffer - sizeof(char32_t),
+      wbuffer, 0);
+  desc.Check();
+  ASSERT_TRUE(IONAME(OutputDescriptor)(io, desc));
+  ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+      << "EndIoStatement() for WRITE";
+  // REWIND(unit)
+  io = IONAME(BeginRewind)(unit, __FILE__, __LINE__);
+  ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+      << "EndIoStatement for REWIND";
+  // READ(unit, *) buffer
+  io = IONAME(BeginExternalListInput)(unit, __FILE__, __LINE__);
+  desc.Establish(TypeCode{CFI_type_char32_t}, sizeof wbuffer, wbuffer, 0);
+  desc.Check();
+  ASSERT_TRUE(IONAME(InputDescriptor)(io, desc));
+  ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+      << "EndIoStatement() for first READ";
+  char dump[80];
+  dump[0] = '\0';
+  for (int j{0}; j < 8; ++j) {
+    std::size_t dumpLen{std::strlen(dump)};
+    std::snprintf(
+        dump + dumpLen, sizeof dump - dumpLen, " %x", (unsigned)wbuffer[j]);
+  }
+  EXPECT_EQ(wbuffer[0], U'a') << dump;
+  EXPECT_EQ(wbuffer[1], U'b') << dump;
+  EXPECT_EQ(wbuffer[2], U'c') << dump;
+  EXPECT_EQ(wbuffer[3], U'\u0080') << dump;
+  EXPECT_EQ(wbuffer[4], U'\uffff') << dump;
+  EXPECT_EQ(wbuffer[5], U'd') << dump;
+  EXPECT_EQ(wbuffer[6], U'e') << dump;
+  EXPECT_EQ(wbuffer[7], U' ') << dump;
+  // CLOSE(UNIT=unit,STATUS='KEEP')
+  io = IONAME(BeginClose)(unit, __FILE__, __LINE__);
+  ASSERT_TRUE(IONAME(SetStatus)(io, "KEEP", 4)) << "SetStatus(KEEP)";
+  ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+      << "EndIoStatement() for first CLOSE";
+  // OPEN(FILE="ucstest",NEWUNIT=unit,ACCESS='SEQUENTIAL',ACTION='READWRITE',&
+  //   FORM='FORMATTED',STATUS='OLD')
+  io = IONAME(BeginOpenNewUnit)(__FILE__, __LINE__);
+  ASSERT_TRUE(IONAME(SetAccess)(io, "SEQUENTIAL", 10))
+      << "SetAccess(SEQUENTIAL)";
+  ASSERT_TRUE(IONAME(SetAction)(io, "READWRITE", 9)) << "SetAction(READWRITE)";
+  ASSERT_TRUE(IONAME(SetFile)(io, "ucstest", 7)) << "SetFile(ucstest)";
+  ASSERT_TRUE(IONAME(SetForm)(io, "FORMATTED", 9)) << "SetForm(FORMATTED)";
+  ASSERT_TRUE(IONAME(SetStatus)(io, "OLD", 3)) << "SetStatus(OLD)";
+  ASSERT_TRUE(IONAME(GetNewUnit)(io, unit)) << "GetNewUnit()";
+  ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+      << "EndIoStatement() for second OPEN";
+  char buffer[12];
+  // READ(unit, *) buffer
+  io = IONAME(BeginExternalListInput)(unit, __FILE__, __LINE__);
+  desc.Establish(TypeCode{CFI_type_char}, sizeof buffer, buffer, 0);
+  desc.Check();
+  ASSERT_TRUE(IONAME(InputDescriptor)(io, desc));
+  ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+      << "EndIoStatement() for second READ";
+  dump[0] = '\0';
+  for (int j{0}; j < 12; ++j) {
+    std::size_t dumpLen{std::strlen(dump)};
+    std::snprintf(dump + dumpLen, sizeof dump - dumpLen, " %x",
+        (unsigned)(unsigned char)buffer[j]);
+  }
+  EXPECT_EQ(std::memcmp(buffer,
+                "abc\xc2\x80\xef\xbf\xbf"
+                "de  ",
+                12),
+      0)
+      << dump;
+  // CLOSE(UNIT=unit,STATUS='DELETE')
+  io = IONAME(BeginClose)(unit, __FILE__, __LINE__);
+  ASSERT_TRUE(IONAME(SetStatus)(io, "DELETE", 6)) << "SetStatus(DELETE)";
+  ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
+      << "EndIoStatement() for second CLOSE";
  }
author	Peter Klausler <pklausler@nvidia.com>
	Wed, 16 Mar 2022 19:32:03 +0000 (12:32 -0700)
committer	Peter Klausler <pklausler@nvidia.com>
	Tue, 22 Mar 2022 18:48:14 +0000 (11:48 -0700)
flang/include/flang/Runtime/iostat.h		patch \| blob \| history
flang/runtime/CMakeLists.txt		patch \| blob \| history
flang/runtime/descriptor-io.h		patch \| blob \| history
flang/runtime/edit-input.cpp		patch \| blob \| history
flang/runtime/edit-input.h		patch \| blob \| history
flang/runtime/edit-output.cpp		patch \| blob \| history
flang/runtime/edit-output.h		patch \| blob \| history
flang/runtime/environment.cpp		patch \| blob \| history
flang/runtime/environment.h		patch \| blob \| history
flang/runtime/internal-unit.cpp		patch \| blob \| history
flang/runtime/internal-unit.h		patch \| blob \| history
flang/runtime/io-stmt.cpp		patch \| blob \| history
flang/runtime/io-stmt.h		patch \| blob \| history
flang/runtime/iostat.cpp		patch \| blob \| history
flang/runtime/namelist.cpp		patch \| blob \| history
flang/runtime/unit.cpp		patch \| blob \| history
flang/runtime/unit.h		patch \| blob \| history
flang/runtime/utf.cpp	[new file with mode: 0644]	patch \| blob
flang/runtime/utf.h	[new file with mode: 0644]	patch \| blob
flang/unittests/Runtime/ExternalIOTest.cpp		patch \| blob \| history