[flang] Tests pass

author peter klausler <pklausler@nvidia.com>

Tue, 11 Jun 2019 17:34:58 +0000 (10:34 -0700)

committer peter klausler <pklausler@nvidia.com>

Mon, 17 Jun 2019 23:13:07 +0000 (16:13 -0700)
author peter klausler <pklausler@nvidia.com>
Tue, 11 Jun 2019 17:34:58 +0000 (10:34 -0700)
committer peter klausler <pklausler@nvidia.com>
Mon, 17 Jun 2019 23:13:07 +0000 (16:13 -0700)
diff --git a/flang/documentation/Parsing.md b/flang/documentation/Parsing.md

index 716a1e8..9a0d1aa 100644 (file)
--- a/flang/documentation/Parsing.md
+++ b/flang/documentation/Parsing.md
@@ -1,5 +1,5 @@
  <!--
-Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
  -->
  
  The F18 Parser
@@ -58,6 +58,7 @@ by a CookedSource class instance, in which:
  * except for the payload in character literals, Hollerith constants,
    and character and Hollerith edit descriptors, all letters have been
    normalized to lower case
+* all non-ASCII characters have been re-encoded in UTF-8.
  
  Lines in the cooked character stream can be of arbitrary length.
  
diff --git a/flang/lib/parser/characters.cc b/flang/lib/parser/characters.cc

index e92c1d1..3345fa1 100644 (file)
--- a/flang/lib/parser/characters.cc
+++ b/flang/lib/parser/characters.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+// Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
@@ -13,27 +13,28 @@
  // limitations under the License.
  
  #include "characters.h"
+#include "../common/idioms.h"
  #include <cstddef>
  #include <optional>
  
  namespace Fortran::parser {
  
-std::optional<int> UTF8CharacterBytes(const char *p) {
+std::optional<int> UTF_8CharacterBytes(const char *p) {
    if ((*p & 0x80) == 0) {
-    return {1};
+    return 1;
    }
    if ((*p & 0xf8) == 0xf0) {
      if ((*p & 0x07) != 0 && (p[1] & 0xc0) == 0x80 && (p[2] & 0xc0) == 0x80 &&
          (p[3] & 0xc0) == 0x80) {
-      return {4};
+      return 4;
      }
    } else if ((*p & 0xf0) == 0xe0) {
      if ((*p & 0x0f) != 0 && (p[1] & 0xc0) == 0x80 && (p[2] & 0xc0) == 0x80) {
-      return {3};
+      return 3;
      }
    } else if ((*p & 0xe0) == 0xc0) {
      if ((*p & 0x1f) != 0 && (p[1] & 0xc0) == 0x80) {
-      return {2};
+      return 2;
      }
    }
    return std::nullopt;
@@ -42,49 +43,68 @@ std::optional<int> UTF8CharacterBytes(const char *p) {
  std::optional<int> EUC_JPCharacterBytes(const char *p) {
    int b1 = *p & 0xff;
    if (b1 <= 0x7f) {
-    return {1};
+    return 1;
    }
    if (b1 >= 0xa1 && b1 <= 0xfe) {
      int b2 = p[1] & 0xff;
      if (b2 >= 0xa1 && b2 <= 0xfe) {
        // JIS X 0208 (code set 1)
-      return {2};
+      return 2;
      }
    } else if (b1 == 0x8e) {
      int b2 = p[1] & 0xff;
      if (b2 >= 0xa1 && b2 <= 0xdf) {
        // upper half JIS 0201 (half-width kana, code set 2)
-      return {2};
+      return 2;
      }
    } else if (b1 == 0x8f) {
      int b2 = p[1] & 0xff;
      int b3 = p[2] & 0xff;
      if (b2 >= 0xa1 && b2 <= 0xfe && b3 >= 0xa1 && b3 <= 0xfe) {
        // JIS X 0212 (code set 3)
-      return {3};
+      return 3;
      }
    }
    return std::nullopt;
  }
  
-std::optional<std::size_t> CountCharacters(
-    const char *p, std::size_t bytes, std::optional<int> (*cbf)(const char *)) {
+static std::optional<int> One(const char *) { return 1; }
+
+static std::optional<int> (*CharacterCounter(Encoding encoding))(const char *) {
+  switch (encoding) {
+  case Encoding::UTF_8: return UTF_8CharacterBytes;
+  case Encoding::EUC_JP: return EUC_JPCharacterBytes;
+  default: return One;
+  }
+}
+
+std::optional<int> CharacterBytes(const char *p, Encoding encoding) {
+  return CharacterCounter(encoding)(p);
+}
+
+std::optional<int> CountCharacters(
+    const char *p, std::size_t bytes, Encoding encoding) {
    std::size_t chars{0};
    const char *limit{p + bytes};
+  std::optional<int> (*cbf)(const char *){CharacterCounter(encoding)};
    while (p < limit) {
-    ++chars;
-    std::optional<int> cb{cbf(p)};
-    if (!cb.has_value()) {
+    if (std::optional<int> cb{cbf(p)}) {
+      p += *cb;
+      ++chars;
+    } else {
        return std::nullopt;
      }
-    p += *cb;
    }
-  return {chars};
+  if (p == limit) {
+    return chars;
+  } else {
+    return std::nullopt;
+  }
  }
  
  template<typename STRING>
-std::string QuoteCharacterLiteralHelper(
-    const STRING &str, bool doubleDoubleQuotes, bool doubleBackslash) {
+std::string QuoteCharacterLiteralHelper(const STRING &str,
+    bool doubleDoubleQuotes, bool doubleBackslash, Encoding encoding) {
    std::string result{'"'};
    const auto emit{[&](char ch) { result += ch; }};
    for (auto ch : str) {
@@ -92,60 +112,198 @@ std::string QuoteCharacterLiteralHelper(
      if constexpr (std::is_same_v<char, CharT>) {
        // char may be signed depending on host.
        char32_t ch32{static_cast<unsigned char>(ch)};
-      EmitQuotedChar(ch32, emit, emit, doubleDoubleQuotes, doubleBackslash);
+      EmitQuotedChar(
+          ch32, emit, emit, doubleDoubleQuotes, doubleBackslash, encoding);
      } else {
        char32_t ch32{ch};
-      EmitQuotedChar(ch32, emit, emit, doubleDoubleQuotes, doubleBackslash);
+      EmitQuotedChar(
+          ch32, emit, emit, doubleDoubleQuotes, doubleBackslash, encoding);
      }
    }
    result += '"';
    return result;
  }
  
-std::string QuoteCharacterLiteral(
-    const std::string &str, bool doubleDoubleQuotes, bool doubleBackslash) {
-  return QuoteCharacterLiteralHelper(str, doubleDoubleQuotes, doubleBackslash);
+std::string QuoteCharacterLiteral(const std::string &str,
+    bool doubleDoubleQuotes, bool doubleBackslash, Encoding encoding) {
+  return QuoteCharacterLiteralHelper(
+      str, doubleDoubleQuotes, doubleBackslash, encoding);
+}
+
+std::string QuoteCharacterLiteral(const std::u16string &str,
+    bool doubleDoubleQuotes, bool doubleBackslash, Encoding encoding) {
+  return QuoteCharacterLiteralHelper(
+      str, doubleDoubleQuotes, doubleBackslash, encoding);
+}
+
+std::string QuoteCharacterLiteral(const std::u32string &str,
+    bool doubleDoubleQuotes, bool doubleBackslash, Encoding encoding) {
+  return QuoteCharacterLiteralHelper(
+      str, doubleDoubleQuotes, doubleBackslash, encoding);
+}
+
+EncodedCharacter EncodeLATIN_1(char codepoint) {
+  CHECK(codepoint <= 0xff);
+  EncodedCharacter result;
+  result.buffer[0] = codepoint;
+  result.bytes = 1;
+  return result;
+}
+
+EncodedCharacter EncodeUTF_8(char32_t codepoint) {
+  // N.B. char32_t is unsigned
+  EncodedCharacter result;
+  if (codepoint <= 0x7f) {
+    result.buffer[0] = codepoint;
+    result.bytes = 1;
+  } else if (codepoint <= 0x7ff) {
+    result.buffer[0] = 0xc0 | (codepoint >> 6);
+    result.buffer[1] = 0x80 | (codepoint & 0x3f);
+    result.bytes = 2;
+  } else if (codepoint <= 0xffff) {
+    result.buffer[0] = 0xe0 | (codepoint >> 12);
+    result.buffer[1] = 0x80 | ((codepoint >> 6) & 0x3f);
+    result.buffer[2] = 0x80 | (codepoint & 0x3f);
+    result.bytes = 3;
+  } else {
+    // UCS actually only goes up to 0x10ffff but the
+    // UTF-8 encoding handles 21 bits.
+    CHECK(codepoint <= 0x1fffff);
+    result.buffer[0] = 0xf0 | (codepoint >> 18);
+    result.buffer[1] = 0x80 | ((codepoint >> 12) & 0x3f);
+    result.buffer[2] = 0x80 | ((codepoint >> 6) & 0x3f);
+    result.buffer[3] = 0x80 | (codepoint & 0x3f);
+    result.bytes = 4;
+  }
+  return result;
+}
+
+EncodedCharacter EncodeEUC_JP(char16_t codepoint) {
+  // Assume JIS X 0208 (TODO: others)
+  CHECK(codepoint <= 0x6e6e);
+  EncodedCharacter result;
+  if (codepoint <= 0x7f) {
+    result.buffer[0] = codepoint;
+    result.bytes = 1;
+  } else {
+    result.buffer[0] = 0x80 | (codepoint >> 8);
+    result.buffer[1] = 0x80 | (codepoint & 0x7f);
+    result.bytes = 2;
+  }
+  return result;
+}
+
+EncodedCharacter EncodeCharacter(Encoding encoding, char32_t codepoint) {
+  switch (encoding) {
+  case Encoding::LATIN_1: return EncodeLATIN_1(codepoint);
+  case Encoding::UTF_8: return EncodeUTF_8(codepoint);
+  case Encoding::EUC_JP: return EncodeEUC_JP(codepoint);
+  default: CRASH_NO_CASE;
+  }
+}
+
+DecodedCharacter DecodeUTF_8Character(const char *cp, std::size_t bytes) {
+  auto p{reinterpret_cast<const std::uint8_t *>(cp)};
+  char32_t ch{*p};
+  int charBytes{1};
+  if (ch >= 0x80) {
+    if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 &&
+        ((p[1] | p[2] | p[3]) & 0xc0) == 0x80) {
+      charBytes = 4;
+      ch = ((ch & 7) << 6) | (p[1] & 0x3f);
+      ch = (ch << 6) | (p[2] & 0x3f);
+      ch = (ch << 6) | (p[3] & 0x3f);
+    } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 &&
+        ((p[1] | p[2]) & 0xc0) == 0x80) {
+      charBytes = 3;
+      ch = ((ch & 0xf) << 6) | (p[1] & 0x3f);
+      ch = (ch << 6) | (p[2] & 0x3f);
+    } else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 &&
+        (p[1] & 0xc0) == 0x80) {
+      charBytes = 2;
+      ch = ((ch & 0x1f) << 6) | (p[1] & 0x3f);
+    } else {
+      return {};  // not valid UTF-8
+    }
+  }
+  return {ch, charBytes};
  }
  
-std::string QuoteCharacterLiteral(
-    const std::u16string &str, bool doubleDoubleQuotes, bool doubleBackslash) {
-  return QuoteCharacterLiteralHelper(str, doubleDoubleQuotes, doubleBackslash);
+DecodedCharacter DecodeEUC_JPCharacter(const char *cp, std::size_t bytes) {
+  auto p{reinterpret_cast<const std::uint8_t *>(cp)};
+  char32_t ch{*p};
+  int charBytes{1};
+  if (ch >= 0x80) {
+    if (bytes >= 2 && ch == 0x8e && p[1] >= 0xa1 && p[1] <= 0xdf) {
+      charBytes = 2;  // JIS X 0201
+      ch = p[1];
+    } else if (bytes >= 3 && ch == 0x8f && p[1] >= 0xa1 && p[1] <= 0xfe &&
+        p[2] >= 0xa1 && p[2] <= 0xfe) {
+      charBytes = 3;  // JIS X 0212
+      ch = (p[1] & 0x7f) << 8 | (p[1] & 0x7f);
+    } else if (bytes >= 2 && ch >= 0xa1 && ch <= 0xfe && p[1] >= 0x1 &&
+        p[1] <= 0xfe) {
+      charBytes = 2;  // JIS X 0208
+      ch = ((ch & 0x7f) << 8) | (p[1] & 0x7f);
+    } else {
+      return {};
+    }
+  }
+  return {ch, charBytes};
  }
  
-std::string QuoteCharacterLiteral(
-    const std::u32string &str, bool doubleDoubleQuotes, bool doubleBackslash) {
-  return QuoteCharacterLiteralHelper(str, doubleDoubleQuotes, doubleBackslash);
+DecodedCharacter DecodeLATIN1Character(const char *cp) {
+  return {*reinterpret_cast<const std::uint8_t *>(cp), 1};
  }
  
-std::optional<std::u32string> DecodeUTF8(const std::string &s) {
+DecodedCharacter DecodeCharacter(
+    Encoding encoding, const char *cp, std::size_t bytes) {
+  switch (encoding) {
+  case Encoding::LATIN_1: return DecodeLATIN1Character(cp);
+  case Encoding::UTF_8: return DecodeUTF_8Character(cp, bytes);
+  case Encoding::EUC_JP: return DecodeEUC_JPCharacter(cp, bytes);
+  default: CRASH_NO_CASE;
+  }
+}
+
+std::u32string DecodeUTF_8(const std::string &s) {
    std::u32string result;
-  const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(s.data())};
+  const char *p{s.c_str()};
    for (auto bytes{s.size()}; bytes != 0;) {
-    decltype(bytes) charBytes{1};
-    char32_t ch{*p++};
-    if ((ch & 0xc0) > 0x40) {
-      if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 &&
-          ((p[0] | p[1] | p[2]) & 0xc0) == 0x80) {
-        charBytes = 4;
-        ch = ((ch & 7) << 6) | (*p++ & 0x3f);
-        ch = (ch << 6) | (*p++ & 0x3f);
-        ch = (ch << 6) | (*p++ & 0x3f);
-      } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 &&
-          ((p[0] | p[1]) & 0xc0) == 0x80) {
-        charBytes = 3;
-        ch = ((ch & 0xf) << 6) | (*p++ & 0x3f);
-        ch = (ch << 6) | (*p++ & 0x3f);
-      } else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 &&
-          (*p & 0xc0) == 0x80) {
-        charBytes = 2;
-        ch = ((ch & 0x1f) << 6) | (*p++ & 0x3f);
-      } else {
-        return std::nullopt;  // not valid UTF-8
+    DecodedCharacter decoded{DecodeUTF_8Character(p, bytes)};
+    if (decoded.bytes > 0) {
+      if (static_cast<std::size_t>(decoded.bytes) <= bytes) {
+        result.append(1, decoded.unicode);
+        bytes -= decoded.bytes;
+        p += decoded.bytes;
+        continue;
        }
      }
-    result.append(1, ch);
-    bytes -= charBytes;
+    result.append(1, static_cast<uint8_t>(*p));
+    ++p;
+    --bytes;
    }
-  return {result};
+  return result;
  }
+
+std::u16string DecodeEUC_JP(const std::string &s) {
+  std::u16string result;
+  const char *p{s.c_str()};
+  for (auto bytes{s.size()}; bytes != 0;) {
+    DecodedCharacter decoded{DecodeEUC_JPCharacter(p, bytes)};
+    if (decoded.bytes > 0) {
+      if (static_cast<std::size_t>(decoded.bytes) <= bytes) {
+        result.append(1, decoded.unicode);
+        bytes -= decoded.bytes;
+        p += decoded.bytes;
+        continue;
+      }
+    }
+    result.append(1, static_cast<uint8_t>(*p));
+    ++p;
+    --bytes;
+  }
+  return result;
+}
+
  }
diff --git a/flang/lib/parser/characters.h b/flang/lib/parser/characters.h

index a54df93..47c3cc9 100644 (file)
--- a/flang/lib/parser/characters.h
+++ b/flang/lib/parser/characters.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+// Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
@@ -26,7 +26,13 @@
  
  namespace Fortran::parser {
  
-enum class Encoding { UTF8, EUC_JP };
+// We can easily support Fortran program source in any character
+// set whose first 128 code points correspond to ASCII codes 0-127 (ISO/IEC646).
+// The specific encodings that we can handle include:
+//   LATIN_1: ISO 8859-1 Latin-1
+//   UTF_8: Multi-byte encoding of Unicode (ISO/IEC 10646)
+//   EUC_JP: 1-3 byte encoding of JIS X 0208 / 0212
+enum class Encoding { LATIN_1, UTF_8, EUC_JP };
  
  inline constexpr bool IsUpperCaseLetter(char ch) {
    return ch >= 'A' && ch <= 'Z';
@@ -133,9 +139,20 @@ inline constexpr std::optional<char> BackslashEscapeChar(char ch) {
    }
  }
  
+struct EncodedCharacter {
+  char buffer[4];
+  int bytes{0};
+};
+
+EncodedCharacter EncodeLATIN_1(char);
+EncodedCharacter EncodeUTF_8(char32_t);
+EncodedCharacter EncodeEUC_JP(char16_t);
+EncodedCharacter EncodeCharacter(Encoding, char32_t);
+
  template<typename NORMAL, typename INSERTED>
  void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert,
-    bool doubleDoubleQuotes = true, bool doubleBackslash = true) {
+    bool doubleDoubleQuotes = true, bool doubleBackslash = true,
+    Encoding encoding = Encoding::UTF_8) {
    if (ch == '"') {
      if (doubleDoubleQuotes) {
        insert('"');
@@ -146,7 +163,7 @@ void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert,
        insert('\\');
      }
      emit('\\');
-  } else if (ch < ' ' || (ch >= 0x80 && ch <= 0xff)) {
+  } else if (ch < ' ' || (encoding == Encoding::LATIN_1 && ch >= 0x7f)) {
      insert('\\');
      if (std::optional<char> escape{BackslashEscapeChar(ch)}) {
        emit(*escape);
@@ -156,34 +173,40 @@ void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert,
        insert('0' + ((ch >> 3) & 7));
        insert('0' + (ch & 7));
      }
-  } else if (ch <= 0x7f) {
-    emit(ch);
-  } else if (ch <= 0x7ff) {
-    emit(0xc0 | ((ch >> 6) & 0x1f));
-    emit(0x80 | (ch & 0x3f));
-  } else if (ch <= 0xffff) {
-    emit(0xe0 | ((ch >> 12) & 0x0f));
-    emit(0x80 | ((ch >> 6) & 0x3f));
-    emit(0x80 | (ch & 0x3f));
    } else {
-    emit(0xf0 | ((ch >> 18) & 0x07));
-    emit(0x80 | ((ch >> 12) & 0x3f));
-    emit(0x80 | ((ch >> 6) & 0x3f));
-    emit(0x80 | (ch & 0x3f));
+    EncodedCharacter encoded{EncodeCharacter(encoding, ch)};
+    for (int j{0}; j < encoded.bytes; ++j) {
+      emit(encoded.buffer[j]);
+    }
    }
  }
  
  std::string QuoteCharacterLiteral(const std::string &,
-    bool doubleDoubleQuotes = true, bool doubleBackslash = true);
+    bool doubleDoubleQuotes = true, bool doubleBackslash = true,
+    Encoding = Encoding::LATIN_1);
  std::string QuoteCharacterLiteral(const std::u16string &,
-    bool doubleDoubleQuotes = true, bool doubleBackslash = true);
+    bool doubleDoubleQuotes = true, bool doubleBackslash = true,
+    Encoding = Encoding::EUC_JP);
  std::string QuoteCharacterLiteral(const std::u32string &,
-    bool doubleDoubleQuotes = true, bool doubleBackslash = true);
+    bool doubleDoubleQuotes = true, bool doubleBackslash = true,
+    Encoding = Encoding::UTF_8);
  
-std::optional<int> UTF8CharacterBytes(const char *);
+std::optional<int> UTF_8CharacterBytes(const char *);
  std::optional<int> EUC_JPCharacterBytes(const char *);
-std::optional<std::size_t> CountCharacters(
-    const char *, std::size_t bytes, std::optional<int> (*)(const char *));
-std::optional<std::u32string> DecodeUTF8(const std::string &);
+std::optional<int> CharacterBytes(const char *, Encoding);
+std::optional<int> CountCharacters(const char *, std::size_t bytes, Encoding);
+
+struct DecodedCharacter {
+  char32_t unicode{0};
+  int bytes{0};  // signifying failure
+};
+
+DecodedCharacter DecodeUTF_8Character(const char *, std::size_t);
+DecodedCharacter DecodeEUC_JPCharacter(const char *, std::size_t);
+DecodedCharacter DecodeLATIN1Character(const char *);
+DecodedCharacter DecodeCharacter(Encoding, const char *, std::size_t);
+
+std::u32string DecodeUTF_8(const std::string &);
+std::u16string DecodeEUC_JP(const std::string &);
  }
  #endif  // FORTRAN_PARSER_CHARACTERS_H_
diff --git a/flang/lib/parser/parse-state.h b/flang/lib/parser/parse-state.h

index 65ec9ec..973b200 100644 (file)
--- a/flang/lib/parser/parse-state.h
+++ b/flang/lib/parser/parse-state.h
@@ -185,17 +185,19 @@ public:
    }
  
    std::optional<const char *> GetNextChar() {
-    if (p_ >= limit_) {
+    if (p_ < limit_) {
+      return UncheckedAdvance();
+    } else {
        return std::nullopt;
      }
-    return {UncheckedAdvance()};
    }
  
    std::optional<const char *> PeekAtNextChar() const {
-    if (p_ >= limit_) {
+    if (p_ < limit_) {
+      return p_;
+    } else {
        return std::nullopt;
      }
-    return {p_};
    }
  
    std::size_t BytesRemaining() const {
@@ -229,7 +231,7 @@ private:
    UserState *userState_{nullptr};
  
    bool inFixedForm_{false};
-  Encoding encoding_{Encoding::UTF8};
+  Encoding encoding_{Encoding::UTF_8};
    bool anyErrorRecovery_{false};
    bool anyConformanceViolation_{false};
    bool deferMessages_{false};
diff --git a/flang/lib/parser/parsing.h b/flang/lib/parser/parsing.h

index 3bb77ac..be470eb 100644 (file)
--- a/flang/lib/parser/parsing.h
+++ b/flang/lib/parser/parsing.h
@@ -37,7 +37,7 @@ struct Options {
    bool isFixedForm{false};
    int fixedFormColumns{72};
    LanguageFeatureControl features;
-  Encoding encoding{Encoding::UTF8};
+  Encoding encoding{Encoding::UTF_8};
    std::vector<std::string> searchDirectories;
    std::vector<Predefinition> predefinitions;
    bool instrumentedParse{false};
diff --git a/flang/lib/parser/prescan.cc b/flang/lib/parser/prescan.cc

index b7ac18e..7a6ffe4 100644 (file)
--- a/flang/lib/parser/prescan.cc
+++ b/flang/lib/parser/prescan.cc
@@ -550,7 +550,14 @@ void Prescanner::QuotedCharacterLiteral(
    bool escape{false};
    bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
    while (true) {
-    char32_t ch{static_cast<unsigned char>(*at_)};
+    DecodedCharacter decoded{DecodeCharacter(
+        encoding_, at_, static_cast<std::size_t>(limit_ - at_))};
+    if (decoded.bytes <= 0) {
+      Say(GetProvenanceRange(start, end),
+          "Bad character in character literal"_err_en_US);
+      break;
+    }
+    char32_t ch{decoded.unicode};
      escape = !escape && ch == '\\' && escapesEnabled;
      EmitQuotedChar(ch, emit, insert, false, !escapesEnabled);
      while (PadOutCharacterLiteral(tokens)) {
@@ -562,6 +569,7 @@ void Prescanner::QuotedCharacterLiteral(
        }
        break;
      }
+    at_ += decoded.bytes - 1;
      end = at_ + 1;
      NextChar();
      if (*at_ == quote && !escape) {
@@ -592,24 +600,24 @@ void Prescanner::Hollerith(
      if (PadOutCharacterLiteral(tokens)) {
      } else if (*at_ == '\n') {
        Say(GetProvenanceRange(start, at_),
-          "possible truncated Hollerith literal"_en_US);
+          "Possible truncated Hollerith literal"_en_US);
        break;
      } else {
        NextChar();
-      EmitChar(tokens, *at_);
-      // Multi-byte character encodings should count as single characters.
-      int bytes{1};
-      if (encoding_ == Encoding::EUC_JP) {
-        if (std::optional<int> chBytes{EUC_JPCharacterBytes(at_)}) {
-          bytes = *chBytes;
-        }
-      } else if (encoding_ == Encoding::UTF8) {
-        if (std::optional<int> chBytes{UTF8CharacterBytes(at_)}) {
-          bytes = *chBytes;
+      // Multi-byte character encodings each count as single characters.
+      DecodedCharacter decoded{DecodeCharacter(
+          encoding_, at_, static_cast<std::size_t>(limit_ - at_))};
+      if (decoded.bytes > 0) {
+        // The cooked character stream we emit is always in UTF-8.
+        EncodedCharacter utf8{EncodeUTF_8(decoded.unicode)};
+        for (int j{0}; j < utf8.bytes; ++j) {
+          EmitChar(tokens, utf8.buffer[j]);
          }
-      }
-      while (bytes-- > 1) {
-        EmitChar(tokens, *++at_);
+        at_ += decoded.bytes;
+      } else {
+        Say(GetProvenanceRange(start, at_),
+            "Bad character in Hollerith literal"_err_en_US);
+        break;
        }
      }
    }
diff --git a/flang/lib/parser/prescan.h b/flang/lib/parser/prescan.h

index abc1ed9..0e492f7 100644 (file)
--- a/flang/lib/parser/prescan.h
+++ b/flang/lib/parser/prescan.h
@@ -186,7 +186,7 @@ private:
    LanguageFeatureControl features_;
    bool inFixedForm_{false};
    int fixedFormColumnLimit_{72};
-  Encoding encoding_{Encoding::UTF8};
+  Encoding encoding_{Encoding::UTF_8};
    int delimiterNesting_{0};
    int prescannerNesting_{0};
  
diff --git a/flang/lib/parser/token-parsers.h b/flang/lib/parser/token-parsers.h

index 46cdcdb..3de491d 100644 (file)
--- a/flang/lib/parser/token-parsers.h
+++ b/flang/lib/parser/token-parsers.h
@@ -228,7 +228,7 @@ struct CharLiteralChar {
        return std::nullopt;
      }
      if (ch != '\\') {
-      return {Result::Bare(ch)};
+      return Result::Bare(ch);
      }
      if (!(och = nextCh.Parse(state)).has_value()) {
        return std::nullopt;
@@ -240,7 +240,7 @@ struct CharLiteralChar {
        return std::nullopt;
      }
      if (std::optional<char> escChar{BackslashEscapeValue(ch)}) {
-      return {Result::Escaped(*escChar)};
+      return Result::Escaped(*escChar);
      }
      if (IsOctalDigit(ch)) {
        ch -= '0';
@@ -283,7 +283,7 @@ template<char quote> struct CharLiteral {
        if (ch->ch == quote && !ch->wasEscaped) {
          static constexpr auto doubled{attempt(AnyOfChars{SetOfChars{quote}})};
          if (!doubled.Parse(state).has_value()) {
-          return {str};
+          return str;
          }
        }
        str += ch->ch;
@@ -543,39 +543,26 @@ struct HollerithLiteral {
      }
      std::string content;
      for (auto j{*charCount}; j-- > 0;) {
-      int bytes{1};
-      const char *p{state.GetLocation()};
-      if (state.encoding() == Encoding::EUC_JP) {
-        std::optional<int> chBytes{EUC_JPCharacterBytes(p)};
-        if (!chBytes.has_value()) {
-          state.Say(start, "bad EUC_JP characters in Hollerith"_err_en_US);
-          return std::nullopt;
-        }
-        bytes = *chBytes;
-      } else if (state.encoding() == Encoding::UTF8) {
-        std::optional<int> chBytes{UTF8CharacterBytes(p)};
-        if (!chBytes.has_value()) {
-          state.Say(start, "bad UTF-8 characters in Hollerith"_err_en_US);
-          return std::nullopt;
-        }
-        bytes = *chBytes;
-      }
-      if (bytes == 1) {
-        std::optional<const char *> at{nextCh.Parse(state)};
-        if (!at.has_value() || !isprint(**at)) {
-          state.Say(
-              start, "insufficient or bad characters in Hollerith"_err_en_US);
-          return std::nullopt;
+      if (std::optional<int> chBytes{
+              CharacterBytes(state.GetLocation(), state.encoding())}) {
+        for (int bytes{*chBytes}; bytes > 0; --bytes) {
+          if (std::optional<const char *> at{nextCh.Parse(state)}) {
+            if (*chBytes == 1 && !isprint(**at)) {
+              state.Say(start, "Bad character in Hollerith"_err_en_US);
+              return std::nullopt;
+            }
+            content += **at;
+          } else {
+            state.Say(start, "Insufficient characters in Hollerith"_err_en_US);
+            return std::nullopt;
+          }
          }
-        content += **at;
        } else {
-        // Multi-byte character
-        while (bytes-- > 0) {
-          content += *nextCh.Parse(state).value();
-        }
+        state.Say(start, "Bad multi-byte character in Hollerith"_err_en_US);
+        return std::nullopt;
        }
      }
-    return {content};
+    return content;
    }
  };
  
diff --git a/flang/lib/parser/unparse.cc b/flang/lib/parser/unparse.cc

index 4c03a79..9e6f19c 100644 (file)
--- a/flang/lib/parser/unparse.cc
+++ b/flang/lib/parser/unparse.cc
@@ -189,18 +189,21 @@ public:
      if (const auto &k{std::get<std::optional<KindParam>>(x.t)}) {
        if (std::holds_alternative<KindParam::Kanji>(k->u)) {
          Word("NC");
+        Put(QuoteCharacterLiteral(std::get<std::string>(x.t), true,
+            backslashEscapes_, Encoding::EUC_JP));
        } else {
          Walk(*k), Put('_');
+        Put(QuoteCharacterLiteral(
+            std::get<std::string>(x.t), true, backslashEscapes_));
        }
+    } else {
+      Put(QuoteCharacterLiteral(
+          std::get<std::string>(x.t), true, backslashEscapes_));
      }
-    Put(QuoteCharacterLiteral(
-        std::get<std::string>(x.t), true, backslashEscapes_));
    }
    void Before(const HollerithLiteralConstant &x) {
-    std::optional<std::size_t> chars{CountCharacters(x.v.data(), x.v.size(),
-        encoding_ == Encoding::EUC_JP ? EUC_JPCharacterBytes
-                                      : UTF8CharacterBytes)};
-    if (chars.has_value()) {
+    if (std::optional<std::size_t> chars{
+            CountCharacters(x.v.data(), x.v.size(), encoding_)}) {
        Unparse(*chars);
      } else {
        Unparse(x.v.size());
@@ -2575,7 +2578,7 @@ private:
    int column_{1};
    const int maxColumns_{80};
    std::set<CharBlock> structureComponents_;
-  Encoding encoding_{Encoding::UTF8};
+  Encoding encoding_{Encoding::UTF_8};
    bool capitalizeKeywords_{true};
    bool openmpDirective_{false};
    bool backslashEscapes_{false};
diff --git a/flang/lib/parser/unparse.h b/flang/lib/parser/unparse.h

index 6ff8388..070c758 100644 (file)
--- a/flang/lib/parser/unparse.h
+++ b/flang/lib/parser/unparse.h
@@ -39,7 +39,7 @@ using TypedExprAsFortran =
  
  /// Convert parsed program to out as Fortran.
  void Unparse(std::ostream &out, const Program &program,
-    Encoding encoding = Encoding::UTF8, bool capitalizeKeywords = true,
+    Encoding encoding = Encoding::UTF_8, bool capitalizeKeywords = true,
      bool backslashEscapes = true, preStatementType *preStatement = nullptr,
      TypedExprAsFortran *expr = nullptr);
  }
diff --git a/flang/lib/semantics/expression.cc b/flang/lib/semantics/expression.cc

index 2b6dd0b..a3fc057 100644 (file)
--- a/flang/lib/semantics/expression.cc
+++ b/flang/lib/semantics/expression.cc
@@ -519,26 +519,37 @@ MaybeExpr ExpressionAnalyzer::AnalyzeString(std::string &&string, int kind) {
    if (!CheckIntrinsicKind(TypeCategory::Character, kind)) {
      return std::nullopt;
    }
+  std::u32string unicode{parser::DecodeUTF_8(string)};
    if (kind == 1) {
-    return {AsGenericExpr(
-        Constant<Type<TypeCategory::Character, 1>>{std::move(string)})};
-  } else if (std::optional<std::u32string> unicode{
-                 parser::DecodeUTF8(string)}) {
-    if (kind == 4) {
-      return {AsGenericExpr(
-          Constant<Type<TypeCategory::Character, 4>>{std::move(*unicode)})};
+    std::string result;
+    for (const char32_t &ch : unicode) {
+      if (ch <= 0xff) {
+        result += static_cast<char>(ch);
+      } else {
+        // Original literal in UTF-8 source contained a byte sequence
+        // that looked like UTF-8 and got decoded as such.  Reconstruct.
+        parser::EncodedCharacter encoded{parser::EncodeUTF_8(ch)};
+        result += std::string{
+            encoded.buffer, static_cast<std::size_t>(encoded.bytes)};
+      }
      }
-    CHECK(kind == 2);
-    // TODO: better Kanji support
+    return AsGenericExpr(
+        Constant<Type<TypeCategory::Character, 1>>{std::move(result)});
+  } else if (kind == 2) {
      std::u16string result;
-    for (const char32_t &ch : *unicode) {
+    for (const char32_t &ch : unicode) {
+      if (ch > 0xffff) {
+        Say("Bad character in CHARACTER(KIND=2) literal"_err_en_US);
+        return std::nullopt;
+      }
        result += static_cast<char16_t>(ch);
      }
-    return {AsGenericExpr(
-        Constant<Type<TypeCategory::Character, 2>>{std::move(result)})};
+    return AsGenericExpr(
+        Constant<Type<TypeCategory::Character, 2>>{std::move(result)});
    } else {
-    Say("bad UTF-8 encoding of CHARACTER(KIND=%d) literal"_err_en_US, kind);
-    return std::nullopt;
+    CHECK(kind == 4);
+    return AsGenericExpr(
+        Constant<Type<TypeCategory::Character, 4>>{std::move(unicode)});
    }
  }
  
diff --git a/flang/lib/semantics/unparse-with-symbols.h b/flang/lib/semantics/unparse-with-symbols.h

index 4d4d9d4..07729ad 100644 (file)
--- a/flang/lib/semantics/unparse-with-symbols.h
+++ b/flang/lib/semantics/unparse-with-symbols.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+// Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ struct Program;
  
  namespace Fortran::semantics {
  void UnparseWithSymbols(std::ostream &, const parser::Program &,
-    parser::Encoding encoding = parser::Encoding::UTF8);
+    parser::Encoding encoding = parser::Encoding::UTF_8);
  }
  
  #endif  // FORTRAN_SEMANTICS_UNPARSE_WITH_SYMBOLS_H_
diff --git a/flang/tools/f18/f18-parse-demo.cc b/flang/tools/f18/f18-parse-demo.cc

index 57e68dd..5030345 100644 (file)
--- a/flang/tools/f18/f18-parse-demo.cc
+++ b/flang/tools/f18/f18-parse-demo.cc
@@ -92,7 +92,7 @@ struct DriverOptions {
    bool forcedForm{false};  // -Mfixed or -Mfree appeared
    bool warnOnNonstandardUsage{false};  // -Mstandard
    bool warningsAreErrors{false};  // -Werror
-  Fortran::parser::Encoding encoding{Fortran::parser::Encoding::UTF8};
+  Fortran::parser::Encoding encoding{Fortran::parser::Encoding::UTF_8};
    bool parseOnly{false};
    bool dumpProvenance{false};
    bool dumpCookedChars{false};
diff --git a/flang/tools/f18/f18.cc b/flang/tools/f18/f18.cc

index 5a1d34b..cde2389 100644 (file)
--- a/flang/tools/f18/f18.cc
+++ b/flang/tools/f18/f18.cc
@@ -87,7 +87,7 @@ struct DriverOptions {
    bool forcedForm{false};  // -Mfixed or -Mfree appeared
    bool warnOnNonstandardUsage{false};  // -Mstandard
    bool warningsAreErrors{false};  // -Werror
-  Fortran::parser::Encoding encoding{Fortran::parser::Encoding::UTF8};
+  Fortran::parser::Encoding encoding{Fortran::parser::Encoding::UTF_8};
    bool parseOnly{false};
    bool dumpProvenance{false};
    bool dumpCookedChars{false};
@@ -451,6 +451,8 @@ int main(int argc, char *const argv[]) {
      } else if (arg == "-module-suffix") {
        driver.moduleFileSuffix = args.front();
        args.pop_front();
+    } else if (arg == "-fno-utf-8") {
+      options.encoding = Fortran::parser::Encoding::LATIN_1;
      } else if (arg == "-help" || arg == "--help" || arg == "-?") {
        std::cerr
            << "f18 options:\n"
author	peter klausler <pklausler@nvidia.com>
	Tue, 11 Jun 2019 17:34:58 +0000 (10:34 -0700)
committer	peter klausler <pklausler@nvidia.com>
	Mon, 17 Jun 2019 23:13:07 +0000 (16:13 -0700)
flang/documentation/Parsing.md		patch \| blob \| history
flang/lib/parser/characters.cc		patch \| blob \| history
flang/lib/parser/characters.h		patch \| blob \| history
flang/lib/parser/parse-state.h		patch \| blob \| history
flang/lib/parser/parsing.h		patch \| blob \| history
flang/lib/parser/prescan.cc		patch \| blob \| history
flang/lib/parser/prescan.h		patch \| blob \| history
flang/lib/parser/token-parsers.h		patch \| blob \| history
flang/lib/parser/unparse.cc		patch \| blob \| history
flang/lib/parser/unparse.h		patch \| blob \| history
flang/lib/semantics/expression.cc		patch \| blob \| history
flang/lib/semantics/unparse-with-symbols.h		patch \| blob \| history
flang/tools/f18/f18-parse-demo.cc		patch \| blob \| history
flang/tools/f18/f18.cc		patch \| blob \| history