[flang] Character set and encoding changes

author peter klausler <pklausler@nvidia.com>

Wed, 12 Jun 2019 22:26:37 +0000 (15:26 -0700)

committer peter klausler <pklausler@nvidia.com>

Mon, 17 Jun 2019 23:13:07 +0000 (16:13 -0700)
author peter klausler <pklausler@nvidia.com>
Wed, 12 Jun 2019 22:26:37 +0000 (15:26 -0700)
committer peter klausler <pklausler@nvidia.com>
Mon, 17 Jun 2019 23:13:07 +0000 (16:13 -0700)
diff --git a/flang/lib/evaluate/formatting.cc b/flang/lib/evaluate/formatting.cc

index 58db40a..59b0d1c 100644 (file)
--- a/flang/lib/evaluate/formatting.cc
+++ b/flang/lib/evaluate/formatting.cc
@@ -56,7 +56,8 @@ std::ostream &ConstantBase<RESULT, VALUE>::AsFortran(std::ostream &o) const {
          Result::category == TypeCategory::Complex) {
        value.AsFortran(o, Result::kind);
      } else if constexpr (Result::category == TypeCategory::Character) {
-      o << Result::kind << '_' << parser::QuoteCharacterLiteral(value);
+      o << Result::kind << '_'
+        << parser::QuoteCharacterLiteral(value, true, false);
      } else if constexpr (Result::category == TypeCategory::Logical) {
        if (value.IsTrue()) {
          o << ".true.";
@@ -92,7 +93,9 @@ std::ostream &Constant<Type<TypeCategory::Character, KIND>>::AsFortran(
      } else if (Rank() == 0) {
        o << Result::kind << '_';
      }
-    o << parser::QuoteCharacterLiteral(value);
+    o << parser::QuoteCharacterLiteral(value, true /* double quotes */,
+        false /* avoid backslash escapes */,
+        parser::Encoding::UTF_8 /* module files are UTF-8 */);
    }
    if (Rank() > 0) {
      o << ']';
diff --git a/flang/lib/parser/characters.cc b/flang/lib/parser/characters.cc

index 3345fa1..23a422c 100644 (file)
--- a/flang/lib/parser/characters.cc
+++ b/flang/lib/parser/characters.cc
@@ -78,10 +78,6 @@ static std::optional<int> (*CharacterCounter(Encoding encoding))(const char *) {
    }
  }
  
-std::optional<int> CharacterBytes(const char *p, Encoding encoding) {
-  return CharacterCounter(encoding)(p);
-}
-
  std::optional<int> CountCharacters(
      const char *p, std::size_t bytes, Encoding encoding) {
    std::size_t chars{0};
@@ -104,7 +100,7 @@ std::optional<int> CountCharacters(
  
  template<typename STRING>
  std::string QuoteCharacterLiteralHelper(const STRING &str,
-    bool doubleDoubleQuotes, bool doubleBackslash, Encoding encoding) {
+    bool doubleDoubleQuotes, bool backslashEscapes, Encoding encoding) {
    std::string result{'"'};
    const auto emit{[&](char ch) { result += ch; }};
    for (auto ch : str) {
@@ -113,11 +109,11 @@ std::string QuoteCharacterLiteralHelper(const STRING &str,
        // char may be signed depending on host.
        char32_t ch32{static_cast<unsigned char>(ch)};
        EmitQuotedChar(
-          ch32, emit, emit, doubleDoubleQuotes, doubleBackslash, encoding);
+          ch32, emit, emit, doubleDoubleQuotes, backslashEscapes, encoding);
      } else {
        char32_t ch32{ch};
        EmitQuotedChar(
-          ch32, emit, emit, doubleDoubleQuotes, doubleBackslash, encoding);
+          ch32, emit, emit, doubleDoubleQuotes, backslashEscapes, encoding);
      }
    }
    result += '"';
@@ -125,24 +121,24 @@ std::string QuoteCharacterLiteralHelper(const STRING &str,
  }
  
  std::string QuoteCharacterLiteral(const std::string &str,
-    bool doubleDoubleQuotes, bool doubleBackslash, Encoding encoding) {
+    bool doubleDoubleQuotes, bool backslashEscapes, Encoding encoding) {
    return QuoteCharacterLiteralHelper(
-      str, doubleDoubleQuotes, doubleBackslash, encoding);
+      str, doubleDoubleQuotes, backslashEscapes, encoding);
  }
  
  std::string QuoteCharacterLiteral(const std::u16string &str,
-    bool doubleDoubleQuotes, bool doubleBackslash, Encoding encoding) {
+    bool doubleDoubleQuotes, bool backslashEscapes, Encoding encoding) {
    return QuoteCharacterLiteralHelper(
-      str, doubleDoubleQuotes, doubleBackslash, encoding);
+      str, doubleDoubleQuotes, backslashEscapes, encoding);
  }
  
  std::string QuoteCharacterLiteral(const std::u32string &str,
-    bool doubleDoubleQuotes, bool doubleBackslash, Encoding encoding) {
+    bool doubleDoubleQuotes, bool backslashEscapes, Encoding encoding) {
    return QuoteCharacterLiteralHelper(
-      str, doubleDoubleQuotes, doubleBackslash, encoding);
+      str, doubleDoubleQuotes, backslashEscapes, encoding);
  }
  
-EncodedCharacter EncodeLATIN_1(char codepoint) {
+EncodedCharacter EncodeLATIN_1(char32_t codepoint) {
    CHECK(codepoint <= 0xff);
    EncodedCharacter result;
    result.buffer[0] = codepoint;
@@ -178,7 +174,7 @@ EncodedCharacter EncodeUTF_8(char32_t codepoint) {
    return result;
  }
  
-EncodedCharacter EncodeEUC_JP(char16_t codepoint) {
+EncodedCharacter EncodeEUC_JP(char32_t codepoint) {
    // Assume JIS X 0208 (TODO: others)
    CHECK(codepoint <= 0x6e6e);
    EncodedCharacter result;
@@ -205,64 +201,111 @@ EncodedCharacter EncodeCharacter(Encoding encoding, char32_t codepoint) {
  DecodedCharacter DecodeUTF_8Character(const char *cp, std::size_t bytes) {
    auto p{reinterpret_cast<const std::uint8_t *>(cp)};
    char32_t ch{*p};
-  int charBytes{1};
-  if (ch >= 0x80) {
-    if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 &&
-        ((p[1] | p[2] | p[3]) & 0xc0) == 0x80) {
-      charBytes = 4;
-      ch = ((ch & 7) << 6) | (p[1] & 0x3f);
-      ch = (ch << 6) | (p[2] & 0x3f);
-      ch = (ch << 6) | (p[3] & 0x3f);
-    } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 &&
-        ((p[1] | p[2]) & 0xc0) == 0x80) {
-      charBytes = 3;
-      ch = ((ch & 0xf) << 6) | (p[1] & 0x3f);
-      ch = (ch << 6) | (p[2] & 0x3f);
-    } else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 &&
-        (p[1] & 0xc0) == 0x80) {
-      charBytes = 2;
-      ch = ((ch & 0x1f) << 6) | (p[1] & 0x3f);
-    } else {
-      return {};  // not valid UTF-8
-    }
+  if (ch <= 0x7f) {
+    return {ch, 1};
+  } else if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 &&
+      ((p[1] | p[2] | p[3]) & 0xc0) == 0x80) {
+    ch = ((ch & 7) << 6) | (p[1] & 0x3f);
+    ch = (ch << 6) | (p[2] & 0x3f);
+    ch = (ch << 6) | (p[3] & 0x3f);
+    return {ch, 4};
+  } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 &&
+      ((p[1] | p[2]) & 0xc0) == 0x80) {
+    ch = ((ch & 0xf) << 6) | (p[1] & 0x3f);
+    ch = (ch << 6) | (p[2] & 0x3f);
+    return {ch, 3};
+  } else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 &&
+      (p[1] & 0xc0) == 0x80) {
+    ch = ((ch & 0x1f) << 6) | (p[1] & 0x3f);
+    return {ch, 2};
+  } else {
+    return {};  // not valid UTF-8
    }
-  return {ch, charBytes};
  }
  
  DecodedCharacter DecodeEUC_JPCharacter(const char *cp, std::size_t bytes) {
    auto p{reinterpret_cast<const std::uint8_t *>(cp)};
    char32_t ch{*p};
-  int charBytes{1};
-  if (ch >= 0x80) {
-    if (bytes >= 2 && ch == 0x8e && p[1] >= 0xa1 && p[1] <= 0xdf) {
-      charBytes = 2;  // JIS X 0201
-      ch = p[1];
-    } else if (bytes >= 3 && ch == 0x8f && p[1] >= 0xa1 && p[1] <= 0xfe &&
-        p[2] >= 0xa1 && p[2] <= 0xfe) {
-      charBytes = 3;  // JIS X 0212
-      ch = (p[1] & 0x7f) << 8 | (p[1] & 0x7f);
-    } else if (bytes >= 2 && ch >= 0xa1 && ch <= 0xfe && p[1] >= 0x1 &&
-        p[1] <= 0xfe) {
-      charBytes = 2;  // JIS X 0208
-      ch = ((ch & 0x7f) << 8) | (p[1] & 0x7f);
-    } else {
-      return {};
-    }
+  if (ch <= 0x7f) {
+    return {ch, 1};
+  } else if (ch >= 0xa1 && ch <= 0xfe && bytes >= 2 && p[1] >= 0xa1 &&
+      p[1] <= 0xfe) {
+    ch = ((ch & 0x7f) << 8) | (p[1] & 0x7f);  // JIS X 0208
+    return {ch, 2};
+  } else if (ch == 0x8e && bytes >= 2 && p[1] >= 0xa1 && p[1] <= 0xdf) {
+    return {p[1], 2};  // JIS X 0201
+  } else if (ch == 0x8f && bytes >= 3 && p[1] >= 0xa1 && p[1] <= 0xfe &&
+      p[2] >= 0xa1 && p[2] <= 0xfe) {
+    ch = (p[1] & 0x7f) << 8 | (p[1] & 0x7f);  // JIS X 0212
+    return {ch, 3};
+  } else {
+    return {};  // not valid EUC_JP
    }
-  return {ch, charBytes};
  }
  
  DecodedCharacter DecodeLATIN1Character(const char *cp) {
    return {*reinterpret_cast<const std::uint8_t *>(cp), 1};
  }
  
-DecodedCharacter DecodeCharacter(
+static DecodedCharacter DecodeEscapedCharacter(
+    const char *cp, std::size_t bytes) {
+  if (cp[0] == '\\' && bytes > 1) {
+    if (std::optional<char> escChar{BackslashEscapeValue(cp[1])}) {
+      return {static_cast<char32_t>(*escChar), 2};
+    }
+    if (IsOctalDigit(cp[1])) {
+      std::size_t maxDigits{static_cast<std::size_t>(cp[1] > '3' ? 2 : 3)};
+      std::size_t maxLen{std::max(maxDigits + 1, bytes)};
+      char32_t code{static_cast<char32_t>(cp[1] - '0')};
+      std::size_t len{2};  // so far
+      for (; len < maxLen && IsOctalDigit(cp[len]); ++len) {
+        code = 8 * code + DecimalDigitValue(cp[len]);
+      }
+      return {code, static_cast<int>(len)};
+    } else if (bytes >= 4 && ToLowerCaseLetter(cp[1]) == 'x' &&
+        IsHexadecimalDigit(cp[2]) && IsHexadecimalDigit(cp[3])) {
+      return {static_cast<char32_t>(16 * HexadecimalDigitValue(cp[2]) +
+                  HexadecimalDigitValue(cp[3])),
+          4};
+    }
+  }
+  return {static_cast<char32_t>(cp[0]), 1};
+}
+
+static DecodedCharacter DecodeEscapedCharacters(
      Encoding encoding, const char *cp, std::size_t bytes) {
-  switch (encoding) {
-  case Encoding::LATIN_1: return DecodeLATIN1Character(cp);
-  case Encoding::UTF_8: return DecodeUTF_8Character(cp, bytes);
-  case Encoding::EUC_JP: return DecodeEUC_JPCharacter(cp, bytes);
-  default: CRASH_NO_CASE;
+  char buffer[4];
+  int count[4];
+  std::size_t at{0}, len{0};
+  for (; len < 4 && at < bytes; ++len) {
+    DecodedCharacter code{DecodeEscapedCharacter(cp + at, bytes - at)};
+    buffer[len] = code.unicode;
+    at += code.bytes;
+    count[len] = at;
+  }
+  DecodedCharacter code{DecodeCharacter(encoding, buffer, len, false)};
+  if (code.bytes > 0) {
+    code.bytes = count[code.bytes - 1];
+  }
+  return code;
+}
+
+DecodedCharacter DecodeCharacter(Encoding encoding, const char *cp,
+    std::size_t bytes, bool backslashEscapes) {
+  if (backslashEscapes && bytes >= 1 && *cp == '\\') {
+    return DecodeEscapedCharacters(encoding, cp, bytes);
+  } else {
+    switch (encoding) {
+    case Encoding::LATIN_1:
+      if (bytes >= 1) {
+        return DecodeLATIN1Character(cp);
+      } else {
+        return {};
+      }
+    case Encoding::UTF_8: return DecodeUTF_8Character(cp, bytes);
+    case Encoding::EUC_JP: return DecodeEUC_JPCharacter(cp, bytes);
+    default: CRASH_NO_CASE;
+    }
    }
  }
  
diff --git a/flang/lib/parser/characters.h b/flang/lib/parser/characters.h

index 47c3cc9..0a55e18 100644 (file)
--- a/flang/lib/parser/characters.h
+++ b/flang/lib/parser/characters.h
@@ -109,32 +109,32 @@ inline constexpr char HexadecimalDigitValue(char ch) {
  
  inline constexpr std::optional<char> BackslashEscapeValue(char ch) {
    switch (ch) {
-  // case 'a': return {'\a'};  // pgf90 has no \a
-  case 'b': return {'\b'};
-  case 'f': return {'\f'};
-  case 'n': return {'\n'};
-  case 'r': return {'\r'};
-  case 't': return {'\t'};
-  case 'v': return {'\v'};
+  // case 'a': return '\a';  // pgf90 has no \a
+  case 'b': return '\b';
+  case 'f': return '\f';
+  case 'n': return '\n';
+  case 'r': return '\r';
+  case 't': return '\t';
+  case 'v': return '\v';
    case '"':
    case '\'':
-  case '\\': return {ch};
+  case '\\': return ch;
    default: return std::nullopt;
    }
  }
  
  inline constexpr std::optional<char> BackslashEscapeChar(char ch) {
    switch (ch) {
-  // case '\a': return {'a'};  // pgf90 has no \a
-  case '\b': return {'b'};
-  case '\f': return {'f'};
-  case '\n': return {'n'};
-  case '\r': return {'r'};
-  case '\t': return {'t'};
-  case '\v': return {'v'};
+  // case '\a': return 'a';  // pgf90 has no \a
+  case '\b': return 'b';
+  case '\f': return 'f';
+  case '\n': return 'n';
+  case '\r': return 'r';
+  case '\t': return 't';
+  case '\v': return 'v';
    case '"':
    case '\'':
-  case '\\': return {ch};
+  case '\\': return ch;
    default: return std::nullopt;
    }
  }
@@ -144,56 +144,63 @@ struct EncodedCharacter {
    int bytes{0};
  };
  
-EncodedCharacter EncodeLATIN_1(char);
+EncodedCharacter EncodeLATIN_1(char32_t);
  EncodedCharacter EncodeUTF_8(char32_t);
-EncodedCharacter EncodeEUC_JP(char16_t);
+EncodedCharacter EncodeEUC_JP(char32_t);
  EncodedCharacter EncodeCharacter(Encoding, char32_t);
  
+// EmitQuotedChar drives callbacks "emit" and "insert" to output the
+// bytes of an encoding for a codepoint.
  template<typename NORMAL, typename INSERTED>
  void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert,
-    bool doubleDoubleQuotes = true, bool doubleBackslash = true,
+    bool doubleDoubleQuotes = true, bool backslashEscapes = true,
      Encoding encoding = Encoding::UTF_8) {
+  auto emitOneChar{[&](std::uint8_t ch) {
+    if (ch < ' ' || (backslashEscapes && (ch == '\\' || ch >= 0x7f))) {
+      insert('\\');
+      if (std::optional<char> escape{BackslashEscapeChar(ch)}) {
+        emit(*escape);
+      } else {
+        // octal escape sequence
+        if (ch > 077) {
+          insert('0' + (ch >> 6));
+        }
+        if (ch > 07) {
+          insert('0' + ((ch >> 3) & 7));
+        }
+        insert('0' + (ch & 7));
+      }
+    } else {
+      emit(ch);
+    }
+  }};
    if (ch == '"') {
      if (doubleDoubleQuotes) {
        insert('"');
      }
      emit('"');
-  } else if (ch == '\\') {
-    if (doubleBackslash) {
-      insert('\\');
-    }
-    emit('\\');
-  } else if (ch < ' ' || (encoding == Encoding::LATIN_1 && ch >= 0x7f)) {
-    insert('\\');
-    if (std::optional<char> escape{BackslashEscapeChar(ch)}) {
-      emit(*escape);
-    } else {
-      // octal escape sequence
-      insert('0' + ((ch >> 6) & 3));
-      insert('0' + ((ch >> 3) & 7));
-      insert('0' + (ch & 7));
-    }
+  } else if (ch <= 0x7f) {
+    emitOneChar(ch);
    } else {
      EncodedCharacter encoded{EncodeCharacter(encoding, ch)};
      for (int j{0}; j < encoded.bytes; ++j) {
-      emit(encoded.buffer[j]);
+      emitOneChar(encoded.buffer[j]);
      }
    }
  }
  
  std::string QuoteCharacterLiteral(const std::string &,
-    bool doubleDoubleQuotes = true, bool doubleBackslash = true,
+    bool doubleDoubleQuotes = true, bool backslashEscapes = true,
      Encoding = Encoding::LATIN_1);
  std::string QuoteCharacterLiteral(const std::u16string &,
-    bool doubleDoubleQuotes = true, bool doubleBackslash = true,
+    bool doubleDoubleQuotes = true, bool backslashEscapes = true,
      Encoding = Encoding::EUC_JP);
  std::string QuoteCharacterLiteral(const std::u32string &,
-    bool doubleDoubleQuotes = true, bool doubleBackslash = true,
+    bool doubleDoubleQuotes = true, bool backslashEscapes = true,
      Encoding = Encoding::UTF_8);
  
  std::optional<int> UTF_8CharacterBytes(const char *);
  std::optional<int> EUC_JPCharacterBytes(const char *);
-std::optional<int> CharacterBytes(const char *, Encoding);
  std::optional<int> CountCharacters(const char *, std::size_t bytes, Encoding);
  
  struct DecodedCharacter {
@@ -204,7 +211,8 @@ struct DecodedCharacter {
  DecodedCharacter DecodeUTF_8Character(const char *, std::size_t);
  DecodedCharacter DecodeEUC_JPCharacter(const char *, std::size_t);
  DecodedCharacter DecodeLATIN1Character(const char *);
-DecodedCharacter DecodeCharacter(Encoding, const char *, std::size_t);
+DecodedCharacter DecodeCharacter(
+    Encoding, const char *, std::size_t, bool backslashEscapes = false);
  
  std::u32string DecodeUTF_8(const std::string &);
  std::u16string DecodeEUC_JP(const std::string &);
diff --git a/flang/lib/parser/parse-state.h b/flang/lib/parser/parse-state.h

index 973b200..c85a179 100644 (file)
--- a/flang/lib/parser/parse-state.h
+++ b/flang/lib/parser/parse-state.h
@@ -44,7 +44,7 @@ public:
    ParseState(const ParseState &that)
      : p_{that.p_}, limit_{that.limit_}, context_{that.context_},
        userState_{that.userState_}, inFixedForm_{that.inFixedForm_},
-      encoding_{that.encoding_}, anyErrorRecovery_{that.anyErrorRecovery_},
+      anyErrorRecovery_{that.anyErrorRecovery_},
        anyConformanceViolation_{that.anyConformanceViolation_},
        deferMessages_{that.deferMessages_},
        anyDeferredMessages_{that.anyDeferredMessages_},
@@ -52,7 +52,7 @@ public:
    ParseState(ParseState &&that)
      : p_{that.p_}, limit_{that.limit_}, messages_{std::move(that.messages_)},
        context_{std::move(that.context_)}, userState_{that.userState_},
-      inFixedForm_{that.inFixedForm_}, encoding_{that.encoding_},
+      inFixedForm_{that.inFixedForm_},
        anyErrorRecovery_{that.anyErrorRecovery_},
        anyConformanceViolation_{that.anyConformanceViolation_},
        deferMessages_{that.deferMessages_},
@@ -61,7 +61,6 @@ public:
    ParseState &operator=(const ParseState &that) {
      p_ = that.p_, limit_ = that.limit_, context_ = that.context_;
      userState_ = that.userState_, inFixedForm_ = that.inFixedForm_;
-    encoding_ = that.encoding_;
      anyErrorRecovery_ = that.anyErrorRecovery_;
      anyConformanceViolation_ = that.anyConformanceViolation_;
      deferMessages_ = that.deferMessages_;
@@ -73,7 +72,6 @@ public:
      p_ = that.p_, limit_ = that.limit_, messages_ = std::move(that.messages_);
      context_ = std::move(that.context_);
      userState_ = that.userState_, inFixedForm_ = that.inFixedForm_;
-    encoding_ = that.encoding_;
      anyErrorRecovery_ = that.anyErrorRecovery_;
      anyConformanceViolation_ = that.anyConformanceViolation_;
      deferMessages_ = that.deferMessages_;
@@ -106,12 +104,6 @@ public:
      return *this;
    }
  
-  Encoding encoding() const { return encoding_; }
-  ParseState &set_encoding(Encoding encoding) {
-    encoding_ = encoding;
-    return *this;
-  }
-
    bool deferMessages() const { return deferMessages_; }
    ParseState &set_deferMessages(bool yes = true) {
      deferMessages_ = yes;
@@ -231,7 +223,6 @@ private:
    UserState *userState_{nullptr};
  
    bool inFixedForm_{false};
-  Encoding encoding_{Encoding::UTF_8};
    bool anyErrorRecovery_{false};
    bool anyConformanceViolation_{false};
    bool deferMessages_{false};
diff --git a/flang/lib/parser/parsing.cc b/flang/lib/parser/parsing.cc

index 38efb7d..0cf38b2 100644 (file)
--- a/flang/lib/parser/parsing.cc
+++ b/flang/lib/parser/parsing.cc
@@ -69,7 +69,6 @@ void Parsing::Prescan(const std::string &path, Options options) {
    Prescanner prescanner{messages_, cooked_, preprocessor, options.features};
    prescanner.set_fixedForm(options.isFixedForm)
        .set_fixedFormColumnLimit(options.fixedFormColumns)
-      .set_encoding(options.encoding)
        .AddCompilerDirectiveSentinel("dir$");
    if (options.features.IsEnabled(LanguageFeature::OpenMP)) {
      prescanner.AddCompilerDirectiveSentinel("$omp");
@@ -102,9 +101,7 @@ void Parsing::Parse(std::ostream *out) {
        .set_instrumentedParse(options_.instrumentedParse)
        .set_log(&log_);
    ParseState parseState{cooked_};
-  parseState.set_inFixedForm(options_.isFixedForm)
-      .set_encoding(options_.encoding)
-      .set_userState(&userState);
+  parseState.set_inFixedForm(options_.isFixedForm).set_userState(&userState);
    parseTree_ = program.Parse(parseState);
    CHECK(
        !parseState.anyErrorRecovery() || parseState.messages().AnyFatalError());
diff --git a/flang/lib/parser/parsing.h b/flang/lib/parser/parsing.h

index be470eb..ffd0670 100644 (file)
--- a/flang/lib/parser/parsing.h
+++ b/flang/lib/parser/parsing.h
@@ -37,7 +37,6 @@ struct Options {
    bool isFixedForm{false};
    int fixedFormColumns{72};
    LanguageFeatureControl features;
-  Encoding encoding{Encoding::UTF_8};
    std::vector<std::string> searchDirectories;
    std::vector<Predefinition> predefinitions;
    bool instrumentedParse{false};
diff --git a/flang/lib/parser/preprocessor.cc b/flang/lib/parser/preprocessor.cc

index e9df36f..36b455e 100644 (file)
--- a/flang/lib/parser/preprocessor.cc
+++ b/flang/lib/parser/preprocessor.cc
@@ -588,7 +588,9 @@ void Preprocessor::Directive(const TokenSequence &dir, Prescanner *prescanner) {
      } else if (included->bytes() > 0) {
        ProvenanceRange fileRange{
            allSources_.AddIncludedFile(*included, dir.GetProvenanceRange())};
-      Prescanner{*prescanner}.Prescan(fileRange);
+      Prescanner{*prescanner}
+          .set_encoding(included->encoding())
+          .Prescan(fileRange);
      }
    } else {
      prescanner->Say(dir.GetTokenProvenanceRange(dirOffset),
diff --git a/flang/lib/parser/prescan.cc b/flang/lib/parser/prescan.cc

index 7a6ffe4..dd0141a 100644 (file)
--- a/flang/lib/parser/prescan.cc
+++ b/flang/lib/parser/prescan.cc
@@ -31,8 +31,8 @@ static constexpr int maxPrescannerNesting{100};
  
  Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
      Preprocessor &preprocessor, LanguageFeatureControl lfc)
-  : messages_{messages}, cooked_{cooked},
-    preprocessor_{preprocessor}, features_{lfc} {}
+  : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
+    features_{lfc}, encoding_{cooked.allSources().encoding()} {}
  
  Prescanner::Prescanner(const Prescanner &that)
    : messages_{that.messages_}, cooked_{that.cooked_},
@@ -295,6 +295,11 @@ bool Prescanner::MustSkipToEndOfLine() const {
  void Prescanner::NextChar() {
    CHECK(*at_ != '\n');
    ++at_, ++column_;
+  while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
+    // UTF-8 byte order mark - treat this file as UTF-8
+    at_ += 3;
+    encoding_ = Encoding::UTF_8;
+  }
    if (inPreprocessorDirective_) {
      SkipCComments();
    } else {
@@ -477,10 +482,18 @@ bool Prescanner::NextToken(TokenSequence &tokens) {
      }
      preventHollerith_ = false;
    } else if (IsLegalInIdentifier(*at_)) {
-    while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) {
-    }
+    // Look for NC'...' prefix - legacy PGI "Kanji" NCHARACTER literal
+    char buffer[2];
+    int idChars{0};
+    do {
+      if (idChars < static_cast<int>(sizeof buffer)) {
+        buffer[idChars] = ToLowerCaseLetter(*at_);
+      }
+      ++idChars;
+    } while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_)));
      if (*at_ == '\'' || *at_ == '"') {
-      QuotedCharacterLiteral(tokens, start);
+      bool isKanji{idChars == 2 && buffer[0] == 'n' && buffer[1] == 'c'};
+      QuotedCharacterLiteral(tokens, start, isKanji);
        preventHollerith_ = false;
      } else {
        // Subtle: Don't misrecognize labeled DO statement label as Hollerith
@@ -522,7 +535,7 @@ bool Prescanner::NextToken(TokenSequence &tokens) {
  }
  
  bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
-  char ed = ToLowerCaseLetter(*at_);
+  char ed{ToLowerCaseLetter(*at_)};
    if (ed != 'e' && ed != 'd') {
      return false;
    }
@@ -541,7 +554,7 @@ bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
  }
  
  void Prescanner::QuotedCharacterLiteral(
-    TokenSequence &tokens, const char *start) {
+    TokenSequence &tokens, const char *start, bool isKanji) {
    char quote{*at_};
    const char *end{at_ + 1};
    inCharLiteral_ = true;
@@ -549,9 +562,14 @@ void Prescanner::QuotedCharacterLiteral(
    const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
    bool escape{false};
    bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
+  Encoding encoding{encoding_};
+  if (isKanji) {
+    // NC'...' - the contents are EUC_JP even if the context is not
+    encoding = Encoding::EUC_JP;
+  }
    while (true) {
      DecodedCharacter decoded{DecodeCharacter(
-        encoding_, at_, static_cast<std::size_t>(limit_ - at_))};
+        encoding, at_, static_cast<std::size_t>(limit_ - at_), escapesEnabled)};
      if (decoded.bytes <= 0) {
        Say(GetProvenanceRange(start, end),
            "Bad character in character literal"_err_en_US);
@@ -559,7 +577,9 @@ void Prescanner::QuotedCharacterLiteral(
      }
      char32_t ch{decoded.unicode};
      escape = !escape && ch == '\\' && escapesEnabled;
-    EmitQuotedChar(ch, emit, insert, false, !escapesEnabled);
+    EmitQuotedChar(ch, emit, insert, false /* don't double quotes */,
+        true /* use backslash escapes */,
+        Encoding::UTF_8 /* cooked char stream is UTF-8 only */);
      while (PadOutCharacterLiteral(tokens)) {
      }
      if (*at_ == '\n') {
@@ -613,7 +633,7 @@ void Prescanner::Hollerith(
          for (int j{0}; j < utf8.bytes; ++j) {
            EmitChar(tokens, utf8.buffer[j]);
          }
-        at_ += decoded.bytes;
+        at_ += decoded.bytes - 1;
        } else {
          Say(GetProvenanceRange(start, at_),
              "Bad character in Hollerith literal"_err_en_US);
@@ -746,7 +766,7 @@ void Prescanner::FortranInclude(const char *firstQuote) {
          provenance, static_cast<std::size_t>(p - nextLine_)};
      ProvenanceRange fileRange{
          allSources.AddIncludedFile(*included, includeLineRange)};
-    Prescanner{*this}.Prescan(fileRange);
+    Prescanner{*this}.set_encoding(included->encoding()).Prescan(fileRange);
    }
  }
  
diff --git a/flang/lib/parser/prescan.h b/flang/lib/parser/prescan.h

index 0e492f7..d623852 100644 (file)
--- a/flang/lib/parser/prescan.h
+++ b/flang/lib/parser/prescan.h
@@ -158,7 +158,8 @@ private:
    const char *SkipCComment(const char *) const;
    bool NextToken(TokenSequence &);
    bool ExponentAndKind(TokenSequence &);
-  void QuotedCharacterLiteral(TokenSequence &, const char *start);
+  void QuotedCharacterLiteral(
+      TokenSequence &, const char *start, bool isKanji = false);
    void Hollerith(TokenSequence &, int count, const char *start);
    bool PadOutCharacterLiteral(TokenSequence &);
    bool SkipCommentLine(bool afterAmpersand);
diff --git a/flang/lib/parser/provenance.cc b/flang/lib/parser/provenance.cc

index 166104d..e0cd7b7 100644 (file)
--- a/flang/lib/parser/provenance.cc
+++ b/flang/lib/parser/provenance.cc
@@ -108,7 +108,7 @@ std::string AllSources::PopSearchPathDirectory() {
  }
  
  const SourceFile *AllSources::Open(std::string path, std::stringstream *error) {
-  std::unique_ptr<SourceFile> source{std::make_unique<SourceFile>()};
+  std::unique_ptr<SourceFile> source{std::make_unique<SourceFile>(encoding_)};
    if (source->Open(LocateSourceFile(path, searchPath_), error)) {
      return ownedSourceFiles_.emplace_back(std::move(source)).get();
    }
@@ -116,7 +116,7 @@ const SourceFile *AllSources::Open(std::string path, std::stringstream *error) {
  }
  
  const SourceFile *AllSources::ReadStandardInput(std::stringstream *error) {
-  std::unique_ptr<SourceFile> source{std::make_unique<SourceFile>()};
+  std::unique_ptr<SourceFile> source{std::make_unique<SourceFile>(encoding_)};
    if (source->ReadStandardInput(error)) {
      return ownedSourceFiles_.emplace_back(std::move(source)).get();
    }
diff --git a/flang/lib/parser/provenance.h b/flang/lib/parser/provenance.h

index 4efcf35..2562299 100644 (file)
--- a/flang/lib/parser/provenance.h
+++ b/flang/lib/parser/provenance.h
@@ -17,6 +17,7 @@
  
  #include "char-block.h"
  #include "char-buffer.h"
+#include "characters.h"
  #include "source.h"
  #include "../common/idioms.h"
  #include "../common/interval.h"
@@ -117,6 +118,11 @@ public:
  
    std::size_t size() const { return range_.size(); }
    const char &operator[](Provenance) const;
+  Encoding encoding() const { return encoding_; }
+  AllSources &set_encoding(Encoding e) {
+    encoding_ = e;
+    return *this;
+  }
  
    void PushSearchPathDirectory(std::string);
    std::string PopSearchPathDirectory();
@@ -181,6 +187,7 @@ private:
    std::map<char, Provenance> compilerInsertionProvenance_;
    std::vector<std::unique_ptr<SourceFile>> ownedSourceFiles_;
    std::vector<std::string> searchPath_;
+  Encoding encoding_{Encoding::UTF_8};
  };
  
  class CookedSource {
diff --git a/flang/lib/parser/source.cc b/flang/lib/parser/source.cc

index 995fa26..f5e7259 100644 (file)
--- a/flang/lib/parser/source.cc
+++ b/flang/lib/parser/source.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+// Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
@@ -60,17 +60,18 @@ void SourceFile::RecordLineStarts() {
    lineStart_ = FindLineStarts(content_, bytes_);
  }
  
-// Cut down the contiguous content of a source file to skip
-// things like byte order marks.
+// Check for a Unicode byte order mark (BOM).
+// Module files all have one; so can source files.
  void SourceFile::IdentifyPayload() {
    content_ = address_;
    bytes_ = size_;
    if (content_ != nullptr) {
+    static constexpr int BOMBytes{3};
      static const char UTF8_BOM[]{"\xef\xbb\xbf"};
-    if (bytes_ >= sizeof UTF8_BOM &&
-        std::memcmp(content_, UTF8_BOM, sizeof UTF8_BOM) == 0) {
-      content_ += sizeof UTF8_BOM;
-      bytes_ -= sizeof UTF8_BOM;
+    if (bytes_ >= BOMBytes && std::memcmp(content_, UTF8_BOM, BOMBytes) == 0) {
+      content_ += BOMBytes;
+      bytes_ -= BOMBytes;
+      encoding_ = Encoding::UTF_8;
      }
    }
  }
diff --git a/flang/lib/parser/source.h b/flang/lib/parser/source.h

index 560b2ae..729e862 100644 (file)
--- a/flang/lib/parser/source.h
+++ b/flang/lib/parser/source.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+// Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
@@ -18,7 +18,9 @@
  // Source file content is lightly normalized when the file is read.
  //  - Line ending markers are converted to single newline characters
  //  - A newline character is added to the last line of the file if one is needed
+//  - A Unicode byte order mark is recognized if present.
  
+#include "characters.h"
  #include <cstddef>
  #include <sstream>
  #include <string>
@@ -33,12 +35,13 @@ std::string LocateSourceFile(
  
  class SourceFile {
  public:
-  SourceFile() {}
+  explicit SourceFile(Encoding e) : encoding_{e} {}
    ~SourceFile();
    std::string path() const { return path_; }
    const char *content() const { return content_; }
    std::size_t bytes() const { return bytes_; }
    std::size_t lines() const { return lineStart_.size(); }
+  Encoding encoding() const { return encoding_; }
  
    bool Open(std::string path, std::stringstream *error);
    bool ReadStandardInput(std::stringstream *error);
@@ -62,6 +65,7 @@ private:
    std::size_t bytes_{0};
    std::vector<std::size_t> lineStart_;
    std::string normalized_;
+  Encoding encoding_{Encoding::UTF_8};
  };
  }
  #endif  // FORTRAN_PARSER_SOURCE_H_
diff --git a/flang/lib/parser/token-parsers.h b/flang/lib/parser/token-parsers.h

index 3de491d..b4ca3de 100644 (file)
--- a/flang/lib/parser/token-parsers.h
+++ b/flang/lib/parser/token-parsers.h
@@ -207,70 +207,41 @@ template<class PA> inline constexpr auto bracketed(const PA &p) {
  
  // Quoted character literal constants.
  struct CharLiteralChar {
-  struct Result {
-    Result(char c, bool esc) : ch{c}, wasEscaped{esc} {}
-    static Result Bare(char c) { return Result{c, false}; }
-    static Result Escaped(char c) { return Result{c, true}; }
-    char ch;
-    bool wasEscaped;
-  };
-  using resultType = Result;
-  static std::optional<Result> Parse(ParseState &state) {
+  using resultType = char;
+  static std::optional<char> Parse(ParseState &state) {
      auto at{state.GetLocation()};
-    std::optional<const char *> och{nextCh.Parse(state)};
-    if (!och.has_value()) {
-      return std::nullopt;
-    }
-    char ch{**och};
-    if (ch == '\n') {
-      state.Say(CharBlock{at, state.GetLocation()},
-          "unclosed character constant"_err_en_US);
-      return std::nullopt;
-    }
-    if (ch != '\\') {
-      return Result::Bare(ch);
-    }
-    if (!(och = nextCh.Parse(state)).has_value()) {
-      return std::nullopt;
-    }
-    ch = **och;
-    if (ch == '\n') {
-      state.Say(CharBlock{at, state.GetLocation()},
-          "unclosed character constant"_err_en_US);
-      return std::nullopt;
-    }
-    if (std::optional<char> escChar{BackslashEscapeValue(ch)}) {
-      return Result::Escaped(*escChar);
-    }
-    if (IsOctalDigit(ch)) {
-      ch -= '0';
-      for (int j = (ch > 3 ? 1 : 2); j-- > 0;) {
-        static constexpr auto octalDigit{attempt("01234567"_ch)};
-        och = octalDigit.Parse(state);
-        if (och.has_value()) {
-          ch = 8 * ch + **och - '0';
-        } else {
-          break;
-        }
+    if (std::optional<const char *> cp{nextCh.Parse(state)}) {
+      if (**cp == '\n') {
+        state.Say(CharBlock{at, state.GetLocation()},
+            "Unclosed character constant"_err_en_US);
+        return std::nullopt;
        }
-    } else if (ch == 'x' || ch == 'X') {
-      ch = 0;
-      static constexpr auto hexDigit{"0123456789abcdefABCDEF"_ch};
-      och = hexDigit.Parse(state);
-      if (och.has_value()) {
-        ch = HexadecimalDigitValue(**och);
-        static constexpr auto hexDigit2{attempt("0123456789abcdefABCDEF"_ch)};
-        och = hexDigit2.Parse(state);
-        if (och.has_value()) {
-          ch = 16 * ch + HexadecimalDigitValue(**och);
-        }
-      } else {
+      if (**cp != '\\') {
+        return **cp;
+      }
+      if (!(cp = nextCh.Parse(state)).has_value()) {
+        state.Say(CharBlock{at, state.GetLocation()},
+            "Unclosed character constant"_err_en_US);
          return std::nullopt;
        }
-    } else {
-      state.Say(at, "bad escaped character"_en_US);
+      if (std::optional<char> escChar{BackslashEscapeValue(**cp)}) {
+        return escChar;
+      }
+      if (IsOctalDigit(**cp)) {
+        int result{**cp - '0'};
+        for (int j = (result > 3 ? 1 : 2); j-- > 0;) {
+          static constexpr auto octalDigit{attempt("01234567"_ch)};
+          if (std::optional<const char *> oct{octalDigit.Parse(state)}) {
+            result = 8 * result + **oct - '0';
+          } else {
+            break;
+          }
+        }
+        return result;
+      }
+      state.Say(at, "Bad escaped character"_err_en_US);
      }
-    return {Result::Escaped(ch)};
+    return std::nullopt;
    }
  };
  
@@ -279,14 +250,14 @@ template<char quote> struct CharLiteral {
    static std::optional<std::string> Parse(ParseState &state) {
      std::string str;
      static constexpr auto nextch{attempt(CharLiteralChar{})};
-    while (std::optional<CharLiteralChar::Result> ch{nextch.Parse(state)}) {
-      if (ch->ch == quote && !ch->wasEscaped) {
+    while (std::optional<char> ch{nextch.Parse(state)}) {
+      if (*ch == quote) {
          static constexpr auto doubled{attempt(AnyOfChars{SetOfChars{quote}})};
          if (!doubled.Parse(state).has_value()) {
            return str;
          }
        }
-      str += ch->ch;
+      str += *ch;
      }
      return std::nullopt;
    }
@@ -544,7 +515,7 @@ struct HollerithLiteral {
      std::string content;
      for (auto j{*charCount}; j-- > 0;) {
        if (std::optional<int> chBytes{
-              CharacterBytes(state.GetLocation(), state.encoding())}) {
+              UTF_8CharacterBytes(state.GetLocation())}) {
          for (int bytes{*chBytes}; bytes > 0; --bytes) {
            if (std::optional<const char *> at{nextCh.Parse(state)}) {
              if (*chBytes == 1 && !isprint(**at)) {
diff --git a/flang/lib/semantics/mod-file.cc b/flang/lib/semantics/mod-file.cc

index 851bcff..19afac3 100644 (file)
--- a/flang/lib/semantics/mod-file.cc
+++ b/flang/lib/semantics/mod-file.cc
@@ -31,7 +31,10 @@ namespace Fortran::semantics {
  using namespace parser::literals;
  
  // The initial characters of a file that identify it as a .mod file.
-static constexpr auto magic{"!mod$ v1 sum:"};
+// The first three bytes are a Unicode byte order mark that ensures
+// that the module file is decoded as UTF-8 even if source files
+// are using another encoding.
+static constexpr auto magic{"\xef\xbb\xbf!mod$ v1 sum:"};
  
  static const SourceName *GetSubmoduleParent(const parser::Program &);
  static std::string ModFilePath(const std::string &dir, const SourceName &,
diff --git a/flang/test/semantics/modfile28.f90 b/flang/test/semantics/modfile28.f90

index fa07ebf..9a6cd3d 100644 (file)
--- a/flang/test/semantics/modfile28.f90
+++ b/flang/test/semantics/modfile28.f90
@@ -15,9 +15,12 @@
  
  ! Test UTF-8 support in character literals
  ! TODO: test EUC-JP
+! Note: Module files are encoded in UTF-8.
  
  module m
  character(kind=4,len=:), parameter :: c4 = 4_"Hi! 你好!"
+! In CHARACTER(1) literals, codepoints > 0xff are serialized into UTF-8;
+! each of those bytes then gets encoded into UTF-8 for the module file.
  character(kind=1,len=:), parameter :: c1 = 1_"Hi! 你好!"
  character(kind=4,len=:), parameter :: c4a(:) = [4_"一", 4_"二", 4_"三", 4_"四", 4_"五"]
  integer, parameter :: lc4 = len(c4)
@@ -27,7 +30,7 @@ end module m
  !Expect: m.mod
  !module m
  !character(:,4),parameter::c4=4_"Hi! 你好!"
-!character(:,1),parameter::c1=1_"Hi! \344\275\240\345\245\275!"
+!character(:,1),parameter::c1=1_"Hi! ä½ å¥½!"
  !character(:,4),parameter::c4a(1_8:)=[CHARACTER(KIND=4,LEN=1)::"一","二","三","四","五"]
  !integer(4),parameter::lc4=7_4
  !integer(4),parameter::lc1=11_4
diff --git a/flang/test/semantics/test_modfile.sh b/flang/test/semantics/test_modfile.sh

index 939155d..f6e737f 100755 (executable)
--- a/flang/test/semantics/test_modfile.sh
+++ b/flang/test/semantics/test_modfile.sh
@@ -59,7 +59,8 @@ for src in "$@"; do
        echo FAIL
        exit 1
      fi
-    sed '/^!mod\$/d' $temp/$mod > $actual
+    # The first three bytes of the file are a UTF-8 BOM
+    sed '/^.!mod\$/d' $temp/$mod > $actual
      sed '1,/^!Expect: '"$mod"'/d' $src | sed -e '/^$/,$d' -e 's/^! *//' > $expect
      if ! diff -U999999 $expect $actual > $diffs; then
        echo "Module file $mod differs from expected:"
diff --git a/flang/tools/f18/f18-parse-demo.cc b/flang/tools/f18/f18-parse-demo.cc

index 5030345..1eb997a 100644 (file)
--- a/flang/tools/f18/f18-parse-demo.cc
+++ b/flang/tools/f18/f18-parse-demo.cc
@@ -446,7 +446,6 @@ int main(int argc, char *const argv[]) {
        }
      }
    }
-  driver.encoding = options.encoding;
  
    if (driver.warnOnNonstandardUsage) {
      options.features.WarnOnAllNonstandard();
diff --git a/flang/tools/f18/f18.cc b/flang/tools/f18/f18.cc

index cde2389..922b2fa 100644 (file)
--- a/flang/tools/f18/f18.cc
+++ b/flang/tools/f18/f18.cc
@@ -452,7 +452,7 @@ int main(int argc, char *const argv[]) {
        driver.moduleFileSuffix = args.front();
        args.pop_front();
      } else if (arg == "-fno-utf-8") {
-      options.encoding = Fortran::parser::Encoding::LATIN_1;
+      driver.encoding = Fortran::parser::Encoding::LATIN_1;
      } else if (arg == "-help" || arg == "--help" || arg == "-?") {
        std::cerr
            << "f18 options:\n"
@@ -496,11 +496,10 @@ int main(int argc, char *const argv[]) {
        } else if (arg.substr(0, 2) == "-I") {
          driver.searchDirectories.push_back(arg.substr(2));
        } else if (arg == "-Mx,125,4") {  // PGI "all Kanji" mode
-        options.encoding = Fortran::parser::Encoding::EUC_JP;
+        driver.encoding = Fortran::parser::Encoding::EUC_JP;
        }
      }
    }
-  driver.encoding = options.encoding;
  
    if (driver.warnOnNonstandardUsage) {
      options.features.WarnOnAllNonstandard();
@@ -514,6 +513,7 @@ int main(int argc, char *const argv[]) {
    }
  
    Fortran::parser::AllSources allSources;
+  allSources.set_encoding(driver.encoding);
    Fortran::semantics::SemanticsContext semanticsContext{
        defaultKinds, options.features, allSources};
    semanticsContext.set_moduleDirectory(driver.moduleDirectory)
author	peter klausler <pklausler@nvidia.com>
	Wed, 12 Jun 2019 22:26:37 +0000 (15:26 -0700)
committer	peter klausler <pklausler@nvidia.com>
	Mon, 17 Jun 2019 23:13:07 +0000 (16:13 -0700)
flang/lib/evaluate/formatting.cc		patch \| blob \| history
flang/lib/parser/characters.cc		patch \| blob \| history
flang/lib/parser/characters.h		patch \| blob \| history
flang/lib/parser/parse-state.h		patch \| blob \| history
flang/lib/parser/parsing.cc		patch \| blob \| history
flang/lib/parser/parsing.h		patch \| blob \| history
flang/lib/parser/preprocessor.cc		patch \| blob \| history
flang/lib/parser/prescan.cc		patch \| blob \| history
flang/lib/parser/prescan.h		patch \| blob \| history
flang/lib/parser/provenance.cc		patch \| blob \| history
flang/lib/parser/provenance.h		patch \| blob \| history
flang/lib/parser/source.cc		patch \| blob \| history
flang/lib/parser/source.h		patch \| blob \| history
flang/lib/parser/token-parsers.h		patch \| blob \| history
flang/lib/semantics/mod-file.cc		patch \| blob \| history
flang/test/semantics/modfile28.f90		patch \| blob \| history
flang/test/semantics/test_modfile.sh		patch \| blob \| history
flang/tools/f18/f18-parse-demo.cc		patch \| blob \| history
flang/tools/f18/f18.cc		patch \| blob \| history