From 8971f7f2b3595b19cf19a3961c6ade237279114d Mon Sep 17 00:00:00 2001
From: peter klausler <pklausler@nvidia.com>
Date: Fri, 14 Jun 2019 15:44:32 -0700
Subject: [PATCH] [flang] More debugging vs. regression tests

Original-commit: flang-compiler/f18@d4fd4ad1eca91336cdd2fb1add93d97ecb303720
Reviewed-on: https://github.com/flang-compiler/f18/pull/496
Tree-same-pre-rewrite: false
---
 flang/lib/parser/characters.cc     | 179 +++++++++++++++----------------------
 flang/lib/parser/characters.h      |  11 ++-
 flang/lib/parser/prescan.cc        |  13 +--
 flang/lib/parser/token-parsers.h   |  23 ++---
 flang/lib/parser/token-sequence.cc |  10 +++
 flang/lib/parser/token-sequence.h  |   2 +
 flang/lib/parser/unparse.cc        |  29 +++---
 flang/lib/parser/unparse.h         |   2 +-
 flang/tools/f18/f18-parse-demo.cc  |   2 +-
 flang/tools/f18/f18.cc             |  11 ++-
 10 files changed, 130 insertions(+), 152 deletions(-)
diff --git a/flang/lib/parser/characters.cc b/flang/lib/parser/characters.cc
index bc60252..bc0e569 100644
--- a/flang/lib/parser/characters.cc
+++ b/flang/lib/parser/characters.cc
@@ -20,82 +20,19 @@
 
 namespace Fortran::parser {
 
-std::optional<int> UTF_8CharacterBytes(const char *p) {
+int UTF_8CharacterBytes(const char *p) {
   if ((*p & 0x80) == 0) {
     return 1;
-  }
-  if ((*p & 0xf8) == 0xf0) {
-    if ((*p & 0x07) != 0 && (p[1] & 0xc0) == 0x80 && (p[2] & 0xc0) == 0x80 &&
-        (p[3] & 0xc0) == 0x80) {
-      return 4;
-    }
-  } else if ((*p & 0xf0) == 0xe0) {
-    if ((*p & 0x0f) != 0 && (p[1] & 0xc0) == 0x80 && (p[2] & 0xc0) == 0x80) {
-      return 3;
-    }
   } else if ((*p & 0xe0) == 0xc0) {
-    if ((*p & 0x1f) != 0 && (p[1] & 0xc0) == 0x80) {
-      return 2;
-    }
-  }
-  return std::nullopt;
-}
-
-std::optional<int> EUC_JPCharacterBytes(const char *p) {
-  int b1 = *p & 0xff;
-  if (b1 <= 0x7f) {
-    return 1;
-  }
-  if (b1 >= 0xa0 && b1 <= 0xfe) {
-    int b2 = p[1] & 0xff;
-    if (b2 >= 0xa0 && b2 <= 0xfe) {
-      // JIS X 0208 (code set 1)
-      return 2;
-    }
-  } else if (b1 == 0x8e) {
-    int b2 = p[1] & 0xff;
-    if (b2 >= 0xa0 && b2 <= 0xdf) {
-      // upper half JIS 0201 (half-width kana, code set 2)
-      return 2;
-    }
-  } else if (b1 == 0x8f) {
-    int b2 = p[1] & 0xff;
-    int b3 = p[2] & 0xff;
-    if (b2 >= 0xa0 && b2 <= 0xfe && b3 >= 0xa0 && b3 <= 0xfe) {
-      // JIS X 0212 (code set 3)
-      return 3;
-    }
-  }
-  return std::nullopt;
-}
-
-static std::optional<int> One(const char *) { return 1; }
-
-static std::optional<int> (*CharacterCounter(Encoding encoding))(const char *) {
-  switch (encoding) {
-  case Encoding::UTF_8: return UTF_8CharacterBytes;
-  case Encoding::EUC_JP: return EUC_JPCharacterBytes;
-  default: return One;
-  }
-}
-
-std::optional<int> CountCharacters(
-    const char *p, std::size_t bytes, Encoding encoding) {
-  std::size_t chars{0};
-  const char *limit{p + bytes};
-  std::optional<int> (*cbf)(const char *){CharacterCounter(encoding)};
-  while (p < limit) {
-    if (std::optional<int> cb{cbf(p)}) {
-      p += *cb;
-      ++chars;
-    } else {
-      return std::nullopt;
-    }
-  }
-  if (p == limit) {
-    return chars;
+    return 2;
+  } else if ((*p & 0xf0) == 0xe0) {
+    return 3;
+  } else if ((*p & 0xf8) == 0xf0) {
+    return 4;
+  } else if ((*p & 0xfc) == 0xf8) {
+    return 5;
   } else {
-    return std::nullopt;
+    return 6;
   }
 }
 
@@ -131,10 +68,10 @@ std::string QuoteCharacterLiteral(
   return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding);
 }
 
-EncodedCharacter EncodeLATIN_1(char32_t codepoint) {
-  CHECK(codepoint <= 0xff);
+EncodedCharacter EncodeLATIN_1(char32_t ucs) {
+  CHECK(ucs <= 0xff);
   EncodedCharacter result;
-  result.buffer[0] = codepoint;
+  result.buffer[0] = ucs;
   result.bytes = 1;
   return result;
 }
@@ -154,39 +91,68 @@ EncodedCharacter EncodeUTF_8(char32_t codepoint) {
     result.buffer[1] = 0x80 | ((codepoint >> 6) & 0x3f);
     result.buffer[2] = 0x80 | (codepoint & 0x3f);
     result.bytes = 3;
-  } else {
-    // UCS actually only goes up to 0x10ffff but the
-    // UTF-8 encoding handles 21 bits.
-    CHECK(codepoint <= 0x1fffff);
+  } else if (codepoint <= 0x1fffff) {
+    // UCS actually only goes up to 0x10ffff, but the
+    // UTF-8 encoding can handle 32 bits.
     result.buffer[0] = 0xf0 | (codepoint >> 18);
     result.buffer[1] = 0x80 | ((codepoint >> 12) & 0x3f);
     result.buffer[2] = 0x80 | ((codepoint >> 6) & 0x3f);
     result.buffer[3] = 0x80 | (codepoint & 0x3f);
     result.bytes = 4;
+  } else if (codepoint <= 0x3ffffff) {
+    result.buffer[0] = 0xf8 | (codepoint >> 24);
+    result.buffer[1] = 0x80 | ((codepoint >> 18) & 0x3f);
+    result.buffer[2] = 0x80 | ((codepoint >> 12) & 0x3f);
+    result.buffer[3] = 0x80 | ((codepoint >> 6) & 0x3f);
+    result.buffer[4] = 0x80 | (codepoint & 0x3f);
+    result.bytes = 5;
+  } else {
+    result.buffer[0] = 0xfc | (codepoint >> 30);
+    result.buffer[1] = 0x80 | ((codepoint >> 24) & 0x3f);
+    result.buffer[2] = 0x80 | ((codepoint >> 18) & 0x3f);
+    result.buffer[3] = 0x80 | ((codepoint >> 12) & 0x3f);
+    result.buffer[4] = 0x80 | ((codepoint >> 6) & 0x3f);
+    result.buffer[5] = 0x80 | (codepoint & 0x3f);
+    result.bytes = 6;
   }
   return result;
 }
 
-EncodedCharacter EncodeEUC_JP(char32_t codepoint) {
-  // Assume JIS X 0208 (TODO: others)
-  CHECK(codepoint <= 0x6e6e);
+// These are placeholders; the actual mapping is complicated.
+static char32_t JIS_0208ToUCS(char32_t jis) { return jis | 0x80000; }
+static char32_t JIS_0212ToUCS(char32_t jis) { return jis | 0x90000; }
+static bool IsUCSJIS_0212(char32_t ucs) { return (ucs & 0x90000) == 0x90000; }
+static char32_t UCSToJIS(char32_t ucs) { return ucs & 0xffff; }
+
+EncodedCharacter EncodeEUC_JP(char32_t ucs) {
   EncodedCharacter result;
-  if (codepoint <= 0x7f) {
-    result.buffer[0] = codepoint;
+  if (ucs <= 0x7f) {
+    result.buffer[0] = ucs;
     result.bytes = 1;
-  } else {
-    result.buffer[0] = 0x80 | (codepoint >> 8);
-    result.buffer[1] = 0x80 | (codepoint & 0x7f);
+  } else if (ucs <= 0xff) {
+    result.buffer[0] = 0x8e;  // JIS X 0201
+    result.buffer[1] = ucs;
+    result.bytes = 2;
+  } else if (IsUCSJIS_0212(ucs)) {  // JIS X 0212
+    char32_t jis{UCSToJIS(ucs)};
+    result.buffer[0] = 0x8f;
+    result.buffer[1] = 0x80 ^ (jis >> 8);
+    result.buffer[2] = 0x80 ^ jis;
+    result.bytes = 3;
+  } else {  // JIS X 0208
+    char32_t jis{UCSToJIS(ucs)};
+    result.buffer[0] = 0x80 ^ (jis >> 8);
+    result.buffer[1] = 0x80 ^ jis;
     result.bytes = 2;
   }
   return result;
 }
 
-EncodedCharacter EncodeCharacter(Encoding encoding, char32_t codepoint) {
+EncodedCharacter EncodeCharacter(Encoding encoding, char32_t ucs) {
   switch (encoding) {
-  case Encoding::LATIN_1: return EncodeLATIN_1(codepoint);
-  case Encoding::UTF_8: return EncodeUTF_8(codepoint);
-  case Encoding::EUC_JP: return EncodeEUC_JP(codepoint);
+  case Encoding::LATIN_1: return EncodeLATIN_1(ucs);
+  case Encoding::UTF_8: return EncodeUTF_8(ucs);
+  case Encoding::EUC_JP: return EncodeEUC_JP(ucs);
   default: CRASH_NO_CASE;
   }
 }
@@ -221,19 +187,18 @@ DecodedCharacter DecodeEUC_JPCharacter(const char *cp, std::size_t bytes) {
   char32_t ch{*p};
   if (ch <= 0x7f) {
     return {ch, 1};
-  } else if (ch >= 0xa0 && ch <= 0xfe && bytes >= 2 && p[1] >= 0xa0 &&
-      p[1] <= 0xfe) {
-    ch = ((ch << 8) | p[1]) & 0x7f7f;  // JIS X 0208
-    return {ch, 2};
-  } else if (ch == 0x8e && bytes >= 2 && p[1] >= 0xa0 && p[1] <= 0xdf) {
-    return {p[1], 2};  // JIS X 0201
-  } else if (ch == 0x8f && bytes >= 3 && p[1] >= 0xa0 && p[1] <= 0xfe &&
-      p[2] >= 0xa0 && p[2] <= 0xfe) {
-    ch = ((p[1] << 8) | p[1]) & 0x7f7f;  // JIS X 0212
-    return {ch, 3};
-  } else {
-    return {};  // not valid EUC_JP
+  } else if (ch == 0x8e) {
+    if (bytes >= 2) {
+      return {p[1], 2};  // JIS X 0201
+    }
+  } else if (ch == 0x8f) {  // JIS X 0212
+    if (bytes >= 3) {
+      return {JIS_0212ToUCS(((p[1] << 8) | p[2]) ^ 0x8080), 3};
+    }
+  } else if (bytes >= 2) {  // assume JIS X 0208
+    return {JIS_0208ToUCS(((ch << 8) | p[1]) ^ 0x8080), 2};
   }
+  return {};
 }
 
 DecodedCharacter DecodeLATIN1Character(const char *cp) {
@@ -267,10 +232,10 @@ static DecodedCharacter DecodeEscapedCharacter(
 
 static DecodedCharacter DecodeEscapedCharacters(
     Encoding encoding, const char *cp, std::size_t bytes) {
-  char buffer[4];
-  int count[4];
+  char buffer[EncodedCharacter::maxEncodingBytes];
+  int count[EncodedCharacter::maxEncodingBytes];
   std::size_t at{0}, len{0};
-  for (; len < 4 && at < bytes; ++len) {
+  for (; len < EncodedCharacter::maxEncodingBytes && at < bytes; ++len) {
     DecodedCharacter code{DecodeEscapedCharacter(cp + at, bytes - at)};
     buffer[len] = code.codepoint;
     at += code.bytes;
@@ -280,7 +245,7 @@ static DecodedCharacter DecodeEscapedCharacters(
   if (code.bytes > 0) {
     code.bytes = count[code.bytes - 1];
   } else {
-    code.codepoint = static_cast<unsigned char>(buffer[0]);
+    code.codepoint = buffer[0] & 0xff;
     code.bytes = count[0];
   }
   return code;
diff --git a/flang/lib/parser/characters.h b/flang/lib/parser/characters.h
index 4947ae6..eade306 100644
--- a/flang/lib/parser/characters.h
+++ b/flang/lib/parser/characters.h
@@ -125,7 +125,7 @@ inline constexpr std::optional<char> BackslashEscapeValue(char ch) {
 
 inline constexpr std::optional<char> BackslashEscapeChar(char ch) {
   switch (ch) {
-  case '\a': return 'a';
+    //  case '\a': return 'a';  // PGF90 doesn't know \a
   case '\b': return 'b';
   case '\f': return 'f';
   case '\n': return 'n';
@@ -140,7 +140,8 @@ inline constexpr std::optional<char> BackslashEscapeChar(char ch) {
 }
 
 struct EncodedCharacter {
-  char buffer[4];
+  static constexpr int maxEncodingBytes{6};
+  char buffer[maxEncodingBytes];
   int bytes{0};
 };
 
@@ -155,7 +156,7 @@ template<typename NORMAL, typename INSERTED>
 void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert,
     bool backslashEscapes = true, Encoding encoding = Encoding::UTF_8) {
   auto emitOneChar{[&](std::uint8_t ch) {
-    if (ch < ' ' || ch == '\\' || (backslashEscapes && ch >= 0x7f)) {
+    if (ch < ' ' || (backslashEscapes && (ch == '\\' || ch >= 0x7f))) {
       insert('\\');
       if (std::optional<char> escape{BackslashEscapeChar(ch)}) {
         emit(*escape);
@@ -190,9 +191,7 @@ std::string QuoteCharacterLiteral(const std::u16string &,
 std::string QuoteCharacterLiteral(const std::u32string &,
     bool backslashEscapes = true, Encoding = Encoding::UTF_8);
 
-std::optional<int> UTF_8CharacterBytes(const char *);
-std::optional<int> EUC_JPCharacterBytes(const char *);
-std::optional<int> CountCharacters(const char *, std::size_t bytes, Encoding);
+int UTF_8CharacterBytes(const char *);
 
 struct DecodedCharacter {
   char32_t codepoint{0};
diff --git a/flang/lib/parser/prescan.cc b/flang/lib/parser/prescan.cc
index 26a507a..fadf07b1 100644
--- a/flang/lib/parser/prescan.cc
+++ b/flang/lib/parser/prescan.cc
@@ -21,6 +21,7 @@
 #include "../common/idioms.h"
 #include <cstddef>
 #include <cstring>
+#include <iostream>  // TODO pmk rm
 #include <sstream>
 #include <utility>
 #include <vector>
@@ -168,12 +169,12 @@ void Prescanner::Statement() {
   Provenance newlineProvenance{GetCurrentProvenance()};
   if (std::optional<TokenSequence> preprocessed{
           preprocessor_.MacroReplacement(tokens, *this)}) {
-    // Reprocess the preprocessed line.
+    // Reprocess the preprocessed line.  Append a newline temporarily.
     preprocessed->PutNextTokenChar('\n', newlineProvenance);
     preprocessed->CloseToken();
     const char *ppd{preprocessed->ToCharBlock().begin()};
     LineClassification ppl{ClassifyLine(ppd)};
-    preprocessed->ReopenLastToken();  // remove the newline
+    preprocessed->RemoveLastToken();  // remove the newline
     switch (ppl.kind) {
     case LineClassification::Kind::Comment: break;
     case LineClassification::Kind::IncludeLine:
@@ -183,7 +184,7 @@ void Prescanner::Statement() {
     case LineClassification::Kind::IncludeDirective:
     case LineClassification::Kind::PreprocessorDirective:
       Say(preprocessed->GetProvenanceRange(),
-          "preprocessed line resembles a preprocessor directive"_en_US);
+          "Preprocessed line resembles a preprocessor directive"_en_US);
       preprocessed->ToLowerCase().Emit(cooked_);
       break;
     case LineClassification::Kind::CompilerDirective:
@@ -483,12 +484,12 @@ bool Prescanner::NextToken(TokenSequence &tokens) {
     preventHollerith_ = false;
   } else if (IsLegalInIdentifier(*at_)) {
     do {
-    } while (IsLegalInIdentifier(
-        EmitCharAndAdvance(tokens, ToLowerCaseLetter(*at_))));
+    } while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_)));
     if (*at_ == '\'' || *at_ == '"') {
       // Look for prefix of NC'...' legacy PGI "Kanji" NCHARACTER literal
       CharBlock prefix{tokens.CurrentOpenToken()};
-      bool isKanji{prefix.size() == 2 && prefix[0] == 'n' && prefix[1] == 'c'};
+      bool isKanji{prefix.size() == 2 && ToLowerCaseLetter(prefix[0]) == 'n' &&
+          ToLowerCaseLetter(prefix[1]) == 'c'};
       QuotedCharacterLiteral(tokens, start, isKanji);
       preventHollerith_ = false;
     } else {
diff --git a/flang/lib/parser/token-parsers.h b/flang/lib/parser/token-parsers.h
index 0c6c327..550a8c8 100644
--- a/flang/lib/parser/token-parsers.h
+++ b/flang/lib/parser/token-parsers.h
@@ -515,23 +515,18 @@ struct HollerithLiteral {
     }
     std::string content;
     for (auto j{*charCount}; j-- > 0;) {
-      if (std::optional<int> chBytes{
-              UTF_8CharacterBytes(state.GetLocation())}) {
-        for (int bytes{*chBytes}; bytes > 0; --bytes) {
-          if (std::optional<const char *> at{nextCh.Parse(state)}) {
-            if (*chBytes == 1 && !isprint(**at)) {
-              state.Say(start, "Bad character in Hollerith"_err_en_US);
-              return std::nullopt;
-            }
-            content += **at;
-          } else {
-            state.Say(start, "Insufficient characters in Hollerith"_err_en_US);
+      int chBytes{UTF_8CharacterBytes(state.GetLocation())};
+      for (int bytes{chBytes}; bytes > 0; --bytes) {
+        if (std::optional<const char *> at{nextCh.Parse(state)}) {
+          if (chBytes == 1 && !isprint(**at)) {
+            state.Say(start, "Bad character in Hollerith"_err_en_US);
             return std::nullopt;
           }
+          content += **at;
+        } else {
+          state.Say(start, "Insufficient characters in Hollerith"_err_en_US);
+          return std::nullopt;
         }
-      } else {
-        state.Say(start, "Bad multi-byte character in Hollerith"_err_en_US);
-        return std::nullopt;
       }
     }
     return content;
diff --git a/flang/lib/parser/token-sequence.cc b/flang/lib/parser/token-sequence.cc
index 829d1f7..2c4424a 100644
--- a/flang/lib/parser/token-sequence.cc
+++ b/flang/lib/parser/token-sequence.cc
@@ -61,6 +61,16 @@ std::size_t TokenSequence::SkipBlanks(std::size_t at) const {
   return tokens;  // even if at > tokens
 }
 
+void TokenSequence::RemoveLastToken() {
+  CHECK(!start_.empty());
+  CHECK(nextStart_ > start_.back());
+  std::size_t bytes{nextStart_ - start_.back()};
+  nextStart_ = start_.back();
+  start_.pop_back();
+  char_.erase(char_.begin() + nextStart_, char_.end());
+  provenances_.RemoveLastBytes(bytes);
+}
+
 void TokenSequence::Put(const TokenSequence &that) {
   if (nextStart_ < char_.size()) {
     start_.push_back(nextStart_);
diff --git a/flang/lib/parser/token-sequence.h b/flang/lib/parser/token-sequence.h
index ea06184..19dff79 100644
--- a/flang/lib/parser/token-sequence.h
+++ b/flang/lib/parser/token-sequence.h
@@ -89,6 +89,8 @@ public:
     start_.pop_back();
   }
 
+  void RemoveLastToken();
+
   void Put(const TokenSequence &);
   void Put(const TokenSequence &, ProvenanceRange);
   void Put(const TokenSequence &, std::size_t at, std::size_t tokens = 1);
diff --git a/flang/lib/parser/unparse.cc b/flang/lib/parser/unparse.cc
index b3ddfd3..e428bbce 100644
--- a/flang/lib/parser/unparse.cc
+++ b/flang/lib/parser/unparse.cc
@@ -186,31 +186,28 @@ public:
         x.u);
   }
   void Unparse(const CharLiteralConstant &x) {  // R724
+    Encoding encoding{encoding_};
     if (const auto &k{std::get<std::optional<KindParam>>(x.t)}) {
       if (std::holds_alternative<KindParam::Kanji>(k->u)) {
         Word("NC");
-        std::u16string jis;
-        for (char32_t ch : DecodeUTF_8(std::get<std::string>(x.t))) {
-          jis += static_cast<char16_t>(ch);
-        }
-        Put(QuoteCharacterLiteral(jis, backslashEscapes_, Encoding::EUC_JP));
+        encoding = Encoding::EUC_JP;
       } else {
         Walk(*k), Put('_');
-        Put(QuoteCharacterLiteral(
-            std::get<std::string>(x.t), backslashEscapes_));
       }
-    } else {
-      Put(QuoteCharacterLiteral(std::get<std::string>(x.t), backslashEscapes_));
     }
+    Put(QuoteCharacterLiteral(
+        DecodeUTF_8(std::get<std::string>(x.t)), backslashEscapes_, encoding));
   }
-  void Before(const HollerithLiteralConstant &x) {
-    if (std::optional<std::size_t> chars{
-            CountCharacters(x.v.data(), x.v.size(), encoding_)}) {
-      Unparse(*chars);
-    } else {
-      Unparse(x.v.size());
-    }
+  void Unparse(const HollerithLiteralConstant &x) {
+    std::u32string ucs{DecodeUTF_8(x.v)};
+    Unparse(ucs.size());
     Put('H');
+    for (char32_t ch : DecodeUTF_8(x.v)) {
+      EncodedCharacter encoded{EncodeCharacter(encoding_, ch)};
+      for (int j{0}; j < encoded.bytes; ++j) {
+        Put(encoded.buffer[j]);
+      }
+    }
   }
   void Unparse(const LogicalLiteralConstant &x) {  // R725
     Put(std::get<bool>(x.t) ? ".TRUE." : ".FALSE.");
diff --git a/flang/lib/parser/unparse.h b/flang/lib/parser/unparse.h
index 070c758..470ba9d 100644
--- a/flang/lib/parser/unparse.h
+++ b/flang/lib/parser/unparse.h
@@ -37,7 +37,7 @@ using preStatementType =
 using TypedExprAsFortran =
     std::function<void(std::ostream &, const evaluate::GenericExprWrapper &)>;
 
-/// Convert parsed program to out as Fortran.
+// Converts parsed program to out as Fortran.
 void Unparse(std::ostream &out, const Program &program,
     Encoding encoding = Encoding::UTF_8, bool capitalizeKeywords = true,
     bool backslashEscapes = true, preStatementType *preStatement = nullptr,
diff --git a/flang/tools/f18/f18-parse-demo.cc b/flang/tools/f18/f18-parse-demo.cc
index 1eb997a..1edb23a 100644
--- a/flang/tools/f18/f18-parse-demo.cc
+++ b/flang/tools/f18/f18-parse-demo.cc
@@ -92,7 +92,7 @@ struct DriverOptions {
   bool forcedForm{false};  // -Mfixed or -Mfree appeared
   bool warnOnNonstandardUsage{false};  // -Mstandard
   bool warningsAreErrors{false};  // -Werror
-  Fortran::parser::Encoding encoding{Fortran::parser::Encoding::UTF_8};
+  Fortran::parser::Encoding encoding{Fortran::parser::Encoding::LATIN_1};
   bool parseOnly{false};
   bool dumpProvenance{false};
   bool dumpCookedChars{false};
diff --git a/flang/tools/f18/f18.cc b/flang/tools/f18/f18.cc
index 922b2fa..9109c36 100644
--- a/flang/tools/f18/f18.cc
+++ b/flang/tools/f18/f18.cc
@@ -451,8 +451,13 @@ int main(int argc, char *const argv[]) {
     } else if (arg == "-module-suffix") {
       driver.moduleFileSuffix = args.front();
       args.pop_front();
-    } else if (arg == "-fno-utf-8") {
+    } else if (arg == "-futf-8") {
+      driver.encoding = Fortran::parser::Encoding::UTF_8;
+    } else if (arg == "-flatin") {
       driver.encoding = Fortran::parser::Encoding::LATIN_1;
+    } else if (arg == "-fkanji") {
+      driver.encoding = Fortran::parser::Encoding::EUC_JP;
+      driver.pgf90Args.push_back("-Mx,125,4");  // PGI "Kanji" mode
     } else if (arg == "-help" || arg == "--help" || arg == "-?") {
       std::cerr
           << "f18 options:\n"
@@ -467,6 +472,10 @@ int main(int argc, char *const argv[]) {
           << "  -ed                  enable fixed form D lines\n"
           << "  -E                   prescan & preprocess only\n"
           << "  -module dir          module output directory (default .)\n"
+          << "  -fkanji              interpret source as EUC_JP rather than "
+             "UTF-8\n"
+          << "  -flatin              interpret source as Latin-1 (ISO 8859-1) "
+             "rather than UTF-8\n"
           << "  -fparse-only         parse only, no output except messages\n"
           << "  -funparse            parse & reformat only, no code "
              "generation\n"
-- 
2.7.4