--- /dev/null
+#include "prescan.h"
+#include "char-buffer.h"
+#include "idioms.h"
+#include "source.h"
+#include <cctype>
+#include <cstring>
+#include <utility>
+#include <vector>
+
+namespace Fortran {
+
+CharBuffer Prescanner::Prescan(const SourceFile &source) {
+ lineStart_ = source.content();
+ limit_ = lineStart_ + source.bytes();
+ CommentLinesAndPreprocessorDirectives();
+ CharBuffer out;
+ TokenSequence tokens, preprocessed;
+ while (lineStart_ < limit_) {
+ BeginSourceLineAndAdvance();
+ if (inFixedForm_) {
+ LabelField(&out);
+ } else {
+ SkipSpaces();
+ }
+ while (NextToken(&tokens)) {
+ }
+ if (preprocessor_.MacroReplacement(tokens, &preprocessed)) {
+ // TODO: include label field
+ // TODO: recheck for comments, &c.; just retokenize?
+ preprocessed.Emit(&out);
+ preprocessed.clear();
+ } else {
+ tokens.Emit(&out);
+ }
+ tokens.clear();
+ out.Put('\n');
+ for (; newlineDebt_ > 0; --newlineDebt_) {
+ out.Put('\n');
+ }
+ }
+ return std::move(out);
+}
+
+std::optional<TokenSequence> Prescanner::NextTokenizedLine() {
+ if (lineStart_ >= limit_) {
+ return {};
+ }
+ bool wasInPreprocessorDirective{inPreprocessorDirective_};
+ inPreprocessorDirective_ = true;
+ BeginSourceLineAndAdvance();
+ TokenSequence tokens;
+ while (NextToken(&tokens)) {
+ }
+ inPreprocessorDirective_ = wasInPreprocessorDirective;
+ return {std::move(tokens)};
+}
+
+void Prescanner::NextLine() {
+ void *vstart{static_cast<void *>(const_cast<char *>(lineStart_))};
+ void *v{std::memchr(vstart, '\n', limit_ - lineStart_)};
+ if (v == nullptr) {
+ lineStart_ = limit_;
+ } else {
+ const char *nl{const_cast<const char *>(static_cast<char *>(v))};
+ lineStart_ = nl + 1;
+ }
+}
+
+void Prescanner::LabelField(CharBuffer *out) {
+ int outCol{1};
+ while (*at_ != '\n' && column_ <= 6) {
+ if (*at_ == '\t') {
+ NextChar();
+ break;
+ }
+ if (*at_ != ' ' &&
+ (*at_ != '0' || column_ != 6)) { // '0' in column 6 becomes space
+ out->Put(*at_);
+ ++outCol;
+ }
+ NextChar();
+ }
+ while (outCol < 7) {
+ out->Put(' ');
+ ++outCol;
+ }
+}
+
+void Prescanner::NextChar() {
+ // CHECK(*at_ != '\n');
+ ++at_;
+ ++column_;
+ if (inPreprocessorDirective_) {
+ while (*at_ == '/' && at_[1] == '*') {
+ char star{' '}, slash{' '};
+ for (at_ += 2, column_ += 2;
+ *at_ != '\n' && (star != '*' || slash != '/');
+ ++at_, ++column_) {
+ star = slash;
+ slash = *at_;
+ }
+ }
+ while (*at_ == '\\' && at_ + 2 < limit_ && at_[1] == '\n') {
+ BeginSourceLineAndAdvance();
+ ++newlineDebt_;
+ }
+ } else {
+ if ((inFixedForm_ && column_ > fixedFormColumnLimit_ &&
+ !tabInCurrentLine_) ||
+ (*at_ == '!' && !inCharLiteral_)) {
+ while (*at_ != '\n') {
+ ++at_;
+ }
+ }
+ while (*at_ == '\n' || *at_ == '&') {
+ if ((inFixedForm_ && !FixedFormContinuation()) ||
+ (!inFixedForm_ && !FreeFormContinuation())) {
+ return;
+ }
+ }
+ if (*at_ == '\t') {
+ tabInCurrentLine_ = true;
+ }
+ }
+}
+
+void Prescanner::SkipSpaces() {
+ while (*at_ == ' ' || *at_ == '\t') {
+ NextChar();
+ }
+}
+
+static inline bool IsNameChar(char ch) {
+ return isalnum(ch) || ch == '_' || ch == '$' || ch == '@';
+}
+
+bool Prescanner::NextToken(TokenSequence *tokens) {
+ if (inFixedForm_) {
+ SkipSpaces();
+ } else if (*at_ == ' ' || *at_ == '\t') {
+ NextChar();
+ SkipSpaces();
+ if (*at_ != '\n') {
+ tokens->AddChar(' ');
+ tokens->EndToken();
+ return true;
+ }
+ }
+ if (*at_ == '\n') {
+ return false;
+ }
+
+ if (*at_ == '\'' || *at_ == '"') {
+ QuotedCharacterLiteral(tokens);
+ preventHollerith_ = false;
+ } else if (isdigit(*at_)) {
+ int n{0};
+ static constexpr int maxHollerith = 256 * (132-6);
+ do {
+ if (n < maxHollerith) {
+ n = 10 * n + *at_ - '0';
+ }
+ EmitCharAndAdvance(tokens, *at_);
+ if (inFixedForm_) {
+ SkipSpaces();
+ }
+ } while (isdigit(*at_));
+ if ((*at_ == 'h' || *at_ == 'H') &&
+ n > 0 && n < maxHollerith &&
+ !preventHollerith_) {
+ EmitCharAndAdvance(tokens, 'h');
+ inCharLiteral_ = true;
+ while (n-- > 0) {
+ if (PadOutCharacterLiteral()) {
+ tokens->AddChar(' ');
+ } else {
+ if (*at_ == '\n') {
+ break; // TODO error
+ }
+ EmitCharAndAdvance(tokens, *at_);
+ }
+ }
+ inCharLiteral_ = false;
+ } else if (*at_ == '.') {
+ while (isdigit(EmitCharAndAdvance(tokens, *at_))) {
+ }
+ ExponentAndKind(tokens);
+ } else if (ExponentAndKind(tokens)) {
+ } else if (isalpha(*at_)) {
+ // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
+ // we don't misrecognize I9HOLLERITH as an identifier in the next case.
+ EmitCharAndAdvance(tokens, tolower(*at_));
+ }
+ preventHollerith_ = false;
+ } else if (*at_ == '.') {
+ if (isdigit(EmitCharAndAdvance(tokens, '.'))) {
+ while (isdigit(EmitCharAndAdvance(tokens, *at_))) {
+ }
+ ExponentAndKind(tokens);
+ } else if (isalpha(*at_)) {
+ while (IsNameChar(EmitCharAndAdvance(tokens, tolower(*at_)))) {
+ }
+ if (*at_ == '.') {
+ EmitCharAndAdvance(tokens, '.');
+ }
+ }
+ preventHollerith_ = false;
+ } else if (IsNameChar(*at_)) {
+ while (IsNameChar(EmitCharAndAdvance(tokens, tolower(*at_)))) {
+ }
+ if (*at_ == '\'' || *at_ == '"') {
+ QuotedCharacterLiteral(tokens);
+ }
+ preventHollerith_ = false;
+ } else if (*at_ == '*') {
+ if (EmitCharAndAdvance(tokens, '*') == '*') {
+ EmitCharAndAdvance(tokens, '*');
+ } else {
+ preventHollerith_ = true; // ambiguity: CHARACTER*2H
+ }
+ } else {
+ char ch{*at_}, nch{EmitCharAndAdvance(tokens, ch)};
+ preventHollerith_ = false;
+ if ((nch == '=' && (ch == '<' || ch == '>' || ch == '/' || ch == '=')) ||
+ (ch == nch && (ch == '/' || ch == ':' || ch == '#')) ||
+ (ch == '=' && nch == '>')) {
+ EmitCharAndAdvance(tokens, nch);
+ }
+ }
+ tokens->EndToken();
+ return true;
+}
+
+bool Prescanner::ExponentAndKind(TokenSequence *tokens) {
+ char ed = tolower(*at_);
+ if (ed != 'e' && ed != 'd') {
+ return false;
+ }
+ EmitCharAndAdvance(tokens, ed);
+ if (*at_ == '+' || *at_ == '-') {
+ EmitCharAndAdvance(tokens, *at_);
+ }
+ while (isdigit(*at_)) {
+ EmitCharAndAdvance(tokens, *at_);
+ }
+ if (*at_ == '_') {
+ while (IsNameChar(EmitCharAndAdvance(tokens, tolower(*at_)))) {
+ }
+ }
+ return true;
+}
+
+void Prescanner::QuotedCharacterLiteral(TokenSequence *tokens) {
+ char quote{*at_};
+ inCharLiteral_ = true;
+ do {
+ EmitCharAndAdvance(tokens, *at_);
+ while (PadOutCharacterLiteral()) {
+ tokens->AddChar(' ');
+ }
+ if (*at_ == '\\' && enableBackslashEscapesInCharLiterals_) {
+ EmitCharAndAdvance(tokens, '\\');
+ while (PadOutCharacterLiteral()) {
+ tokens->AddChar(' ');
+ }
+ } else if (*at_ == quote) {
+ // A doubled quote mark becomes a single instance of the quote character
+ // in the literal later.
+ EmitCharAndAdvance(tokens, quote);
+ if (inFixedForm_) {
+ SkipSpaces();
+ }
+ if (*at_ != quote) {
+ break;
+ }
+ }
+ } while (*at_ != '\n');
+ inCharLiteral_ = false;
+}
+
+bool Prescanner::PadOutCharacterLiteral() {
+ if (inFixedForm_ &&
+ !tabInCurrentLine_ &&
+ *at_ == '\n' &&
+ column_ < fixedFormColumnLimit_) {
+ ++column_;
+ return true;
+ }
+ return false;
+}
+
+bool Prescanner::IsFixedFormCommentLine(const char *start) {
+ if (start >= limit_ || !inFixedForm_) {
+ return false;
+ }
+ const char *p{start};
+ char ch{*p};
+ if (ch == '*' || ch == 'C' || ch == 'c' ||
+ ch == '%' || // VAX %list, %eject, &c.
+ ((ch == 'D' || ch == 'd') && !enableOldDebugLines_)) {
+ return true;
+ }
+ bool anyTabs{false};
+ while (true) {
+ ch = *p;
+ if (ch == ' ') {
+ ++p;
+ } else if (ch == '\t') {
+ anyTabs = true;
+ ++p;
+ } else if (ch == '0' && !anyTabs && p == start + 5) {
+ ++p; // 0 in column 6 must treated as a space
+ } else {
+ break;
+ }
+ }
+ if (!anyTabs && p >= start + fixedFormColumnLimit_) {
+ return true;
+ }
+ if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) {
+ return true;
+ }
+ return *p == '\n';
+}
+
+bool Prescanner::IsFreeFormComment(const char *p) {
+ if (p >= limit_ || inFixedForm_) {
+ return false;
+ }
+ while (*p == ' ' || *p == '\t') {
+ ++p;
+ }
+ return *p == '!' || *p == '\n';
+}
+
+bool Prescanner::IsPreprocessorDirectiveLine(const char *start) {
+ const char *p{start};
+ if (p >= limit_ || inPreprocessorDirective_) {
+ return false;
+ }
+ for (; *p == ' '; ++p) {
+ }
+ if (*p == '#') {
+ return !inFixedForm_ || p != start + 5;
+ }
+ for (; *p == ' ' || *p == '\t'; ++p) {
+ }
+ return *p == '#';
+}
+
+void Prescanner::CommentLinesAndPreprocessorDirectives() {
+ while (lineStart_ < limit_) {
+ if (IsFixedFormCommentLine(lineStart_) ||
+ IsFreeFormComment(lineStart_)) {
+ NextLine();
+ } else if (IsPreprocessorDirectiveLine(lineStart_)) {
+ const char *saveAt{at_};
+ if (std::optional<TokenSequence> tokens{NextTokenizedLine()}) {
+ std::string err{preprocessor_.Directive(*tokens)};
+ if (!err.empty()) {
+ *error_ << err << '\n';
+ }
+ }
+ at_ = saveAt;
+ } else {
+ break;
+ }
+ ++newlineDebt_;
+ }
+}
+
+const char *Prescanner::FixedFormContinuationLine() {
+ const char *p{lineStart_};
+ if (p >= limit_) {
+ return nullptr;
+ }
+ tabInCurrentLine_ = false;
+ if (*p == '&') {
+ return p + 1; // extension
+ }
+ if (*p == '\t' && p[1] >= '1' && p[1] <= '9') {
+ tabInCurrentLine_ = true;
+ return p + 2; // VAX extension
+ }
+ if (p[0] == ' ' && p[1] == ' ' && p[2] == ' ' &&
+ p[3] == ' ' && p[4] == ' ') {
+ char col6{p[5]};
+ if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
+ return p + 6;
+ }
+ }
+ return nullptr; // not a continuation line
+}
+
+bool Prescanner::FixedFormContinuation() {
+ CommentLinesAndPreprocessorDirectives();
+ const char *cont{FixedFormContinuationLine()};
+ if (cont == nullptr) {
+ return false;
+ }
+ BeginSourceLine(cont, 7);
+ ++newlineDebt_;
+ NextLine();
+ return true;
+}
+
+bool Prescanner::FreeFormContinuation() {
+ while (*at_ == ' ' || *at_ == '\t') {
+ ++at_;
+ }
+ const char *p{at_};
+ bool ampersand{*p == '&'};
+ if (ampersand) {
+ for (++p; *p == ' ' || *p == '\t'; ++p) {
+ }
+ }
+ if (*p != '\n' && (inCharLiteral_ || *p != '!')) {
+ return false;
+ }
+ CommentLinesAndPreprocessorDirectives();
+ p = lineStart_;
+ if (p >= limit_) {
+ return false;
+ }
+ int column{1};
+ for (; *p == ' ' || *p == '\t'; ++p) {
+ ++column;
+ }
+ if (*p == '&') {
+ ++p;
+ ++column;
+ } else if (ampersand) {
+ if (p > lineStart_) {
+ --p;
+ --column;
+ }
+ } else {
+ return false; // not a continuation
+ }
+ BeginSourceLine(p, column);
+ ++newlineDebt_;
+ NextLine();
+ return true;
+}
+} // namespace Fortran
--- /dev/null
+#ifndef FORTRAN_PRESCAN_H_
+#define FORTRAN_PRESCAN_H_
+
+// Defines a fast Fortran source prescanning phase that implements some
+// character-level features of the language that can be inefficient to
+// support directly in a backtracking parser. This phase handles Fortran
+// line continuation, comment removal, card image margins, padding out
+// fixed form character literals on truncated card images, and drives the
+// Fortran source preprocessor.
+//
+// It is possible to run the Fortran parser without running this prescan
+// phase, using only the parsers defined in cooked-chars.h, so long as
+// preprocessing and INCLUDE lines need not be handled.
+
+#include "char-buffer.h"
+#include "preprocessor.h"
+#include "source.h"
+#include <optional>
+#include <sstream>
+
+namespace Fortran {
+
+class Prescanner {
+ public:
+ explicit Prescanner(std::stringstream *err)
+ : error_{err}, preprocessor_{this} {}
+
+ Prescanner &set_fixedForm(bool yes) {
+ inFixedForm_ = yes;
+ return *this;
+ }
+ Prescanner &set_enableOldDebugLines(bool yes) {
+ enableOldDebugLines_ = yes;
+ return *this;
+ }
+ Prescanner &set_enableBackslashEscapesInCharLiterals(bool yes) {
+ enableBackslashEscapesInCharLiterals_ = yes;
+ return *this;
+ }
+ Prescanner &set_fixedFormColumnLimit(int limit) {
+ fixedFormColumnLimit_ = limit;
+ return *this;
+ }
+
+ CharBuffer Prescan(const SourceFile &source);
+ std::optional<TokenSequence> NextTokenizedLine();
+
+ private:
+ void BeginSourceLine(const char *at, int column = 1) {
+ at_ = at;
+ column_ = column;
+ tabInCurrentLine_ = false;
+ preventHollerith_ = false;
+ }
+
+ void BeginSourceLineAndAdvance() {
+ BeginSourceLine(lineStart_);
+ NextLine();
+ }
+
+ char EmitCharAndAdvance(TokenSequence *tokens, char ch) {
+ tokens->AddChar(ch);
+ NextChar();
+ return *at_;
+ }
+
+ void NextLine();
+ void LabelField(CharBuffer *);
+ void NextChar();
+ void SkipSpaces();
+ bool NextToken(TokenSequence *);
+ bool ExponentAndKind(TokenSequence *);
+ void QuotedCharacterLiteral(TokenSequence *);
+ bool PadOutCharacterLiteral();
+ void CommentLinesAndPreprocessorDirectives();
+ bool IsFixedFormCommentLine(const char *);
+ bool IsFreeFormComment(const char *);
+ bool IsPreprocessorDirectiveLine(const char *);
+ const char *FixedFormContinuationLine();
+ bool FixedFormContinuation();
+ bool FreeFormContinuation();
+
+ std::stringstream *error_;
+ const char *lineStart_{nullptr}; // next line to process; <= limit_
+ const char *at_{nullptr}; // next character to process; < lineStart_
+ int column_{1}; // card image column position of next character
+ const char *limit_{nullptr}; // first address after end of source
+ int newlineDebt_{0}; // newline characters consumed but not yet emitted
+ bool inCharLiteral_{false};
+ bool inPreprocessorDirective_{false};
+ bool inFixedForm_{true};
+ int fixedFormColumnLimit_{72};
+ bool tabInCurrentLine_{false};
+ bool preventHollerith_{false};
+ bool enableOldDebugLines_{false};
+ bool enableBackslashEscapesInCharLiterals_{true};
+ Preprocessor preprocessor_;
+};
+} // namespace Fortran
+#endif // FORTRAN_PRESCAN_H_