From e78861cba2f36bccd3452ebb44cd647e44f1eda9 Mon Sep 17 00:00:00 2001 From: peter klausler Date: Tue, 30 Jan 2018 11:54:47 -0800 Subject: [PATCH] [flang] Add prescan.{h,cc}. Original-commit: flang-compiler/f18@6437ba318c8761ad8206dc88d700717fdd4131b7 --- flang/prescan.cc | 445 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ flang/prescan.h | 100 +++++++++++++ 2 files changed, 545 insertions(+) create mode 100644 flang/prescan.cc create mode 100644 flang/prescan.h diff --git a/flang/prescan.cc b/flang/prescan.cc new file mode 100644 index 0000000..16c6af4 --- /dev/null +++ b/flang/prescan.cc @@ -0,0 +1,445 @@ +#include "prescan.h" +#include "char-buffer.h" +#include "idioms.h" +#include "source.h" +#include +#include +#include +#include + +namespace Fortran { + +CharBuffer Prescanner::Prescan(const SourceFile &source) { + lineStart_ = source.content(); + limit_ = lineStart_ + source.bytes(); + CommentLinesAndPreprocessorDirectives(); + CharBuffer out; + TokenSequence tokens, preprocessed; + while (lineStart_ < limit_) { + BeginSourceLineAndAdvance(); + if (inFixedForm_) { + LabelField(&out); + } else { + SkipSpaces(); + } + while (NextToken(&tokens)) { + } + if (preprocessor_.MacroReplacement(tokens, &preprocessed)) { + // TODO: include label field + // TODO: recheck for comments, &c.; just retokenize? + preprocessed.Emit(&out); + preprocessed.clear(); + } else { + tokens.Emit(&out); + } + tokens.clear(); + out.Put('\n'); + for (; newlineDebt_ > 0; --newlineDebt_) { + out.Put('\n'); + } + } + return std::move(out); +} + +std::optional Prescanner::NextTokenizedLine() { + if (lineStart_ >= limit_) { + return {}; + } + bool wasInPreprocessorDirective{inPreprocessorDirective_}; + inPreprocessorDirective_ = true; + BeginSourceLineAndAdvance(); + TokenSequence tokens; + while (NextToken(&tokens)) { + } + inPreprocessorDirective_ = wasInPreprocessorDirective; + return {std::move(tokens)}; +} + +void Prescanner::NextLine() { + void *vstart{static_cast(const_cast(lineStart_))}; + void *v{std::memchr(vstart, '\n', limit_ - lineStart_)}; + if (v == nullptr) { + lineStart_ = limit_; + } else { + const char *nl{const_cast(static_cast(v))}; + lineStart_ = nl + 1; + } +} + +void Prescanner::LabelField(CharBuffer *out) { + int outCol{1}; + while (*at_ != '\n' && column_ <= 6) { + if (*at_ == '\t') { + NextChar(); + break; + } + if (*at_ != ' ' && + (*at_ != '0' || column_ != 6)) { // '0' in column 6 becomes space + out->Put(*at_); + ++outCol; + } + NextChar(); + } + while (outCol < 7) { + out->Put(' '); + ++outCol; + } +} + +void Prescanner::NextChar() { + // CHECK(*at_ != '\n'); + ++at_; + ++column_; + if (inPreprocessorDirective_) { + while (*at_ == '/' && at_[1] == '*') { + char star{' '}, slash{' '}; + for (at_ += 2, column_ += 2; + *at_ != '\n' && (star != '*' || slash != '/'); + ++at_, ++column_) { + star = slash; + slash = *at_; + } + } + while (*at_ == '\\' && at_ + 2 < limit_ && at_[1] == '\n') { + BeginSourceLineAndAdvance(); + ++newlineDebt_; + } + } else { + if ((inFixedForm_ && column_ > fixedFormColumnLimit_ && + !tabInCurrentLine_) || + (*at_ == '!' && !inCharLiteral_)) { + while (*at_ != '\n') { + ++at_; + } + } + while (*at_ == '\n' || *at_ == '&') { + if ((inFixedForm_ && !FixedFormContinuation()) || + (!inFixedForm_ && !FreeFormContinuation())) { + return; + } + } + if (*at_ == '\t') { + tabInCurrentLine_ = true; + } + } +} + +void Prescanner::SkipSpaces() { + while (*at_ == ' ' || *at_ == '\t') { + NextChar(); + } +} + +static inline bool IsNameChar(char ch) { + return isalnum(ch) || ch == '_' || ch == '$' || ch == '@'; +} + +bool Prescanner::NextToken(TokenSequence *tokens) { + if (inFixedForm_) { + SkipSpaces(); + } else if (*at_ == ' ' || *at_ == '\t') { + NextChar(); + SkipSpaces(); + if (*at_ != '\n') { + tokens->AddChar(' '); + tokens->EndToken(); + return true; + } + } + if (*at_ == '\n') { + return false; + } + + if (*at_ == '\'' || *at_ == '"') { + QuotedCharacterLiteral(tokens); + preventHollerith_ = false; + } else if (isdigit(*at_)) { + int n{0}; + static constexpr int maxHollerith = 256 * (132-6); + do { + if (n < maxHollerith) { + n = 10 * n + *at_ - '0'; + } + EmitCharAndAdvance(tokens, *at_); + if (inFixedForm_) { + SkipSpaces(); + } + } while (isdigit(*at_)); + if ((*at_ == 'h' || *at_ == 'H') && + n > 0 && n < maxHollerith && + !preventHollerith_) { + EmitCharAndAdvance(tokens, 'h'); + inCharLiteral_ = true; + while (n-- > 0) { + if (PadOutCharacterLiteral()) { + tokens->AddChar(' '); + } else { + if (*at_ == '\n') { + break; // TODO error + } + EmitCharAndAdvance(tokens, *at_); + } + } + inCharLiteral_ = false; + } else if (*at_ == '.') { + while (isdigit(EmitCharAndAdvance(tokens, *at_))) { + } + ExponentAndKind(tokens); + } else if (ExponentAndKind(tokens)) { + } else if (isalpha(*at_)) { + // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that + // we don't misrecognize I9HOLLERITH as an identifier in the next case. + EmitCharAndAdvance(tokens, tolower(*at_)); + } + preventHollerith_ = false; + } else if (*at_ == '.') { + if (isdigit(EmitCharAndAdvance(tokens, '.'))) { + while (isdigit(EmitCharAndAdvance(tokens, *at_))) { + } + ExponentAndKind(tokens); + } else if (isalpha(*at_)) { + while (IsNameChar(EmitCharAndAdvance(tokens, tolower(*at_)))) { + } + if (*at_ == '.') { + EmitCharAndAdvance(tokens, '.'); + } + } + preventHollerith_ = false; + } else if (IsNameChar(*at_)) { + while (IsNameChar(EmitCharAndAdvance(tokens, tolower(*at_)))) { + } + if (*at_ == '\'' || *at_ == '"') { + QuotedCharacterLiteral(tokens); + } + preventHollerith_ = false; + } else if (*at_ == '*') { + if (EmitCharAndAdvance(tokens, '*') == '*') { + EmitCharAndAdvance(tokens, '*'); + } else { + preventHollerith_ = true; // ambiguity: CHARACTER*2H + } + } else { + char ch{*at_}, nch{EmitCharAndAdvance(tokens, ch)}; + preventHollerith_ = false; + if ((nch == '=' && (ch == '<' || ch == '>' || ch == '/' || ch == '=')) || + (ch == nch && (ch == '/' || ch == ':' || ch == '#')) || + (ch == '=' && nch == '>')) { + EmitCharAndAdvance(tokens, nch); + } + } + tokens->EndToken(); + return true; +} + +bool Prescanner::ExponentAndKind(TokenSequence *tokens) { + char ed = tolower(*at_); + if (ed != 'e' && ed != 'd') { + return false; + } + EmitCharAndAdvance(tokens, ed); + if (*at_ == '+' || *at_ == '-') { + EmitCharAndAdvance(tokens, *at_); + } + while (isdigit(*at_)) { + EmitCharAndAdvance(tokens, *at_); + } + if (*at_ == '_') { + while (IsNameChar(EmitCharAndAdvance(tokens, tolower(*at_)))) { + } + } + return true; +} + +void Prescanner::QuotedCharacterLiteral(TokenSequence *tokens) { + char quote{*at_}; + inCharLiteral_ = true; + do { + EmitCharAndAdvance(tokens, *at_); + while (PadOutCharacterLiteral()) { + tokens->AddChar(' '); + } + if (*at_ == '\\' && enableBackslashEscapesInCharLiterals_) { + EmitCharAndAdvance(tokens, '\\'); + while (PadOutCharacterLiteral()) { + tokens->AddChar(' '); + } + } else if (*at_ == quote) { + // A doubled quote mark becomes a single instance of the quote character + // in the literal later. + EmitCharAndAdvance(tokens, quote); + if (inFixedForm_) { + SkipSpaces(); + } + if (*at_ != quote) { + break; + } + } + } while (*at_ != '\n'); + inCharLiteral_ = false; +} + +bool Prescanner::PadOutCharacterLiteral() { + if (inFixedForm_ && + !tabInCurrentLine_ && + *at_ == '\n' && + column_ < fixedFormColumnLimit_) { + ++column_; + return true; + } + return false; +} + +bool Prescanner::IsFixedFormCommentLine(const char *start) { + if (start >= limit_ || !inFixedForm_) { + return false; + } + const char *p{start}; + char ch{*p}; + if (ch == '*' || ch == 'C' || ch == 'c' || + ch == '%' || // VAX %list, %eject, &c. + ((ch == 'D' || ch == 'd') && !enableOldDebugLines_)) { + return true; + } + bool anyTabs{false}; + while (true) { + ch = *p; + if (ch == ' ') { + ++p; + } else if (ch == '\t') { + anyTabs = true; + ++p; + } else if (ch == '0' && !anyTabs && p == start + 5) { + ++p; // 0 in column 6 must treated as a space + } else { + break; + } + } + if (!anyTabs && p >= start + fixedFormColumnLimit_) { + return true; + } + if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) { + return true; + } + return *p == '\n'; +} + +bool Prescanner::IsFreeFormComment(const char *p) { + if (p >= limit_ || inFixedForm_) { + return false; + } + while (*p == ' ' || *p == '\t') { + ++p; + } + return *p == '!' || *p == '\n'; +} + +bool Prescanner::IsPreprocessorDirectiveLine(const char *start) { + const char *p{start}; + if (p >= limit_ || inPreprocessorDirective_) { + return false; + } + for (; *p == ' '; ++p) { + } + if (*p == '#') { + return !inFixedForm_ || p != start + 5; + } + for (; *p == ' ' || *p == '\t'; ++p) { + } + return *p == '#'; +} + +void Prescanner::CommentLinesAndPreprocessorDirectives() { + while (lineStart_ < limit_) { + if (IsFixedFormCommentLine(lineStart_) || + IsFreeFormComment(lineStart_)) { + NextLine(); + } else if (IsPreprocessorDirectiveLine(lineStart_)) { + const char *saveAt{at_}; + if (std::optional tokens{NextTokenizedLine()}) { + std::string err{preprocessor_.Directive(*tokens)}; + if (!err.empty()) { + *error_ << err << '\n'; + } + } + at_ = saveAt; + } else { + break; + } + ++newlineDebt_; + } +} + +const char *Prescanner::FixedFormContinuationLine() { + const char *p{lineStart_}; + if (p >= limit_) { + return nullptr; + } + tabInCurrentLine_ = false; + if (*p == '&') { + return p + 1; // extension + } + if (*p == '\t' && p[1] >= '1' && p[1] <= '9') { + tabInCurrentLine_ = true; + return p + 2; // VAX extension + } + if (p[0] == ' ' && p[1] == ' ' && p[2] == ' ' && + p[3] == ' ' && p[4] == ' ') { + char col6{p[5]}; + if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') { + return p + 6; + } + } + return nullptr; // not a continuation line +} + +bool Prescanner::FixedFormContinuation() { + CommentLinesAndPreprocessorDirectives(); + const char *cont{FixedFormContinuationLine()}; + if (cont == nullptr) { + return false; + } + BeginSourceLine(cont, 7); + ++newlineDebt_; + NextLine(); + return true; +} + +bool Prescanner::FreeFormContinuation() { + while (*at_ == ' ' || *at_ == '\t') { + ++at_; + } + const char *p{at_}; + bool ampersand{*p == '&'}; + if (ampersand) { + for (++p; *p == ' ' || *p == '\t'; ++p) { + } + } + if (*p != '\n' && (inCharLiteral_ || *p != '!')) { + return false; + } + CommentLinesAndPreprocessorDirectives(); + p = lineStart_; + if (p >= limit_) { + return false; + } + int column{1}; + for (; *p == ' ' || *p == '\t'; ++p) { + ++column; + } + if (*p == '&') { + ++p; + ++column; + } else if (ampersand) { + if (p > lineStart_) { + --p; + --column; + } + } else { + return false; // not a continuation + } + BeginSourceLine(p, column); + ++newlineDebt_; + NextLine(); + return true; +} +} // namespace Fortran diff --git a/flang/prescan.h b/flang/prescan.h new file mode 100644 index 0000000..01fee1f --- /dev/null +++ b/flang/prescan.h @@ -0,0 +1,100 @@ +#ifndef FORTRAN_PRESCAN_H_ +#define FORTRAN_PRESCAN_H_ + +// Defines a fast Fortran source prescanning phase that implements some +// character-level features of the language that can be inefficient to +// support directly in a backtracking parser. This phase handles Fortran +// line continuation, comment removal, card image margins, padding out +// fixed form character literals on truncated card images, and drives the +// Fortran source preprocessor. +// +// It is possible to run the Fortran parser without running this prescan +// phase, using only the parsers defined in cooked-chars.h, so long as +// preprocessing and INCLUDE lines need not be handled. + +#include "char-buffer.h" +#include "preprocessor.h" +#include "source.h" +#include +#include + +namespace Fortran { + +class Prescanner { + public: + explicit Prescanner(std::stringstream *err) + : error_{err}, preprocessor_{this} {} + + Prescanner &set_fixedForm(bool yes) { + inFixedForm_ = yes; + return *this; + } + Prescanner &set_enableOldDebugLines(bool yes) { + enableOldDebugLines_ = yes; + return *this; + } + Prescanner &set_enableBackslashEscapesInCharLiterals(bool yes) { + enableBackslashEscapesInCharLiterals_ = yes; + return *this; + } + Prescanner &set_fixedFormColumnLimit(int limit) { + fixedFormColumnLimit_ = limit; + return *this; + } + + CharBuffer Prescan(const SourceFile &source); + std::optional NextTokenizedLine(); + + private: + void BeginSourceLine(const char *at, int column = 1) { + at_ = at; + column_ = column; + tabInCurrentLine_ = false; + preventHollerith_ = false; + } + + void BeginSourceLineAndAdvance() { + BeginSourceLine(lineStart_); + NextLine(); + } + + char EmitCharAndAdvance(TokenSequence *tokens, char ch) { + tokens->AddChar(ch); + NextChar(); + return *at_; + } + + void NextLine(); + void LabelField(CharBuffer *); + void NextChar(); + void SkipSpaces(); + bool NextToken(TokenSequence *); + bool ExponentAndKind(TokenSequence *); + void QuotedCharacterLiteral(TokenSequence *); + bool PadOutCharacterLiteral(); + void CommentLinesAndPreprocessorDirectives(); + bool IsFixedFormCommentLine(const char *); + bool IsFreeFormComment(const char *); + bool IsPreprocessorDirectiveLine(const char *); + const char *FixedFormContinuationLine(); + bool FixedFormContinuation(); + bool FreeFormContinuation(); + + std::stringstream *error_; + const char *lineStart_{nullptr}; // next line to process; <= limit_ + const char *at_{nullptr}; // next character to process; < lineStart_ + int column_{1}; // card image column position of next character + const char *limit_{nullptr}; // first address after end of source + int newlineDebt_{0}; // newline characters consumed but not yet emitted + bool inCharLiteral_{false}; + bool inPreprocessorDirective_{false}; + bool inFixedForm_{true}; + int fixedFormColumnLimit_{72}; + bool tabInCurrentLine_{false}; + bool preventHollerith_{false}; + bool enableOldDebugLines_{false}; + bool enableBackslashEscapesInCharLiterals_{true}; + Preprocessor preprocessor_; +}; +} // namespace Fortran +#endif // FORTRAN_PRESCAN_H_ -- 2.7.4