Add clang source minimizer that reduces source to directives
authorAlex Lorenz <arphaman@gmail.com>
Mon, 3 Jun 2019 22:59:17 +0000 (22:59 +0000)
committerAlex Lorenz <arphaman@gmail.com>
Mon, 3 Jun 2019 22:59:17 +0000 (22:59 +0000)
that might affect the dependency list for a compilation

This commit introduces a dependency directives source minimizer to clang
that minimizes header and source files to the minimum necessary preprocessor
directives for evaluating includes. It reduces the source down to #define, #include,

The source minimizer works by lexing the input with a custom fast lexer that recognizes
the preprocessor directives it cares about, and emitting those directives in the minimized source.
It ignores source code, comments, and normalizes whitespace. It gives up and fails if seems
any directives that it doesn't recognize as valid (e.g. #define 0).

In addition to the source minimizer this patch adds a
-print-dependency-directives-minimized-source CC1 option that allows you to invoke the minimizer
from clang directly.

Differential Revision: https://reviews.llvm.org/D55463

llvm-svn: 362459

16 files changed:
clang/include/clang/Basic/DiagnosticLexKinds.td
clang/include/clang/Driver/CC1Options.td
clang/include/clang/Frontend/FrontendActions.h
clang/include/clang/Frontend/FrontendOptions.h
clang/include/clang/Lex/DependencyDirectivesSourceMinimizer.h [new file with mode: 0644]
clang/lib/Frontend/CompilerInvocation.cpp
clang/lib/Frontend/FrontendActions.cpp
clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
clang/lib/Lex/CMakeLists.txt
clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp [new file with mode: 0644]
clang/test/Frontend/minimize_source_to_dependency_directives.c [new file with mode: 0644]
clang/test/Lexer/minimize_source_to_dependency_directives_at_import_extra_tokens.m [new file with mode: 0644]
clang/test/Lexer/minimize_source_to_dependency_directives_at_import_missing_semi.m [new file with mode: 0644]
clang/test/Lexer/minimize_source_to_dependency_directives_invalid_macro_name.c [new file with mode: 0644]
clang/unittests/Lex/CMakeLists.txt
clang/unittests/Lex/DependencyDirectivesSourceMinimizerTest.cpp [new file with mode: 0644]

index dd5e2af..b64cbc2 100644 (file)
@@ -818,4 +818,13 @@ def err_pp_eof_in_assume_nonnull : Error<
 
 }
 
+let CategoryName = "Dependency Directive Source Minimization Issue" in {
+
+def err_dep_source_minimizer_missing_sema_after_at_import : Error<
+  "could not find ';' after @import">;
+def err_dep_source_minimizer_unexpected_tokens_at_import : Error<
+  "unexpected extra tokens at end of @import declaration">;
+
+}
+
 }
index 76b36a1..56ff05d 100644 (file)
@@ -612,6 +612,9 @@ def migrate : Flag<["-"], "migrate">,
   HelpText<"Migrate source code">;
 def compiler_options_dump : Flag<["-"], "compiler-options-dump">,
   HelpText<"Dump the compiler configuration options">;
+def print_dependency_directives_minimized_source : Flag<["-"],
+  "print-dependency-directives-minimized-source">,
+  HelpText<"Print the output of the dependency directives source minimizer">;
 }
 
 def emit_llvm_uselists : Flag<["-"], "emit-llvm-uselists">,
index e3b8b46..846b268 100644 (file)
@@ -240,6 +240,17 @@ protected:
   bool usesPreprocessorOnly() const override { return true; }
 };
 
+class PrintDependencyDirectivesSourceMinimizerAction : public FrontendAction {
+protected:
+  void ExecuteAction() override;
+  std::unique_ptr<ASTConsumer> CreateASTConsumer(CompilerInstance &,
+                                                 StringRef) override {
+    return nullptr;
+  }
+
+  bool usesPreprocessorOnly() const override { return true; }
+};
+
 //===----------------------------------------------------------------------===//
 // Preprocessor Actions
 //===----------------------------------------------------------------------===//
index ce0b696..1bbd048 100644 (file)
@@ -128,7 +128,10 @@ enum ActionKind {
   MigrateSource,
 
   /// Just lex, no output.
-  RunPreprocessorOnly
+  RunPreprocessorOnly,
+
+  /// Print the output of the dependency directives source minimizer.
+  PrintDependencyDirectivesSourceMinimizerOutput
 };
 
 } // namespace frontend
diff --git a/clang/include/clang/Lex/DependencyDirectivesSourceMinimizer.h b/clang/include/clang/Lex/DependencyDirectivesSourceMinimizer.h
new file mode 100644 (file)
index 0000000..4164107
--- /dev/null
@@ -0,0 +1,88 @@
+//===- clang/Lex/DependencyDirectivesSourceMinimizer.h -  ----------*- C++ -*-//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This is the interface for minimizing header and source files to the
+/// minimum necessary preprocessor directives for evaluating includes. It
+/// reduces the source down to #define, #include, #import, @import, and any
+/// conditional preprocessor logic that contains one of those.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LEX_DEPENDENCY_DIRECTIVES_SOURCE_MINIMIZER_H
+#define LLVM_CLANG_LEX_DEPENDENCY_DIRECTIVES_SOURCE_MINIMIZER_H
+
+#include "clang/Basic/SourceLocation.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace clang {
+
+class DiagnosticsEngine;
+
+namespace minimize_source_to_dependency_directives {
+
+/// Represents the kind of preprocessor directive or a module declaration that
+/// is tracked by the source minimizer in its token output.
+enum TokenKind {
+  pp_none,
+  pp_include,
+  pp___include_macros,
+  pp_define,
+  pp_undef,
+  pp_import,
+  pp_pragma_import,
+  pp_include_next,
+  pp_if,
+  pp_ifdef,
+  pp_ifndef,
+  pp_elif,
+  pp_else,
+  pp_endif,
+  decl_at_import,
+  pp_eof,
+};
+
+/// Represents a simplified token that's lexed as part of the source
+/// minimization. It's used to track the location of various preprocessor
+/// directives that could potentially have an effect on the depedencies.
+struct Token {
+  /// The kind of token.
+  TokenKind K = pp_none;
+
+  /// Offset into the output byte stream of where the directive begins.
+  int Offset = -1;
+
+  Token(TokenKind K, int Offset) : K(K), Offset(Offset) {}
+};
+
+} // end namespace minimize_source_to_dependency_directives
+
+/// Minimize the input down to the preprocessor directives that might have
+/// an effect on the dependencies for a compilation unit.
+///
+/// This function deletes all non-preprocessor code, and strips anything that
+/// can't affect what gets included. It canonicalizes whitespace where
+/// convenient to stabilize the output against formatting changes in the input.
+///
+/// Clears the output vectors at the beginning of the call.
+///
+/// \returns false on success, true on error. If the diagnostic engine is not
+/// null, an appropriate error is reported using the given input location
+/// with the offset that corresponds to the minimizer's current buffer offset.
+bool minimizeSourceToDependencyDirectives(
+    llvm::StringRef Input, llvm::SmallVectorImpl<char> &Output,
+    llvm::SmallVectorImpl<minimize_source_to_dependency_directives::Token>
+        &Tokens,
+    DiagnosticsEngine *Diags = nullptr,
+    SourceLocation InputSourceLoc = SourceLocation());
+
+} // end namespace clang
+
+#endif // LLVM_CLANG_LEX_DEPENDENCY_DIRECTIVES_SOURCE_MINIMIZER_H
index 717278c..7ac58ae 100644 (file)
@@ -1696,6 +1696,10 @@ static InputKind ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
       Opts.ProgramAction = frontend::MigrateSource; break;
     case OPT_Eonly:
       Opts.ProgramAction = frontend::RunPreprocessorOnly; break;
+    case OPT_print_dependency_directives_minimized_source:
+      Opts.ProgramAction =
+          frontend::PrintDependencyDirectivesSourceMinimizerOutput;
+      break;
     }
   }
 
@@ -3116,6 +3120,7 @@ static bool isStrictlyPreprocessorAction(frontend::ActionKind Action) {
   case frontend::PrintPreprocessedInput:
   case frontend::RewriteMacros:
   case frontend::RunPreprocessorOnly:
+  case frontend::PrintDependencyDirectivesSourceMinimizerOutput:
     return true;
   }
   llvm_unreachable("invalid frontend action");
index 9e86323..7d54d66 100644 (file)
@@ -14,6 +14,7 @@
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/Frontend/MultiplexConsumer.h"
 #include "clang/Frontend/Utils.h"
+#include "clang/Lex/DependencyDirectivesSourceMinimizer.h"
 #include "clang/Lex/HeaderSearch.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Lex/PreprocessorOptions.h"
@@ -23,8 +24,8 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
 #include <memory>
 #include <system_error>
 
@@ -908,3 +909,33 @@ void DumpCompilerOptionsAction::ExecuteAction() {
 
   OS << "}";
 }
+
+void PrintDependencyDirectivesSourceMinimizerAction::ExecuteAction() {
+  CompilerInstance &CI = getCompilerInstance();
+  SourceManager &SM = CI.getPreprocessor().getSourceManager();
+  const llvm::MemoryBuffer *FromFile = SM.getBuffer(SM.getMainFileID());
+
+  llvm::SmallString<1024> Output;
+  llvm::SmallVector<minimize_source_to_dependency_directives::Token, 32> Toks;
+  if (minimizeSourceToDependencyDirectives(
+          FromFile->getBuffer(), Output, Toks, &CI.getDiagnostics(),
+          SM.getLocForStartOfFile(SM.getMainFileID()))) {
+    assert(CI.getDiagnostics().hasErrorOccurred() &&
+           "no errors reported for failure");
+
+    // Preprocess the source when verifying the diagnostics to capture the
+    // 'expected' comments.
+    if (CI.getDiagnosticOpts().VerifyDiagnostics) {
+      // Make sure we don't emit new diagnostics!
+      CI.getDiagnostics().setSuppressAllDiagnostics();
+      Preprocessor &PP = getCompilerInstance().getPreprocessor();
+      PP.EnterMainSourceFile();
+      Token Tok;
+      do {
+        PP.Lex(Tok);
+      } while (Tok.isNot(tok::eof));
+    }
+    return;
+  }
+  llvm::outs() << Output;
+}
index da7aa7b..b6a20a7 100644 (file)
@@ -116,6 +116,8 @@ CreateFrontendBaseAction(CompilerInstance &CI) {
   case RunAnalysis:            Action = "RunAnalysis"; break;
 #endif
   case RunPreprocessorOnly:    return llvm::make_unique<PreprocessOnlyAction>();
+  case PrintDependencyDirectivesSourceMinimizerOutput:
+    return llvm::make_unique<PrintDependencyDirectivesSourceMinimizerAction>();
   }
 
 #if !CLANG_ENABLE_ARCMT || !CLANG_ENABLE_STATIC_ANALYZER \
index 7888b15..d77e6dd 100644 (file)
@@ -3,6 +3,7 @@
 set(LLVM_LINK_COMPONENTS support)
 
 add_clang_library(clangLex
+  DependencyDirectivesSourceMinimizer.cpp
   HeaderMap.cpp
   HeaderSearch.cpp
   Lexer.cpp
diff --git a/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp b/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp
new file mode 100644 (file)
index 0000000..802b7ba
--- /dev/null
@@ -0,0 +1,756 @@
+//===- DependencyDirectivesSourceMinimizer.cpp -  -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This is the implementation for minimizing header and source files to the
+/// minimum necessary preprocessor directives for evaluating includes. It
+/// reduces the source down to #define, #include, #import, @import, and any
+/// conditional preprocessor logic that contains one of those.
+///
+//===----------------------------------------------------------------------===//
+
+#include "clang/Lex/DependencyDirectivesSourceMinimizer.h"
+#include "clang/Basic/CharInfo.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Lex/LexDiagnostic.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using namespace llvm;
+using namespace clang;
+using namespace clang::minimize_source_to_dependency_directives;
+
+namespace {
+
+struct Minimizer {
+  /// Minimized output.
+  SmallVectorImpl<char> &Out;
+  /// The known tokens encountered during the minimization.
+  SmallVectorImpl<Token> &Tokens;
+
+  Minimizer(SmallVectorImpl<char> &Out, SmallVectorImpl<Token> &Tokens,
+            StringRef Input, DiagnosticsEngine *Diags,
+            SourceLocation InputSourceLoc)
+      : Out(Out), Tokens(Tokens), Input(Input), Diags(Diags),
+        InputSourceLoc(InputSourceLoc) {}
+
+  /// Lex the provided source and emit the minimized output.
+  ///
+  /// \returns True on error.
+  bool minimize();
+
+private:
+  struct IdInfo {
+    const char *Last;
+    StringRef Name;
+  };
+
+  /// Lex an identifier.
+  ///
+  /// \pre First points at a valid identifier head.
+  LLVM_NODISCARD IdInfo lexIdentifier(const char *First, const char *const End);
+  LLVM_NODISCARD bool isNextIdentifier(StringRef Id, const char *&First,
+                                       const char *const End);
+  LLVM_NODISCARD bool minimizeImpl(const char *First, const char *const End);
+  LLVM_NODISCARD bool lexPPLine(const char *&First, const char *const End);
+  LLVM_NODISCARD bool lexAt(const char *&First, const char *const End);
+  LLVM_NODISCARD bool lexDefine(const char *&First, const char *const End);
+  LLVM_NODISCARD bool lexPragma(const char *&First, const char *const End);
+  LLVM_NODISCARD bool lexEndif(const char *&First, const char *const End);
+  LLVM_NODISCARD bool lexDefault(TokenKind Kind, StringRef Directive,
+                                 const char *&First, const char *const End);
+  Token &makeToken(TokenKind K) {
+    Tokens.emplace_back(K, Out.size());
+    return Tokens.back();
+  }
+  void popToken() {
+    Out.resize(Tokens.back().Offset);
+    Tokens.pop_back();
+  }
+  TokenKind top() const { return Tokens.empty() ? pp_none : Tokens.back().K; }
+
+  Minimizer &put(char Byte) {
+    Out.push_back(Byte);
+    return *this;
+  }
+  Minimizer &append(StringRef S) { return append(S.begin(), S.end()); }
+  Minimizer &append(const char *First, const char *Last) {
+    Out.append(First, Last);
+    return *this;
+  }
+
+  void printToNewline(const char *&First, const char *const End);
+  void printAdjacentModuleNameParts(const char *&First, const char *const End);
+  LLVM_NODISCARD bool printAtImportBody(const char *&First,
+                                        const char *const End);
+  void printDirectiveBody(const char *&First, const char *const End);
+  void printAdjacentMacroArgs(const char *&First, const char *const End);
+  LLVM_NODISCARD bool printMacroArgs(const char *&First, const char *const End);
+
+  /// Reports a diagnostic if the diagnostic engine is provided. Always returns
+  /// true at the end.
+  bool reportError(const char *CurPtr, unsigned Err);
+
+  StringMap<char> SplitIds;
+  StringRef Input;
+  DiagnosticsEngine *Diags;
+  SourceLocation InputSourceLoc;
+};
+
+} // end anonymous namespace
+
+bool Minimizer::reportError(const char *CurPtr, unsigned Err) {
+  if (!Diags)
+    return true;
+  assert(CurPtr >= Input.data() && "invalid buffer ptr");
+  Diags->Report(InputSourceLoc.getLocWithOffset(CurPtr - Input.data()), Err);
+  return true;
+}
+
+static void skipOverSpaces(const char *&First, const char *const End) {
+  while (First != End && isHorizontalWhitespace(*First))
+    ++First;
+}
+
+LLVM_NODISCARD static bool isRawStringLiteral(const char *First,
+                                              const char *Current) {
+  assert(First <= Current);
+
+  // Check if we can even back up.
+  if (*Current != '\"' || First == Current)
+    return false;
+
+  // Check for an "R".
+  --Current;
+  if (*Current != 'R')
+    return false;
+  if (First == Current || !isIdentifierBody(*--Current))
+    return true;
+
+  // Check for a prefix of "u", "U", or "L".
+  if (*Current == 'u' || *Current == 'U' || *Current == 'L')
+    return First == Current || !isIdentifierBody(*--Current);
+
+  // Check for a prefix of "u8".
+  if (*Current != '8' || First == Current || *Current-- != 'u')
+    return false;
+  return First == Current || !isIdentifierBody(*--Current);
+}
+
+static void skipRawString(const char *&First, const char *const End) {
+  assert(First[0] == '\"');
+  assert(First[-1] == 'R');
+
+  const char *Last = ++First;
+  while (Last != End && *Last != '(')
+    ++Last;
+  if (Last == End) {
+    First = Last; // Hit the end... just give up.
+    return;
+  }
+
+  StringRef Terminator(First, Last - First);
+  for (;;) {
+    // Move First to just past the next ")".
+    First = Last;
+    while (First != End && *First != ')')
+      ++First;
+    if (First == End)
+      return;
+    ++First;
+
+    // Look ahead for the terminator sequence.
+    Last = First;
+    while (Last != End && size_t(Last - First) < Terminator.size() &&
+           Terminator[Last - First] == *Last)
+      ++Last;
+
+    // Check if we hit it (or the end of the file).
+    if (Last == End) {
+      First = Last;
+      return;
+    }
+    if (size_t(Last - First) < Terminator.size())
+      continue;
+    if (*Last != '\"')
+      continue;
+    First = Last + 1;
+    return;
+  }
+}
+
+static void skipString(const char *&First, const char *const End) {
+  assert(*First == '\'' || *First == '\"');
+  const char Terminator = *First;
+  for (++First; First != End && *First != Terminator; ++First)
+    if (*First == '\\')
+      if (++First == End)
+        return;
+  if (First != End)
+    ++First; // Finish off the string.
+}
+
+static void skipNewline(const char *&First, const char *End) {
+  assert(isVerticalWhitespace(*First));
+  ++First;
+  if (First == End)
+    return;
+
+  // Check for "\n\r" and "\r\n".
+  if (LLVM_UNLIKELY(isVerticalWhitespace(*First) && First[-1] != First[0]))
+    ++First;
+}
+
+static void skipToNewlineRaw(const char *&First, const char *const End) {
+  for (;;) {
+    if (First == End)
+      return;
+
+    if (isVerticalWhitespace(*First))
+      return;
+
+    while (!isVerticalWhitespace(*First))
+      if (++First == End)
+        return;
+
+    if (First[-1] != '\\')
+      return;
+
+    ++First; // Keep going...
+  }
+}
+
+static const char *reverseOverSpaces(const char *First, const char *Last) {
+  assert(First <= Last);
+  while (First != Last && isHorizontalWhitespace(Last[-1]))
+    --Last;
+  return Last;
+}
+
+static void skipLineComment(const char *&First, const char *const End) {
+  assert(First[0] == '/' && First[1] == '/');
+  First += 2;
+  skipToNewlineRaw(First, End);
+}
+
+static void skipBlockComment(const char *&First, const char *const End) {
+  assert(First[0] == '/' && First[1] == '*');
+  if (End - First < 4) {
+    First = End;
+    return;
+  }
+  for (First += 3; First != End; ++First)
+    if (First[-1] == '*' && First[0] == '/') {
+      ++First;
+      return;
+    }
+}
+
+/// \returns True if the current single quotation mark character is a C++ 14
+/// digit separator.
+static bool isQuoteCppDigitSeparator(const char *const Start,
+                                     const char *const Cur,
+                                     const char *const End) {
+  assert(*Cur == '\'' && "expected quotation character");
+  // skipLine called in places where we don't expect a valid number
+  // body before `start` on the same line, so always return false at the start.
+  if (Start == Cur)
+    return false;
+  // The previous character must be a valid PP number character.
+  if (!isPreprocessingNumberBody(*(Cur - 1)))
+    return false;
+  // The next character should be a valid identifier body character.
+  return (Cur + 1) < End && isIdentifierBody(*(Cur + 1));
+}
+
+static void skipLine(const char *&First, const char *const End) {
+  do {
+    assert(First <= End);
+    if (First == End)
+      return;
+
+    if (isVerticalWhitespace(*First)) {
+      skipNewline(First, End);
+      return;
+    }
+    const char *Start = First;
+    while (First != End && !isVerticalWhitespace(*First)) {
+      // Iterate over strings correctly to avoid comments and newlines.
+      if (*First == '\"' ||
+          (*First == '\'' && !isQuoteCppDigitSeparator(Start, First, End))) {
+        if (isRawStringLiteral(Start, First))
+          skipRawString(First, End);
+        else
+          skipString(First, End);
+        continue;
+      }
+
+      // Iterate over comments correctly.
+      if (*First != '/' || End - First < 2) {
+        ++First;
+        continue;
+      }
+
+      if (First[1] == '/') {
+        // "//...".
+        skipLineComment(First, End);
+        continue;
+      }
+
+      if (First[1] != '*') {
+        ++First;
+        continue;
+      }
+
+      // "/*...*/".
+      skipBlockComment(First, End);
+    }
+    if (First == End)
+      return;
+
+    // Skip over the newline.
+    assert(isVerticalWhitespace(*First));
+    skipNewline(First, End);
+  } while (First[-2] == '\\'); // Continue past line-continuations.
+}
+
+static void skipDirective(StringRef Name, const char *&First,
+                          const char *const End) {
+  if (llvm::StringSwitch<bool>(Name)
+          .Case("warning", true)
+          .Case("error", true)
+          .Default(false))
+    // Do not process quotes or comments.
+    skipToNewlineRaw(First, End);
+  else
+    skipLine(First, End);
+}
+
+void Minimizer::printToNewline(const char *&First, const char *const End) {
+  while (First != End && !isVerticalWhitespace(*First)) {
+    const char *Last = First;
+    do {
+      // Iterate over strings correctly to avoid comments and newlines.
+      if (*Last == '\"' || *Last == '\'') {
+        if (LLVM_UNLIKELY(isRawStringLiteral(First, Last)))
+          skipRawString(Last, End);
+        else
+          skipString(Last, End);
+        continue;
+      }
+      if (*Last != '/' || End - Last < 2) {
+        ++Last;
+        continue; // Gather the rest up to print verbatim.
+      }
+
+      if (Last[1] != '/' && Last[1] != '*') {
+        ++Last;
+        continue;
+      }
+
+      // Deal with "//..." and "/*...*/".
+      append(First, reverseOverSpaces(First, Last));
+      First = Last;
+
+      if (Last[1] == '/') {
+        skipLineComment(First, End);
+        return;
+      }
+
+      put(' ');
+      skipBlockComment(First, End);
+      skipOverSpaces(First, End);
+      Last = First;
+    } while (Last != End && !isVerticalWhitespace(*Last));
+
+    // Print out the string.
+    if (Last == End || Last == First || Last[-1] != '\\') {
+      append(First, reverseOverSpaces(First, Last));
+      return;
+    }
+
+    // Print up to the backslash, backing up over spaces.
+    append(First, reverseOverSpaces(First, Last - 1));
+
+    First = Last;
+    skipNewline(First, End);
+    skipOverSpaces(First, End);
+  }
+}
+
+static void skipWhitespace(const char *&First, const char *const End) {
+  for (;;) {
+    assert(First <= End);
+    skipOverSpaces(First, End);
+
+    if (End - First < 2)
+      return;
+
+    if (First[0] == '\\' && isVerticalWhitespace(First[1])) {
+      skipNewline(++First, End);
+      continue;
+    }
+
+    // Check for a non-comment character.
+    if (First[0] != '/')
+      return;
+
+    // "// ...".
+    if (First[1] == '/') {
+      skipLineComment(First, End);
+      return;
+    }
+
+    // Cannot be a comment.
+    if (First[1] != '*')
+      return;
+
+    // "/*...*/".
+    skipBlockComment(First, End);
+  }
+}
+
+void Minimizer::printAdjacentModuleNameParts(const char *&First,
+                                             const char *const End) {
+  // Skip over parts of the body.
+  const char *Last = First;
+  do
+    ++Last;
+  while (Last != End && (isIdentifierBody(*Last) || *Last == '.'));
+  append(First, Last);
+  First = Last;
+}
+
+bool Minimizer::printAtImportBody(const char *&First, const char *const End) {
+  for (;;) {
+    skipWhitespace(First, End);
+    if (First == End)
+      return true;
+
+    if (isVerticalWhitespace(*First)) {
+      skipNewline(First, End);
+      continue;
+    }
+
+    // Found a semicolon.
+    if (*First == ';') {
+      put(*First++).put('\n');
+      return false;
+    }
+
+    // Don't handle macro expansions inside @import for now.
+    if (!isIdentifierBody(*First) && *First != '.')
+      return true;
+
+    printAdjacentModuleNameParts(First, End);
+  }
+}
+
+void Minimizer::printDirectiveBody(const char *&First, const char *const End) {
+  skipWhitespace(First, End); // Skip initial whitespace.
+  printToNewline(First, End);
+  while (Out.back() == ' ')
+    Out.pop_back();
+  put('\n');
+}
+
+LLVM_NODISCARD static const char *lexRawIdentifier(const char *First,
+                                                   const char *const End) {
+  assert(isIdentifierBody(*First) && "invalid identifer");
+  const char *Last = First + 1;
+  while (Last != End && isIdentifierBody(*Last))
+    ++Last;
+  return Last;
+}
+
+LLVM_NODISCARD static const char *
+getIdentifierContinuation(const char *First, const char *const End) {
+  if (End - First < 3 || First[0] != '\\' || !isVerticalWhitespace(First[1]))
+    return nullptr;
+
+  ++First;
+  skipNewline(First, End);
+  if (First == End)
+    return nullptr;
+  return isIdentifierBody(First[0]) ? First : nullptr;
+}
+
+Minimizer::IdInfo Minimizer::lexIdentifier(const char *First,
+                                           const char *const End) {
+  const char *Last = lexRawIdentifier(First, End);
+  const char *Next = getIdentifierContinuation(Last, End);
+  if (LLVM_LIKELY(!Next))
+    return IdInfo{Last, StringRef(First, Last - First)};
+
+  // Slow path, where identifiers are split over lines.
+  SmallVector<char, 64> Id(First, Last);
+  while (Next) {
+    Last = lexRawIdentifier(Next, End);
+    Id.append(Next, Last);
+    Next = getIdentifierContinuation(Last, End);
+  }
+  return IdInfo{
+      Last,
+      SplitIds.try_emplace(StringRef(Id.begin(), Id.size()), 0).first->first()};
+}
+
+void Minimizer::printAdjacentMacroArgs(const char *&First,
+                                       const char *const End) {
+  // Skip over parts of the body.
+  const char *Last = First;
+  do
+    ++Last;
+  while (Last != End &&
+         (isIdentifierBody(*Last) || *Last == '.' || *Last == ','));
+  append(First, Last);
+  First = Last;
+}
+
+bool Minimizer::printMacroArgs(const char *&First, const char *const End) {
+  assert(*First == '(');
+  put(*First++);
+  for (;;) {
+    skipWhitespace(First, End);
+    if (First == End)
+      return true;
+
+    if (*First == ')') {
+      put(*First++);
+      return false;
+    }
+
+    // This is intentionally fairly liberal.
+    if (!(isIdentifierBody(*First) || *First == '.' || *First == ','))
+      return true;
+
+    printAdjacentMacroArgs(First, End);
+  }
+}
+
+/// Looks for an identifier starting from Last.
+///
+/// Updates "First" to just past the next identifier, if any.  Returns true iff
+/// the identifier matches "Id".
+bool Minimizer::isNextIdentifier(StringRef Id, const char *&First,
+                                 const char *const End) {
+  skipWhitespace(First, End);
+  if (First == End || !isIdentifierHead(*First))
+    return false;
+
+  IdInfo FoundId = lexIdentifier(First, End);
+  First = FoundId.Last;
+  return FoundId.Name == Id;
+}
+
+bool Minimizer::lexAt(const char *&First, const char *const End) {
+  // Handle "@import".
+  const char *ImportLoc = First++;
+  if (!isNextIdentifier("import", First, End)) {
+    skipLine(First, End);
+    return false;
+  }
+  makeToken(decl_at_import);
+  append("@import ");
+  if (printAtImportBody(First, End))
+    return reportError(
+        ImportLoc, diag::err_dep_source_minimizer_missing_sema_after_at_import);
+  skipWhitespace(First, End);
+  if (First == End)
+    return false;
+  if (!isVerticalWhitespace(*First))
+    return reportError(
+        ImportLoc, diag::err_dep_source_minimizer_unexpected_tokens_at_import);
+  skipNewline(First, End);
+  return false;
+}
+
+bool Minimizer::lexDefine(const char *&First, const char *const End) {
+  makeToken(pp_define);
+  append("#define ");
+  skipWhitespace(First, End);
+
+  if (!isIdentifierHead(*First))
+    return reportError(First, diag::err_pp_macro_not_identifier);
+
+  IdInfo Id = lexIdentifier(First, End);
+  const char *Last = Id.Last;
+  append(Id.Name);
+  if (Last == End)
+    return false;
+  if (*Last == '(') {
+    size_t Size = Out.size();
+    if (printMacroArgs(Last, End)) {
+      // Be robust to bad macro arguments, since they can show up in disabled
+      // code.
+      Out.resize(Size);
+      append("(/* invalid */\n");
+      skipLine(Last, End);
+      return false;
+    }
+  }
+  skipWhitespace(Last, End);
+  if (Last == End)
+    return false;
+  if (!isVerticalWhitespace(*Last))
+    put(' ');
+  printDirectiveBody(Last, End);
+  First = Last;
+  return false;
+}
+
+bool Minimizer::lexPragma(const char *&First, const char *const End) {
+  // #pragma.
+  if (!isNextIdentifier("clang", First, End)) {
+    skipLine(First, End);
+    return false;
+  }
+
+  // #pragma clang.
+  if (!isNextIdentifier("module", First, End)) {
+    skipLine(First, End);
+    return false;
+  }
+
+  // #pragma clang module.
+  if (!isNextIdentifier("import", First, End)) {
+    skipLine(First, End);
+    return false;
+  }
+
+  // #pragma clang module import.
+  makeToken(pp_pragma_import);
+  append("#pragma clang module import ");
+  printDirectiveBody(First, End);
+  return false;
+}
+
+bool Minimizer::lexEndif(const char *&First, const char *const End) {
+  // Strip out "#else" if it's empty.
+  if (top() == pp_else)
+    popToken();
+
+  // Strip out "#elif" if they're empty.
+  while (top() == pp_elif)
+    popToken();
+
+  // If "#if" is empty, strip it and skip the "#endif".
+  if (top() == pp_if || top() == pp_ifdef || top() == pp_ifndef) {
+    popToken();
+    skipLine(First, End);
+    return false;
+  }
+
+  return lexDefault(pp_endif, "endif", First, End);
+}
+
+bool Minimizer::lexDefault(TokenKind Kind, StringRef Directive,
+                           const char *&First, const char *const End) {
+  makeToken(Kind);
+  put('#').append(Directive).put(' ');
+  printDirectiveBody(First, End);
+  return false;
+}
+
+bool Minimizer::lexPPLine(const char *&First, const char *const End) {
+  assert(First != End);
+
+  skipWhitespace(First, End);
+  assert(First <= End);
+  if (First == End)
+    return false;
+
+  if (*First != '#' && *First != '@') {
+    skipLine(First, End);
+    assert(First <= End);
+    return false;
+  }
+
+  // Handle "@import".
+  if (*First == '@')
+    return lexAt(First, End);
+
+  // Handle preprocessing directives.
+  ++First; // Skip over '#'.
+  skipWhitespace(First, End);
+
+  if (First == End)
+    return reportError(First, diag::err_pp_expected_eol);
+
+  if (!isIdentifierHead(*First)) {
+    skipLine(First, End);
+    return false;
+  }
+
+  // Figure out the token.
+  IdInfo Id = lexIdentifier(First, End);
+  First = Id.Last;
+  auto Kind = llvm::StringSwitch<TokenKind>(Id.Name)
+                  .Case("include", pp_include)
+                  .Case("__include_macros", pp___include_macros)
+                  .Case("define", pp_define)
+                  .Case("undef", pp_undef)
+                  .Case("import", pp_import)
+                  .Case("include_next", pp_include_next)
+                  .Case("if", pp_if)
+                  .Case("ifdef", pp_ifdef)
+                  .Case("ifndef", pp_ifndef)
+                  .Case("elif", pp_elif)
+                  .Case("else", pp_else)
+                  .Case("endif", pp_endif)
+                  .Case("pragma", pp_pragma_import)
+                  .Default(pp_none);
+  if (Kind == pp_none) {
+    skipDirective(Id.Name, First, End);
+    return false;
+  }
+
+  if (Kind == pp_endif)
+    return lexEndif(First, End);
+
+  if (Kind == pp_define)
+    return lexDefine(First, End);
+
+  if (Kind == pp_pragma_import)
+    return lexPragma(First, End);
+
+  // Everything else.
+  return lexDefault(Kind, Id.Name, First, End);
+}
+
+bool Minimizer::minimizeImpl(const char *First, const char *const End) {
+  while (First != End)
+    if (lexPPLine(First, End))
+      return true;
+  return false;
+}
+
+bool Minimizer::minimize() {
+  bool Error = minimizeImpl(Input.begin(), Input.end());
+
+  if (!Error) {
+    // Add a trailing newline and an EOF on success.
+    if (!Out.empty() && Out.back() != '\n')
+      Out.push_back('\n');
+    makeToken(pp_eof);
+  }
+
+  // Null-terminate the output. This way the memory buffer that's passed to
+  // Clang will not have to worry about the terminating '\0'.
+  Out.push_back(0);
+  Out.pop_back();
+  return Error;
+}
+
+bool clang::minimizeSourceToDependencyDirectives(
+    StringRef Input, SmallVectorImpl<char> &Output,
+    SmallVectorImpl<Token> &Tokens, DiagnosticsEngine *Diags,
+    SourceLocation InputSourceLoc) {
+  Output.clear();
+  Tokens.clear();
+  return Minimizer(Output, Tokens, Input, Diags, InputSourceLoc).minimize();
+}
diff --git a/clang/test/Frontend/minimize_source_to_dependency_directives.c b/clang/test/Frontend/minimize_source_to_dependency_directives.c
new file mode 100644 (file)
index 0000000..39f608b
--- /dev/null
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -print-dependency-directives-minimized-source %s > %t
+// RUN: echo END. >> %t
+// RUN: FileCheck < %t %s
+
+#ifdef FOO
+#include "a.h"
+#else
+void skipThisCode();
+#endif
+
+// CHECK:      #ifdef FOO
+// CHECK-NEXT: #include "a.h"
+// CHECK-NEXT: #endif
+// CHECK-NEXT: END.
diff --git a/clang/test/Lexer/minimize_source_to_dependency_directives_at_import_extra_tokens.m b/clang/test/Lexer/minimize_source_to_dependency_directives_at_import_extra_tokens.m
new file mode 100644 (file)
index 0000000..ef210af
--- /dev/null
@@ -0,0 +1,3 @@
+// RUN: %clang_cc1 -verify -print-dependency-directives-minimized-source %s 2>&1
+
+@import x; a // expected-error {{unexpected extra tokens at end of @import declaration}}
diff --git a/clang/test/Lexer/minimize_source_to_dependency_directives_at_import_missing_semi.m b/clang/test/Lexer/minimize_source_to_dependency_directives_at_import_missing_semi.m
new file mode 100644 (file)
index 0000000..8962e31
--- /dev/null
@@ -0,0 +1,3 @@
+// RUN: %clang_cc1 -verify -print-dependency-directives-minimized-source %s 2>&1
+
+@import x // expected-error {{could not find ';' after @import}}
diff --git a/clang/test/Lexer/minimize_source_to_dependency_directives_invalid_macro_name.c b/clang/test/Lexer/minimize_source_to_dependency_directives_invalid_macro_name.c
new file mode 100644 (file)
index 0000000..fa4ff7d
--- /dev/null
@@ -0,0 +1,3 @@
+// RUN: %clang_cc1 -verify -print-dependency-directives-minimized-source %s 2>&1
+
+#define 0 0 // expected-error {{macro name must be an identifier}}
index bb0f66d..dbc8328 100644 (file)
@@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_clang_unittest(LexTests
+  DependencyDirectivesSourceMinimizerTest.cpp
   HeaderMapTest.cpp
   HeaderSearchTest.cpp
   LexerTest.cpp
diff --git a/clang/unittests/Lex/DependencyDirectivesSourceMinimizerTest.cpp b/clang/unittests/Lex/DependencyDirectivesSourceMinimizerTest.cpp
new file mode 100644 (file)
index 0000000..7feb6c9
--- /dev/null
@@ -0,0 +1,508 @@
+//===- unittests/Lex/DependencyDirectivesSourceMinimizer.cpp -  -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Lex/DependencyDirectivesSourceMinimizer.h"
+#include "llvm/ADT/SmallString.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace clang;
+using namespace clang::minimize_source_to_dependency_directives;
+
+namespace clang {
+
+bool minimizeSourceToDependencyDirectives(StringRef Input,
+                                          SmallVectorImpl<char> &Out) {
+  SmallVector<minimize_source_to_dependency_directives::Token, 32> Tokens;
+  return minimizeSourceToDependencyDirectives(Input, Out, Tokens);
+}
+
+} // end namespace clang
+
+namespace {
+
+TEST(MinimizeSourceToDependencyDirectivesTest, Empty) {
+  SmallVector<char, 128> Out;
+  SmallVector<Token, 4> Tokens;
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("", Out, Tokens));
+  EXPECT_TRUE(Out.empty());
+  ASSERT_EQ(1u, Tokens.size());
+  ASSERT_EQ(pp_eof, Tokens.back().K);
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("abc def\nxyz", Out, Tokens));
+  EXPECT_TRUE(Out.empty());
+  ASSERT_EQ(1u, Tokens.size());
+  ASSERT_EQ(pp_eof, Tokens.back().K);
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, AllTokens) {
+  SmallVector<char, 128> Out;
+  SmallVector<Token, 4> Tokens;
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("#define A\n"
+                                           "#undef A\n"
+                                           "#endif\n"
+                                           "#if A\n"
+                                           "#ifdef A\n"
+                                           "#ifndef A\n"
+                                           "#elif A\n"
+                                           "#else\n"
+                                           "#include <A>\n"
+                                           "#include_next <A>\n"
+                                           "#__include_macros <A>\n"
+                                           "#import <A>\n"
+                                           "@import A;\n"
+                                           "#pragma clang module import A\n",
+                                           Out, Tokens));
+  EXPECT_EQ(pp_define, Tokens[0].K);
+  EXPECT_EQ(pp_undef, Tokens[1].K);
+  EXPECT_EQ(pp_endif, Tokens[2].K);
+  EXPECT_EQ(pp_if, Tokens[3].K);
+  EXPECT_EQ(pp_ifdef, Tokens[4].K);
+  EXPECT_EQ(pp_ifndef, Tokens[5].K);
+  EXPECT_EQ(pp_elif, Tokens[6].K);
+  EXPECT_EQ(pp_else, Tokens[7].K);
+  EXPECT_EQ(pp_include, Tokens[8].K);
+  EXPECT_EQ(pp_include_next, Tokens[9].K);
+  EXPECT_EQ(pp___include_macros, Tokens[10].K);
+  EXPECT_EQ(pp_import, Tokens[11].K);
+  EXPECT_EQ(decl_at_import, Tokens[12].K);
+  EXPECT_EQ(pp_pragma_import, Tokens[13].K);
+  EXPECT_EQ(pp_eof, Tokens[14].K);
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, Define) {
+  SmallVector<char, 128> Out;
+  SmallVector<Token, 4> Tokens;
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("#define MACRO", Out, Tokens));
+  EXPECT_STREQ("#define MACRO\n", Out.data());
+  ASSERT_EQ(2u, Tokens.size());
+  ASSERT_EQ(pp_define, Tokens.front().K);
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, DefineSpacing) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("#define MACRO\n\n\n", Out));
+  EXPECT_STREQ("#define MACRO\n", Out.data());
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("#define MACRO \n\n\n", Out));
+  EXPECT_STREQ("#define MACRO\n", Out.data());
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("#define MACRO a \n\n\n", Out));
+  EXPECT_STREQ("#define MACRO a\n", Out.data());
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("#define   MACRO\n\n\n", Out));
+  EXPECT_STREQ("#define MACRO\n", Out.data());
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, DefineMacroArguments) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO()", Out));
+  EXPECT_STREQ("#define MACRO()\n", Out.data());
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("#define MACRO(a, b...)", Out));
+  EXPECT_STREQ("#define MACRO(a,b...)\n", Out.data());
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("#define MACRO content", Out));
+  EXPECT_STREQ("#define MACRO content\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      "#define MACRO   con  tent   ", Out));
+  EXPECT_STREQ("#define MACRO con  tent\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      "#define MACRO()   con  tent   ", Out));
+  EXPECT_STREQ("#define MACRO() con  tent\n", Out.data());
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, DefineInvalidMacroArguments) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO((a))", Out));
+  EXPECT_STREQ("#define MACRO(/* invalid */\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO(", Out));
+  EXPECT_STREQ("#define MACRO(/* invalid */\n", Out.data());
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("#define MACRO(a * b)", Out));
+  EXPECT_STREQ("#define MACRO(/* invalid */\n", Out.data());
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, DefineHorizontalWhitespace) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      "#define MACRO(\t)\tcon \t tent\t", Out));
+  EXPECT_STREQ("#define MACRO() con \t tent\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      "#define MACRO(\f)\fcon \f tent\f", Out));
+  EXPECT_STREQ("#define MACRO() con \f tent\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      "#define MACRO(\v)\vcon \v tent\v", Out));
+  EXPECT_STREQ("#define MACRO() con \v tent\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      "#define MACRO \t\v\f\v\t con\f\t\vtent\v\f \v", Out));
+  EXPECT_STREQ("#define MACRO con\f\t\vtent\n", Out.data());
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, DefineMultilineArgs) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("#define MACRO(a        \\\n"
+                                           "              )",
+                                           Out));
+  EXPECT_STREQ("#define MACRO(a)\n", Out.data());
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("#define MACRO(a,       \\\n"
+                                           "              b)       \\\n"
+                                           "        call((a),      \\\n"
+                                           "             (b))",
+                                           Out));
+  EXPECT_STREQ("#define MACRO(a,b) call((a),(b))\n", Out.data());
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest,
+     DefineMultilineArgsCarriageReturn) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("#define MACRO(a,       \\\r"
+                                           "              b)       \\\r"
+                                           "        call((a),      \\\r"
+                                           "             (b))",
+                                           Out));
+  EXPECT_STREQ("#define MACRO(a,b) call((a),(b))\n", Out.data());
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest,
+     DefineMultilineArgsCarriageReturnNewline) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("#define MACRO(a,       \\\r\n"
+                                           "              b)       \\\r\n"
+                                           "        call((a),      \\\r\n"
+                                           "             (b))",
+                                           Out));
+  EXPECT_STREQ("#define MACRO(a,b) call((a),(b))\n", Out.data());
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest,
+     DefineMultilineArgsNewlineCarriageReturn) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("#define MACRO(a,       \\\n\r"
+                                           "              b)       \\\n\r"
+                                           "        call((a),      \\\n\r"
+                                           "             (b))",
+                                           Out));
+  EXPECT_STREQ("#define MACRO(a,b) call((a),(b))\n", Out.data());
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, DefineNumber) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_TRUE(minimizeSourceToDependencyDirectives("#define 0\n", Out));
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, DefineNoName) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_TRUE(minimizeSourceToDependencyDirectives("#define &\n", Out));
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, DefineNoWhitespace) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define AND&\n", Out));
+  EXPECT_STREQ("#define AND &\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define AND\\\n"
+                                                    "&\n",
+                                                    Out));
+  EXPECT_STREQ("#define AND &\n", Out.data());
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, MultilineComment) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("#define MACRO a/*\n"
+                                           "  /*\n"
+                                           "#define MISSING abc\n"
+                                           "  /*\n"
+                                           "  /* something */ \n"
+                                           "#include  /* \"def\" */ <abc> \n",
+                                           Out));
+  EXPECT_STREQ("#define MACRO a\n"
+               "#include <abc>\n",
+               Out.data());
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, MultilineCommentInStrings) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO1 \"/*\"\n"
+                                                    "#define MACRO2 \"*/\"\n",
+                                                    Out));
+  EXPECT_STREQ("#define MACRO1 \"/*\"\n"
+               "#define MACRO2 \"*/\"\n",
+               Out.data());
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, Ifdef) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#ifdef A\n"
+                                                    "#define B\n"
+                                                    "#endif\n",
+                                                    Out));
+  EXPECT_STREQ("#ifdef A\n"
+               "#define B\n"
+               "#endif\n",
+               Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#ifdef A\n"
+                                                    "#define B\n"
+                                                    "#elif B\n"
+                                                    "#define C\n"
+                                                    "#elif C\n"
+                                                    "#define D\n"
+                                                    "#else\n"
+                                                    "#define E\n"
+                                                    "#endif\n",
+                                                    Out));
+  EXPECT_STREQ("#ifdef A\n"
+               "#define B\n"
+               "#elif B\n"
+               "#define C\n"
+               "#elif C\n"
+               "#define D\n"
+               "#else\n"
+               "#define E\n"
+               "#endif\n",
+               Out.data());
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, EmptyIfdef) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#ifdef A\n"
+                                                    "#elif B\n"
+                                                    "#elif C\n"
+                                                    "#else D\n"
+                                                    "#endif\n",
+                                                    Out));
+  EXPECT_STREQ("", Out.data());
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, Pragma) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#pragma A\n", Out));
+  EXPECT_STREQ("", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#pragma clang\n", Out));
+  EXPECT_STREQ("", Out.data());
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("#pragma clang module\n", Out));
+  EXPECT_STREQ("", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      "#pragma clang module impor\n", Out));
+  EXPECT_STREQ("", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      "#pragma clang module import\n", Out));
+  EXPECT_STREQ("#pragma clang module import\n", Out.data());
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, Include) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#include \"A\"\n", Out));
+  EXPECT_STREQ("#include \"A\"\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#include <A>\n", Out));
+  EXPECT_STREQ("#include <A>\n", Out.data());
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("#include_next <A>\n", Out));
+  EXPECT_STREQ("#include_next <A>\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#import <A>\n", Out));
+  EXPECT_STREQ("#import <A>\n", Out.data());
+
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives("#__include_macros <A>\n", Out));
+  EXPECT_STREQ("#__include_macros <A>\n", Out.data());
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, AtImport) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("@import A;\n", Out));
+  EXPECT_STREQ("@import A;\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(" @ import  A;\n", Out));
+  EXPECT_STREQ("@import A;\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("@import A\n;", Out));
+  EXPECT_STREQ("@import A;\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("@import A.B;\n", Out));
+  EXPECT_STREQ("@import A.B;\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      "@import /*x*/ A /*x*/ . /*x*/ B /*x*/ \n /*x*/ ; /*x*/", Out));
+  EXPECT_STREQ("@import A.B;\n", Out.data());
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, AtImportFailures) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_TRUE(minimizeSourceToDependencyDirectives("@import A\n", Out));
+  ASSERT_TRUE(minimizeSourceToDependencyDirectives("@import MACRO(A);\n", Out));
+  ASSERT_TRUE(minimizeSourceToDependencyDirectives("@import \" \";\n", Out));
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, RawStringLiteral) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#ifndef GUARD\n"
+                                                    "#define GUARD\n"
+                                                    "R\"()\"\n"
+                                                    "#endif\n",
+                                                    Out));
+  EXPECT_STREQ("#ifndef GUARD\n"
+               "#define GUARD\n"
+               "#endif\n",
+               Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      "#ifndef GUARD\n"
+      "#define GUARD\n"
+      R"raw(static constexpr char bytes[] = R"(-?:\,[]{}#&*!|>'"%@`)";)raw"
+      "\n"
+      "#endif\n",
+      Out));
+  EXPECT_STREQ("#ifndef GUARD\n"
+               "#define GUARD\n"
+               "#endif\n",
+               Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(
+      "#ifndef GUARD\n"
+      "#define GUARD\n"
+      R"raw(static constexpr char bytes[] = R"abc(-?:\,[]{}#&*!|>'"%@`)abc";)raw"
+      "\n"
+      "#endif\n",
+      Out));
+  EXPECT_STREQ("#ifndef GUARD\n"
+               "#define GUARD\n"
+               "#endif\n",
+               Out.data());
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, SplitIdentifier) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#if\\\n"
+                                                    "ndef GUARD\n"
+                                                    "#define GUARD\n"
+                                                    "#endif\n",
+                                                    Out));
+  EXPECT_STREQ("#ifndef GUARD\n"
+               "#define GUARD\n"
+               "#endif\n",
+               Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define GUA\\\n"
+                                                    "RD\n",
+                                                    Out));
+  EXPECT_STREQ("#define GUARD\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define GUA\\\r"
+                                                    "RD\n",
+                                                    Out));
+  EXPECT_STREQ("#define GUARD\n", Out.data());
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define GUA\\\n"
+                                                    "           RD\n",
+                                                    Out));
+  EXPECT_STREQ("#define GUA RD\n", Out.data());
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, PoundWarningAndError) {
+  SmallVector<char, 128> Out;
+
+  for (auto Source : {
+           "#warning '\n#include <t.h>\n",
+           "#warning \"\n#include <t.h>\n",
+           "#warning /*\n#include <t.h>\n",
+           "#warning \\\n#include <t.h>\n#include <t.h>\n",
+           "#error '\n#include <t.h>\n",
+           "#error \"\n#include <t.h>\n",
+           "#error /*\n#include <t.h>\n",
+           "#error \\\n#include <t.h>\n#include <t.h>\n",
+       }) {
+    ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out));
+    EXPECT_STREQ("#include <t.h>\n", Out.data());
+  }
+
+  for (auto Source : {
+           "#warning \\\n#include <t.h>\n",
+           "#error \\\n#include <t.h>\n",
+           "#if MACRO\n#warning '\n#endif\n",
+           "#if MACRO\n#warning \"\n#endif\n",
+           "#if MACRO\n#warning /*\n#endif\n",
+           "#if MACRO\n#error '\n#endif\n",
+           "#if MACRO\n#error \"\n#endif\n",
+           "#if MACRO\n#error /*\n#endif\n",
+       }) {
+    ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out));
+    EXPECT_STREQ("", Out.data());
+  }
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest, CharacterLiteral) {
+  SmallVector<char, 128> Out;
+
+  StringRef Source = R"(
+#include <bob>
+int a = 0'1;
+int b = 0xfa'af'fa;
+int c = 12 ' ';
+#include <foo>
+)";
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out));
+  EXPECT_STREQ("#include <bob>\n#include <foo>\n", Out.data());
+}
+
+} // end anonymous namespace