[include-cleaner] Record main-file macro occurences and includes
authorSam McCall <sam.mccall@gmail.com>
Wed, 26 Oct 2022 00:21:06 +0000 (02:21 +0200)
committerSam McCall <sam.mccall@gmail.com>
Tue, 8 Nov 2022 14:20:40 +0000 (15:20 +0100)
The occurrences are roots for finding used headers, like walkAST.
Includes are the targets we're matching used headers against.

Differential Revision: https://reviews.llvm.org/D136723

clang-tools-extra/include-cleaner/include/clang-include-cleaner/Record.h
clang-tools-extra/include-cleaner/include/clang-include-cleaner/Types.h
clang-tools-extra/include-cleaner/lib/Record.cpp
clang-tools-extra/include-cleaner/lib/Types.cpp
clang-tools-extra/include-cleaner/unittests/RecordTest.cpp

index 258d361..f3fdeee 100644 (file)
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/FileSystem/UniqueID.h"
+#include "clang-include-cleaner/Types.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
 #include <memory>
 #include <vector>
 
@@ -29,6 +33,8 @@ class ASTContext;
 class CompilerInstance;
 class Decl;
 class FileEntry;
+class Preprocessor;
+class PPCallbacks;
 
 namespace include_cleaner {
 
@@ -75,19 +81,55 @@ private:
   // FIXME: add selfcontained file.
 };
 
-// Contains recorded parser events relevant to include-cleaner.
+/// Recorded main-file parser events relevant to include-cleaner.
 struct RecordedAST {
-  // The consumer (when installed into clang) tracks declarations in this.
+  /// The consumer (when installed into clang) tracks declarations in `*this`.
   std::unique_ptr<ASTConsumer> record();
 
   ASTContext *Ctx = nullptr;
-  // The set of declarations written at file scope inside the main file.
-  //
-  // These are the roots of the subtrees that should be traversed to find uses.
-  // (Traversing the TranslationUnitDecl would find uses inside headers!)
+  /// The set of declarations written at file scope inside the main file.
+  ///
+  /// These are the roots of the subtrees that should be traversed to find uses.
+  /// (Traversing the TranslationUnitDecl would find uses inside headers!)
   std::vector<Decl *> Roots;
 };
 
+/// Recorded main-file preprocessor events relevant to include-cleaner.
+///
+/// This doesn't include facts that we record globally for the whole TU, even
+/// when they occur in the main file (e.g. IWYU pragmas).
+struct RecordedPP {
+  /// The callback (when installed into clang) tracks macros/includes in this.
+  std::unique_ptr<PPCallbacks> record(const Preprocessor &PP);
+
+  /// Describes where macros were used in the main file.
+  std::vector<SymbolReference> MacroReferences;
+
+  /// A container for all includes present in the main file.
+  /// Supports efficiently hit-testing Headers against Includes.
+  /// FIXME: is there a more natural header for this class?
+  class RecordedIncludes {
+  public:
+    void add(const Include &);
+
+    /// All #includes seen, in the order they appear.
+    llvm::ArrayRef<Include> all() const { return All; }
+
+    /// Determine #includes that match a header (that provides a used symbol).
+    ///
+    /// Matching is based on the type of Header specified:
+    ///  - for a physical file like /path/to/foo.h, we check Resolved
+    ///  - for a logical file like <vector>, we check Spelled
+    llvm::SmallVector<const Include *> match(Header H) const;
+
+  private:
+    std::vector<Include> All;
+    // Lookup structures for match(), values are index into All.
+    llvm::StringMap<llvm::SmallVector<unsigned>> BySpelling;
+    llvm::DenseMap<const FileEntry *, llvm::SmallVector<unsigned>> ByFile;
+  } Includes;
+};
+
 } // namespace include_cleaner
 } // namespace clang
 
index e33693c..135883d 100644 (file)
@@ -22,6 +22,7 @@
 #ifndef CLANG_INCLUDE_CLEANER_TYPES_H
 #define CLANG_INCLUDE_CLEANER_TYPES_H
 
+#include "clang/Basic/SourceLocation.h"
 #include "clang/Tooling/Inclusions/StandardLibrary.h"
 #include <memory>
 #include <vector>
@@ -32,35 +33,60 @@ class raw_ostream;
 namespace clang {
 class Decl;
 class FileEntry;
+class IdentifierInfo;
 namespace include_cleaner {
 
+/// We consider a macro to be a different symbol each time it is defined.
+struct Macro {
+  IdentifierInfo *Name;
+  /// The location of the Name where the macro is defined.
+  SourceLocation Definition;
+
+  bool operator==(const Macro &S) const {
+    return Definition == S.Definition;
+  }
+};
+
 /// An entity that can be referenced in the code.
 struct Symbol {
   enum Kind {
     /// A canonical clang declaration.
     Declaration,
+    /// A preprocessor macro, as defined in a specific location.
+    Macro,
     /// A recognized symbol from the standard library, like std::string.
     Standard,
   };
 
   Symbol(const Decl &D) : Storage(&D) {}
+  Symbol(struct Macro M) : Storage(M) {}
   Symbol(tooling::stdlib::Symbol S) : Storage(S) {}
 
   Kind kind() const { return static_cast<Kind>(Storage.index()); }
   bool operator==(const Symbol &RHS) const { return Storage == RHS.Storage; }
 
+  const Decl &declaration() const { return *std::get<Declaration>(Storage); }
+  struct Macro macro() const { return std::get<Macro>(Storage); }
   tooling::stdlib::Symbol standard() const {
     return std::get<Standard>(Storage);
   }
-  const Decl &declaration() const { return *std::get<Declaration>(Storage); }
 
 private:
   // FIXME: Add support for macros.
   // Order must match Kind enum!
-  std::variant<const Decl *, tooling::stdlib::Symbol> Storage;
+  std::variant<const Decl *, struct Macro, tooling::stdlib::Symbol> Storage;
 };
 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Symbol &);
 
+/// Indicates that a piece of code refers to a symbol.
+struct SymbolReference {
+  /// The symbol referred to.
+  Symbol Symbol;
+  /// The point in the code that refers to the symbol.
+  SourceLocation RefLocation;
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const SymbolReference &);
+
 /// Represents a file that provides some symbol. Might not be includeable, e.g.
 /// built-in or main-file itself.
 struct Header {
@@ -89,8 +115,17 @@ private:
 };
 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Header &);
 
+/// A single #include directive written in the main file.
+struct Include {
+  llvm::StringRef Spelled;             // e.g. vector
+  const FileEntry *Resolved = nullptr; // e.g. /path/to/c++/v1/vector
+                                       // nullptr if the header was not found
+  SourceLocation HashLocation;         // of hash in #include <vector>
+  unsigned Line = 0;                   // 1-based line number for #include
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Include &);
+
 } // namespace include_cleaner
 } // namespace clang
 
 #endif
-
index ba691f4..7ce22fc 100644 (file)
 #include "clang/AST/DeclGroup.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Frontend/CompilerInstance.h"
+#include "clang/Lex/MacroInfo.h"
 #include "clang/Lex/PPCallbacks.h"
 #include "clang/Lex/Preprocessor.h"
 
 namespace clang::include_cleaner {
+namespace {
+
+class PPRecorder : public PPCallbacks {
+public:
+  PPRecorder(RecordedPP &Recorded, const Preprocessor &PP)
+      : Recorded(Recorded), PP(PP), SM(PP.getSourceManager()) {}
+
+  void FileChanged(SourceLocation Loc, FileChangeReason Reason,
+                   SrcMgr::CharacteristicKind FileType,
+                   FileID PrevFID) override {
+    Active = SM.isWrittenInMainFile(Loc);
+  }
+
+  void InclusionDirective(SourceLocation Hash, const Token &IncludeTok,
+                          StringRef SpelledFilename, bool IsAngled,
+                          CharSourceRange FilenameRange,
+                          llvm::Optional<FileEntryRef> File,
+                          StringRef SearchPath, StringRef RelativePath,
+                          const Module *, SrcMgr::CharacteristicKind) override {
+    if (!Active)
+      return;
+
+    Include I;
+    I.HashLocation = Hash;
+    I.Resolved = File ? &File->getFileEntry() : nullptr;
+    I.Line = SM.getSpellingLineNumber(Hash);
+    I.Spelled = SpelledFilename;
+    Recorded.Includes.add(I);
+  }
+
+  void MacroExpands(const Token &MacroName, const MacroDefinition &MD,
+                    SourceRange Range, const MacroArgs *Args) override {
+    if (!Active)
+      return;
+    recordMacroRef(MacroName, *MD.getMacroInfo());
+  }
+
+  void MacroDefined(const Token &MacroName, const MacroDirective *MD) override {
+    if (!Active)
+      return;
+
+    const auto *MI = MD->getMacroInfo();
+    // The tokens of a macro definition could refer to a macro.
+    // Formally this reference isn't resolved until this macro is expanded,
+    // but we want to treat it as a reference anyway.
+    for (const auto &Tok : MI->tokens()) {
+      auto *II = Tok.getIdentifierInfo();
+      // Could this token be a reference to a macro? (Not param to this macro).
+      if (!II || !II->hadMacroDefinition() ||
+          llvm::is_contained(MI->params(), II))
+        continue;
+      if (const MacroInfo *MI = PP.getMacroInfo(II))
+        recordMacroRef(Tok, *MI);
+    }
+  }
+
+  void MacroUndefined(const Token &MacroName, const MacroDefinition &MD,
+                      const MacroDirective *) override {
+    if (!Active)
+      return;
+    if (const auto *MI = MD.getMacroInfo())
+      recordMacroRef(MacroName, *MI);
+  }
+
+private:
+  void recordMacroRef(const Token &Tok, const MacroInfo &MI) {
+    if (MI.isBuiltinMacro())
+      return; // __FILE__ is not a reference.
+    Recorded.MacroReferences.push_back(
+        SymbolReference{Macro{Tok.getIdentifierInfo(), MI.getDefinitionLoc()},
+                        Tok.getLocation()});
+  }
+
+  bool Active = false;
+  RecordedPP &Recorded;
+  const Preprocessor &PP;
+  const SourceManager &SM;
+};
+
+} // namespace
 
 // FIXME: this is a mirror of clang::clangd::parseIWYUPragma, move to libTooling
 // to share the code?
@@ -142,4 +223,36 @@ std::unique_ptr<ASTConsumer> RecordedAST::record() {
   return std::make_unique<Recorder>(this);
 }
 
+void RecordedPP::RecordedIncludes::add(const Include &I) {
+  unsigned Index = All.size();
+  All.push_back(I);
+  auto BySpellingIt = BySpelling.try_emplace(I.Spelled).first;
+  All.back().Spelled = BySpellingIt->first(); // Now we own the backing string.
+
+  BySpellingIt->second.push_back(Index);
+  if (I.Resolved)
+    ByFile[I.Resolved].push_back(Index);
+}
+
+llvm::SmallVector<const Include *>
+RecordedPP::RecordedIncludes::match(Header H) const {
+  llvm::SmallVector<const Include *> Result;
+  switch (H.kind()) {
+  case Header::Physical:
+    for (unsigned I : ByFile.lookup(H.physical()))
+      Result.push_back(&All[I]);
+    break;
+  case Header::Standard:
+    for (unsigned I : BySpelling.lookup(H.standard().name().trim("<>")))
+      Result.push_back(&All[I]);
+    break;
+  }
+  return Result;
+}
+
+std::unique_ptr<PPCallbacks> RecordedPP::record(const Preprocessor &PP) {
+  return std::make_unique<PPRecorder>(*this, PP);
+}
+
+
 } // namespace clang::include_cleaner
index 1b673ab..6a0755e 100644 (file)
@@ -9,6 +9,7 @@
 #include "clang-include-cleaner/Types.h"
 #include "clang/AST/Decl.h"
 #include "clang/Basic/FileEntry.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace clang::include_cleaner {
@@ -19,6 +20,8 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Symbol &S) {
     if (const auto *ND = llvm::dyn_cast<NamedDecl>(&S.declaration()))
       return OS << ND->getNameAsString();
     return OS << S.declaration().getDeclKindName();
+  case Symbol::Macro:
+    return OS << S.macro().Name;
   case Symbol::Standard:
     return OS << S.standard().scope() << S.standard().name();
   }
@@ -35,4 +38,18 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Header &H) {
   llvm_unreachable("Unhandled Header kind");
 }
 
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Include &I) {
+  return OS << I.Line << ": " << I.Spelled << " => "
+            << (I.Resolved ? I.Resolved->getName() : "<missing>");
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const SymbolReference &R) {
+  // We can't decode the Location without SourceManager. Its raw representation
+  // isn't completely useless (and distinguishes SymbolReference from Symbol).
+  return OS << R.Symbol << "@0x"
+            << llvm::utohexstr(R.RefLocation.getRawEncoding(),
+                               /*Width=*/CHAR_BIT *
+                                   sizeof(SourceLocation::UIntTy));
+}
+
 } // namespace clang::include_cleaner
index c1f1562..4e6b054 100644 (file)
 #include "clang/Frontend/FrontendAction.h"
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Testing/TestAST.h"
+#include "clang/Tooling/Inclusions/StandardLibrary.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Testing/Support/Annotations.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
 namespace clang::include_cleaner {
 namespace {
+using testing::ElementsAre;
+using testing::ElementsAreArray;
+using testing::IsEmpty;
 
 // Matches a Decl* if it is a NamedDecl with the given name.
-MATCHER_P(Named, N, "") {
+MATCHER_P(named, N, "") {
   if (const NamedDecl *ND = llvm::dyn_cast<NamedDecl>(arg)) {
     if (N == ND->getNameAsString())
       return true;
@@ -64,7 +70,7 @@ TEST_F(RecordASTTest, Namespace) {
       }
     )cpp";
   auto AST = build();
-  EXPECT_THAT(Recorded.Roots, testing::ElementsAre(Named("ns")));
+  EXPECT_THAT(Recorded.Roots, testing::ElementsAre(named("ns")));
 }
 
 // Decl in included file is not a root.
@@ -75,7 +81,7 @@ TEST_F(RecordASTTest, Inclusion) {
     void mainFunc();
   )cpp";
   auto AST = build();
-  EXPECT_THAT(Recorded.Roots, testing::ElementsAre(Named("mainFunc")));
+  EXPECT_THAT(Recorded.Roots, testing::ElementsAre(named("mainFunc")));
 }
 
 // Decl from macro expanded into the main file is a root.
@@ -86,7 +92,147 @@ TEST_F(RecordASTTest, Macros) {
     X
   )cpp";
   auto AST = build();
-  EXPECT_THAT(Recorded.Roots, testing::ElementsAre(Named("x")));
+  EXPECT_THAT(Recorded.Roots, testing::ElementsAre(named("x")));
+}
+
+class RecordPPTest : public ::testing::Test {
+protected:
+  TestInputs Inputs;
+  RecordedPP Recorded;
+
+  RecordPPTest() {
+    struct RecordAction : public PreprocessOnlyAction {
+      RecordedPP &Out;
+      RecordAction(RecordedPP &Out) : Out(Out) {}
+
+      void ExecuteAction() override {
+        auto &PP = getCompilerInstance().getPreprocessor();
+        PP.addPPCallbacks(Out.record(PP));
+        PreprocessOnlyAction::ExecuteAction();
+      }
+    };
+    Inputs.MakeAction = [this] {
+      return std::make_unique<RecordAction>(Recorded);
+    };
+  }
+
+  TestAST build() { return TestAST(Inputs); }
+};
+
+// Matches an Include with a particular spelling.
+MATCHER_P(spelled, S, "") { return arg.Spelled == S; }
+
+TEST_F(RecordPPTest, CapturesIncludes) {
+  llvm::Annotations MainFile(R"cpp(
+    $H^#include "./header.h"
+    $M^#include "missing.h"
+  )cpp");
+  Inputs.Code = MainFile.code();
+  Inputs.ExtraFiles["header.h"] = "";
+  Inputs.ErrorOK = true; // missing header
+  auto AST = build();
+
+  ASSERT_THAT(
+      Recorded.Includes.all(),
+      testing::ElementsAre(spelled("./header.h"), spelled("missing.h")));
+
+  auto &H = Recorded.Includes.all().front();
+  EXPECT_EQ(H.Line, 2u);
+  EXPECT_EQ(H.HashLocation,
+            AST.sourceManager().getComposedLoc(
+                AST.sourceManager().getMainFileID(), MainFile.point("H")));
+  EXPECT_EQ(H.Resolved, AST.fileManager().getFile("header.h").get());
+
+  auto &M = Recorded.Includes.all().back();
+  EXPECT_EQ(M.Line, 3u);
+  EXPECT_EQ(M.HashLocation,
+            AST.sourceManager().getComposedLoc(
+                AST.sourceManager().getMainFileID(), MainFile.point("M")));
+  EXPECT_EQ(M.Resolved, nullptr);
+}
+
+TEST_F(RecordPPTest, CapturesMacroRefs) {
+  llvm::Annotations Header(R"cpp(
+    #define $def^X 1
+
+    // Refs, but not in main file.
+    #define Y X
+    int one = X;
+  )cpp");
+  llvm::Annotations MainFile(R"cpp(
+    #define EARLY X // not a ref, no definition
+    #include "header.h"
+    #define LATE ^X
+    #define LATE2 ^X // a ref even if not expanded
+
+    int uno = ^X;
+    int jeden = $exp^LATE; // a ref in LATE's expansion
+
+    #define IDENT(X) X // not a ref, shadowed
+    int eins = IDENT(^X);
+
+    #undef ^X
+    // Not refs, rather a new macro with the same name.
+    #define X 2
+    int two = X;
+  )cpp");
+  Inputs.Code = MainFile.code();
+  Inputs.ExtraFiles["header.h"] = Header.code();
+  auto AST = build();
+  const auto &SM = AST.sourceManager();
+
+  SourceLocation Def = SM.getComposedLoc(
+      SM.translateFile(AST.fileManager().getFile("header.h").get()),
+      Header.point("def"));
+  ASSERT_THAT(Recorded.MacroReferences, Not(IsEmpty()));
+  Symbol OrigX = Recorded.MacroReferences.front().Symbol;
+  EXPECT_EQ("X", OrigX.macro().Name->getName());
+  EXPECT_EQ(Def, OrigX.macro().Definition);
+
+  std::vector<unsigned> RefOffsets;
+  std::vector<unsigned> ExpOffsets; // Expansion locs of refs in macro locs.
+  std::vector<SourceLocation> RefMacroLocs;
+  for (const auto &Ref : Recorded.MacroReferences) {
+    if (Ref.Symbol == OrigX) {
+      auto [FID, Off] = SM.getDecomposedLoc(Ref.RefLocation);
+      if (FID == SM.getMainFileID()) {
+        RefOffsets.push_back(Off);
+      } else if (Ref.RefLocation.isMacroID() &&
+                 SM.isWrittenInMainFile(SM.getExpansionLoc(Ref.RefLocation))) {
+        ExpOffsets.push_back(
+            SM.getDecomposedExpansionLoc(Ref.RefLocation).second);
+      } else {
+        ADD_FAILURE() << Ref.RefLocation.printToString(SM);
+      }
+    }
+  }
+  EXPECT_THAT(RefOffsets, ElementsAreArray(MainFile.points()));
+  EXPECT_THAT(ExpOffsets, ElementsAreArray(MainFile.points("exp")));
+}
+
+// Matches an Include* on the specified line;
+MATCHER_P(line, N, "") { return arg->Line == (unsigned)N; }
+
+TEST(RecordedIncludesTest, Match) {
+  // We're using synthetic data, but need a FileManager to obtain FileEntry*s.
+  // Ensure it doesn't do any actual IO.
+  auto FS = llvm::makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>();
+  FileManager FM(FileSystemOptions{});
+  const FileEntry *A = FM.getVirtualFile("/path/a", /*Size=*/0, time_t{});
+  const FileEntry *B = FM.getVirtualFile("/path/b", /*Size=*/0, time_t{});
+
+  RecordedPP::RecordedIncludes Includes;
+  Includes.add(Include{"a", A, SourceLocation(), 1});
+  Includes.add(Include{"a2", A, SourceLocation(), 2});
+  Includes.add(Include{"b", B, SourceLocation(), 3});
+  Includes.add(Include{"vector", B, SourceLocation(), 4});
+  Includes.add(Include{"vector", B, SourceLocation(), 5});
+  Includes.add(Include{"missing", nullptr, SourceLocation(), 6});
+
+  EXPECT_THAT(Includes.match(A), ElementsAre(line(1), line(2)));
+  EXPECT_THAT(Includes.match(B), ElementsAre(line(3), line(4), line(5)));
+  EXPECT_THAT(Includes.match(*tooling::stdlib::Header::named("<vector>")),
+              ElementsAre(line(4), line(5)));
 }
 
 class PragmaIncludeTest : public ::testing::Test {