From 9c254184242e9fa8b6e2c4c11a83544fc5e22994 Mon Sep 17 00:00:00 2001 From: Richard Smith Date: Sun, 19 Jul 2015 21:41:12 +0000 Subject: [PATCH] [modules] Don't save uninteresting identifiers, and don't consider identifiers to be interesting just because they are the name of a builtin. Reduces the size of an empty module by over 80% (~100KB). llvm-svn: 242650 --- clang/include/clang/Basic/IdentifierTable.h | 14 +++++++++++++- clang/include/clang/Serialization/Module.h | 5 +++++ clang/lib/Parse/Parser.cpp | 2 +- clang/lib/Sema/SemaDecl.cpp | 2 +- clang/lib/Serialization/ASTReader.cpp | 18 +++++++++++++----- clang/lib/Serialization/ASTWriter.cpp | 21 ++++++++++++++------- clang/test/Modules/empty.modulemap | 6 ++++++ 7 files changed, 53 insertions(+), 15 deletions(-) diff --git a/clang/include/clang/Basic/IdentifierTable.h b/clang/include/clang/Basic/IdentifierTable.h index 1785e04..af6ecf9 100644 --- a/clang/include/clang/Basic/IdentifierTable.h +++ b/clang/include/clang/Basic/IdentifierTable.h @@ -161,7 +161,7 @@ public: /// TokenID is normally read-only but there are 2 instances where we revert it /// to tok::identifier for libstdc++ 4.2. Keep track of when this happens /// using this method so we can inform serialization about it. - void RevertTokenIDToIdentifier() { + void revertTokenIDToIdentifier() { assert(TokenID != tok::identifier && "Already at tok::identifier"); TokenID = tok::identifier; RevertedTokenID = true; @@ -183,6 +183,18 @@ public: } void setObjCKeywordID(tok::ObjCKeywordKind ID) { ObjCOrBuiltinID = ID; } + /// \brief True if setNotBuiltin() was called. + bool hasRevertedBuiltin() const { + return ObjCOrBuiltinID == tok::NUM_OBJC_KEYWORDS; + } + + /// \brief Revert the identifier to a non-builtin identifier. We do this if + /// the name of a known builtin library function is used to declare that + /// function, but an unexpected type is specified. + void revertBuiltin() { + setBuiltinID(0); + } + /// \brief Return a value indicating whether this is a builtin function. /// /// 0 is not-built-in. 1 is builtin-for-some-nonprimary-target. diff --git a/clang/include/clang/Serialization/Module.h b/clang/include/clang/Serialization/Module.h index c98ced4..dae1c4a 100644 --- a/clang/include/clang/Serialization/Module.h +++ b/clang/include/clang/Serialization/Module.h @@ -476,6 +476,11 @@ public: /// any point during translation. bool isDirectlyImported() const { return DirectlyImported; } + /// \brief Is this a module file for a module (rather than a PCH or similar). + bool isModule() const { + return Kind == MK_ImplicitModule || Kind == MK_ExplicitModule; + } + /// \brief Dump debugging output for this module. void dump(); }; diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp index e76f767..3c89df9 100644 --- a/clang/lib/Parse/Parser.cpp +++ b/clang/lib/Parse/Parser.cpp @@ -1489,7 +1489,7 @@ bool Parser::TryKeywordIdentFallback(bool DisableKeyword) { << PP.getSpelling(Tok) << DisableKeyword; if (DisableKeyword) - Tok.getIdentifierInfo()->RevertTokenIDToIdentifier(); + Tok.getIdentifierInfo()->revertTokenIDToIdentifier(); Tok.setKind(tok::identifier); return true; } diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 2bf7356..042cffc 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -3115,7 +3115,7 @@ bool Sema::MergeFunctionDecl(FunctionDecl *New, NamedDecl *&OldD, // remain visible, a single bogus local redeclaration (which is // actually only a warning) could break all the downstream code. if (!New->getLexicalDeclContext()->isFunctionOrMethod()) - New->getIdentifier()->setBuiltinID(Builtin::NotBuiltin); + New->getIdentifier()->revertBuiltin(); return false; } diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 4bde2b5..1f47f57 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -735,10 +735,10 @@ ASTIdentifierLookupTraitBase::ReadKey(const unsigned char* d, unsigned n) { } /// \brief Whether the given identifier is "interesting". -static bool isInterestingIdentifier(IdentifierInfo &II) { +static bool isInterestingIdentifier(IdentifierInfo &II, bool IsModule) { return II.hadMacroDefinition() || II.isPoisoned() || - II.getObjCOrBuiltinID() || + (IsModule ? II.hasRevertedBuiltin() : II.getObjCOrBuiltinID()) || II.hasRevertedTokenIDToIdentifier() || II.getFETokenInfo(); } @@ -767,7 +767,7 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k, } if (!II->isFromAST()) { II->setIsFromAST(); - if (isInterestingIdentifier(*II)) + if (isInterestingIdentifier(*II, F.isModule())) II->setChangedSinceDeserialization(); } Reader.markIdentifierUpToDate(II); @@ -784,6 +784,7 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k, unsigned Bits = endian::readNext(d); bool CPlusPlusOperatorKeyword = readBit(Bits); bool HasRevertedTokenIDToIdentifier = readBit(Bits); + bool HasRevertedBuiltin = readBit(Bits); bool Poisoned = readBit(Bits); bool ExtensionToken = readBit(Bits); bool HadMacroDefinition = readBit(Bits); @@ -794,8 +795,15 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k, // Set or check the various bits in the IdentifierInfo structure. // Token IDs are read-only. if (HasRevertedTokenIDToIdentifier && II->getTokenID() != tok::identifier) - II->RevertTokenIDToIdentifier(); - II->setObjCOrBuiltinID(ObjCOrBuiltinID); + II->revertTokenIDToIdentifier(); + if (!F.isModule()) + II->setObjCOrBuiltinID(ObjCOrBuiltinID); + else if (HasRevertedBuiltin && II->getBuiltinID()) { + II->revertBuiltin(); + assert((II->hasRevertedBuiltin() || + II->getObjCOrBuiltinID() == ObjCOrBuiltinID) && + "Incorrect ObjC keyword or builtin ID"); + } assert(II->isExtensionToken() == ExtensionToken && "Incorrect extension token flag"); (void)ExtensionToken; diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 13c6dcf..1c08cbf 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -3102,15 +3102,16 @@ class ASTIdentifierTableTrait { ASTWriter &Writer; Preprocessor &PP; IdentifierResolver &IdResolver; + bool IsModule; /// \brief Determines whether this is an "interesting" identifier that needs a /// full IdentifierInfo structure written into the hash table. Notably, this /// doesn't check whether the name has macros defined; use PublicMacroIterator /// to check that. - bool isInterestingIdentifier(IdentifierInfo *II, uint64_t MacroOffset) { + bool isInterestingIdentifier(const IdentifierInfo *II, uint64_t MacroOffset) { if (MacroOffset || II->isPoisoned() || - II->getObjCOrBuiltinID() || + (IsModule ? II->hasRevertedBuiltin() : II->getObjCOrBuiltinID()) || II->hasRevertedTokenIDToIdentifier() || II->getFETokenInfo()) return true; @@ -3129,13 +3130,17 @@ public: typedef unsigned offset_type; ASTIdentifierTableTrait(ASTWriter &Writer, Preprocessor &PP, - IdentifierResolver &IdResolver) - : Writer(Writer), PP(PP), IdResolver(IdResolver) {} + IdentifierResolver &IdResolver, bool IsModule) + : Writer(Writer), PP(PP), IdResolver(IdResolver), IsModule(IsModule) {} static hash_value_type ComputeHash(const IdentifierInfo* II) { return llvm::HashString(II->getName()); } + bool isInterestingNonMacroIdentifier(const IdentifierInfo *II) { + return isInterestingIdentifier(II, 0); + } + std::pair EmitKeyDataLength(raw_ostream& Out, IdentifierInfo* II, IdentID ID) { unsigned KeyLen = II->getLength() + 1; @@ -3192,6 +3197,7 @@ public: Bits = (Bits << 1) | unsigned(HadMacroDefinition); Bits = (Bits << 1) | unsigned(II->isExtensionToken()); Bits = (Bits << 1) | unsigned(II->isPoisoned()); + Bits = (Bits << 1) | unsigned(II->hasRevertedBuiltin()); Bits = (Bits << 1) | unsigned(II->hasRevertedTokenIDToIdentifier()); Bits = (Bits << 1) | unsigned(II->isCPlusPlusOperatorKeyword()); LE.write(Bits); @@ -3229,7 +3235,7 @@ void ASTWriter::WriteIdentifierTable(Preprocessor &PP, // strings. { llvm::OnDiskChainedHashTableGenerator Generator; - ASTIdentifierTableTrait Trait(*this, PP, IdResolver); + ASTIdentifierTableTrait Trait(*this, PP, IdResolver, IsModule); // Look for any identifiers that were named while processing the // headers, but are otherwise not needed. We add these to the hash @@ -3245,7 +3251,8 @@ void ASTWriter::WriteIdentifierTable(Preprocessor &PP, // that their order is stable. std::sort(IIs.begin(), IIs.end(), llvm::less_ptr()); for (const IdentifierInfo *II : IIs) - getIdentifierRef(II); + if (Trait.isInterestingNonMacroIdentifier(II)) + getIdentifierRef(II); // Create the on-disk hash table representation. We only store offsets // for identifiers that appear here for the first time. @@ -4444,6 +4451,7 @@ void ASTWriter::WriteASTCore(Sema &SemaRef, WriteHeaderSearch(PP.getHeaderSearchInfo()); WriteSelectors(SemaRef); WriteReferencedSelectorsPool(SemaRef); + WriteLateParsedTemplates(SemaRef); WriteIdentifierTable(PP, SemaRef.IdResolver, isModule); WriteFPPragmaOptions(SemaRef.getFPOptions()); WriteOpenCLExtensions(SemaRef); @@ -4559,7 +4567,6 @@ void ASTWriter::WriteASTCore(Sema &SemaRef, WriteDeclReplacementsBlock(); WriteRedeclarations(); WriteObjCCategories(); - WriteLateParsedTemplates(SemaRef); if(!WritingModule) WriteOptimizePragmaOptions(SemaRef); diff --git a/clang/test/Modules/empty.modulemap b/clang/test/Modules/empty.modulemap index 6350e2f..d451ad2 100644 --- a/clang/test/Modules/empty.modulemap +++ b/clang/test/Modules/empty.modulemap @@ -10,6 +10,12 @@ // RUN: -emit-module -fmodule-name=empty -o %t/check.pcm \ // RUN: %s // +// The module file should be identical each time we produce it. // RUN: diff %t/base.pcm %t/check.pcm +// +// We expect an empty module to be less than 30KB. +// REQUIRES: shell +// RUN: wc -c %t/base.pcm | FileCheck --check-prefix=CHECK-SIZE %s +// CHECK-SIZE: {{^[12][0-9]{4} }} module empty { header "Inputs/empty.h" export * } -- 2.7.4