From 8207641251706ea808df6d2a1ea8f87b8ee04c6d Mon Sep 17 00:00:00 2001 From: Ahmed Bougacha Date: Thu, 4 Jun 2015 20:39:23 +0000 Subject: [PATCH] [GlobalMerge] Take into account minsize on Global users' parents. Now that we can look at users, we can trivially do this: when we would have otherwise disabled GlobalMerge (currently -O<3), we can just run it for minsize functions, as it's usually a codesize win. Differential Revision: http://reviews.llvm.org/D10054 llvm-svn: 239087 --- llvm/include/llvm/Transforms/Scalar.h | 9 ++- llvm/lib/CodeGen/GlobalMerge.cpp | 22 +++++-- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp | 10 ++- llvm/lib/Target/ARM/ARMTargetMachine.cpp | 9 ++- .../global-merge-ignore-single-use-minsize.ll | 74 ++++++++++++++++++++++ 5 files changed, 113 insertions(+), 11 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h index 293ceb1..481d4fc6 100644 --- a/llvm/include/llvm/Transforms/Scalar.h +++ b/llvm/include/llvm/Transforms/Scalar.h @@ -152,7 +152,14 @@ Pass *createLoopInterchangePass(); // Pass *createLoopStrengthReducePass(); -Pass *createGlobalMergePass(const TargetMachine *TM, unsigned MaximalOffset); +//===----------------------------------------------------------------------===// +// +// GlobalMerge - This pass merges internal (by default) globals into structs +// to enable reuse of a base pointer by indexed addressing modes. +// It can also be configured to focus on size optimizations only. +// +Pass *createGlobalMergePass(const TargetMachine *TM, unsigned MaximalOffset, + bool OnlyOptimizeForSize = false); //===----------------------------------------------------------------------===// // diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp index 79de175..df54a9c 100644 --- a/llvm/lib/CodeGen/GlobalMerge.cpp +++ b/llvm/lib/CodeGen/GlobalMerge.cpp @@ -124,6 +124,12 @@ namespace { // for more information. unsigned MaxOffset; + /// Whether we should try to optimize for size only. + /// Currently, this applies a dead simple heuristic: only consider globals + /// used in minsize functions for merging. + /// FIXME: This could learn about optsize, and be used in the cost model. + bool OnlyOptimizeForSize; + bool doMerge(SmallVectorImpl &Globals, Module &M, bool isConst, unsigned AddrSpace) const; /// \brief Merge everything in \p Globals for which the corresponding bit @@ -152,9 +158,10 @@ namespace { public: static char ID; // Pass identification, replacement for typeid. explicit GlobalMerge(const TargetMachine *TM = nullptr, - unsigned MaximalOffset = 0) + unsigned MaximalOffset = 0, + bool OnlyOptimizeForSize = false) : FunctionPass(ID), TM(TM), DL(TM->getDataLayout()), - MaxOffset(MaximalOffset) { + MaxOffset(MaximalOffset), OnlyOptimizeForSize(OnlyOptimizeForSize) { initializeGlobalMergePass(*PassRegistry::getPassRegistry()); } @@ -290,6 +297,12 @@ bool GlobalMerge::doMerge(SmallVectorImpl &Globals, continue; Function *ParentFn = I->getParent()->getParent(); + + // If we're only optimizing for size, ignore non-minsize functions. + if (OnlyOptimizeForSize && + !ParentFn->hasFnAttribute(Attribute::MinSize)) + continue; + size_t UGSIdx = GlobalUsesByFunction[ParentFn]; // If this is the first global the basic block uses, map it to the set @@ -585,6 +598,7 @@ bool GlobalMerge::doFinalization(Module &M) { return false; } -Pass *llvm::createGlobalMergePass(const TargetMachine *TM, unsigned Offset) { - return new GlobalMerge(TM, Offset); +Pass *llvm::createGlobalMergePass(const TargetMachine *TM, unsigned Offset, + bool OnlyOptimizeForSize) { + return new GlobalMerge(TM, Offset, OnlyOptimizeForSize); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index a9059ab..f23dd33 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -250,10 +250,14 @@ bool AArch64PassConfig::addPreISel() { // FIXME: On AArch64, this depends on the type. // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes(). // and the offset has to be a multiple of the related size in bytes. - if ((TM->getOptLevel() == CodeGenOpt::Aggressive && + if ((TM->getOptLevel() != CodeGenOpt::None && EnableGlobalMerge == cl::BOU_UNSET) || - EnableGlobalMerge == cl::BOU_TRUE) - addPass(createGlobalMergePass(TM, 4095)); + EnableGlobalMerge == cl::BOU_TRUE) { + bool OnlyOptimizeForSize = (TM->getOptLevel() < CodeGenOpt::Aggressive) && + (EnableGlobalMerge == cl::BOU_UNSET); + addPass(createGlobalMergePass(TM, 4095, OnlyOptimizeForSize)); + } + if (TM->getOptLevel() != CodeGenOpt::None) addPass(createAArch64AddressTypePromotionPass()); diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp index e794fb7..0f98d52 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -339,15 +339,18 @@ void ARMPassConfig::addIRPasses() { } bool ARMPassConfig::addPreISel() { - if ((TM->getOptLevel() == CodeGenOpt::Aggressive && + if ((TM->getOptLevel() != CodeGenOpt::None && EnableGlobalMerge == cl::BOU_UNSET) || - EnableGlobalMerge == cl::BOU_TRUE) + EnableGlobalMerge == cl::BOU_TRUE) { // FIXME: This is using the thumb1 only constant value for // maximal global offset for merging globals. We may want // to look into using the old value for non-thumb1 code of // 4095 based on the TargetMachine, but this starts to become // tricky when doing code gen per function. - addPass(createGlobalMergePass(TM, 127)); + bool OnlyOptimizeForSize = (TM->getOptLevel() < CodeGenOpt::Aggressive) && + (EnableGlobalMerge == cl::BOU_UNSET); + addPass(createGlobalMergePass(TM, 127, OnlyOptimizeForSize)); + } return false; } diff --git a/llvm/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll b/llvm/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll new file mode 100644 index 0000000..e83cbab --- /dev/null +++ b/llvm/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll @@ -0,0 +1,74 @@ +; RUN: llc -mtriple=aarch64-apple-ios -asm-verbose=false -aarch64-collect-loh=false \ +; RUN: -O1 -global-merge-group-by-use -global-merge-ignore-single-use \ +; RUN: %s -o - | FileCheck %s + +; Check that, at -O1, we only merge globals used in minsize functions. +; We assume that globals of the same size aren't reordered inside a set. +; We use -global-merge-ignore-single-use, and thus only expect one merged set. + +@m1 = internal global i32 0, align 4 +@n1 = internal global i32 0, align 4 + +; CHECK-LABEL: f1: +define void @f1(i32 %a1, i32 %a2) minsize nounwind { +; CHECK-NEXT: adrp x8, [[SET:__MergedGlobals]]@PAGE +; CHECK-NEXT: add x8, x8, [[SET]]@PAGEOFF +; CHECK-NEXT: stp w0, w1, [x8] +; CHECK-NEXT: ret + store i32 %a1, i32* @m1, align 4 + store i32 %a2, i32* @n1, align 4 + ret void +} + +@m2 = internal global i32 0, align 4 +@n2 = internal global i32 0, align 4 + +; CHECK-LABEL: f2: +define void @f2(i32 %a1, i32 %a2) nounwind { +; CHECK-NEXT: adrp x8, _m2@PAGE +; CHECK-NEXT: adrp x9, _n2@PAGE +; CHECK-NEXT: str w0, [x8, _m2@PAGEOFF] +; CHECK-NEXT: str w1, [x9, _n2@PAGEOFF] +; CHECK-NEXT: ret + store i32 %a1, i32* @m2, align 4 + store i32 %a2, i32* @n2, align 4 + ret void +} + +; If we have use sets partially overlapping between a minsize and a non-minsize +; function, explicitly check that we only consider the globals used in the +; minsize function for merging. + +@m3 = internal global i32 0, align 4 +@n3 = internal global i32 0, align 4 + +; CHECK-LABEL: f3: +define void @f3(i32 %a1, i32 %a2) minsize nounwind { +; CHECK-NEXT: adrp x8, [[SET]]@PAGE +; CHECK-NEXT: add x8, x8, [[SET]]@PAGEOFF +; CHECK-NEXT: stp w0, w1, [x8, #8] +; CHECK-NEXT: ret + store i32 %a1, i32* @m3, align 4 + store i32 %a2, i32* @n3, align 4 + ret void +} + +@n4 = internal global i32 0, align 4 + +; CHECK-LABEL: f4: +define void @f4(i32 %a1, i32 %a2) nounwind { +; CHECK-NEXT: adrp x8, [[SET]]@PAGE +; CHECK-NEXT: add x8, x8, [[SET]]@PAGEOFF +; CHECK-NEXT: adrp x9, _n4@PAGE +; CHECK-NEXT: str w0, [x8, #8] +; CHECK-NEXT: str w1, [x9, _n4@PAGEOFF] +; CHECK-NEXT: ret + store i32 %a1, i32* @m3, align 4 + store i32 %a2, i32* @n4, align 4 + ret void +} + +; CHECK-DAG: .zerofill __DATA,__bss,[[SET]],16,3 +; CHECK-DAG: .zerofill __DATA,__bss,_m2,4,2 +; CHECK-DAG: .zerofill __DATA,__bss,_n2,4,2 +; CHECK-DAG: .zerofill __DATA,__bss,_n4,4,2 -- 2.7.4