}
}
+ // If we have a cold call site, try to sink addressing computation into the
+ // cold block. This interacts with our handling for loads and stores to
+ // ensure that we can fold all uses of a potential addressing computation
+ // into their uses. TODO: generalize this to work over profiling data
+ if (!OptSize && CI->hasFnAttr(Attribute::Cold))
+ for (auto &Arg : CI->arg_operands()) {
+ if (!Arg->getType()->isPointerTy())
+ continue;
+ unsigned AS = Arg->getType()->getPointerAddressSpace();
+ return optimizeMemoryInst(CI, Arg, Arg->getType(), AS);
+ }
+
IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
if (II) {
switch (II->getIntrinsicID()) {
if (!MightBeFoldableInst(I))
return true;
+ const bool OptSize = I->getFunction()->optForSize();
+
// Loop over all the uses, recursively processing them.
for (Use &U : I->uses()) {
Instruction *UserI = cast<Instruction>(U.getUser());
}
if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
+ // If this is a cold call, we can sink the addressing calculation into
+ // the cold path. See optimizeCallInst
+ if (!OptSize && CI->hasFnAttr(Attribute::Cold))
+ continue;
+
InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
if (!IA) return true;
if (!BaseReg && !ScaledReg)
return true;
- // If all uses of this instruction are ultimately load/store/inlineasm's,
- // check to see if their addressing modes will include this instruction. If
- // so, we can fold it into all uses, so it doesn't matter if it has multiple
- // uses.
+ // If all uses of this instruction can have the address mode sunk into them,
+ // we can remove the addressing mode and effectively trade one live register
+ // for another (at worst.) In this context, folding an addressing mode into
+ // the use is just a particularly nice way of sinking it.
SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
SmallPtrSet<Instruction*, 16> ConsideredInsts;
if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM))
// Now that we know that all uses of this instruction are part of a chain of
// computation involving only operations that could theoretically be folded
- // into a memory use, loop over each of these uses and see if they could
- // *actually* fold the instruction.
+ // into a memory use, loop over each of these memory operation uses and see
+ // if they could *actually* fold the instruction. The assumption is that
+ // addressing modes are cheap and that duplicating the computation involved
+ // many times is worthwhile, even on a fastpath. For sinking candidates
+ // (i.e. cold call sites), this serves as a way to prevent excessive code
+ // growth since most architectures have some reasonable small and fast way to
+ // compute an effective address. (i.e LEA on x86)
SmallVector<Instruction*, 32> MatchedAddrModeInsts;
for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
Instruction *User = MemoryUses[i].first;
return false;
}
+/// Sink addressing mode computation immediate before MemoryInst if doing so
+/// can be done without increasing register pressure. The need for the
+/// register pressure constraint means this can end up being an all or nothing
+/// decision for all uses of the same addressing computation.
+///
/// Load and Store Instructions often have addressing modes that can do
/// significant amounts of computation. As such, instruction selection will try
/// to get the load or store to do as much computation as possible for the
/// such, we sink as much legal addressing mode work into the block as possible.
///
/// This method is used to optimize both load/store and inline asms with memory
-/// operands.
+/// operands. It's also used to sink addressing computations feeding into cold
+/// call sites into their (cold) basic block.
+///
+/// The motivation for handling sinking into cold blocks is that doing so can
+/// both enable other address mode sinking (by satisfying the register pressure
+/// constraint above), and reduce register pressure globally (by removing the
+/// addressing mode computation from the fast path entirely.).
bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
Type *AccessTy, unsigned AddrSpace) {
Value *Repl = Addr;
continue;
}
- // For non-PHIs, determine the addressing mode being computed.
+ // For non-PHIs, determine the addressing mode being computed. Note that
+ // the result may differ depending on what other uses our candidate
+ // addressing instructions might have.
SmallVector<Instruction*, 16> NewAddrModeInsts;
ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TM,
--- /dev/null
+; RUN: opt -S -codegenprepare < %s | FileCheck %s
+
+target datalayout =
+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Can we sink single addressing mode computation to use?
+define void @test1(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test1
+; CHECK: add i64 {{.+}}, 40
+entry:
+ %addr = getelementptr inbounds i64, i64* %base, i64 5
+ %casted = bitcast i64* %addr to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+ %v = load i32, i32* %casted, align 4
+ br label %fallthrough
+
+fallthrough:
+ ret void
+}
+
+declare void @foo(i32)
+
+; Make sure sinking two copies of addressing mode into different blocks works
+define void @test2(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test2
+entry:
+ %addr = getelementptr inbounds i64, i64* %base, i64 5
+ %casted = bitcast i64* %addr to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK: add i64 {{.+}}, 40
+ %v1 = load i32, i32* %casted, align 4
+ call void @foo(i32 %v1)
+ %cmp = icmp eq i32 %v1, 0
+ br i1 %cmp, label %next, label %fallthrough
+
+next:
+; CHECK-LABEL: next:
+; CHECK: add i64 {{.+}}, 40
+ %v2 = load i32, i32* %casted, align 4
+ call void @foo(i32 %v2)
+ br label %fallthrough
+
+fallthrough:
+ ret void
+}
+
+; If we have two loads in the same block, only need one copy of addressing mode
+; - instruction selection will duplicate if needed
+define void @test3(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test3
+entry:
+ %addr = getelementptr inbounds i64, i64* %base, i64 5
+ %casted = bitcast i64* %addr to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK: add i64 {{.+}}, 40
+ %v1 = load i32, i32* %casted, align 4
+ call void @foo(i32 %v1)
+; CHECK-NOT: add i64 {{.+}}, 40
+ %v2 = load i32, i32* %casted, align 4
+ call void @foo(i32 %v2)
+ br label %fallthrough
+
+fallthrough:
+ ret void
+}
+
+; Can we still sink addressing mode if there's a cold use of the
+; address itself?
+define void @test4(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test4
+entry:
+ %addr = getelementptr inbounds i64, i64* %base, i64 5
+ %casted = bitcast i64* %addr to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK: add i64 {{.+}}, 40
+ %v1 = load i32, i32* %casted, align 4
+ call void @foo(i32 %v1)
+ %cmp = icmp eq i32 %v1, 0
+ br i1 %cmp, label %rare.1, label %fallthrough
+
+fallthrough:
+ ret void
+
+rare.1:
+; CHECK-LABEL: rare.1:
+; CHECK: add i64 {{.+}}, 40
+ call void @slowpath(i32 %v1, i32* %casted) cold
+ br label %fallthrough
+}
+
+; Negative test - don't want to duplicate addressing into hot path
+define void @test5(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test5
+entry:
+; CHECK: %addr = getelementptr
+ %addr = getelementptr inbounds i64, i64* %base, i64 5
+ %casted = bitcast i64* %addr to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK-NOT: add i64 {{.+}}, 40
+ %v1 = load i32, i32* %casted, align 4
+ call void @foo(i32 %v1)
+ %cmp = icmp eq i32 %v1, 0
+ br i1 %cmp, label %rare.1, label %fallthrough
+
+fallthrough:
+ ret void
+
+rare.1:
+ call void @slowpath(i32 %v1, i32* %casted) ;; NOT COLD
+ br label %fallthrough
+}
+
+; Negative test - opt for size
+define void @test6(i1 %cond, i64* %base) minsize {
+; CHECK-LABEL: @test6
+entry:
+; CHECK: %addr = getelementptr
+ %addr = getelementptr inbounds i64, i64* %base, i64 5
+ %casted = bitcast i64* %addr to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK-NOT: add i64 {{.+}}, 40
+ %v1 = load i32, i32* %casted, align 4
+ call void @foo(i32 %v1)
+ %cmp = icmp eq i32 %v1, 0
+ br i1 %cmp, label %rare.1, label %fallthrough
+
+fallthrough:
+ ret void
+
+rare.1:
+ call void @slowpath(i32 %v1, i32* %casted) cold
+ br label %fallthrough
+}
+
+
+; Make sure sinking two copies of addressing mode into different blocks works
+; when there are cold paths for each.
+define void @test7(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test7
+entry:
+ %addr = getelementptr inbounds i64, i64* %base, i64 5
+ %casted = bitcast i64* %addr to i32*
+ br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK: add i64 {{.+}}, 40
+ %v1 = load i32, i32* %casted, align 4
+ call void @foo(i32 %v1)
+ %cmp = icmp eq i32 %v1, 0
+ br i1 %cmp, label %rare.1, label %next
+
+next:
+; CHECK-LABEL: next:
+; CHECK: add i64 {{.+}}, 40
+ %v2 = load i32, i32* %casted, align 4
+ call void @foo(i32 %v2)
+ %cmp2 = icmp eq i32 %v2, 0
+ br i1 %cmp2, label %rare.1, label %fallthrough
+
+fallthrough:
+ ret void
+
+rare.1:
+; CHECK-LABEL: rare.1:
+; CHECK: add i64 {{.+}}, 40
+ call void @slowpath(i32 %v1, i32* %casted) cold
+ br label %next
+
+rare.2:
+; CHECK-LABEL: rare.2:
+; CHECK: add i64 {{.+}}, 40
+ call void @slowpath(i32 %v2, i32* %casted) cold
+ br label %fallthrough
+}
+
+
+declare void @slowpath(i32, i32*)