[CGP] Duplicate addressing computation in cold paths if required to sink addressing...

author Philip Reames <listmail@philipreames.com>

Wed, 9 Mar 2016 23:13:12 +0000 (23:13 +0000)

committer Philip Reames <listmail@philipreames.com>

Wed, 9 Mar 2016 23:13:12 +0000 (23:13 +0000)
author Philip Reames <listmail@philipreames.com>
Wed, 9 Mar 2016 23:13:12 +0000 (23:13 +0000)
committer Philip Reames <listmail@philipreames.com>
Wed, 9 Mar 2016 23:13:12 +0000 (23:13 +0000)
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp

index 7fc018556e1e0d520a5c76c31737ddd693b412b7..0df2971b10419ef788fb85da938175f8bae23abc 100644 (file)
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -1760,6 +1760,18 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
      }
    }
  
+  // If we have a cold call site, try to sink addressing computation into the
+  // cold block.  This interacts with our handling for loads and stores to
+  // ensure that we can fold all uses of a potential addressing computation
+  // into their uses.  TODO: generalize this to work over profiling data
+  if (!OptSize && CI->hasFnAttr(Attribute::Cold))
+    for (auto &Arg : CI->arg_operands()) {
+      if (!Arg->getType()->isPointerTy())
+        continue;
+      unsigned AS = Arg->getType()->getPointerAddressSpace();
+      return optimizeMemoryInst(CI, Arg, Arg->getType(), AS);
+    }
+  
    IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
    if (II) {
      switch (II->getIntrinsicID()) {
@@ -3443,6 +3455,8 @@ static bool FindAllMemoryUses(
    if (!MightBeFoldableInst(I))
      return true;
  
+  const bool OptSize = I->getFunction()->optForSize();
+
    // Loop over all the uses, recursively processing them.
    for (Use &U : I->uses()) {
      Instruction *UserI = cast<Instruction>(U.getUser());
@@ -3460,6 +3474,11 @@ static bool FindAllMemoryUses(
      }
  
      if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
+      // If this is a cold call, we can sink the addressing calculation into
+      // the cold path.  See optimizeCallInst
+      if (!OptSize && CI->hasFnAttr(Attribute::Cold))
+        continue;
+      
        InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
        if (!IA) return true;
  
@@ -3551,10 +3570,10 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
    if (!BaseReg && !ScaledReg)
      return true;
  
-  // If all uses of this instruction are ultimately load/store/inlineasm's,
-  // check to see if their addressing modes will include this instruction.  If
-  // so, we can fold it into all uses, so it doesn't matter if it has multiple
-  // uses.
+  // If all uses of this instruction can have the address mode sunk into them,
+  // we can remove the addressing mode and effectively trade one live register
+  // for another (at worst.)  In this context, folding an addressing mode into
+  // the use is just a particularly nice way of sinking it.  
    SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
    SmallPtrSet<Instruction*, 16> ConsideredInsts;
    if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM))
@@ -3562,8 +3581,13 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
  
    // Now that we know that all uses of this instruction are part of a chain of
    // computation involving only operations that could theoretically be folded
-  // into a memory use, loop over each of these uses and see if they could
-  // *actually* fold the instruction.
+  // into a memory use, loop over each of these memory operation uses and see
+  // if they could  *actually* fold the instruction.  The assumption is that
+  // addressing modes are cheap and that duplicating the computation involved
+  // many times is worthwhile, even on a fastpath. For sinking candidates
+  // (i.e. cold call sites), this serves as a way to prevent excessive code
+  // growth since most architectures have some reasonable small and fast way to
+  // compute an effective address.  (i.e LEA on x86)
    SmallVector<Instruction*, 32> MatchedAddrModeInsts;
    for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
      Instruction *User = MemoryUses[i].first;
@@ -3617,6 +3641,11 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
    return false;
  }
  
+/// Sink addressing mode computation immediate before MemoryInst if doing so
+/// can be done without increasing register pressure.  The need for the
+/// register pressure constraint means this can end up being an all or nothing
+/// decision for all uses of the same addressing computation.
+///
  /// Load and Store Instructions often have addressing modes that can do
  /// significant amounts of computation. As such, instruction selection will try
  /// to get the load or store to do as much computation as possible for the
@@ -3624,7 +3653,13 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
  /// such, we sink as much legal addressing mode work into the block as possible.
  ///
  /// This method is used to optimize both load/store and inline asms with memory
-/// operands.
+/// operands.  It's also used to sink addressing computations feeding into cold
+/// call sites into their (cold) basic block.
+///
+/// The motivation for handling sinking into cold blocks is that doing so can
+/// both enable other address mode sinking (by satisfying the register pressure
+/// constraint above), and reduce register pressure globally (by removing the
+/// addressing mode computation from the fast path entirely.).
  bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
                                          Type *AccessTy, unsigned AddrSpace) {
    Value *Repl = Addr;
@@ -3663,7 +3698,9 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
        continue;
      }
  
-    // For non-PHIs, determine the addressing mode being computed.
+    // For non-PHIs, determine the addressing mode being computed.  Note that
+    // the result may differ depending on what other uses our candidate
+    // addressing instructions might have.
      SmallVector<Instruction*, 16> NewAddrModeInsts;
      ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
        V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TM,
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll

new file mode 100644 (file)

index 0000000..5c0b5f3
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
@@ -0,0 +1,196 @@
+; RUN: opt -S -codegenprepare < %s | FileCheck %s
+
+target datalayout =
+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Can we sink single addressing mode computation to use?
+define void @test1(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test1
+; CHECK: add i64 {{.+}}, 40
+entry:
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+  %v = load i32, i32* %casted, align 4
+  br label %fallthrough
+
+fallthrough:
+  ret void
+}
+
+declare void @foo(i32)
+
+; Make sure sinking two copies of addressing mode into different blocks works
+define void @test2(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test2
+entry:
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK: add i64 {{.+}}, 40
+  %v1 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v1)
+  %cmp = icmp eq i32 %v1, 0
+  br i1 %cmp, label %next, label %fallthrough
+
+next:
+; CHECK-LABEL: next:
+; CHECK: add i64 {{.+}}, 40
+  %v2 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v2)
+  br label %fallthrough
+
+fallthrough:
+  ret void
+}
+
+; If we have two loads in the same block, only need one copy of addressing mode
+; - instruction selection will duplicate if needed
+define void @test3(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test3
+entry:
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK: add i64 {{.+}}, 40
+  %v1 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v1)
+; CHECK-NOT: add i64 {{.+}}, 40
+  %v2 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v2)
+  br label %fallthrough
+
+fallthrough:
+  ret void
+}
+
+; Can we still sink addressing mode if there's a cold use of the
+; address itself?  
+define void @test4(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test4
+entry:
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK: add i64 {{.+}}, 40
+  %v1 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v1)
+  %cmp = icmp eq i32 %v1, 0
+  br i1 %cmp, label %rare.1, label %fallthrough
+
+fallthrough:
+  ret void
+
+rare.1:
+; CHECK-LABEL: rare.1:
+; CHECK: add i64 {{.+}}, 40
+  call void @slowpath(i32 %v1, i32* %casted) cold
+  br label %fallthrough
+}
+
+; Negative test - don't want to duplicate addressing into hot path
+define void @test5(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test5
+entry:
+; CHECK: %addr = getelementptr
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK-NOT: add i64 {{.+}}, 40
+  %v1 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v1)
+  %cmp = icmp eq i32 %v1, 0
+  br i1 %cmp, label %rare.1, label %fallthrough
+
+fallthrough:
+  ret void
+
+rare.1:
+  call void @slowpath(i32 %v1, i32* %casted) ;; NOT COLD
+  br label %fallthrough
+}
+
+; Negative test - opt for size
+define void @test6(i1 %cond, i64* %base) minsize {
+; CHECK-LABEL: @test6
+entry:
+; CHECK: %addr = getelementptr
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK-NOT: add i64 {{.+}}, 40
+  %v1 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v1)
+  %cmp = icmp eq i32 %v1, 0
+  br i1 %cmp, label %rare.1, label %fallthrough
+
+fallthrough:
+  ret void
+
+rare.1:
+  call void @slowpath(i32 %v1, i32* %casted) cold
+  br label %fallthrough
+}
+
+
+; Make sure sinking two copies of addressing mode into different blocks works
+; when there are cold paths for each.
+define void @test7(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test7
+entry:
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK: add i64 {{.+}}, 40
+  %v1 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v1)
+  %cmp = icmp eq i32 %v1, 0
+  br i1 %cmp, label %rare.1, label %next
+
+next:
+; CHECK-LABEL: next:
+; CHECK: add i64 {{.+}}, 40
+  %v2 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v2)
+  %cmp2 = icmp eq i32 %v2, 0
+  br i1 %cmp2, label %rare.1, label %fallthrough
+
+fallthrough:
+  ret void
+
+rare.1:
+; CHECK-LABEL: rare.1:
+; CHECK: add i64 {{.+}}, 40
+  call void @slowpath(i32 %v1, i32* %casted) cold
+  br label %next
+
+rare.2:
+; CHECK-LABEL: rare.2:
+; CHECK: add i64 {{.+}}, 40
+  call void @slowpath(i32 %v2, i32* %casted) cold
+  br label %fallthrough
+}
+
+
+declare void @slowpath(i32, i32*)
author	Philip Reames <listmail@philipreames.com>
	Wed, 9 Mar 2016 23:13:12 +0000 (23:13 +0000)
committer	Philip Reames <listmail@philipreames.com>
	Wed, 9 Mar 2016 23:13:12 +0000 (23:13 +0000)
llvm/lib/CodeGen/CodeGenPrepare.cpp		patch \| blob \| history
llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll	[new file with mode: 0644]	patch \| blob