[peephole] Enhance folding logic to work for STATEPOINTs

author Philip Reames <listmail@philipreames.com>

Tue, 13 Dec 2016 01:38:41 +0000 (01:38 +0000)

committer Philip Reames <listmail@philipreames.com>

Tue, 13 Dec 2016 01:38:41 +0000 (01:38 +0000)
author Philip Reames <listmail@philipreames.com>
Tue, 13 Dec 2016 01:38:41 +0000 (01:38 +0000)
committer Philip Reames <listmail@philipreames.com>
Tue, 13 Dec 2016 01:38:41 +0000 (01:38 +0000)
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp

index 3cf67ee..1b8106d 100644 (file)
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -1540,11 +1540,6 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
        if (MI->isDebugValue())
            continue;
  
-      // If we run into an instruction we can't fold across, discard
-      // the load candidates.
-      if (MI->isLoadFoldBarrier())
-        FoldAsLoadDefCandidates.clear();
-
        if (MI->isPosition() || MI->isPHI())
          continue;
  
@@ -1588,7 +1583,6 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
          DEBUG(dbgs() << "NAPhysCopy: blowing away all info due to " << *MI
                       << '\n');
          NAPhysToVirtMIs.clear();
-        continue;
        }
  
        if ((isUncoalescableCopy(*MI) &&
@@ -1639,8 +1633,14 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
        // earlier load into MI.
        if (!isLoadFoldable(MI, FoldAsLoadDefCandidates) &&
            !FoldAsLoadDefCandidates.empty()) {
+
+        // We visit each operand even after successfully folding a previous
+        // one.  This allows us to fold multiple loads into a single
+        // instruction.  We do assume that optimizeLoadInstr doesn't insert
+        // foldable uses earlier in the argument list.  Since we don't restart
+        // iteration, we'd miss such cases.
          const MCInstrDesc &MIDesc = MI->getDesc();
-        for (unsigned i = MIDesc.getNumDefs(); i != MIDesc.getNumOperands();
+        for (unsigned i = MIDesc.getNumDefs(); i != MI->getNumOperands();
               ++i) {
            const MachineOperand &MOp = MI->getOperand(i);
            if (!MOp.isReg())
@@ -1667,13 +1667,23 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
                MRI->markUsesInDebugValueAsUndef(FoldedReg);
                FoldAsLoadDefCandidates.erase(FoldedReg);
                ++NumLoadFold;
-              // MI is replaced with FoldMI.
+              
+              // MI is replaced with FoldMI so we can continue trying to fold
                Changed = true;
-              break;
+              MI = FoldMI;
              }
            }
          }
        }
+      
+      // If we run into an instruction we can't fold across, discard
+      // the load candidates.  Note: We might be able to fold *into* this
+      // instruction, so this needs to be after the folding logic.
+      if (MI->isLoadFoldBarrier()) {
+        DEBUG(dbgs() << "Encountered load fold barrier on " << *MI << "\n");
+        FoldAsLoadDefCandidates.clear();
+      }
+
      }
    }
  
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp

index ca0004a..dfd88e4 100644 (file)
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -6576,14 +6576,6 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
                                                const MachineRegisterInfo *MRI,
                                                unsigned &FoldAsLoadDefReg,
                                                MachineInstr *&DefMI) const {
-  if (FoldAsLoadDefReg == 0)
-    return nullptr;
-  // To be conservative, if there exists another load, clear the load candidate.
-  if (MI.mayLoad()) {
-    FoldAsLoadDefReg = 0;
-    return nullptr;
-  }
-
    // Check whether we can move DefMI here.
    DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
    assert(DefMI);
@@ -6592,27 +6584,24 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
      return nullptr;
  
    // Collect information about virtual register operands of MI.
-  unsigned SrcOperandId = 0;
-  bool FoundSrcOperand = false;
-  for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) {
+  SmallVector<unsigned, 1> SrcOperandIds;
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
      MachineOperand &MO = MI.getOperand(i);
      if (!MO.isReg())
        continue;
      unsigned Reg = MO.getReg();
      if (Reg != FoldAsLoadDefReg)
        continue;
-    // Do not fold if we have a subreg use or a def or multiple uses.
-    if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
+    // Do not fold if we have a subreg use or a def.
+    if (MO.getSubReg() || MO.isDef())
        return nullptr;
-
-    SrcOperandId = i;
-    FoundSrcOperand = true;
+    SrcOperandIds.push_back(i);
    }
-  if (!FoundSrcOperand)
+  if (SrcOperandIds.empty())
      return nullptr;
  
    // Check whether we can fold the def into SrcOperandId.
-  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, *DefMI)) {
+  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
      FoldAsLoadDefReg = 0;
      return FoldMI;
    }
diff --git a/llvm/test/CodeGen/X86/anyregcc.ll b/llvm/test/CodeGen/X86/anyregcc.ll

index 018a92a..1b51b53 100644 (file)
--- a/llvm/test/CodeGen/X86/anyregcc.ll
+++ b/llvm/test/CodeGen/X86/anyregcc.ll
@@ -34,7 +34,7 @@
  ; CHECK-NEXT:   .quad 56
  ; CHECK-NEXT:   .quad 1
  ; CHECK-NEXT:   .quad _anyreg_test2
-; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad 8
  ; CHECK-NEXT:   .quad 1
  ; CHECK-NEXT:   .quad _patchpoint_spilldef
  ; CHECK-NEXT:   .quad 56
@@ -272,31 +272,31 @@ entry:
  ; CHECK-NEXT:   .byte 8
  ; CHECK-NEXT:   .short {{[0-9]+}}
  ; CHECK-NEXT:   .long 0
-; Loc 9: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 10: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 11: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 12: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 13: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
+; Loc 9: Argument, still on stack
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .long
+; Loc 10: Argument, still on stack
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .long
+; Loc 11: Argument, still on stack
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .long
+; Loc 12: Argument, still on stack
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .long
+; Loc 13: Argument, still on stack
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .long
  define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
  entry:
    %f = inttoptr i64 12297829382473034410 to i8*
diff --git a/llvm/test/CodeGen/X86/stackmap.ll b/llvm/test/CodeGen/X86/stackmap.ll

index fe38462..9818d35 100644 (file)
--- a/llvm/test/CodeGen/X86/stackmap.ll
+++ b/llvm/test/CodeGen/X86/stackmap.ll
@@ -38,10 +38,10 @@
  ; CHECK-NEXT:   .quad 8
  ; CHECK-NEXT:   .quad 1
  ; CHECK-NEXT:   .quad _spilledValue
-; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad 8
  ; CHECK-NEXT:   .quad 1
  ; CHECK-NEXT:   .quad _spilledStackMapValue
-; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad 8
  ; CHECK-NEXT:   .quad 1
  ; CHECK-NEXT:   .quad _spillSubReg
  ; CHECK-NEXT:   .quad 56
diff --git a/llvm/test/CodeGen/X86/statepoint-live-in.ll b/llvm/test/CodeGen/X86/statepoint-live-in.ll

index a1718f0..b236393 100644 (file)
--- a/llvm/test/CodeGen/X86/statepoint-live-in.ll
+++ b/llvm/test/CodeGen/X86/statepoint-live-in.ll
@@ -34,35 +34,24 @@ define void @test3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
  entry:
  ; TODO: We should have folded the reload into the statepoint.
  ; CHECK-LABEL: @test3
-; CHECK:               movl    32(%rsp), %r10d
-; CHECK-NEXT:  movl    24(%rsp), %r11d
-; CHECK-NEXT:   movl   16(%rsp), %eax
+; CHECK:               pushq %rax
+; CHECK-NEXT:  Lcfi
+; CHECK-NEXT:   .cfi_def_cfa_offset 16
  ; CHECK-NEXT:   callq  _bar
    %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 9, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i)
    ret void
  }
  
  ; This case just confirms that we don't crash when given more live values
-; than registers.  This is a case where we *have* to use a stack slot.
+; than registers.  This is a case where we *have* to use a stack slot.  This
+; also ends up being a good test of whether we can fold loads from immutable
+; stack slots into the statepoint.
  define void @test4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z) gc "statepoint-example" {
  entry:
-; TODO: We should have folded the reload into the statepoint.  
  ; CHECK-LABEL: test4
-; CHECK: pushq %r15
-; CHECK: pushq %r14
-; CHECK: pushq %r13
-; CHECK: pushq %r12
-; CHECK: pushq %rbx
-; CHECK: pushq %rax
-; CHECK:               movl    128(%rsp), %r13d
-; CHECK-NEXT:   movl   120(%rsp), %r12d
-; CHECK-NEXT:   movl   112(%rsp), %r15d
-; CHECK-NEXT:   movl   104(%rsp), %r14d
-; CHECK-NEXT:   movl   96(%rsp), %ebp
-; CHECK-NEXT:   movl   88(%rsp), %ebx
-; CHECK-NEXT:   movl   80(%rsp), %r11d
-; CHECK-NEXT:   movl   72(%rsp), %r10d
-; CHECK-NEXT:   movl   64(%rsp), %eax
+; CHECK:        pushq %rax
+; CHECK-NEXT:  Lcfi
+; CHECK-NEXT:   .cfi_def_cfa_offset 16
  ; CHECK-NEXT:   callq  _bar
    %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)
    ret void
@@ -90,7 +79,7 @@ entry:
  ; CHECK:        movl %edi, %ebx
  ; CHECK:        movl %ebx, 12(%rsp)
  ; CHECK-NEXT:   callq  _baz
-; CHECK-NEXT:  Ltmp6:
+; CHECK-NEXT:  Ltmp
  ; CHECK-NEXT:   callq  _bar
    call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @baz, i32 0, i32 0, i32 0, i32 1, i32 %a)
    call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 %a)
author	Philip Reames <listmail@philipreames.com>
	Tue, 13 Dec 2016 01:38:41 +0000 (01:38 +0000)
committer	Philip Reames <listmail@philipreames.com>
	Tue, 13 Dec 2016 01:38:41 +0000 (01:38 +0000)
llvm/lib/CodeGen/PeepholeOptimizer.cpp		patch \| blob \| history
llvm/lib/Target/X86/X86InstrInfo.cpp		patch \| blob \| history
llvm/test/CodeGen/X86/anyregcc.ll		patch \| blob \| history
llvm/test/CodeGen/X86/stackmap.ll		patch \| blob \| history
llvm/test/CodeGen/X86/statepoint-live-in.ll		patch \| blob \| history