From 5dc14bd54ccc3562379f598ccd1ab917a07166cc Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Wed, 2 Apr 2014 22:59:58 +0000 Subject: [PATCH] [CodeGen] Teach the peephole optimizer to remember (and exploit) all folding opportunities in the current basic block, rather than just the last one seen. llvm-svn: 205481 --- llvm/lib/CodeGen/PeepholeOptimizer.cpp | 79 +++++++++++++----------- llvm/test/CodeGen/X86/peephole-multiple-folds.ll | 29 +++++++++ 2 files changed, 73 insertions(+), 35 deletions(-) create mode 100644 llvm/test/CodeGen/X86/peephole-multiple-folds.ll diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 23be3c8..ca8334b 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -133,7 +133,8 @@ namespace { bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB, SmallSet &ImmDefRegs, DenseMap &ImmDefMIs); - bool isLoadFoldable(MachineInstr *MI, unsigned &FoldAsLoadDefReg); + bool isLoadFoldable(MachineInstr *MI, + SmallSet &FoldAsLoadDefCandidates); }; } @@ -489,8 +490,9 @@ bool PeepholeOptimizer::optimizeCopyOrBitcast(MachineInstr *MI) { /// isLoadFoldable - Check whether MI is a candidate for folding into a later /// instruction. We only fold loads to virtual registers and the virtual /// register defined has a single use. -bool PeepholeOptimizer::isLoadFoldable(MachineInstr *MI, - unsigned &FoldAsLoadDefReg) { +bool PeepholeOptimizer::isLoadFoldable( + MachineInstr *MI, + SmallSet &FoldAsLoadDefCandidates) { if (!MI->canFoldAsLoad() || !MI->mayLoad()) return false; const MCInstrDesc &MCID = MI->getDesc(); @@ -504,7 +506,7 @@ bool PeepholeOptimizer::isLoadFoldable(MachineInstr *MI, if (!MI->getOperand(0).getSubReg() && TargetRegisterInfo::isVirtualRegister(Reg) && MRI->hasOneNonDBGUse(Reg)) { - FoldAsLoadDefReg = Reg; + FoldAsLoadDefCandidates.insert(Reg); return true; } return false; @@ -570,18 +572,14 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; - SmallPtrSet LocalMIs; - SmallSet ImmDefRegs; - DenseMap ImmDefMIs; - unsigned FoldAsLoadDefReg; for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { MachineBasicBlock *MBB = &*I; bool SeenMoveImm = false; - LocalMIs.clear(); - ImmDefRegs.clear(); - ImmDefMIs.clear(); - FoldAsLoadDefReg = 0; + SmallPtrSet LocalMIs; + SmallSet ImmDefRegs; + DenseMap ImmDefMIs; + SmallSet FoldAsLoadDefCandidates; for (MachineBasicBlock::iterator MII = I->begin(), MIE = I->end(); MII != MIE; ) { @@ -595,15 +593,15 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { continue; // If there exists an instruction which belongs to the following - // categories, we will discard the load candidate. + // categories, we will discard the load candidates. if (MI->isPosition() || MI->isPHI() || MI->isImplicitDef() || MI->isKill() || MI->isInlineAsm() || MI->hasUnmodeledSideEffects()) { - FoldAsLoadDefReg = 0; + FoldAsLoadDefCandidates.clear(); continue; } if (MI->mayStore() || MI->isCall()) - FoldAsLoadDefReg = 0; + FoldAsLoadDefCandidates.clear(); if (((MI->isBitcast() || MI->isCopy()) && optimizeCopyOrBitcast(MI)) || (MI->isCompare() && optimizeCmpInstr(MI, MBB)) || @@ -630,30 +628,41 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { // Check whether MI is a load candidate for folding into a later // instruction. If MI is not a candidate, check whether we can fold an // earlier load into MI. - if (!isLoadFoldable(MI, FoldAsLoadDefReg) && FoldAsLoadDefReg) { + if (!isLoadFoldable(MI, FoldAsLoadDefCandidates) && + !FoldAsLoadDefCandidates.empty()) { // We need to fold load after optimizeCmpInstr, since optimizeCmpInstr // can enable folding by converting SUB to CMP. // Save FoldAsLoadDefReg because optimizeLoadInstr() resets it and we // need it for markUsesInDebugValueAsUndef(). - unsigned FoldedReg = FoldAsLoadDefReg; - MachineInstr *DefMI = 0; - MachineInstr *FoldMI = TII->optimizeLoadInstr(MI, MRI, - FoldAsLoadDefReg, DefMI); - if (FoldMI) { - // Update LocalMIs since we replaced MI with FoldMI and deleted DefMI. - DEBUG(dbgs() << "Replacing: " << *MI); - DEBUG(dbgs() << " With: " << *FoldMI); - LocalMIs.erase(MI); - LocalMIs.erase(DefMI); - LocalMIs.insert(FoldMI); - MI->eraseFromParent(); - DefMI->eraseFromParent(); - MRI->markUsesInDebugValueAsUndef(FoldedReg); - ++NumLoadFold; - - // MI is replaced with FoldMI. - Changed = true; - continue; + const MCInstrDesc &MIDesc = MI->getDesc(); + for (unsigned i = MIDesc.getNumDefs(); i != MIDesc.getNumOperands(); + ++i) { + const MachineOperand &MOp = MI->getOperand(i); + if (!MOp.isReg()) + continue; + unsigned TryFoldReg = MOp.getReg(); + if (FoldAsLoadDefCandidates.count(TryFoldReg)) { + MachineInstr *DefMI = 0; + MachineInstr *FoldMI = TII->optimizeLoadInstr(MI, MRI, TryFoldReg, + DefMI); + if (FoldMI) { + // Update LocalMIs since we replaced MI with FoldMI and deleted + // DefMI. + DEBUG(dbgs() << "Replacing: " << *MI); + DEBUG(dbgs() << " With: " << *FoldMI); + LocalMIs.erase(MI); + LocalMIs.erase(DefMI); + LocalMIs.insert(FoldMI); + MI->eraseFromParent(); + DefMI->eraseFromParent(); + MRI->markUsesInDebugValueAsUndef(TryFoldReg); + FoldAsLoadDefCandidates.erase(TryFoldReg); + ++NumLoadFold; + // MI is replaced with FoldMI. + Changed = true; + break; + } + } } } } diff --git a/llvm/test/CodeGen/X86/peephole-multiple-folds.ll b/llvm/test/CodeGen/X86/peephole-multiple-folds.ll new file mode 100644 index 0000000..9598c9f --- /dev/null +++ b/llvm/test/CodeGen/X86/peephole-multiple-folds.ll @@ -0,0 +1,29 @@ +; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s +; +; Test multiple peephole-time folds in a single basic block. +; + +define <8 x float> @test_peephole_multi_fold(<8 x float>* %p1, <8 x float>* %p2) { +entry: + br label %loopbody + +loopbody: +; CHECK: _test_peephole_multi_fold: +; CHECK: vfmadd231ps (%rdi), +; CHECK: vfmadd231ps (%rsi), + %vsum1 = phi <8 x float> [ %vsum1.next, %loopbody ], [ zeroinitializer, %entry ] + %vsum2 = phi <8 x float> [ %vsum2.next, %loopbody ], [ zeroinitializer, %entry ] + %m1 = load <8 x float>* %p1, align 1 + %m2 = load <8 x float>* %p2, align 1 + %vsum1.next = tail call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %m1, <8 x float> zeroinitializer, <8 x float> %vsum1) + %vsum2.next = tail call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %m2, <8 x float> zeroinitializer, <8 x float> %vsum2) + %vsum1.next.1 = extractelement <8 x float> %vsum1.next, i32 0 + %c = fcmp oeq float %vsum1.next.1, 0.0 + br i1 %c, label %loopbody, label %loopexit + +loopexit: + %r = fadd <8 x float> %vsum1.next, %vsum2.next + ret <8 x float> %r +} + +declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) -- 2.7.4