From 7c8a3b0ef6172a919d760a717d819069a4b55c71 Mon Sep 17 00:00:00 2001 From: Brendon Cahoon Date: Thu, 14 May 2015 20:36:19 +0000 Subject: [PATCH] [Hexagon] Generate hardware loop for a vectorized loop The induction variable in the vectorized loop wasn't recognized properly, so a hardware loop wasn't generated. Differential Revision: http://reviews.llvm.org/D9722 llvm-svn: 237388 --- llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp | 53 ++++++++++++-- llvm/test/CodeGen/Hexagon/hwloop5.ll | 93 ++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 7 deletions(-) create mode 100644 llvm/test/CodeGen/Hexagon/hwloop5.ll diff --git a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp index a4cd1f1..db72899 100644 --- a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp +++ b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp @@ -1719,7 +1719,52 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) { // compared against an immediate, we can fix it. const RegisterBump &RB = I->second; if (CmpRegs.count(RB.first)) { - if (!CmpImmOp) + if (!CmpImmOp) { + // If both operands to the compare instruction are registers, see if + // it can be changed to use induction register as one of the operands. + MachineInstr *IndI = nullptr; + MachineInstr *nonIndI = nullptr; + MachineOperand *IndMO = nullptr; + MachineOperand *nonIndMO = nullptr; + + for (unsigned i = 1, n = PredDef->getNumOperands(); i < n; ++i) { + MachineOperand &MO = PredDef->getOperand(i); + if (MO.isReg() && MO.getReg() == RB.first) { + DEBUG(dbgs() << "\n DefMI(" << i << ") = " + << *(MRI->getVRegDef(I->first))); + if (IndI) + return false; + + IndI = MRI->getVRegDef(I->first); + IndMO = &MO; + } else if (MO.isReg()) { + DEBUG(dbgs() << "\n DefMI(" << i << ") = " + << *(MRI->getVRegDef(MO.getReg()))); + if (nonIndI) + return false; + + nonIndI = MRI->getVRegDef(MO.getReg()); + nonIndMO = &MO; + } + } + if (IndI && nonIndI && + nonIndI->getOpcode() == Hexagon::A2_addi && + nonIndI->getOperand(2).isImm() && + nonIndI->getOperand(2).getImm() == - RB.second) { + bool Order = orderBumpCompare(IndI, PredDef); + if (Order) { + IndMO->setReg(I->first); + nonIndMO->setReg(nonIndI->getOperand(1).getReg()); + return true; + } + } + return false; + } + + // It is not valid to do this transformation on an unsigned comparison + // because it may underflow. + Comparison::Kind Cmp = getComparisonKind(PredDef->getOpcode(), 0, 0, 0); + if (!Cmp || Comparison::isUnsigned(Cmp)) return false; // If the register is being compared against an immediate, try changing @@ -1739,12 +1784,6 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) { if (!isImmValidForOpcode(PredDef->getOpcode(), CmpImm)) return false; - // It is not valid to do this transformation on an unsigned comparison - // because it may underflow. - Comparison::Kind Cmp = getComparisonKind(PredDef->getOpcode(), 0, 0, 0); - if (!Cmp || Comparison::isUnsigned(Cmp)) - return false; - // Make sure that the compare happens after the bump. Otherwise, // after the fixup, the compare would use a yet-undefined register. MachineInstr *BumpI = MRI->getVRegDef(I->first); diff --git a/llvm/test/CodeGen/Hexagon/hwloop5.ll b/llvm/test/CodeGen/Hexagon/hwloop5.ll new file mode 100644 index 0000000..0886b03 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/hwloop5.ll @@ -0,0 +1,93 @@ +; RUN: llc -O3 -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s +; +; Generate hardware loop when unknown trip count loop is vectorized. + +; CHECK: loop0(.LBB{{[0-9]*}}_{{[0-9]*}}, r{{[0-9]+}}) +; CHECK: endloop0 +; CHECK: loop0(.LBB{{[0-9]*}}_{{[0-9]*}}, r{{[0-9]+}}) +; CHECK: endloop0 + +@A = common global [1000 x i32] zeroinitializer, align 8 +@B = common global [1000 x i32] zeroinitializer, align 8 + +define i32 @dotprod2(i32 %count) #0 { +entry.split: + %cmp6 = icmp sgt i32 %count, 0 + br i1 %cmp6, label %polly.cond, label %for.end + +for.end.loopexit: + br label %for.end + +for.end: + %sum.0.lcssa.reg2mem.0.load37 = phi i32 [ 0, %entry.split ], [ %p_add34, %polly.loop_if13 ], [ %p_add, %for.end.loopexit ] + ret i32 %sum.0.lcssa.reg2mem.0.load37 + +polly.cond: + %0 = icmp sgt i32 %count, 1 + br i1 %0, label %polly.loop_if, label %polly.loop_if13 + +polly.loop_exit.loopexit: + br label %polly.loop_exit + +polly.loop_exit: + %1 = phi <2 x i32> [ zeroinitializer, %polly.loop_if ], [ %addp_vec, %polly.loop_exit.loopexit ] + %2 = extractelement <2 x i32> %1, i32 0 + %3 = extractelement <2 x i32> %1, i32 1 + %add_sum = add i32 %2, %3 + br label %polly.loop_if13 + +polly.loop_if: + %4 = add i32 %count, -1 + %leftover_lb = and i32 %4, -2 + %polly.loop_guard = icmp eq i32 %leftover_lb, 0 + br i1 %polly.loop_guard, label %polly.loop_exit, label %polly.loop_preheader + +polly.stmt.for.body: + %addp_vec28 = phi <2 x i32> [ zeroinitializer, %polly.loop_preheader ], [ %addp_vec, %polly.stmt.for.body ] + %scevgep.phi = phi i32* [ getelementptr inbounds ([1000 x i32], [1000 x i32]* @A, i32 0, i32 0), %polly.loop_preheader ], [ %scevgep.inc, %polly.stmt.for.body ] + %scevgep9.phi = phi i32* [ getelementptr inbounds ([1000 x i32], [1000 x i32]* @B, i32 0, i32 0), %polly.loop_preheader ], [ %scevgep9.inc, %polly.stmt.for.body ] + %polly.indvar = phi i32 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.for.body ] + %vector_ptr = bitcast i32* %scevgep.phi to <2 x i32>* + %_p_vec_full = load <2 x i32>, <2 x i32>* %vector_ptr, align 8 + %vector_ptr10 = bitcast i32* %scevgep9.phi to <2 x i32>* + %_p_vec_full11 = load <2 x i32>, <2 x i32>* %vector_ptr10, align 8 + %mulp_vec = mul <2 x i32> %_p_vec_full11, %_p_vec_full + %addp_vec = add <2 x i32> %mulp_vec, %addp_vec28 + %polly.indvar_next = add nsw i32 %polly.indvar, 2 + %polly.loop_cond = icmp eq i32 %polly.indvar, %polly.adjust_ub + %scevgep.inc = getelementptr i32, i32* %scevgep.phi, i32 2 + %scevgep9.inc = getelementptr i32, i32* %scevgep9.phi, i32 2 + br i1 %polly.loop_cond, label %polly.loop_exit.loopexit, label %polly.stmt.for.body + +polly.loop_preheader: + %polly.adjust_ub = add i32 %leftover_lb, -2 + br label %polly.stmt.for.body + +polly.loop_if13: + %p_add34 = phi i32 [ 0, %polly.cond ], [ %add_sum, %polly.loop_exit ] + %merge.lb = phi i32 [ 0, %polly.cond ], [ %leftover_lb, %polly.loop_exit ] + %polly.loop_guard17 = icmp slt i32 %merge.lb, %count + br i1 %polly.loop_guard17, label %polly.loop_preheader15, label %for.end + +polly.stmt.for.body22: + %p_add30 = phi i32 [ %p_add34, %polly.loop_preheader15 ], [ %p_add, %polly.stmt.for.body22 ] + %polly.indvar18 = phi i32 [ %merge.lb, %polly.loop_preheader15 ], [ %polly.indvar_next19, %polly.stmt.for.body22 ] + %5 = tail call i32 @llvm.annotation.i32(i32 %polly.indvar18, i8* null, i8* null, i32 0), !polly.loop.smallTripCount !0 + %scevgep23 = getelementptr [1000 x i32], [1000 x i32]* @A, i32 0, i32 %polly.indvar18 + %_p_scalar_ = load i32, i32* %scevgep23, align 4 + %scevgep24 = getelementptr [1000 x i32], [1000 x i32]* @B, i32 0, i32 %polly.indvar18 + %_p_scalar_25 = load i32, i32* %scevgep24, align 4 + %p_mul = mul nsw i32 %_p_scalar_25, %_p_scalar_ + %p_add = add nsw i32 %p_mul, %p_add30 + %polly.indvar_next19 = add nsw i32 %polly.indvar18, 1 + %polly.loop_cond21 = icmp slt i32 %polly.indvar18, %polly.adjust_ub20 + br i1 %polly.loop_cond21, label %polly.stmt.for.body22, label %for.end.loopexit + +polly.loop_preheader15: + %polly.adjust_ub20 = add i32 %count, -1 + br label %polly.stmt.for.body22 +} + +declare i32 @llvm.annotation.i32(i32, i8*, i8*, i32) #1 + +!0 = !{} -- 2.7.4