From f312c1ecf4bc7003f4b10231a3147d004a39bfae Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Mon, 19 Aug 2019 09:38:28 +0000 Subject: [PATCH] [ARM] Add support for MVE vaddv This patch adds vecreduce_add and the relevant instruction selection for vaddv. Differential revision: https://reviews.llvm.org/D66085 llvm-svn: 369245 --- llvm/include/llvm/Target/TargetSelectionDAG.td | 5 ++++ llvm/lib/Target/ARM/ARMISelLowering.cpp | 3 +++ llvm/lib/Target/ARM/ARMInstrMVE.td | 6 +++++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp | 25 +++++++++++++++++++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h | 7 ++++++ llvm/test/CodeGen/Thumb2/mve-vaddv.ll | 34 ++++++++++++++++++++++++++ 6 files changed, 80 insertions(+) create mode 100644 llvm/test/CodeGen/Thumb2/mve-vaddv.ll diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 453948a..9e30ca1 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -239,6 +239,9 @@ def SDTVecExtract : SDTypeProfile<1, 2, [ // vector extract def SDTVecInsert : SDTypeProfile<1, 3, [ // vector insert SDTCisEltOfVec<2, 1>, SDTCisSameAs<0, 1>, SDTCisPtrTy<3> ]>; +def SDTVecReduce : SDTypeProfile<1, 1, [ // vector reduction + SDTCisInt<0>, SDTCisVec<1> +]>; def SDTSubVecExtract : SDTypeProfile<1, 2, [// subvector extract SDTCisSubVecOfVec<0,1>, SDTCisInt<2> @@ -415,6 +418,8 @@ def addrspacecast : SDNode<"ISD::ADDRSPACECAST", SDTUnaryOp>; def extractelt : SDNode<"ISD::EXTRACT_VECTOR_ELT", SDTVecExtract>; def insertelt : SDNode<"ISD::INSERT_VECTOR_ELT", SDTVecInsert>; +def vecreduce_add : SDNode<"ISD::VECREDUCE_ADD", SDTVecReduce>; + def fadd : SDNode<"ISD::FADD" , SDTFPBinOp, [SDNPCommutative]>; def fsub : SDNode<"ISD::FSUB" , SDTFPBinOp>; def fmul : SDNode<"ISD::FMUL" , SDTFPBinOp, [SDNPCommutative]>; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 63b8f2a..833040f 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -267,6 +267,9 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); + // Vector reductions + setOperationAction(ISD::VECREDUCE_ADD, VT, Legal); + if (!HasMVEFP) { setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index ce51322..8290ef5 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -549,6 +549,12 @@ defm MVE_VADDVu8 : MVE_VADDV_A<"u8", 0b1, 0b00>; defm MVE_VADDVu16 : MVE_VADDV_A<"u16", 0b1, 0b01>; defm MVE_VADDVu32 : MVE_VADDV_A<"u32", 0b1, 0b10>; +let Predicates = [HasMVEInt] in { + def : Pat<(i32 (vecreduce_add (v4i32 MQPR:$src))), (i32 (MVE_VADDVu32no_acc $src))>; + def : Pat<(i32 (vecreduce_add (v8i16 MQPR:$src))), (i32 (MVE_VADDVu16no_acc $src))>; + def : Pat<(i32 (vecreduce_add (v16i8 MQPR:$src))), (i32 (MVE_VADDVu8no_acc $src))>; +} + class MVE_VADDLV pattern=[]> : MVE_rDest<(outs tGPREven:$RdaLo, tGPROdd:$RdaHi), iops, NoItinerary, iname, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 6a128e8..3dbf5b9 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1044,3 +1044,28 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, if (Cost < 12) UP.Force = true; } + +bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const { + assert(isa(Ty) && "Expected Ty to be a vector type"); + unsigned ScalarBits = Ty->getScalarSizeInBits(); + if (!ST->hasMVEIntegerOps()) + return false; + + switch (Opcode) { + case Instruction::FAdd: + case Instruction::FMul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Mul: + case Instruction::ICmp: + case Instruction::FCmp: + return false; + case Instruction::Add: + return ScalarBits * Ty->getVectorNumElements() == 128; + default: + llvm_unreachable("Unhandled reduction opcode"); + } + return false; +} diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index b966c76..507e018 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -156,6 +156,13 @@ public: int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const; + + bool shouldExpandReduction(const IntrinsicInst *II) const { + return false; + } + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I = nullptr); diff --git a/llvm/test/CodeGen/Thumb2/mve-vaddv.ll b/llvm/test/CodeGen/Thumb2/mve-vaddv.ll new file mode 100644 index 0000000..cc4f526 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-vaddv.ll @@ -0,0 +1,34 @@ +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s + +declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>) +define arm_aapcs_vfpcc i32 @vaddv_v4i32_i32(<4 x i32> %s1) { +; CHECK-LABEL: vaddv_v4i32_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %r = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %s1) + ret i32 %r +} + +declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>) +define arm_aapcs_vfpcc i16 @vaddv_v16i16_i16(<8 x i16> %s1) { +; CHECK-LABEL: vaddv_v16i16_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.u16 r0, q0 +; CHECK-NEXT: bx lr +entry: + %r = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> %s1) + ret i16 %r +} + +declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>) +define arm_aapcs_vfpcc i8 @vaddv_v16i8_i8(<16 x i8> %s1) { +; CHECK-LABEL: vaddv_v16i8_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.u8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %r = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> %s1) + ret i8 %r +} -- 2.7.4