From ef78f2106cd7d5bf9afeaaf030dc4e0d241f7ea3 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Wed, 11 Aug 2021 16:47:12 +0100 Subject: [PATCH] [LegalizeTypes][VP] Add splitting support for binary VP ops This patch extends D107904's introduction of vector-predicated (VP) operation legalization to include vector splitting. When the result of a binary VP operation needs splitting, all of its operands are split in kind. The two operands and the mask are split as usual, and the vector-length parameter EVL is "split" such that the low and high halves each execute the correct number of elements. Tests have been added to the RISC-V target to show splitting several scenarios for fixed- and scalable-vector types. Without support for `umax` (e.g. in the `B` extension) the generated code starts to branch. Ideally a cost model would prevent their insertion in the first place. Through these tests many opportunities for better codegen can be seen: combining known-undef VP operations and for constant-folding operations on `ISD::VSCALE`, to name but a few. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D107957 --- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 2 +- .../CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 63 ++++- .../CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll | 270 +++++++++++++++++++++ llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll | 200 +++++++++++++++ 4 files changed, 529 insertions(+), 6 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 50882344..04b7744 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -818,7 +818,7 @@ private: // Vector Result Splitting: <128 x ty> -> 2 x <64 x ty>. void SplitVectorResult(SDNode *N, unsigned ResNo); - void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi, bool IsVP); void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 2c6cef8..baac550 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1069,7 +1069,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::USHLSAT: case ISD::ROTL: case ISD::ROTR: - SplitVecRes_BinOp(N, Lo, Hi); + SplitVecRes_BinOp(N, Lo, Hi, /*IsVP*/ false); break; case ISD::FMA: case ISD::FSHL: @@ -1106,6 +1106,26 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::UDIVFIXSAT: SplitVecRes_FIX(N, Lo, Hi); break; + case ISD::VP_ADD: + case ISD::VP_AND: + case ISD::VP_MUL: + case ISD::VP_OR: + case ISD::VP_SUB: + case ISD::VP_XOR: + case ISD::VP_SHL: + case ISD::VP_LSHR: + case ISD::VP_ASHR: + case ISD::VP_SDIV: + case ISD::VP_UDIV: + case ISD::VP_SREM: + case ISD::VP_UREM: + case ISD::VP_FADD: + case ISD::VP_FSUB: + case ISD::VP_FMUL: + case ISD::VP_FDIV: + case ISD::VP_FREM: + SplitVecRes_BinOp(N, Lo, Hi, /*IsVP*/ true); + break; } // If Lo/Hi is null, the sub-method took care of registering results etc. @@ -1137,8 +1157,8 @@ void DAGTypeLegalizer::IncrementPointer(MemSDNode *N, EVT MemVT, } } -void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo, - SDValue &Hi) { +void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi, + bool IsVP) { SDValue LHSLo, LHSHi; GetSplitVector(N->getOperand(0), LHSLo, LHSHi); SDValue RHSLo, RHSHi; @@ -1147,8 +1167,41 @@ void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo, const SDNodeFlags Flags = N->getFlags(); unsigned Opcode = N->getOpcode(); - Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Flags); - Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Flags); + if (!IsVP) { + Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Flags); + Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Flags); + return; + } + + // Split the mask. + SDValue MaskLo, MaskHi; + SDValue Mask = N->getOperand(2); + EVT MaskVT = Mask.getValueType(); + if (getTypeAction(MaskVT) == TargetLowering::TypeSplitVector) + GetSplitVector(Mask, MaskLo, MaskHi); + else + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, SDLoc(Mask)); + + // Split the vector length parameter. + // %evl -> umin(%evl, %halfnumelts) and usubsat(%evl - %halfnumelts). + SDValue EVL = N->getOperand(3); + EVT VecVT = N->getValueType(0); + EVT EVLVT = EVL.getValueType(); + assert(VecVT.getVectorElementCount().isKnownEven() && + "Expecting the mask to be an evenly-sized vector"); + unsigned HalfMinNumElts = VecVT.getVectorMinNumElements() / 2; + SDValue HalfNumElts = + VecVT.isFixedLengthVector() + ? DAG.getConstant(HalfMinNumElts, dl, EVLVT) + : DAG.getVScale(dl, EVLVT, + APInt(EVLVT.getScalarSizeInBits(), HalfMinNumElts)); + SDValue EVLLo = DAG.getNode(ISD::UMIN, dl, EVLVT, EVL, HalfNumElts); + SDValue EVLHi = DAG.getNode(ISD::USUBSAT, dl, EVLVT, EVL, HalfNumElts); + + Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), + {LHSLo, RHSLo, MaskLo, EVLLo}, Flags); + Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), + {LHSHi, RHSHi, MaskHi, EVLHi}, Flags); } void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll index 696b38e..f366f98 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll @@ -384,6 +384,103 @@ define <16 x i8> @vadd_vi_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) { ret <16 x i8> %v } +declare <256 x i8> @llvm.vp.add.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32) + +define <256 x i8> @vadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vadd_vi_v258i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 128 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu +; CHECK-NEXT: vle1.v v25, (a0) +; CHECK-NEXT: addi a0, a1, -128 +; CHECK-NEXT: vmv1r.v v26, v0 +; CHECK-NEXT: mv a3, zero +; CHECK-NEXT: bltu a1, a0, .LBB30_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a3, a0 +; CHECK-NEXT: .LBB30_2: +; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t +; CHECK-NEXT: bltu a1, a2, .LBB30_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: addi a1, zero, 128 +; CHECK-NEXT: .LBB30_4: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v26 +; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <256 x i8> undef, i8 -1, i32 0 + %vb = shufflevector <256 x i8> %elt.head, <256 x i8> undef, <256 x i32> zeroinitializer + %v = call <256 x i8> @llvm.vp.add.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl) + ret <256 x i8> %v +} + +define <256 x i8> @vadd_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) { +; CHECK-LABEL: vadd_vi_v258i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a1, a0, -128 +; CHECK-NEXT: mv a2, zero +; CHECK-NEXT: bltu a0, a1, .LBB31_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: .LBB31_2: +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu +; CHECK-NEXT: addi a1, zero, 128 +; CHECK-NEXT: vadd.vi v16, v16, -1 +; CHECK-NEXT: bltu a0, a1, .LBB31_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: addi a0, zero, 128 +; CHECK-NEXT: .LBB31_4: +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu +; CHECK-NEXT: vadd.vi v8, v8, -1 +; CHECK-NEXT: ret + %elt.head = insertelement <256 x i8> undef, i8 -1, i32 0 + %vb = shufflevector <256 x i8> %elt.head, <256 x i8> undef, <256 x i32> zeroinitializer + %head = insertelement <256 x i1> undef, i1 true, i32 0 + %m = shufflevector <256 x i1> %head, <256 x i1> undef, <256 x i32> zeroinitializer + %v = call <256 x i8> @llvm.vp.add.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl) + ret <256 x i8> %v +} + +; Test splitting when the %evl is a known constant. + +define <256 x i8> @vadd_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) { +; CHECK-LABEL: vadd_vi_v258i8_evl129: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a1, zero, 128 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu +; CHECK-NEXT: vle1.v v25, (a0) +; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: vsetivli zero, 1, e8, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <256 x i8> undef, i8 -1, i32 0 + %vb = shufflevector <256 x i8> %elt.head, <256 x i8> undef, <256 x i32> zeroinitializer + %v = call <256 x i8> @llvm.vp.add.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 129) + ret <256 x i8> %v +} + +; FIXME: The upper half is doing nothing. + +define <256 x i8> @vadd_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) { +; CHECK-LABEL: vadd_vi_v258i8_evl128: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a1, zero, 128 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu +; CHECK-NEXT: vle1.v v25, (a0) +; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: vsetivli zero, 0, e8, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement <256 x i8> undef, i8 -1, i32 0 + %vb = shufflevector <256 x i8> %elt.head, <256 x i8> undef, <256 x i32> zeroinitializer + %v = call <256 x i8> @llvm.vp.add.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 128) + ret <256 x i8> %v +} + declare <2 x i16> @llvm.vp.add.v2i16(<2 x i16>, <2 x i16>, <2 x i1>, i32) define <2 x i16> @vadd_vv_v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 zeroext %evl) { @@ -1407,3 +1504,176 @@ define <16 x i64> @vadd_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { %v = call <16 x i64> @llvm.vp.add.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl) ret <16 x i64> %v } + +; Test that split-legalization works as expected. + +declare <32 x i64> @llvm.vp.add.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32) + +define <32 x i64> @vadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vadd_vx_v32i64: +; RV32: # %bb.0: +; RV32-NEXT: mv a1, zero +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32-NEXT: vmv1r.v v1, v0 +; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: addi a2, zero, 32 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV32-NEXT: addi a2, a0, -16 +; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: bltu a0, a2, .LBB106_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: .LBB106_2: +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV32-NEXT: addi a1, zero, 16 +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: bltu a0, a1, .LBB106_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: addi a0, zero, 16 +; RV32-NEXT: .LBB106_4: +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v32i64: +; RV64: # %bb.0: +; RV64-NEXT: mv a1, zero +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64-NEXT: addi a2, a0, -16 +; RV64-NEXT: vmv1r.v v25, v0 +; RV64-NEXT: vslidedown.vi v0, v0, 2 +; RV64-NEXT: bltu a0, a2, .LBB106_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a1, a2 +; RV64-NEXT: .LBB106_2: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV64-NEXT: addi a1, zero, 16 +; RV64-NEXT: vadd.vi v16, v16, -1, v0.t +; RV64-NEXT: bltu a0, a1, .LBB106_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: addi a0, zero, 16 +; RV64-NEXT: .LBB106_4: +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v25 +; RV64-NEXT: vadd.vi v8, v8, -1, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <32 x i64> undef, i64 -1, i32 0 + %vb = shufflevector <32 x i64> %elt.head, <32 x i64> undef, <32 x i32> zeroinitializer + %v = call <32 x i64> @llvm.vp.add.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl) + ret <32 x i64> %v +} + +define <32 x i64> @vadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { +; RV32-LABEL: vadd_vi_v32i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: mv a1, zero +; RV32-NEXT: addi a2, zero, 32 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV32-NEXT: addi a2, a0, -16 +; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: bltu a0, a2, .LBB107_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: .LBB107_2: +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; RV32-NEXT: addi a1, zero, 16 +; RV32-NEXT: vadd.vv v16, v16, v24 +; RV32-NEXT: bltu a0, a1, .LBB107_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: addi a0, zero, 16 +; RV32-NEXT: .LBB107_4: +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, mu +; RV32-NEXT: vadd.vv v8, v8, v24 +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vi_v32i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: addi a1, a0, -16 +; RV64-NEXT: mv a2, zero +; RV64-NEXT: bltu a0, a1, .LBB107_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a1 +; RV64-NEXT: .LBB107_2: +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV64-NEXT: addi a1, zero, 16 +; RV64-NEXT: vadd.vi v16, v16, -1 +; RV64-NEXT: bltu a0, a1, .LBB107_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: addi a0, zero, 16 +; RV64-NEXT: .LBB107_4: +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, mu +; RV64-NEXT: vadd.vi v8, v8, -1 +; RV64-NEXT: ret + %elt.head = insertelement <32 x i64> undef, i64 -1, i32 0 + %vb = shufflevector <32 x i64> %elt.head, <32 x i64> undef, <32 x i32> zeroinitializer + %head = insertelement <32 x i1> undef, i1 true, i32 0 + %m = shufflevector <32 x i1> %head, <32 x i1> undef, <32 x i32> zeroinitializer + %v = call <32 x i64> @llvm.vp.add.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl) + ret <32 x i64> %v +} + +; FIXME: After splitting, the "high" vadd.vv is doing nothing; could be +; replaced by undef. + +define <32 x i64> @vadd_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { +; RV32-LABEL: vadd_vx_v32i64_evl12: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32-NEXT: vslidedown.vi v1, v0, 2 +; RV32-NEXT: addi a0, zero, 32 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: vsetivli zero, 12, e64, m8, ta, mu +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: vsetivli zero, 0, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v32i64_evl12: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vi v25, v0, 2 +; RV64-NEXT: vsetivli zero, 12, e64, m8, ta, mu +; RV64-NEXT: vadd.vi v8, v8, -1, v0.t +; RV64-NEXT: vsetivli zero, 0, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v25 +; RV64-NEXT: vadd.vi v16, v16, -1, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <32 x i64> undef, i64 -1, i32 0 + %vb = shufflevector <32 x i64> %elt.head, <32 x i64> undef, <32 x i32> zeroinitializer + %v = call <32 x i64> @llvm.vp.add.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 12) + ret <32 x i64> %v +} + +define <32 x i64> @vadd_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) { +; RV32-LABEL: vadd_vx_v32i64_evl27: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32-NEXT: vslidedown.vi v1, v0, 2 +; RV32-NEXT: addi a0, zero, 32 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: vsetivli zero, 11, e64, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v32i64_evl27: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vi v25, v0, 2 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: vadd.vi v8, v8, -1, v0.t +; RV64-NEXT: vsetivli zero, 11, e64, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v25 +; RV64-NEXT: vadd.vi v16, v16, -1, v0.t +; RV64-NEXT: ret + %elt.head = insertelement <32 x i64> undef, i64 -1, i32 0 + %vb = shufflevector <32 x i64> %elt.head, <32 x i64> undef, <32 x i32> zeroinitializer + %v = call <32 x i64> @llvm.vp.add.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 27) + ret <32 x i64> %v +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll index 57e049e..ef54d51 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll @@ -612,6 +612,69 @@ define @vadd_vi_nxv64i8_unmasked( %va, i32 ret %v } +; Test that split-legalization works when the mask itself needs splitting. + +declare @llvm.vp.add.nxv128i8(, , , i32) + +define @vadd_vi_nxv128i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vadd_vi_nxv128i8: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: mv a3, a1 +; CHECK-NEXT: bltu a1, a2, .LBB48_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a3, a2 +; CHECK-NEXT: .LBB48_2: +; CHECK-NEXT: mv a4, zero +; CHECK-NEXT: vsetvli a5, zero, e8, m8, ta, mu +; CHECK-NEXT: vle1.v v25, (a0) +; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, mu +; CHECK-NEXT: sub a0, a1, a2 +; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: bltu a1, a0, .LBB48_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: mv a4, a0 +; CHECK-NEXT: .LBB48_4: +; CHECK-NEXT: vsetvli zero, a4, e8, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement undef, i8 -1, i32 0 + %vb = shufflevector %elt.head, undef, zeroinitializer + %v = call @llvm.vp.add.nxv128i8( %va, %vb, %m, i32 %evl) + ret %v +} + +define @vadd_vi_nxv128i8_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vadd_vi_nxv128i8_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a0 +; CHECK-NEXT: bltu a0, a1, .LBB49_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: .LBB49_2: +; CHECK-NEXT: mv a3, zero +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu +; CHECK-NEXT: sub a1, a0, a1 +; CHECK-NEXT: vadd.vi v8, v8, -1 +; CHECK-NEXT: bltu a0, a1, .LBB49_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: mv a3, a1 +; CHECK-NEXT: .LBB49_4: +; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, mu +; CHECK-NEXT: vadd.vi v16, v16, -1 +; CHECK-NEXT: ret + %elt.head = insertelement undef, i8 -1, i32 0 + %vb = shufflevector %elt.head, undef, zeroinitializer + %head = insertelement undef, i1 true, i32 0 + %m = shufflevector %head, undef, zeroinitializer + %v = call @llvm.vp.add.nxv128i8( %va, %vb, %m, i32 %evl) + ret %v +} + declare @llvm.vp.add.nxv1i16(, , , i32) define @vadd_vv_nxv1i16( %va, %b, %m, i32 zeroext %evl) { @@ -1448,6 +1511,143 @@ define @vadd_vi_nxv16i32_unmasked( %va, i ret %v } +; Test that split-legalization works then the mask needs manual splitting. + +declare @llvm.vp.add.nxv32i32(, , , i32) + +define @vadd_vi_nxv32i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vadd_vi_nxv32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: mv a2, zero +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a4, a1, 2 +; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, mu +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vslidedown.vx v0, v0, a4 +; CHECK-NEXT: bltu a0, a3, .LBB116_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a2, a3 +; CHECK-NEXT: .LBB116_2: +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t +; CHECK-NEXT: bltu a0, a1, .LBB116_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB116_4: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement undef, i32 -1, i32 0 + %vb = shufflevector %elt.head, undef, zeroinitializer + %v = call @llvm.vp.add.nxv32i32( %va, %vb, %m, i32 %evl) + ret %v +} + +; FIXME: We don't catch this as unmasked. + +define @vadd_vi_nxv32i32_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vadd_vi_nxv32i32_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: mv a2, zero +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a4, a1, 2 +; CHECK-NEXT: vsetvli a3, zero, e8, m4, ta, mu +; CHECK-NEXT: vmset.m v25 +; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, mu +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: vmv1r.v v26, v25 +; CHECK-NEXT: vslidedown.vx v0, v25, a4 +; CHECK-NEXT: bltu a0, a3, .LBB117_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a2, a3 +; CHECK-NEXT: .LBB117_2: +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t +; CHECK-NEXT: bltu a0, a1, .LBB117_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB117_4: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v26 +; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement undef, i32 -1, i32 0 + %vb = shufflevector %elt.head, undef, zeroinitializer + %head = insertelement undef, i1 true, i32 0 + %m = shufflevector %head, undef, zeroinitializer + %v = call @llvm.vp.add.nxv32i32( %va, %vb, %m, i32 %evl) + ret %v +} + +; Test splitting when the %evl is a constant (albeit an unknown one). + +declare i32 @llvm.vscale.i32() + +; FIXME: The upper half of the operation is doing nothing. +; FIXME: The branches comparing vscale vs. vscale should be constant-foldable. + +define @vadd_vi_nxv32i32_evl_nx8( %va, %m) { +; CHECK-LABEL: vadd_vi_nxv32i32_evl_nx8: +; CHECK: # %bb.0: +; CHECK-NEXT: mv a2, zero +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a4, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, mu +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: sub a3, a0, a1 +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vslidedown.vx v0, v0, a4 +; CHECK-NEXT: bltu a0, a3, .LBB118_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a2, a3 +; CHECK-NEXT: .LBB118_2: +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t +; CHECK-NEXT: bltu a0, a1, .LBB118_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB118_4: +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement undef, i32 -1, i32 0 + %vb = shufflevector %elt.head, undef, zeroinitializer + %evl = call i32 @llvm.vscale.i32() + %evl0 = mul i32 %evl, 8 + %v = call @llvm.vp.add.nxv32i32( %va, %vb, %m, i32 %evl0) + ret %v +} + +; FIXME: The first vadd.vi should be able to infer that its AVL is equivalent to VLMAX. +; FIXME: The upper half of the operation is doing nothing. + +define @vadd_vi_nxv32i32_evl_nx16( %va, %m) { +; CHECK-LABEL: vadd_vi_nxv32i32_evl_nx16: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 2 +; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, mu +; CHECK-NEXT: vslidedown.vx v25, v0, a1 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t +; CHECK-NEXT: vsetivli zero, 0, e32, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t +; CHECK-NEXT: ret + %elt.head = insertelement undef, i32 -1, i32 0 + %vb = shufflevector %elt.head, undef, zeroinitializer + %evl = call i32 @llvm.vscale.i32() + %evl0 = mul i32 %evl, 16 + %v = call @llvm.vp.add.nxv32i32( %va, %vb, %m, i32 %evl0) + ret %v +} + declare @llvm.vp.add.nxv1i64(, , , i32) define @vadd_vv_nxv1i64( %va, %b, %m, i32 zeroext %evl) { -- 2.7.4