SDValue ExpandVSELECT(SDNode *Node);
SDValue ExpandVP_SELECT(SDNode *Node);
SDValue ExpandVP_MERGE(SDNode *Node);
+ SDValue ExpandVP_REM(SDNode *Node);
SDValue ExpandSELECT(SDNode *Node);
std::pair<SDValue, SDValue> ExpandLoad(SDNode *N);
SDValue ExpandStore(SDNode *N);
case ISD::VP_SELECT:
Results.push_back(ExpandVP_SELECT(Node));
return;
+ case ISD::VP_SREM:
+ case ISD::VP_UREM:
+ if (SDValue Expanded = ExpandVP_REM(Node)) {
+ Results.push_back(Expanded);
+ return;
+ }
+ break;
case ISD::SELECT:
Results.push_back(ExpandSELECT(Node));
return;
return DAG.getSelect(DL, Node->getValueType(0), FullMask, Op1, Op2);
}
+SDValue VectorLegalizer::ExpandVP_REM(SDNode *Node) {
+ // Implement VP_SREM/UREM in terms of VP_SDIV/VP_UDIV, VP_MUL, VP_SUB.
+ EVT VT = Node->getValueType(0);
+
+ unsigned DivOpc = Node->getOpcode() == ISD::VP_SREM ? ISD::VP_SDIV : ISD::VP_UDIV;
+
+ if (!TLI.isOperationLegalOrCustom(DivOpc, VT) ||
+ !TLI.isOperationLegalOrCustom(ISD::VP_MUL, VT) ||
+ !TLI.isOperationLegalOrCustom(ISD::VP_SUB, VT))
+ return SDValue();
+
+ SDLoc DL(Node);
+
+ SDValue Dividend = Node->getOperand(0);
+ SDValue Divisor = Node->getOperand(1);
+ SDValue Mask = Node->getOperand(2);
+ SDValue EVL = Node->getOperand(3);
+
+ // X % Y -> X-X/Y*Y
+ SDValue Div = DAG.getNode(DivOpc, DL, VT, Dividend, Divisor, Mask, EVL);
+ SDValue Mul = DAG.getNode(ISD::VP_MUL, DL, VT, Divisor, Div, Mask, EVL);
+ return DAG.getNode(ISD::VP_SUB, DL, VT, Dividend, Mul, Mask, EVL);
+}
+
void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node,
SmallVectorImpl<SDValue> &Results) {
// Attempt to expand using TargetLowering.
// Named vector shuffles default to expand.
setOperationAction(ISD::VECTOR_SPLICE, VT, Expand);
+
+ // VP_SREM/UREM default to expand.
+ // TODO: Expand all VP intrinsics.
+ setOperationAction(ISD::VP_SREM, VT, Expand);
+ setOperationAction(ISD::VP_UREM, VT, Expand);
}
// Most targets ignore the @llvm.prefetch intrinsic.
-; REQUIRES: asserts
-; RUN: not --crash llc < %s -march=ve -mattr=+vpu -o /dev/null 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
-; CHECK: t{{[0-9]+}}: v256i32 = vp_srem [[A:t[0-9]+]], [[B:t[0-9]+]], [[MASK:t[0-9]+]], [[EVL:t[0-9]+]]
-; CHECK: [[A]]: v256i32
-; CHECK: [[B]]: v256i32
-; CHECK: [[MASK]]: v256i1
-; CHECK: [[EVL]]: i32
+declare <256 x i32> @llvm.vp.srem.v256i32(<256 x i32>, <256 x i32>, <256 x i1>, i32)
-define <256 x i32> @test_vp_int(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+define fastcc <256 x i32> @test_vp_srem_v256i32_vv(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_srem_v256i32_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vdivs.w.sx %v2, %v0, %v1, %vm1
+; CHECK-NEXT: vmuls.w.sx %v1, %v1, %v2, %vm1
+; CHECK-NEXT: vsubs.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
%r0 = call <256 x i32> @llvm.vp.srem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
ret <256 x i32> %r0
}
-; integer arith
-declare <256 x i32> @llvm.vp.srem.v256i32(<256 x i32>, <256 x i32>, <256 x i1>, i32)
+define fastcc <256 x i32> @test_vp_srem_v256i32_rv(i32 %s0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_srem_v256i32_rv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vdivs.w.sx %v1, %s0, %v0, %vm1
+; CHECK-NEXT: vmuls.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT: vsubs.w.sx %v0, %s0, %v0, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
+ %xins = insertelement <256 x i32> undef, i32 %s0, i32 0
+ %i0 = shufflevector <256 x i32> %xins, <256 x i32> undef, <256 x i32> zeroinitializer
+ %r0 = call <256 x i32> @llvm.vp.srem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
+ ret <256 x i32> %r0
+}
+
+define fastcc <256 x i32> @test_vp_srem_v256i32_vr(<256 x i32> %i0, i32 %s1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_srem_v256i32_vr:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vdivs.w.sx %v1, %v0, %s0, %vm1
+; CHECK-NEXT: vmuls.w.sx %v1, %s0, %v1, %vm1
+; CHECK-NEXT: vsubs.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
+ %yins = insertelement <256 x i32> undef, i32 %s1, i32 0
+ %i1 = shufflevector <256 x i32> %yins, <256 x i32> undef, <256 x i32> zeroinitializer
+ %r0 = call <256 x i32> @llvm.vp.srem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
+ ret <256 x i32> %r0
+}
+
+
+declare <256 x i64> @llvm.vp.srem.v256i64(<256 x i64>, <256 x i64>, <256 x i1>, i32)
+
+define fastcc <256 x i64> @test_vp_int_v256i64_vv(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_int_v256i64_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vdivs.l %v2, %v0, %v1, %vm1
+; CHECK-NEXT: vmuls.l %v1, %v1, %v2, %vm1
+; CHECK-NEXT: vsubs.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r0 = call <256 x i64> @llvm.vp.srem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+ ret <256 x i64> %r0
+}
+
+define fastcc <256 x i64> @test_vp_srem_v256i64_rv(i64 %s0, <256 x i64> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_srem_v256i64_rv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vdivs.l %v1, %s0, %v0, %vm1
+; CHECK-NEXT: vmuls.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT: vsubs.l %v0, %s0, %v0, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
+ %xins = insertelement <256 x i64> undef, i64 %s0, i32 0
+ %i0 = shufflevector <256 x i64> %xins, <256 x i64> undef, <256 x i32> zeroinitializer
+ %r0 = call <256 x i64> @llvm.vp.srem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+ ret <256 x i64> %r0
+}
+
+define fastcc <256 x i64> @test_vp_srem_v256i64_vr(<256 x i64> %i0, i64 %s1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_srem_v256i64_vr:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vdivs.l %v1, %v0, %s0, %vm1
+; CHECK-NEXT: vmuls.l %v1, %s0, %v1, %vm1
+; CHECK-NEXT: vsubs.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
+ %yins = insertelement <256 x i64> undef, i64 %s1, i32 0
+ %i1 = shufflevector <256 x i64> %yins, <256 x i64> undef, <256 x i32> zeroinitializer
+ %r0 = call <256 x i64> @llvm.vp.srem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+ ret <256 x i64> %r0
+}
-; REQUIRES: asserts
-; RUN: not --crash llc < %s -march=ve -mattr=+vpu -o /dev/null 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
-; CHECK: t{{[0-9]+}}: v256i32 = vp_urem [[A:t[0-9]+]], [[B:t[0-9]+]], [[MASK:t[0-9]+]], [[EVL:t[0-9]+]]
-; CHECK: [[A]]: v256i32
-; CHECK: [[B]]: v256i32
-; CHECK: [[MASK]]: v256i1
-; CHECK: [[EVL]]: i32
+declare <256 x i32> @llvm.vp.urem.v256i32(<256 x i32>, <256 x i32>, <256 x i1>, i32)
-define <256 x i32> @test_vp_int(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+define fastcc <256 x i32> @test_vp_urem_v256i32_vv(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_urem_v256i32_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vdivu.w %v2, %v0, %v1, %vm1
+; CHECK-NEXT: vmuls.w.sx %v1, %v1, %v2, %vm1
+; CHECK-NEXT: vsubs.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
%r0 = call <256 x i32> @llvm.vp.urem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
ret <256 x i32> %r0
}
-; integer arith
-declare <256 x i32> @llvm.vp.urem.v256i32(<256 x i32>, <256 x i32>, <256 x i1>, i32)
+define fastcc <256 x i32> @test_vp_urem_v256i32_rv(i32 %s0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_urem_v256i32_rv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vdivu.w %v1, %s0, %v0, %vm1
+; CHECK-NEXT: vmuls.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT: vsubs.w.sx %v0, %s0, %v0, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
+ %xins = insertelement <256 x i32> undef, i32 %s0, i32 0
+ %i0 = shufflevector <256 x i32> %xins, <256 x i32> undef, <256 x i32> zeroinitializer
+ %r0 = call <256 x i32> @llvm.vp.urem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
+ ret <256 x i32> %r0
+}
+
+define fastcc <256 x i32> @test_vp_urem_v256i32_vr(<256 x i32> %i0, i32 %s1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_urem_v256i32_vr:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vdivu.w %v1, %v0, %s0, %vm1
+; CHECK-NEXT: vmuls.w.sx %v1, %s0, %v1, %vm1
+; CHECK-NEXT: vsubs.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
+ %yins = insertelement <256 x i32> undef, i32 %s1, i32 0
+ %i1 = shufflevector <256 x i32> %yins, <256 x i32> undef, <256 x i32> zeroinitializer
+ %r0 = call <256 x i32> @llvm.vp.urem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
+ ret <256 x i32> %r0
+}
+
+
+declare <256 x i64> @llvm.vp.urem.v256i64(<256 x i64>, <256 x i64>, <256 x i1>, i32)
+
+define fastcc <256 x i64> @test_vp_int_v256i64_vv(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_int_v256i64_vv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vdivu.l %v2, %v0, %v1, %vm1
+; CHECK-NEXT: vmuls.l %v1, %v1, %v2, %vm1
+; CHECK-NEXT: vsubs.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r0 = call <256 x i64> @llvm.vp.urem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+ ret <256 x i64> %r0
+}
+
+define fastcc <256 x i64> @test_vp_urem_v256i64_rv(i64 %s0, <256 x i64> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_urem_v256i64_rv:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vdivu.l %v1, %s0, %v0, %vm1
+; CHECK-NEXT: vmuls.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT: vsubs.l %v0, %s0, %v0, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
+ %xins = insertelement <256 x i64> undef, i64 %s0, i32 0
+ %i0 = shufflevector <256 x i64> %xins, <256 x i64> undef, <256 x i32> zeroinitializer
+ %r0 = call <256 x i64> @llvm.vp.urem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+ ret <256 x i64> %r0
+}
+
+define fastcc <256 x i64> @test_vp_urem_v256i64_vr(<256 x i64> %i0, i64 %s1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_urem_v256i64_vr:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vdivu.l %v1, %v0, %s0, %vm1
+; CHECK-NEXT: vmuls.l %v1, %s0, %v1, %vm1
+; CHECK-NEXT: vsubs.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT: b.l.t (, %s10)
+ %yins = insertelement <256 x i64> undef, i64 %s1, i32 0
+ %i1 = shufflevector <256 x i64> %yins, <256 x i64> undef, <256 x i32> zeroinitializer
+ %r0 = call <256 x i64> @llvm.vp.urem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+ ret <256 x i64> %r0
+}