From 1121eca685a39c1fc370daa65ff918a3cd490dc4 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 16 Sep 2022 12:59:13 -0700 Subject: [PATCH] [VP][VE] Default VP_SREM/UREM to Expand and add generic expansion using VP_SDIV/UDIV+VP_MUL+VP_SUB. I want to default all VP operations to Expand. These 2 were blocking because VE doesn't support them and the tests were expecting them to fail a specific way. Using Expand caused them to fail differently. Seemed better to emulate them using operations that are supported. @simoll mentioned on Discord that VE has some expansion downstream. Not sure if its done like this or in the VE target. Reviewed By: frasercrmck, efocht Differential Revision: https://reviews.llvm.org/D133514 --- .../lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 32 +++++++ llvm/lib/CodeGen/TargetLoweringBase.cpp | 5 ++ llvm/test/CodeGen/VE/Vector/vp_srem.ll | 99 +++++++++++++++++++--- llvm/test/CodeGen/VE/Vector/vp_urem.ll | 99 +++++++++++++++++++--- 4 files changed, 215 insertions(+), 20 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 4eafb8e..0132bf4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -132,6 +132,7 @@ class VectorLegalizer { SDValue ExpandVSELECT(SDNode *Node); SDValue ExpandVP_SELECT(SDNode *Node); SDValue ExpandVP_MERGE(SDNode *Node); + SDValue ExpandVP_REM(SDNode *Node); SDValue ExpandSELECT(SDNode *Node); std::pair ExpandLoad(SDNode *N); SDValue ExpandStore(SDNode *N); @@ -735,6 +736,13 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { case ISD::VP_SELECT: Results.push_back(ExpandVP_SELECT(Node)); return; + case ISD::VP_SREM: + case ISD::VP_UREM: + if (SDValue Expanded = ExpandVP_REM(Node)) { + Results.push_back(Expanded); + return; + } + break; case ISD::SELECT: Results.push_back(ExpandSELECT(Node)); return; @@ -1310,6 +1318,30 @@ SDValue VectorLegalizer::ExpandVP_MERGE(SDNode *Node) { return DAG.getSelect(DL, Node->getValueType(0), FullMask, Op1, Op2); } +SDValue VectorLegalizer::ExpandVP_REM(SDNode *Node) { + // Implement VP_SREM/UREM in terms of VP_SDIV/VP_UDIV, VP_MUL, VP_SUB. + EVT VT = Node->getValueType(0); + + unsigned DivOpc = Node->getOpcode() == ISD::VP_SREM ? ISD::VP_SDIV : ISD::VP_UDIV; + + if (!TLI.isOperationLegalOrCustom(DivOpc, VT) || + !TLI.isOperationLegalOrCustom(ISD::VP_MUL, VT) || + !TLI.isOperationLegalOrCustom(ISD::VP_SUB, VT)) + return SDValue(); + + SDLoc DL(Node); + + SDValue Dividend = Node->getOperand(0); + SDValue Divisor = Node->getOperand(1); + SDValue Mask = Node->getOperand(2); + SDValue EVL = Node->getOperand(3); + + // X % Y -> X-X/Y*Y + SDValue Div = DAG.getNode(DivOpc, DL, VT, Dividend, Divisor, Mask, EVL); + SDValue Mul = DAG.getNode(ISD::VP_MUL, DL, VT, Divisor, Div, Mask, EVL); + return DAG.getNode(ISD::VP_SUB, DL, VT, Dividend, Mul, Mask, EVL); +} + void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node, SmallVectorImpl &Results) { // Attempt to expand using TargetLowering. diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 8eaf1bb..9736318 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -870,6 +870,11 @@ void TargetLoweringBase::initActions() { // Named vector shuffles default to expand. setOperationAction(ISD::VECTOR_SPLICE, VT, Expand); + + // VP_SREM/UREM default to expand. + // TODO: Expand all VP intrinsics. + setOperationAction(ISD::VP_SREM, VT, Expand); + setOperationAction(ISD::VP_UREM, VT, Expand); } // Most targets ignore the @llvm.prefetch intrinsic. diff --git a/llvm/test/CodeGen/VE/Vector/vp_srem.ll b/llvm/test/CodeGen/VE/Vector/vp_srem.ll index 15a3dca..ffd93e3 100644 --- a/llvm/test/CodeGen/VE/Vector/vp_srem.ll +++ b/llvm/test/CodeGen/VE/Vector/vp_srem.ll @@ -1,16 +1,95 @@ -; REQUIRES: asserts -; RUN: not --crash llc < %s -march=ve -mattr=+vpu -o /dev/null 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s -; CHECK: t{{[0-9]+}}: v256i32 = vp_srem [[A:t[0-9]+]], [[B:t[0-9]+]], [[MASK:t[0-9]+]], [[EVL:t[0-9]+]] -; CHECK: [[A]]: v256i32 -; CHECK: [[B]]: v256i32 -; CHECK: [[MASK]]: v256i1 -; CHECK: [[EVL]]: i32 +declare <256 x i32> @llvm.vp.srem.v256i32(<256 x i32>, <256 x i32>, <256 x i1>, i32) -define <256 x i32> @test_vp_int(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) { +define fastcc <256 x i32> @test_vp_srem_v256i32_vv(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_srem_v256i32_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vdivs.w.sx %v2, %v0, %v1, %vm1 +; CHECK-NEXT: vmuls.w.sx %v1, %v1, %v2, %vm1 +; CHECK-NEXT: vsubs.w.sx %v0, %v0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) %r0 = call <256 x i32> @llvm.vp.srem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) ret <256 x i32> %r0 } -; integer arith -declare <256 x i32> @llvm.vp.srem.v256i32(<256 x i32>, <256 x i32>, <256 x i1>, i32) +define fastcc <256 x i32> @test_vp_srem_v256i32_rv(i32 %s0, <256 x i32> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_srem_v256i32_rv: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vdivs.w.sx %v1, %s0, %v0, %vm1 +; CHECK-NEXT: vmuls.w.sx %v0, %v0, %v1, %vm1 +; CHECK-NEXT: vsubs.w.sx %v0, %s0, %v0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %xins = insertelement <256 x i32> undef, i32 %s0, i32 0 + %i0 = shufflevector <256 x i32> %xins, <256 x i32> undef, <256 x i32> zeroinitializer + %r0 = call <256 x i32> @llvm.vp.srem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) + ret <256 x i32> %r0 +} + +define fastcc <256 x i32> @test_vp_srem_v256i32_vr(<256 x i32> %i0, i32 %s1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_srem_v256i32_vr: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vdivs.w.sx %v1, %v0, %s0, %vm1 +; CHECK-NEXT: vmuls.w.sx %v1, %s0, %v1, %vm1 +; CHECK-NEXT: vsubs.w.sx %v0, %v0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %yins = insertelement <256 x i32> undef, i32 %s1, i32 0 + %i1 = shufflevector <256 x i32> %yins, <256 x i32> undef, <256 x i32> zeroinitializer + %r0 = call <256 x i32> @llvm.vp.srem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) + ret <256 x i32> %r0 +} + + +declare <256 x i64> @llvm.vp.srem.v256i64(<256 x i64>, <256 x i64>, <256 x i1>, i32) + +define fastcc <256 x i64> @test_vp_int_v256i64_vv(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_int_v256i64_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vdivs.l %v2, %v0, %v1, %vm1 +; CHECK-NEXT: vmuls.l %v1, %v1, %v2, %vm1 +; CHECK-NEXT: vsubs.l %v0, %v0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %r0 = call <256 x i64> @llvm.vp.srem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n) + ret <256 x i64> %r0 +} + +define fastcc <256 x i64> @test_vp_srem_v256i64_rv(i64 %s0, <256 x i64> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_srem_v256i64_rv: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vdivs.l %v1, %s0, %v0, %vm1 +; CHECK-NEXT: vmuls.l %v0, %v0, %v1, %vm1 +; CHECK-NEXT: vsubs.l %v0, %s0, %v0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %xins = insertelement <256 x i64> undef, i64 %s0, i32 0 + %i0 = shufflevector <256 x i64> %xins, <256 x i64> undef, <256 x i32> zeroinitializer + %r0 = call <256 x i64> @llvm.vp.srem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n) + ret <256 x i64> %r0 +} + +define fastcc <256 x i64> @test_vp_srem_v256i64_vr(<256 x i64> %i0, i64 %s1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_srem_v256i64_vr: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vdivs.l %v1, %v0, %s0, %vm1 +; CHECK-NEXT: vmuls.l %v1, %s0, %v1, %vm1 +; CHECK-NEXT: vsubs.l %v0, %v0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %yins = insertelement <256 x i64> undef, i64 %s1, i32 0 + %i1 = shufflevector <256 x i64> %yins, <256 x i64> undef, <256 x i32> zeroinitializer + %r0 = call <256 x i64> @llvm.vp.srem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n) + ret <256 x i64> %r0 +} diff --git a/llvm/test/CodeGen/VE/Vector/vp_urem.ll b/llvm/test/CodeGen/VE/Vector/vp_urem.ll index 5465e9f..0620c89 100644 --- a/llvm/test/CodeGen/VE/Vector/vp_urem.ll +++ b/llvm/test/CodeGen/VE/Vector/vp_urem.ll @@ -1,16 +1,95 @@ -; REQUIRES: asserts -; RUN: not --crash llc < %s -march=ve -mattr=+vpu -o /dev/null 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s -; CHECK: t{{[0-9]+}}: v256i32 = vp_urem [[A:t[0-9]+]], [[B:t[0-9]+]], [[MASK:t[0-9]+]], [[EVL:t[0-9]+]] -; CHECK: [[A]]: v256i32 -; CHECK: [[B]]: v256i32 -; CHECK: [[MASK]]: v256i1 -; CHECK: [[EVL]]: i32 +declare <256 x i32> @llvm.vp.urem.v256i32(<256 x i32>, <256 x i32>, <256 x i1>, i32) -define <256 x i32> @test_vp_int(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) { +define fastcc <256 x i32> @test_vp_urem_v256i32_vv(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_urem_v256i32_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vdivu.w %v2, %v0, %v1, %vm1 +; CHECK-NEXT: vmuls.w.sx %v1, %v1, %v2, %vm1 +; CHECK-NEXT: vsubs.w.sx %v0, %v0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) %r0 = call <256 x i32> @llvm.vp.urem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) ret <256 x i32> %r0 } -; integer arith -declare <256 x i32> @llvm.vp.urem.v256i32(<256 x i32>, <256 x i32>, <256 x i1>, i32) +define fastcc <256 x i32> @test_vp_urem_v256i32_rv(i32 %s0, <256 x i32> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_urem_v256i32_rv: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vdivu.w %v1, %s0, %v0, %vm1 +; CHECK-NEXT: vmuls.w.sx %v0, %v0, %v1, %vm1 +; CHECK-NEXT: vsubs.w.sx %v0, %s0, %v0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %xins = insertelement <256 x i32> undef, i32 %s0, i32 0 + %i0 = shufflevector <256 x i32> %xins, <256 x i32> undef, <256 x i32> zeroinitializer + %r0 = call <256 x i32> @llvm.vp.urem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) + ret <256 x i32> %r0 +} + +define fastcc <256 x i32> @test_vp_urem_v256i32_vr(<256 x i32> %i0, i32 %s1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_urem_v256i32_vr: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vdivu.w %v1, %v0, %s0, %vm1 +; CHECK-NEXT: vmuls.w.sx %v1, %s0, %v1, %vm1 +; CHECK-NEXT: vsubs.w.sx %v0, %v0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %yins = insertelement <256 x i32> undef, i32 %s1, i32 0 + %i1 = shufflevector <256 x i32> %yins, <256 x i32> undef, <256 x i32> zeroinitializer + %r0 = call <256 x i32> @llvm.vp.urem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) + ret <256 x i32> %r0 +} + + +declare <256 x i64> @llvm.vp.urem.v256i64(<256 x i64>, <256 x i64>, <256 x i1>, i32) + +define fastcc <256 x i64> @test_vp_int_v256i64_vv(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_int_v256i64_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vdivu.l %v2, %v0, %v1, %vm1 +; CHECK-NEXT: vmuls.l %v1, %v1, %v2, %vm1 +; CHECK-NEXT: vsubs.l %v0, %v0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %r0 = call <256 x i64> @llvm.vp.urem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n) + ret <256 x i64> %r0 +} + +define fastcc <256 x i64> @test_vp_urem_v256i64_rv(i64 %s0, <256 x i64> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_urem_v256i64_rv: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vdivu.l %v1, %s0, %v0, %vm1 +; CHECK-NEXT: vmuls.l %v0, %v0, %v1, %vm1 +; CHECK-NEXT: vsubs.l %v0, %s0, %v0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %xins = insertelement <256 x i64> undef, i64 %s0, i32 0 + %i0 = shufflevector <256 x i64> %xins, <256 x i64> undef, <256 x i32> zeroinitializer + %r0 = call <256 x i64> @llvm.vp.urem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n) + ret <256 x i64> %r0 +} + +define fastcc <256 x i64> @test_vp_urem_v256i64_vr(<256 x i64> %i0, i64 %s1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_urem_v256i64_vr: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vdivu.l %v1, %v0, %s0, %vm1 +; CHECK-NEXT: vmuls.l %v1, %s0, %v1, %vm1 +; CHECK-NEXT: vsubs.l %v0, %v0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %yins = insertelement <256 x i64> undef, i64 %s1, i32 0 + %i1 = shufflevector <256 x i64> %yins, <256 x i64> undef, <256 x i32> zeroinitializer + %r0 = call <256 x i64> @llvm.vp.urem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n) + ret <256 x i64> %r0 +} -- 2.7.4