From 1121eca685a39c1fc370daa65ff918a3cd490dc4 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 16 Sep 2022 12:59:13 -0700
Subject: [PATCH] [VP][VE] Default VP_SREM/UREM to Expand and add generic
 expansion using VP_SDIV/UDIV+VP_MUL+VP_SUB.

I want to default all VP operations to Expand. These 2 were blocking
because VE doesn't support them and the tests were expecting them
to fail a specific way. Using Expand caused them to fail differently.

Seemed better to emulate them using operations that are supported.

@simoll mentioned on Discord that VE has some expansion downstream. Not
sure if its done like this or in the VE target.

Reviewed By: frasercrmck, efocht

Differential Revision: https://reviews.llvm.org/D133514
---
 .../lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 32 +++++++
 llvm/lib/CodeGen/TargetLoweringBase.cpp            |  5 ++
 llvm/test/CodeGen/VE/Vector/vp_srem.ll             | 99 +++++++++++++++++++---
 llvm/test/CodeGen/VE/Vector/vp_urem.ll             | 99 +++++++++++++++++++---
 4 files changed, 215 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 4eafb8e..0132bf4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -132,6 +132,7 @@ class VectorLegalizer {
   SDValue ExpandVSELECT(SDNode *Node);
   SDValue ExpandVP_SELECT(SDNode *Node);
   SDValue ExpandVP_MERGE(SDNode *Node);
+  SDValue ExpandVP_REM(SDNode *Node);
   SDValue ExpandSELECT(SDNode *Node);
   std::pair<SDValue, SDValue> ExpandLoad(SDNode *N);
   SDValue ExpandStore(SDNode *N);
@@ -735,6 +736,13 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   case ISD::VP_SELECT:
     Results.push_back(ExpandVP_SELECT(Node));
     return;
+  case ISD::VP_SREM:
+  case ISD::VP_UREM:
+    if (SDValue Expanded = ExpandVP_REM(Node)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
   case ISD::SELECT:
     Results.push_back(ExpandSELECT(Node));
     return;
@@ -1310,6 +1318,30 @@ SDValue VectorLegalizer::ExpandVP_MERGE(SDNode *Node) {
   return DAG.getSelect(DL, Node->getValueType(0), FullMask, Op1, Op2);
 }
 
+SDValue VectorLegalizer::ExpandVP_REM(SDNode *Node) {
+  // Implement VP_SREM/UREM in terms of VP_SDIV/VP_UDIV, VP_MUL, VP_SUB.
+  EVT VT = Node->getValueType(0);
+
+  unsigned DivOpc = Node->getOpcode() == ISD::VP_SREM ? ISD::VP_SDIV : ISD::VP_UDIV;
+
+  if (!TLI.isOperationLegalOrCustom(DivOpc, VT) ||
+      !TLI.isOperationLegalOrCustom(ISD::VP_MUL, VT) ||
+      !TLI.isOperationLegalOrCustom(ISD::VP_SUB, VT))
+    return SDValue();
+
+  SDLoc DL(Node);
+
+  SDValue Dividend = Node->getOperand(0);
+  SDValue Divisor = Node->getOperand(1);
+  SDValue Mask = Node->getOperand(2);
+  SDValue EVL = Node->getOperand(3);
+
+  // X % Y -> X-X/Y*Y
+  SDValue Div = DAG.getNode(DivOpc, DL, VT, Dividend, Divisor, Mask, EVL);
+  SDValue Mul = DAG.getNode(ISD::VP_MUL, DL, VT, Divisor, Div, Mask, EVL);
+  return DAG.getNode(ISD::VP_SUB, DL, VT, Dividend, Mul, Mask, EVL);
+}
+
 void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node,
                                        SmallVectorImpl<SDValue> &Results) {
   // Attempt to expand using TargetLowering.
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 8eaf1bb..9736318 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -870,6 +870,11 @@ void TargetLoweringBase::initActions() {
 
     // Named vector shuffles default to expand.
     setOperationAction(ISD::VECTOR_SPLICE, VT, Expand);
+
+    // VP_SREM/UREM default to expand.
+    // TODO: Expand all VP intrinsics.
+    setOperationAction(ISD::VP_SREM, VT, Expand);
+    setOperationAction(ISD::VP_UREM, VT, Expand);
   }
 
   // Most targets ignore the @llvm.prefetch intrinsic.
diff --git a/llvm/test/CodeGen/VE/Vector/vp_srem.ll b/llvm/test/CodeGen/VE/Vector/vp_srem.ll
index 15a3dca..ffd93e3 100644
--- a/llvm/test/CodeGen/VE/Vector/vp_srem.ll
+++ b/llvm/test/CodeGen/VE/Vector/vp_srem.ll
@@ -1,16 +1,95 @@
-; REQUIRES: asserts
-; RUN: not --crash llc < %s -march=ve -mattr=+vpu -o /dev/null 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
 
-; CHECK:  t{{[0-9]+}}: v256i32 = vp_srem [[A:t[0-9]+]], [[B:t[0-9]+]], [[MASK:t[0-9]+]], [[EVL:t[0-9]+]] 
-; CHECK:  [[A]]: v256i32
-; CHECK:  [[B]]: v256i32
-; CHECK:  [[MASK]]: v256i1
-; CHECK:  [[EVL]]: i32
+declare <256 x i32> @llvm.vp.srem.v256i32(<256 x i32>, <256 x i32>, <256 x i1>, i32)
 
-define <256 x i32> @test_vp_int(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+define fastcc <256 x i32> @test_vp_srem_v256i32_vv(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_srem_v256i32_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vdivs.w.sx %v2, %v0, %v1, %vm1
+; CHECK-NEXT:    vmuls.w.sx %v1, %v1, %v2, %vm1
+; CHECK-NEXT:    vsubs.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
   %r0 = call <256 x i32> @llvm.vp.srem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
   ret <256 x i32> %r0
 }
 
-; integer arith
-declare <256 x i32> @llvm.vp.srem.v256i32(<256 x i32>, <256 x i32>, <256 x i1>, i32)
+define fastcc <256 x i32> @test_vp_srem_v256i32_rv(i32 %s0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_srem_v256i32_rv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vdivs.w.sx %v1, %s0, %v0, %vm1
+; CHECK-NEXT:    vmuls.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    vsubs.w.sx %v0, %s0, %v0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x i32> undef, i32 %s0, i32 0
+  %i0 = shufflevector <256 x i32> %xins, <256 x i32> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x i32> @llvm.vp.srem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x i32> %r0
+}
+
+define fastcc <256 x i32> @test_vp_srem_v256i32_vr(<256 x i32> %i0, i32 %s1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_srem_v256i32_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vdivs.w.sx %v1, %v0, %s0, %vm1
+; CHECK-NEXT:    vmuls.w.sx %v1, %s0, %v1, %vm1
+; CHECK-NEXT:    vsubs.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x i32> undef, i32 %s1, i32 0
+  %i1 = shufflevector <256 x i32> %yins, <256 x i32> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x i32> @llvm.vp.srem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x i32> %r0
+}
+
+
+declare <256 x i64> @llvm.vp.srem.v256i64(<256 x i64>, <256 x i64>, <256 x i1>, i32)
+
+define fastcc <256 x i64> @test_vp_int_v256i64_vv(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_int_v256i64_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vdivs.l %v2, %v0, %v1, %vm1
+; CHECK-NEXT:    vmuls.l %v1, %v1, %v2, %vm1
+; CHECK-NEXT:    vsubs.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x i64> @llvm.vp.srem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x i64> %r0
+}
+
+define fastcc <256 x i64> @test_vp_srem_v256i64_rv(i64 %s0, <256 x i64> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_srem_v256i64_rv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vdivs.l %v1, %s0, %v0, %vm1
+; CHECK-NEXT:    vmuls.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    vsubs.l %v0, %s0, %v0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x i64> undef, i64 %s0, i32 0
+  %i0 = shufflevector <256 x i64> %xins, <256 x i64> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x i64> @llvm.vp.srem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x i64> %r0
+}
+
+define fastcc <256 x i64> @test_vp_srem_v256i64_vr(<256 x i64> %i0, i64 %s1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_srem_v256i64_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vdivs.l %v1, %v0, %s0, %vm1
+; CHECK-NEXT:    vmuls.l %v1, %s0, %v1, %vm1
+; CHECK-NEXT:    vsubs.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x i64> undef, i64 %s1, i32 0
+  %i1 = shufflevector <256 x i64> %yins, <256 x i64> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x i64> @llvm.vp.srem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x i64> %r0
+}
diff --git a/llvm/test/CodeGen/VE/Vector/vp_urem.ll b/llvm/test/CodeGen/VE/Vector/vp_urem.ll
index 5465e9f..0620c89 100644
--- a/llvm/test/CodeGen/VE/Vector/vp_urem.ll
+++ b/llvm/test/CodeGen/VE/Vector/vp_urem.ll
@@ -1,16 +1,95 @@
-; REQUIRES: asserts
-; RUN: not --crash llc < %s -march=ve -mattr=+vpu -o /dev/null 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
 
-; CHECK:  t{{[0-9]+}}: v256i32 = vp_urem [[A:t[0-9]+]], [[B:t[0-9]+]], [[MASK:t[0-9]+]], [[EVL:t[0-9]+]] 
-; CHECK:  [[A]]: v256i32
-; CHECK:  [[B]]: v256i32
-; CHECK:  [[MASK]]: v256i1
-; CHECK:  [[EVL]]: i32
+declare <256 x i32> @llvm.vp.urem.v256i32(<256 x i32>, <256 x i32>, <256 x i1>, i32)
 
-define <256 x i32> @test_vp_int(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+define fastcc <256 x i32> @test_vp_urem_v256i32_vv(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_urem_v256i32_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vdivu.w %v2, %v0, %v1, %vm1
+; CHECK-NEXT:    vmuls.w.sx %v1, %v1, %v2, %vm1
+; CHECK-NEXT:    vsubs.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
   %r0 = call <256 x i32> @llvm.vp.urem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
   ret <256 x i32> %r0
 }
 
-; integer arith
-declare <256 x i32> @llvm.vp.urem.v256i32(<256 x i32>, <256 x i32>, <256 x i1>, i32)
+define fastcc <256 x i32> @test_vp_urem_v256i32_rv(i32 %s0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_urem_v256i32_rv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vdivu.w %v1, %s0, %v0, %vm1
+; CHECK-NEXT:    vmuls.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    vsubs.w.sx %v0, %s0, %v0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x i32> undef, i32 %s0, i32 0
+  %i0 = shufflevector <256 x i32> %xins, <256 x i32> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x i32> @llvm.vp.urem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x i32> %r0
+}
+
+define fastcc <256 x i32> @test_vp_urem_v256i32_vr(<256 x i32> %i0, i32 %s1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_urem_v256i32_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vdivu.w %v1, %v0, %s0, %vm1
+; CHECK-NEXT:    vmuls.w.sx %v1, %s0, %v1, %vm1
+; CHECK-NEXT:    vsubs.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x i32> undef, i32 %s1, i32 0
+  %i1 = shufflevector <256 x i32> %yins, <256 x i32> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x i32> @llvm.vp.urem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x i32> %r0
+}
+
+
+declare <256 x i64> @llvm.vp.urem.v256i64(<256 x i64>, <256 x i64>, <256 x i1>, i32)
+
+define fastcc <256 x i64> @test_vp_int_v256i64_vv(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_int_v256i64_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vdivu.l %v2, %v0, %v1, %vm1
+; CHECK-NEXT:    vmuls.l %v1, %v1, %v2, %vm1
+; CHECK-NEXT:    vsubs.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x i64> @llvm.vp.urem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x i64> %r0
+}
+
+define fastcc <256 x i64> @test_vp_urem_v256i64_rv(i64 %s0, <256 x i64> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_urem_v256i64_rv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vdivu.l %v1, %s0, %v0, %vm1
+; CHECK-NEXT:    vmuls.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    vsubs.l %v0, %s0, %v0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x i64> undef, i64 %s0, i32 0
+  %i0 = shufflevector <256 x i64> %xins, <256 x i64> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x i64> @llvm.vp.urem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x i64> %r0
+}
+
+define fastcc <256 x i64> @test_vp_urem_v256i64_vr(<256 x i64> %i0, i64 %s1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_urem_v256i64_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vdivu.l %v1, %v0, %s0, %vm1
+; CHECK-NEXT:    vmuls.l %v1, %s0, %v1, %vm1
+; CHECK-NEXT:    vsubs.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x i64> undef, i64 %s1, i32 0
+  %i1 = shufflevector <256 x i64> %yins, <256 x i64> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x i64> @llvm.vp.urem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x i64> %r0
+}
-- 
2.7.4