[VP][VE] Default VP_SREM/UREM to Expand and add generic expansion using VP_SDIV/UDIV...

author Craig Topper <craig.topper@sifive.com>

Fri, 16 Sep 2022 19:59:13 +0000 (12:59 -0700)

committer Craig Topper <craig.topper@sifive.com>

Fri, 16 Sep 2022 20:19:02 +0000 (13:19 -0700)
author Craig Topper <craig.topper@sifive.com>
Fri, 16 Sep 2022 19:59:13 +0000 (12:59 -0700)
committer Craig Topper <craig.topper@sifive.com>
Fri, 16 Sep 2022 20:19:02 +0000 (13:19 -0700)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp

index 4eafb8e..0132bf4 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -132,6 +132,7 @@ class VectorLegalizer {
    SDValue ExpandVSELECT(SDNode *Node);
    SDValue ExpandVP_SELECT(SDNode *Node);
    SDValue ExpandVP_MERGE(SDNode *Node);
+  SDValue ExpandVP_REM(SDNode *Node);
    SDValue ExpandSELECT(SDNode *Node);
    std::pair<SDValue, SDValue> ExpandLoad(SDNode *N);
    SDValue ExpandStore(SDNode *N);
@@ -735,6 +736,13 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
    case ISD::VP_SELECT:
      Results.push_back(ExpandVP_SELECT(Node));
      return;
+  case ISD::VP_SREM:
+  case ISD::VP_UREM:
+    if (SDValue Expanded = ExpandVP_REM(Node)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
    case ISD::SELECT:
      Results.push_back(ExpandSELECT(Node));
      return;
@@ -1310,6 +1318,30 @@ SDValue VectorLegalizer::ExpandVP_MERGE(SDNode *Node) {
    return DAG.getSelect(DL, Node->getValueType(0), FullMask, Op1, Op2);
  }
  
+SDValue VectorLegalizer::ExpandVP_REM(SDNode *Node) {
+  // Implement VP_SREM/UREM in terms of VP_SDIV/VP_UDIV, VP_MUL, VP_SUB.
+  EVT VT = Node->getValueType(0);
+
+  unsigned DivOpc = Node->getOpcode() == ISD::VP_SREM ? ISD::VP_SDIV : ISD::VP_UDIV;
+
+  if (!TLI.isOperationLegalOrCustom(DivOpc, VT) ||
+      !TLI.isOperationLegalOrCustom(ISD::VP_MUL, VT) ||
+      !TLI.isOperationLegalOrCustom(ISD::VP_SUB, VT))
+    return SDValue();
+
+  SDLoc DL(Node);
+
+  SDValue Dividend = Node->getOperand(0);
+  SDValue Divisor = Node->getOperand(1);
+  SDValue Mask = Node->getOperand(2);
+  SDValue EVL = Node->getOperand(3);
+
+  // X % Y -> X-X/Y*Y
+  SDValue Div = DAG.getNode(DivOpc, DL, VT, Dividend, Divisor, Mask, EVL);
+  SDValue Mul = DAG.getNode(ISD::VP_MUL, DL, VT, Divisor, Div, Mask, EVL);
+  return DAG.getNode(ISD::VP_SUB, DL, VT, Dividend, Mul, Mask, EVL);
+}
+
  void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node,
                                         SmallVectorImpl<SDValue> &Results) {
    // Attempt to expand using TargetLowering.
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp

index 8eaf1bb..9736318 100644 (file)
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -870,6 +870,11 @@ void TargetLoweringBase::initActions() {
  
      // Named vector shuffles default to expand.
      setOperationAction(ISD::VECTOR_SPLICE, VT, Expand);
+
+    // VP_SREM/UREM default to expand.
+    // TODO: Expand all VP intrinsics.
+    setOperationAction(ISD::VP_SREM, VT, Expand);
+    setOperationAction(ISD::VP_UREM, VT, Expand);
    }
  
    // Most targets ignore the @llvm.prefetch intrinsic.
diff --git a/llvm/test/CodeGen/VE/Vector/vp_srem.ll b/llvm/test/CodeGen/VE/Vector/vp_srem.ll

index 15a3dca..ffd93e3 100644 (file)
--- a/llvm/test/CodeGen/VE/Vector/vp_srem.ll
+++ b/llvm/test/CodeGen/VE/Vector/vp_srem.ll
@@ -1,16 +1,95 @@
-; REQUIRES: asserts
-; RUN: not --crash llc < %s -march=ve -mattr=+vpu -o /dev/null 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
  
-; CHECK:  t{{[0-9]+}}: v256i32 = vp_srem [[A:t[0-9]+]], [[B:t[0-9]+]], [[MASK:t[0-9]+]], [[EVL:t[0-9]+]] 
-; CHECK:  [[A]]: v256i32
-; CHECK:  [[B]]: v256i32
-; CHECK:  [[MASK]]: v256i1
-; CHECK:  [[EVL]]: i32
+declare <256 x i32> @llvm.vp.srem.v256i32(<256 x i32>, <256 x i32>, <256 x i1>, i32)
  
-define <256 x i32> @test_vp_int(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+define fastcc <256 x i32> @test_vp_srem_v256i32_vv(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_srem_v256i32_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vdivs.w.sx %v2, %v0, %v1, %vm1
+; CHECK-NEXT:    vmuls.w.sx %v1, %v1, %v2, %vm1
+; CHECK-NEXT:    vsubs.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
    %r0 = call <256 x i32> @llvm.vp.srem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
    ret <256 x i32> %r0
  }
  
-; integer arith
-declare <256 x i32> @llvm.vp.srem.v256i32(<256 x i32>, <256 x i32>, <256 x i1>, i32)
+define fastcc <256 x i32> @test_vp_srem_v256i32_rv(i32 %s0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_srem_v256i32_rv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vdivs.w.sx %v1, %s0, %v0, %vm1
+; CHECK-NEXT:    vmuls.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    vsubs.w.sx %v0, %s0, %v0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x i32> undef, i32 %s0, i32 0
+  %i0 = shufflevector <256 x i32> %xins, <256 x i32> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x i32> @llvm.vp.srem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x i32> %r0
+}
+
+define fastcc <256 x i32> @test_vp_srem_v256i32_vr(<256 x i32> %i0, i32 %s1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_srem_v256i32_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vdivs.w.sx %v1, %v0, %s0, %vm1
+; CHECK-NEXT:    vmuls.w.sx %v1, %s0, %v1, %vm1
+; CHECK-NEXT:    vsubs.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x i32> undef, i32 %s1, i32 0
+  %i1 = shufflevector <256 x i32> %yins, <256 x i32> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x i32> @llvm.vp.srem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x i32> %r0
+}
+
+
+declare <256 x i64> @llvm.vp.srem.v256i64(<256 x i64>, <256 x i64>, <256 x i1>, i32)
+
+define fastcc <256 x i64> @test_vp_int_v256i64_vv(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_int_v256i64_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vdivs.l %v2, %v0, %v1, %vm1
+; CHECK-NEXT:    vmuls.l %v1, %v1, %v2, %vm1
+; CHECK-NEXT:    vsubs.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x i64> @llvm.vp.srem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x i64> %r0
+}
+
+define fastcc <256 x i64> @test_vp_srem_v256i64_rv(i64 %s0, <256 x i64> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_srem_v256i64_rv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vdivs.l %v1, %s0, %v0, %vm1
+; CHECK-NEXT:    vmuls.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    vsubs.l %v0, %s0, %v0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x i64> undef, i64 %s0, i32 0
+  %i0 = shufflevector <256 x i64> %xins, <256 x i64> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x i64> @llvm.vp.srem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x i64> %r0
+}
+
+define fastcc <256 x i64> @test_vp_srem_v256i64_vr(<256 x i64> %i0, i64 %s1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_srem_v256i64_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vdivs.l %v1, %v0, %s0, %vm1
+; CHECK-NEXT:    vmuls.l %v1, %s0, %v1, %vm1
+; CHECK-NEXT:    vsubs.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x i64> undef, i64 %s1, i32 0
+  %i1 = shufflevector <256 x i64> %yins, <256 x i64> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x i64> @llvm.vp.srem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x i64> %r0
+}
diff --git a/llvm/test/CodeGen/VE/Vector/vp_urem.ll b/llvm/test/CodeGen/VE/Vector/vp_urem.ll

index 5465e9f..0620c89 100644 (file)
--- a/llvm/test/CodeGen/VE/Vector/vp_urem.ll
+++ b/llvm/test/CodeGen/VE/Vector/vp_urem.ll
@@ -1,16 +1,95 @@
-; REQUIRES: asserts
-; RUN: not --crash llc < %s -march=ve -mattr=+vpu -o /dev/null 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
  
-; CHECK:  t{{[0-9]+}}: v256i32 = vp_urem [[A:t[0-9]+]], [[B:t[0-9]+]], [[MASK:t[0-9]+]], [[EVL:t[0-9]+]] 
-; CHECK:  [[A]]: v256i32
-; CHECK:  [[B]]: v256i32
-; CHECK:  [[MASK]]: v256i1
-; CHECK:  [[EVL]]: i32
+declare <256 x i32> @llvm.vp.urem.v256i32(<256 x i32>, <256 x i32>, <256 x i1>, i32)
  
-define <256 x i32> @test_vp_int(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+define fastcc <256 x i32> @test_vp_urem_v256i32_vv(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_urem_v256i32_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vdivu.w %v2, %v0, %v1, %vm1
+; CHECK-NEXT:    vmuls.w.sx %v1, %v1, %v2, %vm1
+; CHECK-NEXT:    vsubs.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
    %r0 = call <256 x i32> @llvm.vp.urem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
    ret <256 x i32> %r0
  }
  
-; integer arith
-declare <256 x i32> @llvm.vp.urem.v256i32(<256 x i32>, <256 x i32>, <256 x i1>, i32)
+define fastcc <256 x i32> @test_vp_urem_v256i32_rv(i32 %s0, <256 x i32> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_urem_v256i32_rv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vdivu.w %v1, %s0, %v0, %vm1
+; CHECK-NEXT:    vmuls.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    vsubs.w.sx %v0, %s0, %v0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x i32> undef, i32 %s0, i32 0
+  %i0 = shufflevector <256 x i32> %xins, <256 x i32> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x i32> @llvm.vp.urem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x i32> %r0
+}
+
+define fastcc <256 x i32> @test_vp_urem_v256i32_vr(<256 x i32> %i0, i32 %s1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_urem_v256i32_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vdivu.w %v1, %v0, %s0, %vm1
+; CHECK-NEXT:    vmuls.w.sx %v1, %s0, %v1, %vm1
+; CHECK-NEXT:    vsubs.w.sx %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x i32> undef, i32 %s1, i32 0
+  %i1 = shufflevector <256 x i32> %yins, <256 x i32> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x i32> @llvm.vp.urem.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x i32> %r0
+}
+
+
+declare <256 x i64> @llvm.vp.urem.v256i64(<256 x i64>, <256 x i64>, <256 x i1>, i32)
+
+define fastcc <256 x i64> @test_vp_int_v256i64_vv(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_int_v256i64_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vdivu.l %v2, %v0, %v1, %vm1
+; CHECK-NEXT:    vmuls.l %v1, %v1, %v2, %vm1
+; CHECK-NEXT:    vsubs.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x i64> @llvm.vp.urem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x i64> %r0
+}
+
+define fastcc <256 x i64> @test_vp_urem_v256i64_rv(i64 %s0, <256 x i64> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_urem_v256i64_rv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vdivu.l %v1, %s0, %v0, %vm1
+; CHECK-NEXT:    vmuls.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    vsubs.l %v0, %s0, %v0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x i64> undef, i64 %s0, i32 0
+  %i0 = shufflevector <256 x i64> %xins, <256 x i64> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x i64> @llvm.vp.urem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x i64> %r0
+}
+
+define fastcc <256 x i64> @test_vp_urem_v256i64_vr(<256 x i64> %i0, i64 %s1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_urem_v256i64_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vdivu.l %v1, %v0, %s0, %vm1
+; CHECK-NEXT:    vmuls.l %v1, %s0, %v1, %vm1
+; CHECK-NEXT:    vsubs.l %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x i64> undef, i64 %s1, i32 0
+  %i1 = shufflevector <256 x i64> %yins, <256 x i64> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x i64> @llvm.vp.urem.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x i64> %r0
+}
author	Craig Topper <craig.topper@sifive.com>
	Fri, 16 Sep 2022 19:59:13 +0000 (12:59 -0700)
committer	Craig Topper <craig.topper@sifive.com>
	Fri, 16 Sep 2022 20:19:02 +0000 (13:19 -0700)
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp		patch \| blob \| history
llvm/lib/CodeGen/TargetLoweringBase.cpp		patch \| blob \| history
llvm/test/CodeGen/VE/Vector/vp_srem.ll		patch \| blob \| history
llvm/test/CodeGen/VE/Vector/vp_urem.ll		patch \| blob \| history