From 156fc07e19ae599e638e18e598dbf5c5a4247408 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 18 Feb 2021 09:09:33 -0800 Subject: [PATCH] [RISCV] Add support for fixed vector MULHU/MULHS. This uses to division by constant optimization to use MULHU/MULHS. Reviewed By: frasercrmck, arcbbb Differential Revision: https://reviews.llvm.org/D96934 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 9 + llvm/lib/Target/RISCV/RISCVISelLowering.h | 2 + llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td | 4 + llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll | 1010 ++++++++++++++++++++ 4 files changed, 1025 insertions(+) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 0ae8085..9576d3c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -559,6 +559,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::UMIN, VT, Custom); setOperationAction(ISD::UMAX, VT, Custom); + setOperationAction(ISD::MULHS, VT, Custom); + setOperationAction(ISD::MULHU, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::ANY_EXTEND, VT, Custom); @@ -1219,6 +1222,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return lowerToScalableOp(Op, DAG, RISCVISD::SUB_VL); case ISD::MUL: return lowerToScalableOp(Op, DAG, RISCVISD::MUL_VL); + case ISD::MULHS: + return lowerToScalableOp(Op, DAG, RISCVISD::MULHS_VL); + case ISD::MULHU: + return lowerToScalableOp(Op, DAG, RISCVISD::MULHU_VL); case ISD::AND: return lowerFixedLengthVectorLogicOpToRVV(Op, DAG, RISCVISD::VMAND_VL, RISCVISD::AND_VL); @@ -4968,6 +4975,8 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(SMAX_VL) NODE_NAME_CASE(UMIN_VL) NODE_NAME_CASE(UMAX_VL) + NODE_NAME_CASE(MULHS_VL) + NODE_NAME_CASE(MULHU_VL) NODE_NAME_CASE(SETCC_VL) NODE_NAME_CASE(VSELECT_VL) NODE_NAME_CASE(VMAND_VL) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index b91437a..8d761d2 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -170,6 +170,8 @@ enum NodeType : unsigned { SMAX_VL, UMIN_VL, UMAX_VL, + MULHS_VL, + MULHU_VL, // Vector compare producing a mask. Fourth operand is input mask. Fifth // operand is VL. diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index d6db0cc..b7c08d5 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -63,6 +63,8 @@ def riscv_vse_vl : SDNode<"RISCVISD::VSE_VL", SDT_RISCVVSE_VL, def riscv_add_vl : SDNode<"RISCVISD::ADD_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; def riscv_sub_vl : SDNode<"RISCVISD::SUB_VL", SDT_RISCVIntBinOp_VL>; def riscv_mul_vl : SDNode<"RISCVISD::MUL_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; +def riscv_mulhs_vl : SDNode<"RISCVISD::MULHS_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; +def riscv_mulhu_vl : SDNode<"RISCVISD::MULHU_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; def riscv_and_vl : SDNode<"RISCVISD::AND_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; def riscv_or_vl : SDNode<"RISCVISD::OR_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; def riscv_xor_vl : SDNode<"RISCVISD::XOR_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; @@ -443,6 +445,8 @@ defm "" : VPatBinaryVL_VV_VX; // 12.10. Vector Single-Width Integer Multiply Instructions defm "" : VPatBinaryVL_VV_VX; +defm "" : VPatBinaryVL_VV_VX; +defm "" : VPatBinaryVL_VV_VX; // 12.11. Vector Integer Divide Instructions defm "" : VPatBinaryVL_VV_VX; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll index 5903754..6a8b061 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -888,6 +888,283 @@ define void @urem_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ret void } +define void @mulhu_v16i8(<16 x i8>* %x) { +; CHECK-LABEL: mulhu_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a1, zero, 16 +; CHECK-NEXT: vsetvli a1, a1, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: lui a1, %hi(.LCPI52_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI52_0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: lui a1, %hi(.LCPI52_1) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI52_1) +; CHECK-NEXT: vle8.v v27, (a1) +; CHECK-NEXT: vsrl.vv v26, v25, v26 +; CHECK-NEXT: vmulhu.vv v26, v26, v27 +; CHECK-NEXT: lui a1, %hi(.LCPI52_2) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI52_2) +; CHECK-NEXT: vle8.v v27, (a1) +; CHECK-NEXT: lui a1, %hi(.LCPI52_3) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI52_3) +; CHECK-NEXT: vle8.v v28, (a1) +; CHECK-NEXT: vsub.vv v25, v25, v26 +; CHECK-NEXT: vmulhu.vv v25, v25, v27 +; CHECK-NEXT: vadd.vv v25, v25, v26 +; CHECK-NEXT: vsrl.vv v25, v25, v28 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = udiv <16 x i8> %a, + store <16 x i8> %b, <16 x i8>* %x + ret void +} + +define void @mulhu_v8i16(<8 x i16>* %x) { +; CHECK-LABEL: mulhu_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a1, zero, 8 +; CHECK-NEXT: vsetvli a1, a1, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: lui a1, %hi(.LCPI53_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI53_0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: lui a1, %hi(.LCPI53_1) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI53_1) +; CHECK-NEXT: vle16.v v27, (a1) +; CHECK-NEXT: vsrl.vv v26, v25, v26 +; CHECK-NEXT: vmulhu.vv v26, v26, v27 +; CHECK-NEXT: lui a1, %hi(.LCPI53_2) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI53_2) +; CHECK-NEXT: vle16.v v27, (a1) +; CHECK-NEXT: lui a1, %hi(.LCPI53_3) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI53_3) +; CHECK-NEXT: vle16.v v28, (a1) +; CHECK-NEXT: vsub.vv v25, v25, v26 +; CHECK-NEXT: vmulhu.vv v25, v25, v27 +; CHECK-NEXT: vadd.vv v25, v25, v26 +; CHECK-NEXT: vsrl.vv v25, v25, v28 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = udiv <8 x i16> %a, + store <8 x i16> %b, <8 x i16>* %x + ret void +} + +define void @mulhu_v4i32(<4 x i32>* %x) { +; CHECK-LABEL: mulhu_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a1, zero, 4 +; CHECK-NEXT: vsetvli a1, a1, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: lui a1, %hi(.LCPI54_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI54_0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vmulhu.vv v26, v25, v26 +; CHECK-NEXT: lui a1, %hi(.LCPI54_1) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI54_1) +; CHECK-NEXT: vle32.v v27, (a1) +; CHECK-NEXT: lui a1, %hi(.LCPI54_2) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI54_2) +; CHECK-NEXT: vle32.v v28, (a1) +; CHECK-NEXT: vsub.vv v25, v25, v26 +; CHECK-NEXT: vmulhu.vv v25, v25, v27 +; CHECK-NEXT: vadd.vv v25, v25, v26 +; CHECK-NEXT: vsrl.vv v25, v25, v28 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = udiv <4 x i32> %a, + store <4 x i32> %b, <4 x i32>* %x + ret void +} + +define void @mulhu_v2i64(<2 x i64>* %x) { +; LMULMAX1-RV32-LABEL: mulhu_v2i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI55_0) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI55_0) +; LMULMAX1-RV32-NEXT: addi a3, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a4, a3, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: vsetvli a2, a1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmulhu.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI55_1) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI55_1) +; LMULMAX1-RV32-NEXT: vsetvli a3, a3, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: vsetvli a1, a1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vsrl.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mulhu_v2i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a1, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a1, a1, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI55_0) +; LMULMAX1-RV64-NEXT: addi a1, a1, %lo(.LCPI55_0) +; LMULMAX1-RV64-NEXT: vle64.v v26, (a1) +; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI55_1) +; LMULMAX1-RV64-NEXT: addi a1, a1, %lo(.LCPI55_1) +; LMULMAX1-RV64-NEXT: vle64.v v27, (a1) +; LMULMAX1-RV64-NEXT: vmulhu.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vsrl.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = udiv <2 x i64> %a, + store <2 x i64> %b, <2 x i64>* %x + ret void +} + +define void @mulhs_v16i8(<16 x i8>* %x) { +; CHECK-LABEL: mulhs_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a1, zero, 16 +; CHECK-NEXT: vsetvli a1, a1, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: lui a1, %hi(.LCPI56_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI56_0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: lui a1, %hi(.LCPI56_1) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI56_1) +; CHECK-NEXT: vle8.v v27, (a1) +; CHECK-NEXT: vmulhu.vv v25, v25, v26 +; CHECK-NEXT: vsrl.vv v25, v25, v27 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = udiv <16 x i8> %a, + store <16 x i8> %b, <16 x i8>* %x + ret void +} + +define void @mulhs_v8i16(<8 x i16>* %x) { +; CHECK-LABEL: mulhs_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a1, zero, 8 +; CHECK-NEXT: vsetvli a1, a1, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: lui a1, %hi(.LCPI57_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI57_0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vmulh.vv v25, v25, v26 +; CHECK-NEXT: vsra.vi v25, v25, 1 +; CHECK-NEXT: vsrl.vi v26, v25, 15 +; CHECK-NEXT: vadd.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = sdiv <8 x i16> %a, + store <8 x i16> %b, <8 x i16>* %x + ret void +} + +define void @mulhs_v4i32(<4 x i32>* %x) { +; LMULMAX1-RV32-LABEL: mulhs_v4i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a1, a1, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI58_0) +; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI58_0) +; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) +; LMULMAX1-RV32-NEXT: vmulh.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 31 +; LMULMAX1-RV32-NEXT: vsra.vi v25, v25, 1 +; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mulhs_v4i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a1, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a1, a1, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI58_0) +; LMULMAX1-RV64-NEXT: addi a1, a1, %lo(.LCPI58_0) +; LMULMAX1-RV64-NEXT: vle32.v v26, (a1) +; LMULMAX1-RV64-NEXT: vmulh.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vsra.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 31 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = sdiv <4 x i32> %a, + store <4 x i32> %b, <4 x i32>* %x + ret void +} + +define void @mulhs_v2i64(<2 x i64>* %x) { +; LMULMAX1-RV32-LABEL: mulhs_v2i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI59_0) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI59_0) +; LMULMAX1-RV32-NEXT: addi a3, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a4, a3, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: vsetvli a2, a1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmul.vv v26, v25, v26 +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI59_1) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI59_1) +; LMULMAX1-RV32-NEXT: vsetvli a4, a3, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV32-NEXT: vsetvli a2, a1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmulh.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI59_2) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI59_2) +; LMULMAX1-RV32-NEXT: vsetvli a4, a3, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: vsetvli a2, a1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vsrl.vv v26, v25, v26 +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI59_3) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI59_3) +; LMULMAX1-RV32-NEXT: vsetvli a3, a3, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV32-NEXT: vsetvli a1, a1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vsra.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mulhs_v2i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a1, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a1, a1, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI59_0) +; LMULMAX1-RV64-NEXT: addi a1, a1, %lo(.LCPI59_0) +; LMULMAX1-RV64-NEXT: vle64.v v26, (a1) +; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI59_1) +; LMULMAX1-RV64-NEXT: addi a1, a1, %lo(.LCPI59_1) +; LMULMAX1-RV64-NEXT: vle64.v v27, (a1) +; LMULMAX1-RV64-NEXT: vmul.vv v26, v25, v26 +; LMULMAX1-RV64-NEXT: vmulh.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: addi a1, zero, 63 +; LMULMAX1-RV64-NEXT: vsrl.vx v26, v25, a1 +; LMULMAX1-RV64-NEXT: vid.v v27 +; LMULMAX1-RV64-NEXT: vsra.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = sdiv <2 x i64> %a, + store <2 x i64> %b, <2 x i64>* %x + ret void +} + define void @smin_v16i8(<16 x i8>* %x, <16 x i8>* %y) { ; CHECK-LABEL: smin_v16i8: ; CHECK: # %bb.0: @@ -3778,6 +4055,478 @@ define void @extract_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ret void } +define void @mulhu_v32i8(<32 x i8>* %x) { +; LMULMAX2-LABEL: mulhu_v32i8: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a1, zero, 32 +; LMULMAX2-NEXT: vsetvli a1, a1, e8,m2,ta,mu +; LMULMAX2-NEXT: vle8.v v26, (a0) +; LMULMAX2-NEXT: lui a1, %hi(.LCPI129_0) +; LMULMAX2-NEXT: addi a1, a1, %lo(.LCPI129_0) +; LMULMAX2-NEXT: vle8.v v28, (a1) +; LMULMAX2-NEXT: lui a1, %hi(.LCPI129_1) +; LMULMAX2-NEXT: addi a1, a1, %lo(.LCPI129_1) +; LMULMAX2-NEXT: vle8.v v30, (a1) +; LMULMAX2-NEXT: vsrl.vv v28, v26, v28 +; LMULMAX2-NEXT: vmulhu.vv v28, v28, v30 +; LMULMAX2-NEXT: lui a1, %hi(.LCPI129_2) +; LMULMAX2-NEXT: addi a1, a1, %lo(.LCPI129_2) +; LMULMAX2-NEXT: vle8.v v30, (a1) +; LMULMAX2-NEXT: lui a1, %hi(.LCPI129_3) +; LMULMAX2-NEXT: addi a1, a1, %lo(.LCPI129_3) +; LMULMAX2-NEXT: vle8.v v8, (a1) +; LMULMAX2-NEXT: vsub.vv v26, v26, v28 +; LMULMAX2-NEXT: vmulhu.vv v26, v26, v30 +; LMULMAX2-NEXT: vadd.vv v26, v26, v28 +; LMULMAX2-NEXT: vsrl.vv v26, v26, v8 +; LMULMAX2-NEXT: vse8.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: mulhu_v32i8: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, zero, 16 +; LMULMAX1-RV32-NEXT: vsetvli a1, a1, e8,m1,ta,mu +; LMULMAX1-RV32-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32-NEXT: vle8.v v25, (a1) +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI129_0) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI129_0) +; LMULMAX1-RV32-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV32-NEXT: vle8.v v27, (a0) +; LMULMAX1-RV32-NEXT: vdivu.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vdivu.vv v26, v27, v26 +; LMULMAX1-RV32-NEXT: vse8.v v26, (a0) +; LMULMAX1-RV32-NEXT: vse8.v v25, (a1) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mulhu_v32i8: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a1, zero, 16 +; LMULMAX1-RV64-NEXT: vsetvli a1, a1, e8,m1,ta,mu +; LMULMAX1-RV64-NEXT: addi a1, a0, 16 +; LMULMAX1-RV64-NEXT: vle8.v v25, (a1) +; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI129_0) +; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI129_0) +; LMULMAX1-RV64-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV64-NEXT: vle8.v v27, (a0) +; LMULMAX1-RV64-NEXT: vdivu.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vdivu.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vse8.v v26, (a0) +; LMULMAX1-RV64-NEXT: vse8.v v25, (a1) +; LMULMAX1-RV64-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = udiv <32 x i8> %a, + store <32 x i8> %b, <32 x i8>* %x + ret void +} + +define void @mulhu_v16i16(<16 x i16>* %x) { +; LMULMAX2-LABEL: mulhu_v16i16: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a1, zero, 16 +; LMULMAX2-NEXT: vsetvli a1, a1, e16,m2,ta,mu +; LMULMAX2-NEXT: vle16.v v26, (a0) +; LMULMAX2-NEXT: lui a1, %hi(.LCPI130_0) +; LMULMAX2-NEXT: addi a1, a1, %lo(.LCPI130_0) +; LMULMAX2-NEXT: vle16.v v28, (a1) +; LMULMAX2-NEXT: lui a1, %hi(.LCPI130_1) +; LMULMAX2-NEXT: addi a1, a1, %lo(.LCPI130_1) +; LMULMAX2-NEXT: vle16.v v30, (a1) +; LMULMAX2-NEXT: vsrl.vv v28, v26, v28 +; LMULMAX2-NEXT: vmulhu.vv v28, v28, v30 +; LMULMAX2-NEXT: lui a1, %hi(.LCPI130_2) +; LMULMAX2-NEXT: addi a1, a1, %lo(.LCPI130_2) +; LMULMAX2-NEXT: vle16.v v30, (a1) +; LMULMAX2-NEXT: lui a1, %hi(.LCPI130_3) +; LMULMAX2-NEXT: addi a1, a1, %lo(.LCPI130_3) +; LMULMAX2-NEXT: vle16.v v8, (a1) +; LMULMAX2-NEXT: vsub.vv v26, v26, v28 +; LMULMAX2-NEXT: vmulhu.vv v26, v26, v30 +; LMULMAX2-NEXT: vadd.vv v26, v26, v28 +; LMULMAX2-NEXT: vsrl.vv v26, v26, v8 +; LMULMAX2-NEXT: vse16.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: mulhu_v16i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a1, a1, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v25, (a1) +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI130_0) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI130_0) +; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV32-NEXT: vle16.v v27, (a0) +; LMULMAX1-RV32-NEXT: vdivu.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vdivu.vv v26, v27, v26 +; LMULMAX1-RV32-NEXT: vse16.v v26, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v25, (a1) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mulhu_v16i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a1, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a1, a1, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: addi a1, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v25, (a1) +; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI130_0) +; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI130_0) +; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV64-NEXT: vle16.v v27, (a0) +; LMULMAX1-RV64-NEXT: vdivu.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vdivu.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vse16.v v26, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v25, (a1) +; LMULMAX1-RV64-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = udiv <16 x i16> %a, + store <16 x i16> %b, <16 x i16>* %x + ret void +} + +define void @mulhu_v8i32(<8 x i32>* %x) { +; LMULMAX2-LABEL: mulhu_v8i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a1, zero, 8 +; LMULMAX2-NEXT: vsetvli a1, a1, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: lui a1, %hi(.LCPI131_0) +; LMULMAX2-NEXT: addi a1, a1, %lo(.LCPI131_0) +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: vmulhu.vv v28, v26, v28 +; LMULMAX2-NEXT: lui a1, %hi(.LCPI131_1) +; LMULMAX2-NEXT: addi a1, a1, %lo(.LCPI131_1) +; LMULMAX2-NEXT: vle32.v v30, (a1) +; LMULMAX2-NEXT: lui a1, %hi(.LCPI131_2) +; LMULMAX2-NEXT: addi a1, a1, %lo(.LCPI131_2) +; LMULMAX2-NEXT: vle32.v v8, (a1) +; LMULMAX2-NEXT: vsub.vv v26, v26, v28 +; LMULMAX2-NEXT: vmulhu.vv v26, v26, v30 +; LMULMAX2-NEXT: vadd.vv v26, v26, v28 +; LMULMAX2-NEXT: vsrl.vv v26, v26, v8 +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: mulhu_v8i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a1, a1, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v25, (a1) +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI131_0) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI131_0) +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: vle32.v v27, (a0) +; LMULMAX1-RV32-NEXT: vmulhu.vv v28, v25, v26 +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI131_1) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI131_1) +; LMULMAX1-RV32-NEXT: vle32.v v29, (a2) +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI131_2) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI131_2) +; LMULMAX1-RV32-NEXT: vle32.v v30, (a2) +; LMULMAX1-RV32-NEXT: vsub.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vmulhu.vv v25, v25, v29 +; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vsrl.vv v25, v25, v30 +; LMULMAX1-RV32-NEXT: vmulhu.vv v26, v27, v26 +; LMULMAX1-RV32-NEXT: vsub.vv v27, v27, v26 +; LMULMAX1-RV32-NEXT: vmulhu.vv v27, v27, v29 +; LMULMAX1-RV32-NEXT: vadd.vv v26, v27, v26 +; LMULMAX1-RV32-NEXT: vsrl.vv v26, v26, v30 +; LMULMAX1-RV32-NEXT: vse32.v v26, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v25, (a1) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mulhu_v8i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a1, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a1, a1, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: addi a1, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v25, (a1) +; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI131_0) +; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI131_0) +; LMULMAX1-RV64-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV64-NEXT: vle32.v v27, (a0) +; LMULMAX1-RV64-NEXT: vdivu.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vdivu.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vse32.v v26, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v25, (a1) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = udiv <8 x i32> %a, + store <8 x i32> %b, <8 x i32>* %x + ret void +} + +define void @mulhu_v4i64(<4 x i64>* %x) { +; LMULMAX1-RV32-LABEL: mulhu_v4i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV32-NEXT: lui a3, %hi(.LCPI132_0) +; LMULMAX1-RV32-NEXT: addi a3, a3, %lo(.LCPI132_0) +; LMULMAX1-RV32-NEXT: addi a4, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a5, a4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v27, (a3) +; LMULMAX1-RV32-NEXT: vsetvli a3, a1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vdivu.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: lui a3, %hi(.LCPI132_1) +; LMULMAX1-RV32-NEXT: addi a3, a3, %lo(.LCPI132_1) +; LMULMAX1-RV32-NEXT: vsetvli a4, a4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v27, (a3) +; LMULMAX1-RV32-NEXT: vsetvli a1, a1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vdivu.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mulhu_v4i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a1, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a1, a1, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: addi a1, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v25, (a1) +; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI132_0) +; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI132_0) +; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI132_1) +; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI132_1) +; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle64.v v28, (a0) +; LMULMAX1-RV64-NEXT: vmulhu.vv v26, v25, v26 +; LMULMAX1-RV64-NEXT: vsub.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vmulhu.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI132_2) +; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI132_2) +; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) +; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI132_3) +; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI132_3) +; LMULMAX1-RV64-NEXT: vle64.v v29, (a2) +; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI132_4) +; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI132_4) +; LMULMAX1-RV64-NEXT: vle64.v v30, (a2) +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vsrl.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vmulhu.vv v26, v28, v29 +; LMULMAX1-RV64-NEXT: vsrl.vv v26, v26, v30 +; LMULMAX1-RV64-NEXT: vse64.v v26, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v25, (a1) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x i64>, <4 x i64>* %x + %b = udiv <4 x i64> %a, + store <4 x i64> %b, <4 x i64>* %x + ret void +} + +define void @mulhs_v32i8(<32 x i8>* %x) { +; LMULMAX2-LABEL: mulhs_v32i8: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a1, zero, 32 +; LMULMAX2-NEXT: vsetvli a1, a1, e8,m2,ta,mu +; LMULMAX2-NEXT: vle8.v v26, (a0) +; LMULMAX2-NEXT: lui a1, %hi(.LCPI133_0) +; LMULMAX2-NEXT: addi a1, a1, %lo(.LCPI133_0) +; LMULMAX2-NEXT: vle8.v v28, (a1) +; LMULMAX2-NEXT: lui a1, %hi(.LCPI133_1) +; LMULMAX2-NEXT: addi a1, a1, %lo(.LCPI133_1) +; LMULMAX2-NEXT: vle8.v v30, (a1) +; LMULMAX2-NEXT: vmulhu.vv v26, v26, v28 +; LMULMAX2-NEXT: vsrl.vv v26, v26, v30 +; LMULMAX2-NEXT: vse8.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: mulhs_v32i8: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, zero, 16 +; LMULMAX1-RV32-NEXT: vsetvli a1, a1, e8,m1,ta,mu +; LMULMAX1-RV32-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32-NEXT: vle8.v v25, (a1) +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI133_0) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI133_0) +; LMULMAX1-RV32-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV32-NEXT: vle8.v v27, (a0) +; LMULMAX1-RV32-NEXT: vdivu.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vdivu.vv v26, v27, v26 +; LMULMAX1-RV32-NEXT: vse8.v v26, (a0) +; LMULMAX1-RV32-NEXT: vse8.v v25, (a1) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mulhs_v32i8: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a1, zero, 16 +; LMULMAX1-RV64-NEXT: vsetvli a1, a1, e8,m1,ta,mu +; LMULMAX1-RV64-NEXT: addi a1, a0, 16 +; LMULMAX1-RV64-NEXT: vle8.v v25, (a1) +; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI133_0) +; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI133_0) +; LMULMAX1-RV64-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV64-NEXT: vle8.v v27, (a0) +; LMULMAX1-RV64-NEXT: vdivu.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vdivu.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vse8.v v26, (a0) +; LMULMAX1-RV64-NEXT: vse8.v v25, (a1) +; LMULMAX1-RV64-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = udiv <32 x i8> %a, + store <32 x i8> %b, <32 x i8>* %x + ret void +} + +define void @mulhs_v16i16(<16 x i16>* %x) { +; LMULMAX2-LABEL: mulhs_v16i16: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a1, zero, 16 +; LMULMAX2-NEXT: vsetvli a1, a1, e16,m2,ta,mu +; LMULMAX2-NEXT: vle16.v v26, (a0) +; LMULMAX2-NEXT: lui a1, %hi(.LCPI134_0) +; LMULMAX2-NEXT: addi a1, a1, %lo(.LCPI134_0) +; LMULMAX2-NEXT: vle16.v v28, (a1) +; LMULMAX2-NEXT: vmulh.vv v26, v26, v28 +; LMULMAX2-NEXT: vsra.vi v26, v26, 1 +; LMULMAX2-NEXT: vsrl.vi v28, v26, 15 +; LMULMAX2-NEXT: vadd.vv v26, v26, v28 +; LMULMAX2-NEXT: vse16.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: mulhs_v16i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a1, a1, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v25, (a1) +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI134_0) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI134_0) +; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV32-NEXT: vle16.v v27, (a0) +; LMULMAX1-RV32-NEXT: vdiv.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vdiv.vv v26, v27, v26 +; LMULMAX1-RV32-NEXT: vse16.v v26, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v25, (a1) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mulhs_v16i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a1, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a1, a1, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: addi a1, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v25, (a1) +; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI134_0) +; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI134_0) +; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV64-NEXT: vle16.v v27, (a0) +; LMULMAX1-RV64-NEXT: vdiv.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vdiv.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vse16.v v26, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v25, (a1) +; LMULMAX1-RV64-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = sdiv <16 x i16> %a, + store <16 x i16> %b, <16 x i16>* %x + ret void +} + +define void @mulhs_v8i32(<8 x i32>* %x) { +; LMULMAX1-RV32-LABEL: mulhs_v8i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a1, a1, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v25, (a1) +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI135_0) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI135_0) +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: vle32.v v27, (a0) +; LMULMAX1-RV32-NEXT: vmulh.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vsrl.vi v28, v25, 31 +; LMULMAX1-RV32-NEXT: vsra.vi v25, v25, 1 +; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vmulh.vv v26, v27, v26 +; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 31 +; LMULMAX1-RV32-NEXT: vsra.vi v26, v26, 1 +; LMULMAX1-RV32-NEXT: vadd.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vse32.v v26, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v25, (a1) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mulhs_v8i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a1, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a1, a1, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: addi a1, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v25, (a1) +; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI135_0) +; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI135_0) +; LMULMAX1-RV64-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV64-NEXT: vle32.v v27, (a0) +; LMULMAX1-RV64-NEXT: vdiv.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vdiv.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vse32.v v26, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v25, (a1) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = sdiv <8 x i32> %a, + store <8 x i32> %b, <8 x i32>* %x + ret void +} + +define void @mulhs_v4i64(<4 x i64>* %x) { +; LMULMAX1-RV32-LABEL: mulhs_v4i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV32-NEXT: lui a3, %hi(.LCPI136_0) +; LMULMAX1-RV32-NEXT: addi a3, a3, %lo(.LCPI136_0) +; LMULMAX1-RV32-NEXT: addi a4, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a4, a4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v27, (a3) +; LMULMAX1-RV32-NEXT: vsetvli a1, a1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vdiv.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vdiv.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mulhs_v4i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a1, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a1, a1, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: addi a1, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v25, (a1) +; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI136_0) +; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI136_0) +; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI136_1) +; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI136_1) +; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle64.v v28, (a0) +; LMULMAX1-RV64-NEXT: vmul.vv v29, v25, v26 +; LMULMAX1-RV64-NEXT: vmulh.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v29 +; LMULMAX1-RV64-NEXT: addi a2, zero, 63 +; LMULMAX1-RV64-NEXT: vsrl.vx v29, v25, a2 +; LMULMAX1-RV64-NEXT: vid.v v30 +; LMULMAX1-RV64-NEXT: vsra.vv v25, v25, v30 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v29 +; LMULMAX1-RV64-NEXT: vmul.vv v26, v28, v26 +; LMULMAX1-RV64-NEXT: vmulh.vv v27, v28, v27 +; LMULMAX1-RV64-NEXT: vadd.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsrl.vx v27, v26, a2 +; LMULMAX1-RV64-NEXT: vsra.vv v26, v26, v30 +; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vse64.v v26, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v25, (a1) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x i64>, <4 x i64>* %x + %b = sdiv <4 x i64> %a, + store <4 x i64> %b, <4 x i64>* %x + ret void +} + define void @smin_v32i8(<32 x i8>* %x, <32 x i8>* %y) { ; LMULMAX2-LABEL: smin_v32i8: ; LMULMAX2: # %bb.0: @@ -6212,3 +6961,264 @@ define void @urem_vx_v4i32(<4 x i32>* %x, i32 %y) { store <4 x i32> %d, <4 x i32>* %x ret void } + +define void @mulhu_vx_v16i8(<16 x i8>* %x) { +; CHECK-LABEL: mulhu_vx_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a1, zero, 16 +; CHECK-NEXT: vsetvli a1, a1, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: addi a1, zero, 57 +; CHECK-NEXT: vmulhu.vx v25, v25, a1 +; CHECK-NEXT: vsrl.vi v25, v25, 1 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = udiv <16 x i8> %a, + store <16 x i8> %b, <16 x i8>* %x + ret void +} + +define void @mulhu_vx_v8i16(<8 x i16>* %x) { +; LMULMAX1-RV32-LABEL: mulhu_vx_v8i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a1, a1, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: lui a1, 2 +; LMULMAX1-RV32-NEXT: addi a1, a1, 1171 +; LMULMAX1-RV32-NEXT: vmulhu.vx v26, v25, a1 +; LMULMAX1-RV32-NEXT: vsub.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1 +; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2 +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mulhu_vx_v8i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a1, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a1, a1, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: lui a1, 2 +; LMULMAX1-RV64-NEXT: addiw a1, a1, 1171 +; LMULMAX1-RV64-NEXT: vmulhu.vx v26, v25, a1 +; LMULMAX1-RV64-NEXT: vsub.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2 +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = udiv <8 x i16> %a, + store <8 x i16> %b, <8 x i16>* %x + ret void +} + +define void @mulhu_vx_v4i32(<4 x i32>* %x) { +; LMULMAX1-RV32-LABEL: mulhu_vx_v4i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a1, a1, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: lui a1, 838861 +; LMULMAX1-RV32-NEXT: addi a1, a1, -819 +; LMULMAX1-RV32-NEXT: vmulhu.vx v25, v25, a1 +; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mulhu_vx_v4i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a1, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a1, a1, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: lui a1, 838861 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -819 +; LMULMAX1-RV64-NEXT: vmulhu.vx v25, v25, a1 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = udiv <4 x i32> %a, + store <4 x i32> %b, <4 x i32>* %x + ret void +} + +define void @mulhu_vx_v2i64(<2 x i64>* %x) { +; LMULMAX1-RV32-LABEL: mulhu_vx_v2i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI252_0) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI252_0) +; LMULMAX1-RV32-NEXT: addi a3, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a4, a3, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: vsetvli a2, a1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmulhu.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI252_1) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI252_1) +; LMULMAX1-RV32-NEXT: vsetvli a3, a3, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: vsetvli a1, a1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vsrl.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mulhu_vx_v2i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a1, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a1, a1, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: lui a1, 1026731 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -1365 +; LMULMAX1-RV64-NEXT: vmulhu.vx v25, v25, a1 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = udiv <2 x i64> %a, + store <2 x i64> %b, <2 x i64>* %x + ret void +} + +define void @mulhs_vx_v16i8(<16 x i8>* %x) { +; CHECK-LABEL: mulhs_vx_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a1, zero, 16 +; CHECK-NEXT: vsetvli a1, a1, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: addi a1, zero, -123 +; CHECK-NEXT: vmulhu.vx v25, v25, a1 +; CHECK-NEXT: vsrl.vi v25, v25, 7 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = udiv <16 x i8> %a, + store <16 x i8> %b, <16 x i8>* %x + ret void +} + +define void @mulhs_vx_v8i16(<8 x i16>* %x) { +; LMULMAX1-RV32-LABEL: mulhs_vx_v8i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a1, a1, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: lui a1, 5 +; LMULMAX1-RV32-NEXT: addi a1, a1, -1755 +; LMULMAX1-RV32-NEXT: vmulh.vx v25, v25, a1 +; LMULMAX1-RV32-NEXT: vsra.vi v25, v25, 1 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 15 +; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mulhs_vx_v8i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a1, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a1, a1, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: lui a1, 5 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -1755 +; LMULMAX1-RV64-NEXT: vmulh.vx v25, v25, a1 +; LMULMAX1-RV64-NEXT: vsra.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 15 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = sdiv <8 x i16> %a, + store <8 x i16> %b, <8 x i16>* %x + ret void +} + +define void @mulhs_vx_v4i32(<4 x i32>* %x) { +; LMULMAX1-RV32-LABEL: mulhs_vx_v4i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a1, a1, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: lui a1, 629146 +; LMULMAX1-RV32-NEXT: addi a1, a1, -1639 +; LMULMAX1-RV32-NEXT: vmulh.vx v25, v25, a1 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 31 +; LMULMAX1-RV32-NEXT: vsra.vi v25, v25, 1 +; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mulhs_vx_v4i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a1, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a1, a1, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: lui a1, 629146 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -1639 +; LMULMAX1-RV64-NEXT: vmulh.vx v25, v25, a1 +; LMULMAX1-RV64-NEXT: vsra.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 31 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = sdiv <4 x i32> %a, + store <4 x i32> %b, <4 x i32>* %x + ret void +} + +define void @mulhs_vx_v2i64(<2 x i64>* %x) { +; LMULMAX1-RV32-LABEL: mulhs_vx_v2i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI256_0) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI256_0) +; LMULMAX1-RV32-NEXT: addi a3, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a4, a3, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: vsetvli a2, a1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmulh.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI256_1) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI256_1) +; LMULMAX1-RV32-NEXT: vsetvli a3, a3, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: vsetvli a1, a1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vsrl.vv v26, v25, v26 +; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mulhs_vx_v2i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a1, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a1, a1, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: lui a1, 21845 +; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, 1366 +; LMULMAX1-RV64-NEXT: vmulh.vx v25, v25, a1 +; LMULMAX1-RV64-NEXT: addi a1, zero, 63 +; LMULMAX1-RV64-NEXT: vsrl.vx v26, v25, a1 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = sdiv <2 x i64> %a, + store <2 x i64> %b, <2 x i64>* %x + ret void +} -- 2.7.4