From: Philip Reames Date: Thu, 29 Jun 2023 14:24:54 +0000 (-0700) Subject: [RISCV] Remove legacy TA/TU pseudo distinction for unary instructions X-Git-Tag: upstream/17.0.6~3432 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=92b5a3405de9f04cb31fae21138183c417afc42e;p=platform%2Fupstream%2Fllvm.git [RISCV] Remove legacy TA/TU pseudo distinction for unary instructions This change continues with the line of work discussed in https://discourse.llvm.org/t/riscv-transition-in-vector-pseudo-structure-policy-variants/71295. In D153155, we started removing the legacy distinction between unsuffixed (TA) and _TU pseudos. This patch continues that effort for the unary instruction families. The change consists of a few interacting pieces: * Adding a vector policy operand to VPseudoUnaryNoMaskTU. * Then using VPseudoUnaryNoMaskTU for all cases where VPseudoUnaryNoMask was previously used and deleting the unsuffixed form. * Then renaming VPseudoUnaryNoMaskTU to VPseudoUnaryNoMask, and adjusting the RISCVMaskedPseudo table to use the combined pseudo. * Fixing up two places in C++ code which manually construct VMV_V_* instructions. Normally, I'd try to factor this into a couple of changes, but in this case, the table structure is tied to naming and thus we can't really separate the otherwise NFC bits. As before, we see codegen changes (some improvements and some regressions) due to scheduling differences caused by the extra implicit_def instructions. Differential Revision: https://reviews.llvm.org/D153899 --- diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 69c20dd..03b320c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -3423,7 +3423,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N, bool IsTA) { bool RISCVDAGToDAGISel::performVMergeToVMv(SDNode *N) { #define CASE_VMERGE_TO_VMV(lmul) \ case RISCV::PseudoVMERGE_VVM_##lmul##_TU: \ - NewOpc = RISCV::PseudoVMV_V_V_##lmul##_TU; \ + NewOpc = RISCV::PseudoVMV_V_V_##lmul; \ break; unsigned NewOpc; switch (N->getMachineOpcode()) { @@ -3441,9 +3441,13 @@ bool RISCVDAGToDAGISel::performVMergeToVMv(SDNode *N) { if (!usesAllOnesMask(N, /* MaskOpIdx */ 3)) return false; + SDLoc DL(N); + SDValue PolicyOp = + CurDAG->getTargetConstant(/*TUMU*/ 0, DL, Subtarget->getXLenVT()); SDNode *Result = CurDAG->getMachineNode( - NewOpc, SDLoc(N), N->getValueType(0), - {N->getOperand(1), N->getOperand(2), N->getOperand(4), N->getOperand(5)}); + NewOpc, DL, N->getValueType(0), + {N->getOperand(1), N->getOperand(2), N->getOperand(4), N->getOperand(5), + PolicyOp}); ReplaceUses(N, Result); return true; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index b88befe..a1f315d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -453,14 +453,17 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (NF == 1) { auto MIB = BuildMI(MBB, MBBI, DL, get(Opc), DstReg); + if (UseVMV_V_V) + MIB.addReg(DstReg, RegState::Undef); if (UseVMV_V_I) - MIB = MIB.add(DefMBBI->getOperand(1)); + MIB = MIB.add(DefMBBI->getOperand(2)); else MIB = MIB.addReg(SrcReg, getKillRegState(KillSrc)); if (UseVMV_V_V) { const MCInstrDesc &Desc = DefMBBI->getDesc(); MIB.add(DefMBBI->getOperand(RISCVII::getVLOpNum(Desc))); // AVL MIB.add(DefMBBI->getOperand(RISCVII::getSEWOpNum(Desc))); // SEW + MIB.addImm(0); // tu, mu MIB.addReg(RISCV::VL, RegState::Implicit); MIB.addReg(RISCV::VTYPE, RegState::Implicit); } @@ -481,8 +484,11 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB, for (; I != End; I += Incr) { auto MIB = BuildMI(MBB, MBBI, DL, get(Opc), TRI->getSubReg(DstReg, SubRegIdx + I)); + if (UseVMV_V_V) + MIB.addReg(TRI->getSubReg(DstReg, SubRegIdx + I), + RegState::Undef); if (UseVMV_V_I) - MIB = MIB.add(DefMBBI->getOperand(1)); + MIB = MIB.add(DefMBBI->getOperand(2)); else MIB = MIB.addReg(TRI->getSubReg(SrcReg, SubRegIdx + I), getKillRegState(KillSrc)); @@ -490,6 +496,7 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB, const MCInstrDesc &Desc = DefMBBI->getDesc(); MIB.add(DefMBBI->getOperand(RISCVII::getVLOpNum(Desc))); // AVL MIB.add(DefMBBI->getOperand(RISCVII::getSEWOpNum(Desc))); // SEW + MIB.addImm(0); // tu, mu MIB.addReg(RISCV::VL, RegState::Implicit); MIB.addReg(RISCV::VTYPE, RegState::Implicit); } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 5a21308..620ef90 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -1030,21 +1030,9 @@ class VPseudoNullaryPseudoM class VPseudoUnaryNoMask : - Pseudo<(outs RetClass:$rd), - (ins OpClass:$rs2, AVL:$vl, ixlenimm:$sew), []>, - RISCVVPseudo { - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let Constraints = Constraint; - let HasVLOp = 1; - let HasSEWOp = 1; -} - -class VPseudoUnaryNoMaskTU : Pseudo<(outs RetClass:$rd), - (ins RetClass:$merge, OpClass:$rs2, AVL:$vl, ixlenimm:$sew), []>, + (ins RetClass:$merge, OpClass:$rs2, AVL:$vl, ixlenimm:$sew, + ixlenimm:$policy), []>, RISCVVPseudo { let mayLoad = 0; let mayStore = 0; @@ -1052,6 +1040,7 @@ class VPseudoUnaryNoMaskTU.ret; let HasVLOp = 1; let HasSEWOp = 1; + let HasVecPolicyOp = 1; } class VPseudoUnaryMask : @@ -1958,10 +1947,10 @@ multiclass VPseudoVIOT_M { let VLMul = m.value in { def "_" # m.MX : VPseudoUnaryNoMask, Sched<[WriteVMIotV_MX, ReadVMIotV_MX, ReadVMask]>; - def "_" # m.MX # "_TU" : VPseudoUnaryNoMaskTU, - Sched<[WriteVMIotV_MX, ReadVMIotV_MX, ReadVMask]>; def "_" # m.MX # "_MASK" : VPseudoUnaryMask, - RISCVMaskedPseudo, + RISCVMaskedPseudo, Sched<[WriteVMIotV_MX, ReadVMIotV_MX, ReadVMask]>; } } @@ -2328,12 +2317,6 @@ multiclass VPseudoUnaryVMV_V_X_I { Sched<[WriteVIMovX_MX, ReadVIMovX_MX]>; def "_I_" # mx : VPseudoUnaryNoMask, Sched<[WriteVIMovI_MX]>; - def "_V_" # mx # "_TU": VPseudoUnaryNoMaskTU, - Sched<[WriteVIMovV_MX, ReadVIMovV_MX]>; - def "_X_" # mx # "_TU": VPseudoUnaryNoMaskTU, - Sched<[WriteVIMovX_MX, ReadVIMovX_MX]>; - def "_I_" # mx # "_TU": VPseudoUnaryNoMaskTU, - Sched<[WriteVIMovI_MX]>; } } } @@ -2350,9 +2333,6 @@ multiclass VPseudoVMV_F { def "_" # f.FX # "_" # mx : VPseudoUnaryNoMask, Sched<[WriteVFMovV_MX, ReadVFMovF_MX]>; - def "_" # f.FX # "_" # mx # "_TU": - VPseudoUnaryNoMaskTU, - Sched<[WriteVFMovV_MX, ReadVFMovF_MX]>; } } } @@ -2367,10 +2347,10 @@ multiclass VPseudoVCLS_V { let VLMul = m.value in { def "_V_" # mx : VPseudoUnaryNoMask, Sched<[WriteVFClassV_MX, ReadVFClassV_MX, ReadVMask]>; - def "_V_" # mx # "_TU": VPseudoUnaryNoMaskTU, - Sched<[WriteVFClassV_MX, ReadVFClassV_MX, ReadVMask]>; def "_V_" # mx # "_MASK" : VPseudoUnaryMask, - RISCVMaskedPseudo, + RISCVMaskedPseudo, Sched<[WriteVFClassV_MX, ReadVFClassV_MX, ReadVMask]>; } } @@ -2390,11 +2370,10 @@ multiclass VPseudoVSQR_V { def "_V" # suffix : VPseudoUnaryNoMask, Sched<[WriteVFSqrtV_MX_E, ReadVFSqrtV_MX_E, ReadVMask]>; - def "_V" # suffix # "_TU": VPseudoUnaryNoMaskTU, - Sched<[WriteVFSqrtV_MX_E, ReadVFSqrtV_MX_E, - ReadVMask]>; def "_V" # suffix # "_MASK" : VPseudoUnaryMask, - RISCVMaskedPseudo, + RISCVMaskedPseudo, Sched<[WriteVFSqrtV_MX_E, ReadVFSqrtV_MX_E, ReadVMask]>; } @@ -2410,10 +2389,10 @@ multiclass VPseudoVRCP_V { let VLMul = m.value in { def "_V_" # mx : VPseudoUnaryNoMask, Sched<[WriteVFRecpV_MX, ReadVFRecpV_MX, ReadVMask]>; - def "_V_" # mx # "_TU": VPseudoUnaryNoMaskTU, - Sched<[WriteVFRecpV_MX, ReadVFRecpV_MX, ReadVMask]>; def "_V_" # mx # "_MASK" : VPseudoUnaryMask, - RISCVMaskedPseudo, + RISCVMaskedPseudo, Sched<[WriteVFRecpV_MX, ReadVFRecpV_MX, ReadVMask]>; } } @@ -2430,11 +2409,9 @@ multiclass PseudoVEXT_VF2 { let VLMul = m.value in { def "_" # mx : VPseudoUnaryNoMask, Sched<[WriteVExtV_MX, ReadVExtV_MX, ReadVMask]>; - def "_" # mx # "_TU": VPseudoUnaryNoMaskTU, - Sched<[WriteVExtV_MX, ReadVExtV_MX, ReadVMask]>; def "_" # mx # "_MASK" : VPseudoUnaryMask, - RISCVMaskedPseudo, + RISCVMaskedPseudo, Sched<[WriteVExtV_MX, ReadVExtV_MX, ReadVMask]>; } } @@ -2451,11 +2428,9 @@ multiclass PseudoVEXT_VF4 { let VLMul = m.value in { def "_" # mx : VPseudoUnaryNoMask, Sched<[WriteVExtV_MX, ReadVExtV_MX, ReadVMask]>; - def "_" # mx # "_TU": VPseudoUnaryNoMaskTU, - Sched<[WriteVExtV_MX, ReadVExtV_MX, ReadVMask]>; def "_" # mx # "_MASK" : VPseudoUnaryMask, - RISCVMaskedPseudo, + RISCVMaskedPseudo, Sched<[WriteVExtV_MX, ReadVExtV_MX, ReadVMask]>; } } @@ -2472,11 +2447,9 @@ multiclass PseudoVEXT_VF8 { let VLMul = m.value in { def "_" # mx : VPseudoUnaryNoMask, Sched<[WriteVExtV_MX, ReadVExtV_MX, ReadVMask]>; - def "_" # mx # "_TU": VPseudoUnaryNoMaskTU, - Sched<[WriteVExtV_MX, ReadVExtV_MX, ReadVMask]>; def "_" # mx # "_MASK" : VPseudoUnaryMask, - RISCVMaskedPseudo, + RISCVMaskedPseudo, Sched<[WriteVExtV_MX, ReadVExtV_MX, ReadVMask]>; } } @@ -3554,10 +3527,11 @@ multiclass VPseudoConversion { let VLMul = MInfo.value in { def "_" # MInfo.MX : VPseudoUnaryNoMask; - def "_" # MInfo.MX # "_TU": VPseudoUnaryNoMaskTU; def "_" # MInfo.MX # "_MASK" : VPseudoUnaryMask, - RISCVMaskedPseudo; + RISCVMaskedPseudo; } } @@ -3933,40 +3907,20 @@ class VPatUnaryNoMask : Pat<(result_type (!cast(intrinsic_name) - (result_type undef), + (result_type result_reg_class:$merge), (op2_type op2_reg_class:$rs2), VLOpFrag)), (!cast( !if(isSEWAware, inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew), inst#"_"#kind#"_"#vlmul.MX)) - (op2_type op2_reg_class:$rs2), - GPR:$vl, log2sew)>; - -class VPatUnaryNoMaskTU : - Pat<(result_type (!cast(intrinsic_name) (result_type result_reg_class:$merge), (op2_type op2_reg_class:$rs2), - VLOpFrag)), - (!cast( - !if(isSEWAware, - inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew)#"_TU", - inst#"_"#kind#"_"#vlmul.MX#"_TU")) - (result_type result_reg_class:$merge), - (op2_type op2_reg_class:$rs2), - GPR:$vl, log2sew)>; + GPR:$vl, log2sew, TU_MU)>; class VPatUnaryMask(inst#"_M_"#mti.BX) + (mti.Mask (IMPLICIT_DEF)), (mti.Mask VR:$rs2), - GPR:$vl, mti.Log2SEW)>; + GPR:$vl, mti.Log2SEW, TU_MU)>; class VPatMaskUnaryMask foreach vti = AllIntegerVectors in { let Predicates = GetVTypePredicates.Predicates in { def : VPatUnaryNoMask; - def : VPatUnaryNoMaskTU; + vti.Log2SEW, vti.LMul, vti.RegClass, VR>; def : VPatUnaryMask; } @@ -4497,10 +4450,7 @@ multiclass VPatUnaryV_VF.Predicates) in { def : VPatUnaryNoMask; - def : VPatUnaryNoMaskTU; + vti.Log2SEW, vti.LMul, vti.RegClass, fti.RegClass>; def : VPatUnaryMask; @@ -4514,10 +4464,7 @@ multiclass VPatUnaryV_V.Predicates in { def : VPatUnaryNoMask; - def : VPatUnaryNoMaskTU; + vti.LMul, vti.RegClass, vti.RegClass, isSEWAware>; def : VPatUnaryMask; @@ -4720,9 +4667,7 @@ multiclass VPatConversionTA { def : VPatUnaryNoMask; - def : VPatUnaryNoMaskTU; + sew, vlmul, result_reg_class, op1_reg_class>; def : VPatUnaryMask; } @@ -6561,16 +6506,11 @@ defm : VPatBinaryV_VM_XM_IM<"int_riscv_vmerge", "PseudoVMERGE">; //===----------------------------------------------------------------------===// foreach vti = AllVectors in { let Predicates = GetVTypePredicates.Predicates in { - def : Pat<(vti.Vector (int_riscv_vmv_v_v (vti.Vector undef), - (vti.Vector vti.RegClass:$rs1), - VLOpFrag)), - (!cast("PseudoVMV_V_V_"#vti.LMul.MX) - $rs1, GPR:$vl, vti.Log2SEW)>; def : Pat<(vti.Vector (int_riscv_vmv_v_v (vti.Vector vti.RegClass:$passthru), (vti.Vector vti.RegClass:$rs1), VLOpFrag)), - (!cast("PseudoVMV_V_V_"#vti.LMul.MX#"_TU") - $passthru, $rs1, GPR:$vl, vti.Log2SEW)>; + (!cast("PseudoVMV_V_V_"#vti.LMul.MX) + $passthru, $rs1, GPR:$vl, vti.Log2SEW, TU_MU)>; // vmv.v.x/vmv.v.i are handled in RISCInstrVInstrInfoVVLPatterns.td } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index cb5b2d3..bb77e01 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -316,7 +316,8 @@ multiclass VPatExtendSDNode_V ops, string inst_name, string suffix, GetVTypePredicates.Predicates) in def : Pat<(vti.Vector (op (fti.Vector fti.RegClass:$rs2))), (!cast(inst_name#"_"#suffix#"_"#vti.LMul.MX) - fti.RegClass:$rs2, fti.AVL, vti.Log2SEW)>; + (vti.Vector (IMPLICIT_DEF)), + fti.RegClass:$rs2, fti.AVL, vti.Log2SEW, TU_MU)>; } } @@ -328,7 +329,8 @@ multiclass VPatConvertI2FPSDNode_V.Predicates) in def : Pat<(fvti.Vector (vop (ivti.Vector ivti.RegClass:$rs1))), (!cast(instruction_name#"_"#fvti.LMul.MX) - ivti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW)>; + (fvti.Vector (IMPLICIT_DEF)), + ivti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW, TU_MU)>; } } @@ -340,7 +342,8 @@ multiclass VPatConvertFP2ISDNode_V.Predicates) in def : Pat<(ivti.Vector (vop (fvti.Vector fvti.RegClass:$rs1))), (!cast(instruction_name#"_"#ivti.LMul.MX) - fvti.RegClass:$rs1, ivti.AVL, ivti.Log2SEW)>; + (ivti.Vector (IMPLICIT_DEF)), + fvti.RegClass:$rs1, ivti.AVL, ivti.Log2SEW, TU_MU)>; } } @@ -353,7 +356,8 @@ multiclass VPatWConvertI2FPSDNode_V.Predicates) in def : Pat<(fwti.Vector (vop (ivti.Vector ivti.RegClass:$rs1))), (!cast(instruction_name#"_"#ivti.LMul.MX) - ivti.RegClass:$rs1, ivti.AVL, ivti.Log2SEW)>; + (fwti.Vector (IMPLICIT_DEF)), + ivti.RegClass:$rs1, ivti.AVL, ivti.Log2SEW, TU_MU)>; } } @@ -366,7 +370,8 @@ multiclass VPatWConvertFP2ISDNode_V.Predicates) in def : Pat<(iwti.Vector (vop (fvti.Vector fvti.RegClass:$rs1))), (!cast(instruction_name#"_"#fvti.LMul.MX) - fvti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW)>; + (iwti.Vector (IMPLICIT_DEF)), + fvti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW, TU_MU)>; } } @@ -379,7 +384,8 @@ multiclass VPatNConvertI2FPSDNode_W.Predicates) in def : Pat<(fvti.Vector (vop (iwti.Vector iwti.RegClass:$rs1))), (!cast(instruction_name#"_"#fvti.LMul.MX) - iwti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW)>; + (fvti.Vector (IMPLICIT_DEF)), + iwti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW, TU_MU)>; } } @@ -392,7 +398,8 @@ multiclass VPatNConvertFP2ISDNode_W.Predicates) in def : Pat<(vti.Vector (vop (fwti.Vector fwti.RegClass:$rs1))), (!cast(instruction_name#"_"#vti.LMul.MX) - fwti.RegClass:$rs1, vti.AVL, vti.Log2SEW)>; + (vti.Vector (IMPLICIT_DEF)), + fwti.RegClass:$rs1, vti.AVL, vti.Log2SEW, TU_MU)>; } } @@ -1039,7 +1046,8 @@ foreach vti = AllFloatVectors in { // 13.8. Vector Floating-Point Square-Root Instruction def : Pat<(any_fsqrt (vti.Vector vti.RegClass:$rs2)), (!cast("PseudoVFSQRT_V_"# vti.LMul.MX#"_E"#vti.SEW) - vti.RegClass:$rs2, vti.AVL, vti.Log2SEW)>; + (vti.Vector (IMPLICIT_DEF)), + vti.RegClass:$rs2, vti.AVL, vti.Log2SEW, TU_MU)>; // 13.12. Vector Floating-Point Sign-Injection Instructions def : Pat<(fabs (vti.Vector vti.RegClass:$rs)), @@ -1141,7 +1149,8 @@ foreach fvtiToFWti = AllWidenableFloatVectors in { GetVTypePredicates.Predicates) in def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))), (!cast("PseudoVFNCVT_F_F_W_"#fvti.LMul.MX) - fwti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW)>; + (fvti.Vector (IMPLICIT_DEF)), + fwti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW, TU_MU)>; } //===----------------------------------------------------------------------===// @@ -1152,12 +1161,14 @@ foreach fvti = AllFloatVectors in { let Predicates = GetVTypePredicates.Predicates in { def : Pat<(fvti.Vector (SplatFPOp fvti.ScalarRegClass:$rs1)), (!cast("PseudoVFMV_V_"#fvti.ScalarSuffix#"_"#fvti.LMul.MX) + (fvti.Vector (IMPLICIT_DEF)), (fvti.Scalar fvti.ScalarRegClass:$rs1), - fvti.AVL, fvti.Log2SEW)>; + fvti.AVL, fvti.Log2SEW, TU_MU)>; def : Pat<(fvti.Vector (SplatFPOp (fvti.Scalar fpimm0))), (!cast("PseudoVMV_V_I_"#fvti.LMul.MX) - 0, fvti.AVL, fvti.Log2SEW)>; + (fvti.Vector (IMPLICIT_DEF)), + 0, fvti.AVL, fvti.Log2SEW, TU_MU)>; } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index 96b6f8e..ffa785a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -1788,32 +1788,21 @@ foreach vti = AllIntegerVectors in { // 11.16. Vector Integer Move Instructions foreach vti = AllVectors in { let Predicates = GetVTypePredicates.Predicates in { - def : Pat<(vti.Vector (riscv_vmv_v_v_vl (vti.Vector undef), - vti.RegClass:$rs2, VLOpFrag)), - (!cast("PseudoVMV_V_V_"#vti.LMul.MX) - vti.RegClass:$rs2, GPR:$vl, vti.Log2SEW)>; def : Pat<(vti.Vector (riscv_vmv_v_v_vl vti.RegClass:$passthru, vti.RegClass:$rs2, VLOpFrag)), - (!cast("PseudoVMV_V_V_"#vti.LMul.MX#"_TU") - vti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$vl, vti.Log2SEW)>; + (!cast("PseudoVMV_V_V_"#vti.LMul.MX) + vti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$vl, vti.Log2SEW, TU_MU)>; } foreach vti = AllIntegerVectors in { - def : Pat<(vti.Vector (riscv_vmv_v_x_vl (vti.Vector undef), GPR:$rs2, VLOpFrag)), - (!cast("PseudoVMV_V_X_"#vti.LMul.MX) - GPR:$rs2, GPR:$vl, vti.Log2SEW)>; def : Pat<(vti.Vector (riscv_vmv_v_x_vl vti.RegClass:$passthru, GPR:$rs2, VLOpFrag)), - (!cast("PseudoVMV_V_X_"#vti.LMul.MX#"_TU") - vti.RegClass:$passthru, GPR:$rs2, GPR:$vl, vti.Log2SEW)>; + (!cast("PseudoVMV_V_X_"#vti.LMul.MX) + vti.RegClass:$passthru, GPR:$rs2, GPR:$vl, vti.Log2SEW, TU_MU)>; defvar ImmPat = !cast("sew"#vti.SEW#"simm5"); - def : Pat<(vti.Vector (riscv_vmv_v_x_vl (vti.Vector undef), (ImmPat simm5:$imm5), - VLOpFrag)), - (!cast("PseudoVMV_V_I_"#vti.LMul.MX) - simm5:$imm5, GPR:$vl, vti.Log2SEW)>; def : Pat<(vti.Vector (riscv_vmv_v_x_vl vti.RegClass:$passthru, (ImmPat simm5:$imm5), VLOpFrag)), - (!cast("PseudoVMV_V_I_"#vti.LMul.MX#"_TU") - vti.RegClass:$passthru, simm5:$imm5, GPR:$vl, vti.Log2SEW)>; + (!cast("PseudoVMV_V_I_"#vti.LMul.MX) + vti.RegClass:$passthru, simm5:$imm5, GPR:$vl, vti.Log2SEW, TU_MU)>; } } @@ -1947,7 +1936,7 @@ foreach vti = AllFloatVectors in { def : Pat<(riscv_fclass_vl (vti.Vector vti.RegClass:$rs2), (vti.Mask true_mask), VLOpFrag), (!cast("PseudoVFCLASS_V_"# vti.LMul.MX) - vti.RegClass:$rs2, GPR:$vl, vti.Log2SEW)>; + (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs2, GPR:$vl, vti.Log2SEW, TU_MU)>; } } @@ -2021,34 +2010,20 @@ foreach fvti = AllFloatVectors in { // 13.16. Vector Floating-Point Move Instruction // If we're splatting fpimm0, use vmv.v.x vd, x0. def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl - (fvti.Vector undef), (fvti.Scalar (fpimm0)), VLOpFrag)), - (!cast("PseudoVMV_V_I_"#fvti.LMul.MX) - 0, GPR:$vl, fvti.Log2SEW)>; - def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl fvti.Vector:$passthru, (fvti.Scalar (fpimm0)), VLOpFrag)), - (!cast("PseudoVMV_V_I_"#fvti.LMul.MX#"_TU") - $passthru, 0, GPR:$vl, fvti.Log2SEW)>; - def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl - (fvti.Vector undef), (fvti.Scalar (SelectFPImm (XLenVT GPR:$imm))), VLOpFrag)), - (!cast("PseudoVMV_V_X_"#fvti.LMul.MX) - GPR:$imm, GPR:$vl, fvti.Log2SEW)>; + (!cast("PseudoVMV_V_I_"#fvti.LMul.MX) + $passthru, 0, GPR:$vl, fvti.Log2SEW, TU_MU)>; def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl fvti.Vector:$passthru, (fvti.Scalar (SelectFPImm (XLenVT GPR:$imm))), VLOpFrag)), - (!cast("PseudoVMV_V_X_"#fvti.LMul.MX#"_TU") - $passthru, GPR:$imm, GPR:$vl, fvti.Log2SEW)>; + (!cast("PseudoVMV_V_X_"#fvti.LMul.MX) + $passthru, GPR:$imm, GPR:$vl, fvti.Log2SEW, TU_MU)>; def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl - (fvti.Vector undef), (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)), - (!cast("PseudoVFMV_V_" # fvti.ScalarSuffix # "_" # - fvti.LMul.MX) - (fvti.Scalar fvti.ScalarRegClass:$rs2), - GPR:$vl, fvti.Log2SEW)>; - def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl fvti.Vector:$passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)), (!cast("PseudoVFMV_V_" # fvti.ScalarSuffix # "_" # - fvti.LMul.MX # "_TU") + fvti.LMul.MX) $passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2), - GPR:$vl, fvti.Log2SEW)>; + GPR:$vl, fvti.Log2SEW, TU_MU)>; } } diff --git a/llvm/test/CodeGen/RISCV/calling-conv-vector-on-stack.ll b/llvm/test/CodeGen/RISCV/calling-conv-vector-on-stack.ll index 3e2af11..776ad18 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-vector-on-stack.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-vector-on-stack.ll @@ -17,11 +17,11 @@ define void @bar() nounwind { ; CHECK-NEXT: andi sp, sp, -64 ; CHECK-NEXT: mv s1, sp ; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: addi a0, s1, 64 -; CHECK-NEXT: vs8r.v v8, (a0) ; CHECK-NEXT: sd a0, 0(sp) +; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vs8r.v v8, (a0) ; CHECK-NEXT: li a0, 0 ; CHECK-NEXT: li a1, 0 ; CHECK-NEXT: li a2, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/emergency-slot.mir b/llvm/test/CodeGen/RISCV/rvv/emergency-slot.mir index 72d30cb..5294214 100644 --- a/llvm/test/CodeGen/RISCV/rvv/emergency-slot.mir +++ b/llvm/test/CodeGen/RISCV/rvv/emergency-slot.mir @@ -88,7 +88,7 @@ body: | ; CHECK-NEXT: $x2 = frame-setup SUB $x2, killed $x10 ; CHECK-NEXT: $x2 = frame-setup ANDI $x2, -128 ; CHECK-NEXT: dead renamable $x15 = PseudoVSETIVLI 1, 72 /* e16, m1, ta, mu */, implicit-def $vl, implicit-def $vtype - ; CHECK-NEXT: renamable $v25 = PseudoVMV_V_X_M1 killed renamable $x12, $noreg, 4 /* e16 */, implicit $vl, implicit $vtype + ; CHECK-NEXT: renamable $v25 = PseudoVMV_V_X_M1 undef $v25, killed renamable $x12, $noreg, 4 /* e16 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: $x10 = PseudoReadVLENB ; CHECK-NEXT: $x11 = ADDI killed $x0, 50 ; CHECK-NEXT: $x10 = MUL killed $x10, killed $x11 @@ -172,7 +172,7 @@ body: | liveins: $x12 dead renamable $x15 = PseudoVSETIVLI 1, 72, implicit-def $vl, implicit-def $vtype - renamable $v25 = PseudoVMV_V_X_M1 killed renamable $x12, $noreg, 4, implicit $vl, implicit $vtype + renamable $v25 = PseudoVMV_V_X_M1 undef $v25, killed renamable $x12, $noreg, 4, 0, implicit $vl, implicit $vtype VS1R_V killed renamable $v25, %stack.1 :: (store unknown-size into %stack.1, align 8) renamable $x1 = ADDI $x0, 255 renamable $x5 = nuw ADDI %stack.0, 256 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll index 4f29bc4..2b6e66b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll @@ -42,12 +42,12 @@ define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) { ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV32-NEXT: vid.v v11 ; RV32-NEXT: vrgather.vv v10, v8, v11 -; RV32-NEXT: vadd.vi v8, v11, -1 ; RV32-NEXT: lui a0, 11 ; RV32-NEXT: addi a0, a0, -1366 ; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; RV32-NEXT: vmv.v.x v0, a0 ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV32-NEXT: vadd.vi v8, v11, -1 ; RV32-NEXT: vrgather.vv v10, v9, v8, v0.t ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret @@ -57,12 +57,12 @@ define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) { ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64-NEXT: vid.v v11 ; RV64-NEXT: vrgather.vv v10, v8, v11 -; RV64-NEXT: vadd.vi v8, v11, -1 ; RV64-NEXT: lui a0, 11 ; RV64-NEXT: addiw a0, a0, -1366 ; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; RV64-NEXT: vmv.v.x v0, a0 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV64-NEXT: vadd.vi v8, v11, -1 ; RV64-NEXT: vrgather.vv v10, v9, v8, v0.t ; RV64-NEXT: vmv.v.v v8, v10 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll index 16e0114..e38bc44 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll @@ -1396,10 +1396,9 @@ define <2 x i64> @vp_bitreverse_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %e ; RV32-NEXT: vand.vx v11, v11, a3, v0.t ; RV32-NEXT: vor.vv v10, v11, v10, v0.t ; RV32-NEXT: vsrl.vi v11, v8, 8, v0.t -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV32-NEXT: vmv.v.i v0, 5 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmv.v.i v0, 5 ; RV32-NEXT: lui a4, 1044480 ; RV32-NEXT: vmerge.vxm v12, v12, a4, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma @@ -1528,10 +1527,9 @@ define <2 x i64> @vp_bitreverse_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) ; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vand.vx v10, v10, a3 ; RV32-NEXT: vor.vv v9, v10, v9 -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV32-NEXT: vmv.v.i v0, 5 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vmv.v.i v0, 5 ; RV32-NEXT: lui a4, 1044480 ; RV32-NEXT: vmerge.vxm v10, v10, a4, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma @@ -1665,31 +1663,31 @@ define <4 x i64> @vp_bitreverse_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %e ; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vand.vx v14, v14, a3, v0.t ; RV32-NEXT: vor.vv v12, v14, v12, v0.t -; RV32-NEXT: vsrl.vi v14, v8, 8, v0.t -; RV32-NEXT: li a4, 85 +; RV32-NEXT: vsrl.vi v14, v8, 24, v0.t +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v14, v14, a4, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t +; RV32-NEXT: li a5, 85 ; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV32-NEXT: vmv.v.x v0, a4 +; RV32-NEXT: vmv.v.x v0, a5 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.i v16, 0 -; RV32-NEXT: lui a4, 1044480 -; RV32-NEXT: vmerge.vxm v16, v16, a4, v0 +; RV32-NEXT: vmv.v.i v18, 0 +; RV32-NEXT: lui a5, 1044480 +; RV32-NEXT: vmerge.vxm v18, v18, a5, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vmv1r.v v0, v10 -; RV32-NEXT: vand.vv v14, v14, v16, v0.t -; RV32-NEXT: vsrl.vi v18, v8, 24, v0.t -; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v18, v18, a4, v0.t -; RV32-NEXT: vor.vv v14, v14, v18, v0.t +; RV32-NEXT: vand.vv v16, v16, v18, v0.t +; RV32-NEXT: vor.vv v14, v16, v14, v0.t ; RV32-NEXT: vor.vv v12, v14, v12, v0.t ; RV32-NEXT: vsll.vx v14, v8, a1, v0.t -; RV32-NEXT: vand.vx v18, v8, a3, v0.t -; RV32-NEXT: vsll.vx v18, v18, a2, v0.t -; RV32-NEXT: vor.vv v14, v14, v18, v0.t -; RV32-NEXT: vand.vx v18, v8, a4, v0.t -; RV32-NEXT: vsll.vi v18, v18, 24, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vx v16, v8, a3, v0.t +; RV32-NEXT: vsll.vx v16, v16, a2, v0.t +; RV32-NEXT: vor.vv v14, v14, v16, v0.t +; RV32-NEXT: vand.vx v16, v8, a4, v0.t +; RV32-NEXT: vsll.vi v16, v16, 24, v0.t +; RV32-NEXT: vand.vv v8, v8, v18, v0.t ; RV32-NEXT: vsll.vi v8, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v18, v8, v0.t +; RV32-NEXT: vor.vv v8, v16, v8, v0.t ; RV32-NEXT: vor.vv v8, v14, v8, v0.t ; RV32-NEXT: vor.vv v8, v8, v12, v0.t ; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t @@ -1799,31 +1797,31 @@ define <4 x i64> @vp_bitreverse_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) ; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vand.vx v12, v12, a3 ; RV32-NEXT: vor.vv v10, v12, v10 +; RV32-NEXT: vsrl.vi v12, v8, 8 ; RV32-NEXT: li a4, 85 ; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; RV32-NEXT: vmv.v.x v0, a4 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmv.v.i v14, 0 ; RV32-NEXT: lui a4, 1044480 -; RV32-NEXT: vmerge.vxm v12, v12, a4, v0 +; RV32-NEXT: vmerge.vxm v14, v14, a4, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsrl.vi v14, v8, 8 -; RV32-NEXT: vand.vv v14, v14, v12 +; RV32-NEXT: vand.vv v12, v12, v14 ; RV32-NEXT: vsrl.vi v16, v8, 24 ; RV32-NEXT: lui a4, 4080 ; RV32-NEXT: vand.vx v16, v16, a4 -; RV32-NEXT: vor.vv v14, v14, v16 -; RV32-NEXT: vor.vv v10, v14, v10 -; RV32-NEXT: vsll.vx v14, v8, a1 +; RV32-NEXT: vor.vv v12, v12, v16 +; RV32-NEXT: vor.vv v10, v12, v10 +; RV32-NEXT: vsll.vx v12, v8, a1 ; RV32-NEXT: vand.vx v16, v8, a3 ; RV32-NEXT: vsll.vx v16, v16, a2 -; RV32-NEXT: vor.vv v14, v14, v16 -; RV32-NEXT: vand.vv v12, v8, v12 -; RV32-NEXT: vsll.vi v12, v12, 8 -; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vor.vv v8, v14, v8 +; RV32-NEXT: vor.vv v12, v12, v16 +; RV32-NEXT: vand.vx v16, v8, a4 +; RV32-NEXT: vsll.vi v16, v16, 24 +; RV32-NEXT: vand.vv v8, v8, v14 +; RV32-NEXT: vsll.vi v8, v8, 8 +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: vsrl.vi v10, v8, 4 ; RV32-NEXT: lui a1, 61681 @@ -1936,35 +1934,35 @@ define <8 x i64> @vp_bitreverse_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %e ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vand.vx v20, v20, a3, v0.t -; RV32-NEXT: vor.vv v20, v20, v16, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t -; RV32-NEXT: lui a4, 5 -; RV32-NEXT: addi a4, a4, 1365 +; RV32-NEXT: vor.vv v16, v20, v16, v0.t +; RV32-NEXT: vsrl.vi v20, v8, 24, v0.t +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v24, v20, a4, v0.t +; RV32-NEXT: vsrl.vi v28, v8, 8, v0.t +; RV32-NEXT: lui a5, 5 +; RV32-NEXT: addi a5, a5, 1365 ; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; RV32-NEXT: vmv.v.x v0, a4 +; RV32-NEXT: vmv.v.x v0, a5 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.i v16, 0 -; RV32-NEXT: lui a4, 1044480 -; RV32-NEXT: vmerge.vxm v16, v16, a4, v0 +; RV32-NEXT: vmv.v.i v20, 0 +; RV32-NEXT: lui a5, 1044480 +; RV32-NEXT: vmerge.vxm v20, v20, a5, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vmv1r.v v0, v12 -; RV32-NEXT: vand.vv v24, v24, v16, v0.t -; RV32-NEXT: vsrl.vi v28, v8, 24, v0.t -; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v28, v28, a4, v0.t -; RV32-NEXT: vor.vv v24, v24, v28, v0.t -; RV32-NEXT: vor.vv v20, v24, v20, v0.t +; RV32-NEXT: vand.vv v28, v28, v20, v0.t +; RV32-NEXT: vor.vv v24, v28, v24, v0.t +; RV32-NEXT: vor.vv v16, v24, v16, v0.t ; RV32-NEXT: vsll.vx v24, v8, a1, v0.t ; RV32-NEXT: vand.vx v28, v8, a3, v0.t ; RV32-NEXT: vsll.vx v28, v28, a2, v0.t ; RV32-NEXT: vor.vv v24, v24, v28, v0.t ; RV32-NEXT: vand.vx v28, v8, a4, v0.t ; RV32-NEXT: vsll.vi v28, v28, 24, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v20, v0.t ; RV32-NEXT: vsll.vi v8, v8, 8, v0.t ; RV32-NEXT: vor.vv v8, v28, v8, v0.t ; RV32-NEXT: vor.vv v8, v24, v8, v0.t -; RV32-NEXT: vor.vv v8, v8, v20, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 @@ -2072,6 +2070,7 @@ define <8 x i64> @vp_bitreverse_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) ; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vand.vx v16, v16, a3 ; RV32-NEXT: vor.vv v12, v16, v12 +; RV32-NEXT: vsrl.vi v20, v8, 8 ; RV32-NEXT: lui a4, 5 ; RV32-NEXT: addi a4, a4, 1365 ; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma @@ -2081,7 +2080,6 @@ define <8 x i64> @vp_bitreverse_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) ; RV32-NEXT: lui a4, 1044480 ; RV32-NEXT: vmerge.vxm v16, v16, a4, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsrl.vi v20, v8, 8 ; RV32-NEXT: vand.vv v20, v20, v16 ; RV32-NEXT: vsrl.vi v24, v8, 24 ; RV32-NEXT: lui a4, 4080 @@ -2092,11 +2090,11 @@ define <8 x i64> @vp_bitreverse_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) ; RV32-NEXT: vand.vx v24, v8, a3 ; RV32-NEXT: vsll.vx v24, v24, a2 ; RV32-NEXT: vor.vv v20, v20, v24 -; RV32-NEXT: vand.vv v16, v8, v16 -; RV32-NEXT: vsll.vi v16, v16, 8 -; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vand.vx v24, v8, a4 +; RV32-NEXT: vsll.vi v24, v24, 24 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsll.vi v8, v8, 8 +; RV32-NEXT: vor.vv v8, v24, v8 ; RV32-NEXT: vor.vv v8, v20, v8 ; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: vsrl.vi v12, v8, 4 @@ -2204,54 +2202,36 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; RV32-NEXT: vmv1r.v v1, v0 ; RV32-NEXT: li a3, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v24, v8, a3, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v16, v8, a3, v0.t +; RV32-NEXT: li a4, 40 +; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t ; RV32-NEXT: lui a1, 16 -; RV32-NEXT: addi a4, a1, -256 -; RV32-NEXT: vand.vx v24, v8, a4, v0.t -; RV32-NEXT: li a5, 40 -; RV32-NEXT: vsll.vx v24, v24, a5, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: addi a5, a1, -256 +; RV32-NEXT: vand.vx v24, v24, a5, v0.t ; RV32-NEXT: vor.vv v24, v24, v16, v0.t ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t ; RV32-NEXT: lui a6, 4080 -; RV32-NEXT: vand.vx v24, v8, a6, v0.t -; RV32-NEXT: vsll.vi v24, v24, 24, v0.t +; RV32-NEXT: vand.vx v24, v24, a6, v0.t ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: lui a2, 349525 ; RV32-NEXT: addi a2, a2, 1365 @@ -2260,189 +2240,103 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex ; RV32-NEXT: lui a7, 1044480 ; RV32-NEXT: vmv.v.x v0, a2 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmerge.vxm v24, v24, a7, v0 -; RV32-NEXT: addi a7, sp, 16 -; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill +; RV32-NEXT: vmerge.vxm v16, v24, a7, v0 +; RV32-NEXT: csrr a7, vlenb +; RV32-NEXT: li t0, 24 +; RV32-NEXT: mul a7, a7, t0 +; RV32-NEXT: add a7, sp, a7 +; RV32-NEXT: addi a7, a7, 16 +; RV32-NEXT: vs8r.v v16, (a7) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmv1r.v v0, v1 -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsll.vi v16, v16, 8, v0.t +; RV32-NEXT: addi a7, sp, 16 +; RV32-NEXT: vl8r.v v24, (a7) # Unknown-size Folded Reload +; RV32-NEXT: csrr a7, vlenb +; RV32-NEXT: li t0, 24 +; RV32-NEXT: mul a7, a7, t0 +; RV32-NEXT: add a7, sp, a7 +; RV32-NEXT: addi a7, a7, 16 +; RV32-NEXT: vl8r.v v16, (a7) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v24, v16, v0.t ; RV32-NEXT: csrr a7, vlenb ; RV32-NEXT: slli a7, a7, 3 ; RV32-NEXT: add a7, sp, a7 ; RV32-NEXT: addi a7, a7, 16 ; RV32-NEXT: vl8r.v v24, (a7) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t ; RV32-NEXT: csrr a7, vlenb ; RV32-NEXT: slli a7, a7, 4 ; RV32-NEXT: add a7, sp, a7 ; RV32-NEXT: addi a7, a7, 16 ; RV32-NEXT: vl8r.v v24, (a7) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v24, v16, v0.t +; RV32-NEXT: vor.vv v24, v16, v24, v0.t ; RV32-NEXT: csrr a7, vlenb ; RV32-NEXT: slli a7, a7, 4 ; RV32-NEXT: add a7, sp, a7 ; RV32-NEXT: addi a7, a7, 16 ; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t +; RV32-NEXT: vsll.vx v16, v8, a3, v0.t +; RV32-NEXT: vand.vx v24, v8, a5, v0.t +; RV32-NEXT: vsll.vx v24, v24, a4, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: vand.vx v24, v8, a6, v0.t +; RV32-NEXT: vsll.vi v24, v24, 24, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v24, v8, a5, v0.t -; RV32-NEXT: vand.vx v16, v24, a4, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vsll.vi v8, v8, 8, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t -; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v16, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t -; RV32-NEXT: vand.vx v8, v8, a6, v0.t ; RV32-NEXT: vor.vv v8, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: vor.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 4, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vor.vv v24, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v16, v24, 4, v0.t ; RV32-NEXT: lui a3, 61681 ; RV32-NEXT: addi a3, a3, -241 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v8, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: vand.vv v8, v24, v8, v0.t ; RV32-NEXT: vsll.vi v8, v8, 4, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vor.vv v24, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t ; RV32-NEXT: lui a3, 209715 ; RV32-NEXT: addi a3, a3, 819 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v8, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: vand.vv v8, v24, v8, v0.t ; RV32-NEXT: vsll.vi v8, v8, 2, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vor.vv v24, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v24, 1, v0.t ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v8, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v8, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: vand.vv v8, v24, v8, v0.t ; RV32-NEXT: vsll.vi v8, v8, 1, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -2536,70 +2430,74 @@ define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %ev ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsll.vx v16, v8, a1 +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v24, v8, a2 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v24, v24, a3 +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v16, v8, a4 +; RV32-NEXT: vsll.vi v24, v16, 24 +; RV32-NEXT: li a5, 32 +; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma ; RV32-NEXT: vmv.v.i v16, 0 -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 +; RV32-NEXT: lui a6, 349525 +; RV32-NEXT: addi a6, a6, 1365 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v0, a2 -; RV32-NEXT: lui a3, 1044480 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmerge.vxm v16, v16, a3, v0 +; RV32-NEXT: lui a7, 1044480 +; RV32-NEXT: vmv.v.x v0, a6 +; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma +; RV32-NEXT: vmerge.vxm v16, v16, a7, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v16 -; RV32-NEXT: vsrl.vi v0, v8, 24 -; RV32-NEXT: lui a3, 4080 -; RV32-NEXT: vand.vx v0, v0, a3 +; RV32-NEXT: vand.vv v0, v8, v16 +; RV32-NEXT: vsll.vi v0, v0, 8 ; RV32-NEXT: vor.vv v24, v24, v0 -; RV32-NEXT: addi a4, sp, 16 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill -; RV32-NEXT: li a4, 56 -; RV32-NEXT: li a5, 40 -; RV32-NEXT: vsrl.vx v0, v8, a5 -; RV32-NEXT: lui a6, 16 -; RV32-NEXT: addi a6, a6, -256 -; RV32-NEXT: vand.vx v0, v0, a6 -; RV32-NEXT: vsrl.vx v24, v8, a4 -; RV32-NEXT: vor.vv v24, v0, v24 ; RV32-NEXT: addi a7, sp, 16 ; RV32-NEXT: vl8r.v v0, (a7) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vand.vv v16, v8, v16 -; RV32-NEXT: vsll.vi v16, v16, 8 -; RV32-NEXT: vand.vx v0, v8, a3 -; RV32-NEXT: vsll.vi v0, v0, 24 -; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: vsll.vx v0, v8, a4 -; RV32-NEXT: vand.vx v8, v8, a6 -; RV32-NEXT: vsll.vx v8, v8, a5 -; RV32-NEXT: vor.vv v8, v0, v8 -; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v0, v8, a3 +; RV32-NEXT: vand.vx v0, v0, a2 +; RV32-NEXT: vsrl.vx v24, v8, a1 +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vsrl.vi v0, v8, 8 +; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vsrl.vi v8, v8, 24 +; RV32-NEXT: vand.vx v8, v8, a4 +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: lui a3, 61681 -; RV32-NEXT: addi a3, a3, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a3 +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: lui a3, 209715 -; RV32-NEXT: addi a3, a3, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a3 +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a6 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vand.vv v8, v8, v24 @@ -2684,54 +2582,36 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; RV32-NEXT: vmv1r.v v1, v0 ; RV32-NEXT: li a3, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v24, v8, a3, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v16, v8, a3, v0.t +; RV32-NEXT: li a4, 40 +; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t ; RV32-NEXT: lui a1, 16 -; RV32-NEXT: addi a4, a1, -256 -; RV32-NEXT: vand.vx v24, v8, a4, v0.t -; RV32-NEXT: li a5, 40 -; RV32-NEXT: vsll.vx v24, v24, a5, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: addi a5, a1, -256 +; RV32-NEXT: vand.vx v24, v24, a5, v0.t ; RV32-NEXT: vor.vv v24, v24, v16, v0.t ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t ; RV32-NEXT: lui a6, 4080 -; RV32-NEXT: vand.vx v24, v8, a6, v0.t -; RV32-NEXT: vsll.vi v24, v24, 24, v0.t +; RV32-NEXT: vand.vx v24, v24, a6, v0.t ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: lui a2, 349525 ; RV32-NEXT: addi a2, a2, 1365 @@ -2740,189 +2620,103 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex ; RV32-NEXT: lui a7, 1044480 ; RV32-NEXT: vmv.v.x v0, a2 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmerge.vxm v24, v24, a7, v0 -; RV32-NEXT: addi a7, sp, 16 -; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill +; RV32-NEXT: vmerge.vxm v16, v24, a7, v0 +; RV32-NEXT: csrr a7, vlenb +; RV32-NEXT: li t0, 24 +; RV32-NEXT: mul a7, a7, t0 +; RV32-NEXT: add a7, sp, a7 +; RV32-NEXT: addi a7, a7, 16 +; RV32-NEXT: vs8r.v v16, (a7) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmv1r.v v0, v1 -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsll.vi v16, v16, 8, v0.t +; RV32-NEXT: addi a7, sp, 16 +; RV32-NEXT: vl8r.v v24, (a7) # Unknown-size Folded Reload +; RV32-NEXT: csrr a7, vlenb +; RV32-NEXT: li t0, 24 +; RV32-NEXT: mul a7, a7, t0 +; RV32-NEXT: add a7, sp, a7 +; RV32-NEXT: addi a7, a7, 16 +; RV32-NEXT: vl8r.v v16, (a7) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v24, v16, v0.t ; RV32-NEXT: csrr a7, vlenb ; RV32-NEXT: slli a7, a7, 3 ; RV32-NEXT: add a7, sp, a7 ; RV32-NEXT: addi a7, a7, 16 ; RV32-NEXT: vl8r.v v24, (a7) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t ; RV32-NEXT: csrr a7, vlenb ; RV32-NEXT: slli a7, a7, 4 ; RV32-NEXT: add a7, sp, a7 ; RV32-NEXT: addi a7, a7, 16 ; RV32-NEXT: vl8r.v v24, (a7) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v24, v16, v0.t +; RV32-NEXT: vor.vv v24, v16, v24, v0.t ; RV32-NEXT: csrr a7, vlenb ; RV32-NEXT: slli a7, a7, 4 ; RV32-NEXT: add a7, sp, a7 ; RV32-NEXT: addi a7, a7, 16 ; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t +; RV32-NEXT: vsll.vx v16, v8, a3, v0.t +; RV32-NEXT: vand.vx v24, v8, a5, v0.t +; RV32-NEXT: vsll.vx v24, v24, a4, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: vand.vx v24, v8, a6, v0.t +; RV32-NEXT: vsll.vi v24, v24, 24, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v24, v8, a5, v0.t -; RV32-NEXT: vand.vx v16, v24, a4, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vsll.vi v8, v8, 8, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t -; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v16, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t -; RV32-NEXT: vand.vx v8, v8, a6, v0.t ; RV32-NEXT: vor.vv v8, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: vor.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 4, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vor.vv v24, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v16, v24, 4, v0.t ; RV32-NEXT: lui a3, 61681 ; RV32-NEXT: addi a3, a3, -241 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v8, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: vand.vv v8, v24, v8, v0.t ; RV32-NEXT: vsll.vi v8, v8, 4, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vor.vv v24, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t ; RV32-NEXT: lui a3, 209715 ; RV32-NEXT: addi a3, a3, 819 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v8, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: vand.vv v8, v24, v8, v0.t ; RV32-NEXT: vsll.vi v8, v8, 2, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vor.vv v24, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v24, 1, v0.t ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v8, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v8, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: vand.vv v8, v24, v8, v0.t ; RV32-NEXT: vsll.vi v8, v8, 1, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -3016,70 +2810,74 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %ev ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsll.vx v16, v8, a1 +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v24, v8, a2 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v24, v24, a3 +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v16, v8, a4 +; RV32-NEXT: vsll.vi v24, v16, 24 +; RV32-NEXT: li a5, 32 +; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma ; RV32-NEXT: vmv.v.i v16, 0 -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 +; RV32-NEXT: lui a6, 349525 +; RV32-NEXT: addi a6, a6, 1365 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v0, a2 -; RV32-NEXT: lui a3, 1044480 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmerge.vxm v16, v16, a3, v0 +; RV32-NEXT: lui a7, 1044480 +; RV32-NEXT: vmv.v.x v0, a6 +; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma +; RV32-NEXT: vmerge.vxm v16, v16, a7, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v16 -; RV32-NEXT: vsrl.vi v0, v8, 24 -; RV32-NEXT: lui a3, 4080 -; RV32-NEXT: vand.vx v0, v0, a3 +; RV32-NEXT: vand.vv v0, v8, v16 +; RV32-NEXT: vsll.vi v0, v0, 8 ; RV32-NEXT: vor.vv v24, v24, v0 -; RV32-NEXT: addi a4, sp, 16 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill -; RV32-NEXT: li a4, 56 -; RV32-NEXT: li a5, 40 -; RV32-NEXT: vsrl.vx v0, v8, a5 -; RV32-NEXT: lui a6, 16 -; RV32-NEXT: addi a6, a6, -256 -; RV32-NEXT: vand.vx v0, v0, a6 -; RV32-NEXT: vsrl.vx v24, v8, a4 -; RV32-NEXT: vor.vv v24, v0, v24 ; RV32-NEXT: addi a7, sp, 16 ; RV32-NEXT: vl8r.v v0, (a7) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vand.vv v16, v8, v16 -; RV32-NEXT: vsll.vi v16, v16, 8 -; RV32-NEXT: vand.vx v0, v8, a3 -; RV32-NEXT: vsll.vi v0, v0, 24 -; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: vsll.vx v0, v8, a4 -; RV32-NEXT: vand.vx v8, v8, a6 -; RV32-NEXT: vsll.vx v8, v8, a5 -; RV32-NEXT: vor.vv v8, v0, v8 -; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v0, v8, a3 +; RV32-NEXT: vand.vx v0, v0, a2 +; RV32-NEXT: vsrl.vx v24, v8, a1 +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vsrl.vi v0, v8, 8 +; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vsrl.vi v8, v8, 24 +; RV32-NEXT: vand.vx v8, v8, a4 +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: lui a3, 61681 -; RV32-NEXT: addi a3, a3, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a3 +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: lui a3, 209715 -; RV32-NEXT: addi a3, a3, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a3 +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a6 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vand.vv v8, v8, v24 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll index 6fb287d..52399b3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll @@ -165,9 +165,9 @@ define void @bitreverse_v2i64(ptr %x, ptr %y) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: vmv.v.i v0, 5 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vmv.v.i v0, 5 ; RV32-NEXT: lui a1, 1044480 ; RV32-NEXT: vmerge.vxm v9, v9, a1, v0 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma @@ -696,40 +696,40 @@ define void @bitreverse_v4i64(ptr %x, ptr %y) { ; LMULMAX2-RV32: # %bb.0: ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32-NEXT: li a1, 85 +; LMULMAX2-RV32-NEXT: li a1, 56 +; LMULMAX2-RV32-NEXT: vsrl.vx v10, v8, a1 +; LMULMAX2-RV32-NEXT: li a2, 40 +; LMULMAX2-RV32-NEXT: vsrl.vx v12, v8, a2 +; LMULMAX2-RV32-NEXT: lui a3, 16 +; LMULMAX2-RV32-NEXT: addi a3, a3, -256 +; LMULMAX2-RV32-NEXT: vand.vx v12, v12, a3 +; LMULMAX2-RV32-NEXT: vor.vv v10, v12, v10 +; LMULMAX2-RV32-NEXT: vsrl.vi v12, v8, 24 +; LMULMAX2-RV32-NEXT: lui a4, 4080 +; LMULMAX2-RV32-NEXT: vand.vx v12, v12, a4 +; LMULMAX2-RV32-NEXT: li a5, 85 ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v0, a1 +; LMULMAX2-RV32-NEXT: vmv.v.x v0, a5 ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.i v10, 0 -; LMULMAX2-RV32-NEXT: lui a1, 1044480 -; LMULMAX2-RV32-NEXT: vmerge.vxm v10, v10, a1, v0 +; LMULMAX2-RV32-NEXT: vmv.v.i v14, 0 +; LMULMAX2-RV32-NEXT: lui a5, 1044480 +; LMULMAX2-RV32-NEXT: vmerge.vxm v14, v14, a5, v0 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vsrl.vi v12, v8, 8 -; LMULMAX2-RV32-NEXT: vand.vv v12, v12, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v14, v8, 24 -; LMULMAX2-RV32-NEXT: lui a1, 4080 -; LMULMAX2-RV32-NEXT: vand.vx v14, v14, a1 -; LMULMAX2-RV32-NEXT: vor.vv v12, v12, v14 -; LMULMAX2-RV32-NEXT: li a2, 56 -; LMULMAX2-RV32-NEXT: vsrl.vx v14, v8, a2 -; LMULMAX2-RV32-NEXT: li a3, 40 -; LMULMAX2-RV32-NEXT: vsrl.vx v16, v8, a3 -; LMULMAX2-RV32-NEXT: lui a4, 16 -; LMULMAX2-RV32-NEXT: addi a4, a4, -256 -; LMULMAX2-RV32-NEXT: vand.vx v16, v16, a4 -; LMULMAX2-RV32-NEXT: vor.vv v14, v16, v14 -; LMULMAX2-RV32-NEXT: vor.vv v12, v12, v14 -; LMULMAX2-RV32-NEXT: vand.vv v10, v8, v10 -; LMULMAX2-RV32-NEXT: vsll.vi v10, v10, 8 -; LMULMAX2-RV32-NEXT: vand.vx v14, v8, a1 -; LMULMAX2-RV32-NEXT: vsll.vi v14, v14, 24 -; LMULMAX2-RV32-NEXT: vor.vv v10, v14, v10 -; LMULMAX2-RV32-NEXT: vsll.vx v14, v8, a2 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a4 -; LMULMAX2-RV32-NEXT: vsll.vx v8, v8, a3 -; LMULMAX2-RV32-NEXT: vor.vv v8, v14, v8 +; LMULMAX2-RV32-NEXT: vsrl.vi v16, v8, 8 +; LMULMAX2-RV32-NEXT: vand.vv v16, v16, v14 +; LMULMAX2-RV32-NEXT: vor.vv v12, v16, v12 +; LMULMAX2-RV32-NEXT: vor.vv v10, v12, v10 +; LMULMAX2-RV32-NEXT: vsll.vx v12, v8, a1 +; LMULMAX2-RV32-NEXT: vand.vx v16, v8, a3 +; LMULMAX2-RV32-NEXT: vsll.vx v16, v16, a2 +; LMULMAX2-RV32-NEXT: vor.vv v12, v12, v16 +; LMULMAX2-RV32-NEXT: vand.vx v16, v8, a4 +; LMULMAX2-RV32-NEXT: vsll.vi v16, v16, 24 +; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v14 +; LMULMAX2-RV32-NEXT: vsll.vi v8, v8, 8 +; LMULMAX2-RV32-NEXT: vor.vv v8, v16, v8 +; LMULMAX2-RV32-NEXT: vor.vv v8, v12, v8 ; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v12 ; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4 ; LMULMAX2-RV32-NEXT: lui a1, 61681 ; LMULMAX2-RV32-NEXT: addi a1, a1, -241 @@ -831,9 +831,9 @@ define void @bitreverse_v4i64(ptr %x, ptr %y) { ; LMULMAX1-RV32-NEXT: vle64.v v8, (a0) ; LMULMAX1-RV32-NEXT: addi a1, a0, 16 ; LMULMAX1-RV32-NEXT: vle64.v v10, (a1) -; LMULMAX1-RV32-NEXT: vmv.v.i v0, 5 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-RV32-NEXT: vmv.v.i v9, 0 +; LMULMAX1-RV32-NEXT: vmv.v.i v0, 5 ; LMULMAX1-RV32-NEXT: lui a2, 1044480 ; LMULMAX1-RV32-NEXT: vmerge.vxm v9, v9, a2, v0 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll index 7c40108..7f4304c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll @@ -436,10 +436,9 @@ define <2 x i64> @vp_bswap_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; RV32-NEXT: vand.vx v11, v11, a3, v0.t ; RV32-NEXT: vor.vv v10, v11, v10, v0.t ; RV32-NEXT: vsrl.vi v11, v8, 8, v0.t -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV32-NEXT: vmv.v.i v0, 5 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmv.v.i v0, 5 ; RV32-NEXT: lui a4, 1044480 ; RV32-NEXT: vmerge.vxm v12, v12, a4, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma @@ -511,10 +510,9 @@ define <2 x i64> @vp_bswap_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vand.vx v10, v10, a3 ; RV32-NEXT: vor.vv v9, v10, v9 -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV32-NEXT: vmv.v.i v0, 5 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vmv.v.i v0, 5 ; RV32-NEXT: lui a4, 1044480 ; RV32-NEXT: vmerge.vxm v10, v10, a4, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma @@ -591,31 +589,31 @@ define <4 x i64> @vp_bswap_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vand.vx v14, v14, a3, v0.t ; RV32-NEXT: vor.vv v12, v14, v12, v0.t -; RV32-NEXT: vsrl.vi v14, v8, 8, v0.t -; RV32-NEXT: li a4, 85 +; RV32-NEXT: vsrl.vi v14, v8, 24, v0.t +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v14, v14, a4, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t +; RV32-NEXT: li a5, 85 ; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV32-NEXT: vmv.v.x v0, a4 +; RV32-NEXT: vmv.v.x v0, a5 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.i v16, 0 -; RV32-NEXT: lui a4, 1044480 -; RV32-NEXT: vmerge.vxm v16, v16, a4, v0 +; RV32-NEXT: vmv.v.i v18, 0 +; RV32-NEXT: lui a5, 1044480 +; RV32-NEXT: vmerge.vxm v18, v18, a5, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vmv1r.v v0, v10 -; RV32-NEXT: vand.vv v14, v14, v16, v0.t -; RV32-NEXT: vsrl.vi v18, v8, 24, v0.t -; RV32-NEXT: lui a0, 4080 -; RV32-NEXT: vand.vx v18, v18, a0, v0.t -; RV32-NEXT: vor.vv v14, v14, v18, v0.t +; RV32-NEXT: vand.vv v16, v16, v18, v0.t +; RV32-NEXT: vor.vv v14, v16, v14, v0.t ; RV32-NEXT: vor.vv v12, v14, v12, v0.t ; RV32-NEXT: vsll.vx v14, v8, a1, v0.t -; RV32-NEXT: vand.vx v18, v8, a3, v0.t -; RV32-NEXT: vsll.vx v18, v18, a2, v0.t -; RV32-NEXT: vor.vv v14, v14, v18, v0.t -; RV32-NEXT: vand.vx v18, v8, a0, v0.t -; RV32-NEXT: vsll.vi v18, v18, 24, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vx v16, v8, a3, v0.t +; RV32-NEXT: vsll.vx v16, v16, a2, v0.t +; RV32-NEXT: vor.vv v14, v14, v16, v0.t +; RV32-NEXT: vand.vx v16, v8, a4, v0.t +; RV32-NEXT: vsll.vi v16, v16, 24, v0.t +; RV32-NEXT: vand.vv v8, v8, v18, v0.t ; RV32-NEXT: vsll.vi v8, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v18, v8, v0.t +; RV32-NEXT: vor.vv v8, v16, v8, v0.t ; RV32-NEXT: vor.vv v8, v14, v8, v0.t ; RV32-NEXT: vor.vv v8, v8, v12, v0.t ; RV32-NEXT: ret @@ -668,31 +666,31 @@ define <4 x i64> @vp_bswap_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vand.vx v12, v12, a3 ; RV32-NEXT: vor.vv v10, v12, v10 +; RV32-NEXT: vsrl.vi v12, v8, 8 ; RV32-NEXT: li a4, 85 ; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; RV32-NEXT: vmv.v.x v0, a4 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmv.v.i v14, 0 ; RV32-NEXT: lui a4, 1044480 -; RV32-NEXT: vmerge.vxm v12, v12, a4, v0 +; RV32-NEXT: vmerge.vxm v14, v14, a4, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsrl.vi v14, v8, 8 -; RV32-NEXT: vand.vv v14, v14, v12 +; RV32-NEXT: vand.vv v12, v12, v14 ; RV32-NEXT: vsrl.vi v16, v8, 24 ; RV32-NEXT: lui a0, 4080 ; RV32-NEXT: vand.vx v16, v16, a0 -; RV32-NEXT: vor.vv v14, v14, v16 -; RV32-NEXT: vor.vv v10, v14, v10 -; RV32-NEXT: vsll.vx v14, v8, a1 +; RV32-NEXT: vor.vv v12, v12, v16 +; RV32-NEXT: vor.vv v10, v12, v10 +; RV32-NEXT: vsll.vx v12, v8, a1 ; RV32-NEXT: vand.vx v16, v8, a3 ; RV32-NEXT: vsll.vx v16, v16, a2 -; RV32-NEXT: vor.vv v14, v14, v16 -; RV32-NEXT: vand.vv v12, v8, v12 -; RV32-NEXT: vsll.vi v12, v12, 8 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vor.vv v8, v14, v8 +; RV32-NEXT: vor.vv v12, v12, v16 +; RV32-NEXT: vand.vx v16, v8, a0 +; RV32-NEXT: vsll.vi v16, v16, 24 +; RV32-NEXT: vand.vv v8, v8, v14 +; RV32-NEXT: vsll.vi v8, v8, 8 +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: ret ; @@ -748,35 +746,35 @@ define <8 x i64> @vp_bswap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vand.vx v20, v20, a3, v0.t -; RV32-NEXT: vor.vv v20, v20, v16, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t -; RV32-NEXT: lui a4, 5 -; RV32-NEXT: addi a4, a4, 1365 +; RV32-NEXT: vor.vv v16, v20, v16, v0.t +; RV32-NEXT: vsrl.vi v20, v8, 24, v0.t +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v24, v20, a4, v0.t +; RV32-NEXT: vsrl.vi v28, v8, 8, v0.t +; RV32-NEXT: lui a5, 5 +; RV32-NEXT: addi a5, a5, 1365 ; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; RV32-NEXT: vmv.v.x v0, a4 +; RV32-NEXT: vmv.v.x v0, a5 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.i v16, 0 -; RV32-NEXT: lui a4, 1044480 -; RV32-NEXT: vmerge.vxm v16, v16, a4, v0 +; RV32-NEXT: vmv.v.i v20, 0 +; RV32-NEXT: lui a5, 1044480 +; RV32-NEXT: vmerge.vxm v20, v20, a5, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vmv1r.v v0, v12 -; RV32-NEXT: vand.vv v24, v24, v16, v0.t -; RV32-NEXT: vsrl.vi v28, v8, 24, v0.t -; RV32-NEXT: lui a0, 4080 -; RV32-NEXT: vand.vx v28, v28, a0, v0.t -; RV32-NEXT: vor.vv v24, v24, v28, v0.t -; RV32-NEXT: vor.vv v20, v24, v20, v0.t +; RV32-NEXT: vand.vv v28, v28, v20, v0.t +; RV32-NEXT: vor.vv v24, v28, v24, v0.t +; RV32-NEXT: vor.vv v16, v24, v16, v0.t ; RV32-NEXT: vsll.vx v24, v8, a1, v0.t ; RV32-NEXT: vand.vx v28, v8, a3, v0.t ; RV32-NEXT: vsll.vx v28, v28, a2, v0.t ; RV32-NEXT: vor.vv v24, v24, v28, v0.t -; RV32-NEXT: vand.vx v28, v8, a0, v0.t +; RV32-NEXT: vand.vx v28, v8, a4, v0.t ; RV32-NEXT: vsll.vi v28, v28, 24, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v20, v0.t ; RV32-NEXT: vsll.vi v8, v8, 8, v0.t ; RV32-NEXT: vor.vv v8, v28, v8, v0.t ; RV32-NEXT: vor.vv v8, v24, v8, v0.t -; RV32-NEXT: vor.vv v8, v8, v20, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vp_bswap_v8i64: @@ -827,6 +825,7 @@ define <8 x i64> @vp_bswap_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vand.vx v16, v16, a3 ; RV32-NEXT: vor.vv v12, v16, v12 +; RV32-NEXT: vsrl.vi v20, v8, 8 ; RV32-NEXT: lui a4, 5 ; RV32-NEXT: addi a4, a4, 1365 ; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma @@ -836,7 +835,6 @@ define <8 x i64> @vp_bswap_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: lui a4, 1044480 ; RV32-NEXT: vmerge.vxm v16, v16, a4, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsrl.vi v20, v8, 8 ; RV32-NEXT: vand.vv v20, v20, v16 ; RV32-NEXT: vsrl.vi v24, v8, 24 ; RV32-NEXT: lui a0, 4080 @@ -847,11 +845,11 @@ define <8 x i64> @vp_bswap_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vand.vx v24, v8, a3 ; RV32-NEXT: vsll.vx v24, v24, a2 ; RV32-NEXT: vor.vv v20, v20, v24 -; RV32-NEXT: vand.vv v16, v8, v16 -; RV32-NEXT: vsll.vi v16, v16, 8 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vand.vx v24, v8, a0 +; RV32-NEXT: vsll.vi v24, v24, 24 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsll.vi v8, v8, 8 +; RV32-NEXT: vor.vv v8, v24, v8 ; RV32-NEXT: vor.vv v8, v20, v8 ; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: ret @@ -902,54 +900,36 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; RV32-NEXT: vmv1r.v v1, v0 ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v24, v8, a1, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v24, v8, a2, v0.t -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v24, v24, a3, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t +; RV32-NEXT: li a2, 40 +; RV32-NEXT: vsrl.vx v24, v8, a2, v0.t +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: addi a3, a3, -256 +; RV32-NEXT: vand.vx v24, v24, a3, v0.t ; RV32-NEXT: vor.vv v24, v24, v16, v0.t ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: slli a4, a4, 4 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t ; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v24, v8, a4, v0.t -; RV32-NEXT: vsll.vi v24, v24, 24, v0.t +; RV32-NEXT: vand.vx v24, v24, a4, v0.t ; RV32-NEXT: csrr a5, vlenb ; RV32-NEXT: slli a5, a5, 3 ; RV32-NEXT: add a5, sp, a5 ; RV32-NEXT: addi a5, a5, 16 ; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill ; RV32-NEXT: li a5, 32 +; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t +; RV32-NEXT: addi a6, sp, 16 +; RV32-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma ; RV32-NEXT: lui a6, 349525 ; RV32-NEXT: addi a6, a6, 1365 @@ -958,71 +938,75 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev ; RV32-NEXT: lui a7, 1044480 ; RV32-NEXT: vmv.v.x v0, a6 ; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma -; RV32-NEXT: vmerge.vxm v24, v24, a7, v0 -; RV32-NEXT: addi a5, sp, 16 -; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vmerge.vxm v16, v24, a7, v0 +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: li a6, 24 +; RV32-NEXT: mul a5, a5, a6 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 16 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmv1r.v v0, v1 -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsll.vi v16, v16, 8, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a5, 24 +; RV32-NEXT: mul a0, a0, a5 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v24, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v24, v16, v0.t +; RV32-NEXT: vor.vv v24, v16, v24, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v24, v8, a1, v0.t +; RV32-NEXT: vsll.vx v16, v8, a1, v0.t +; RV32-NEXT: vand.vx v24, v8, a3, v0.t +; RV32-NEXT: vsll.vx v24, v24, a2, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: vand.vx v24, v8, a4, v0.t +; RV32-NEXT: vsll.vi v24, v24, 24, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t -; RV32-NEXT: vand.vx v16, v24, a2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vsll.vi v8, v8, 8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t -; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v16, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t -; RV32-NEXT: vand.vx v8, v8, a4, v0.t ; RV32-NEXT: vor.vv v8, v24, v8, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: vor.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -1089,47 +1073,50 @@ define <15 x i64> @vp_bswap_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsll.vx v16, v8, a1 +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v24, v8, a2 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v24, v24, a3 +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v16, v8, a4 +; RV32-NEXT: vsll.vi v24, v16, 24 +; RV32-NEXT: li a5, 32 +; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma ; RV32-NEXT: vmv.v.i v16, 0 -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 +; RV32-NEXT: lui a6, 349525 +; RV32-NEXT: addi a6, a6, 1365 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v0, a2 -; RV32-NEXT: lui a2, 1044480 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmerge.vxm v16, v16, a2, v0 +; RV32-NEXT: lui a7, 1044480 +; RV32-NEXT: vmv.v.x v0, a6 +; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma +; RV32-NEXT: vmerge.vxm v16, v16, a7, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v16 -; RV32-NEXT: vsrl.vi v0, v8, 24 -; RV32-NEXT: lui a0, 4080 -; RV32-NEXT: vand.vx v0, v0, a0 +; RV32-NEXT: vand.vv v0, v8, v16 +; RV32-NEXT: vsll.vi v0, v0, 8 ; RV32-NEXT: vor.vv v24, v24, v0 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: li a1, 56 -; RV32-NEXT: li a2, 40 -; RV32-NEXT: vsrl.vx v0, v8, a2 -; RV32-NEXT: lui a3, 16 -; RV32-NEXT: addi a3, a3, -256 -; RV32-NEXT: vand.vx v0, v0, a3 -; RV32-NEXT: vsrl.vx v24, v8, a1 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: addi a4, sp, 16 -; RV32-NEXT: vl8r.v v0, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v0, v8, a3 +; RV32-NEXT: vand.vx v0, v0, a2 +; RV32-NEXT: vsrl.vx v24, v8, a1 ; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vand.vv v16, v8, v16 -; RV32-NEXT: vsll.vi v16, v16, 8 -; RV32-NEXT: vand.vx v0, v8, a0 -; RV32-NEXT: vsll.vi v0, v0, 24 -; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: vsll.vx v0, v8, a1 -; RV32-NEXT: vand.vx v8, v8, a3 -; RV32-NEXT: vsll.vx v8, v8, a2 -; RV32-NEXT: vor.vv v8, v0, v8 -; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v0, v8, 8 +; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vsrl.vi v8, v8, 24 +; RV32-NEXT: vand.vx v8, v8, a4 +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add sp, sp, a0 @@ -1182,54 +1169,36 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; RV32-NEXT: vmv1r.v v1, v0 ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v24, v8, a1, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -256 -; RV32-NEXT: vand.vx v24, v8, a2, v0.t -; RV32-NEXT: li a3, 40 -; RV32-NEXT: vsll.vx v24, v24, a3, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t +; RV32-NEXT: li a2, 40 +; RV32-NEXT: vsrl.vx v24, v8, a2, v0.t +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: addi a3, a3, -256 +; RV32-NEXT: vand.vx v24, v24, a3, v0.t ; RV32-NEXT: vor.vv v24, v24, v16, v0.t ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: slli a4, a4, 4 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t ; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v24, v8, a4, v0.t -; RV32-NEXT: vsll.vi v24, v24, 24, v0.t +; RV32-NEXT: vand.vx v24, v24, a4, v0.t ; RV32-NEXT: csrr a5, vlenb ; RV32-NEXT: slli a5, a5, 3 ; RV32-NEXT: add a5, sp, a5 ; RV32-NEXT: addi a5, a5, 16 ; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill ; RV32-NEXT: li a5, 32 +; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t +; RV32-NEXT: addi a6, sp, 16 +; RV32-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma ; RV32-NEXT: lui a6, 349525 ; RV32-NEXT: addi a6, a6, 1365 @@ -1238,71 +1207,75 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev ; RV32-NEXT: lui a7, 1044480 ; RV32-NEXT: vmv.v.x v0, a6 ; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma -; RV32-NEXT: vmerge.vxm v24, v24, a7, v0 -; RV32-NEXT: addi a5, sp, 16 -; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vmerge.vxm v16, v24, a7, v0 +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: li a6, 24 +; RV32-NEXT: mul a5, a5, a6 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 16 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmv1r.v v0, v1 -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsll.vi v16, v16, 8, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a5, 24 +; RV32-NEXT: mul a0, a0, a5 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v24, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v24, v16, v0.t +; RV32-NEXT: vor.vv v24, v16, v24, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v24, v8, a1, v0.t +; RV32-NEXT: vsll.vx v16, v8, a1, v0.t +; RV32-NEXT: vand.vx v24, v8, a3, v0.t +; RV32-NEXT: vsll.vx v24, v24, a2, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: vand.vx v24, v8, a4, v0.t +; RV32-NEXT: vsll.vi v24, v24, 24, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t -; RV32-NEXT: vand.vx v16, v24, a2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vsll.vi v8, v8, 8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t -; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v16, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t -; RV32-NEXT: vand.vx v8, v8, a4, v0.t ; RV32-NEXT: vor.vv v8, v24, v8, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: vor.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -1369,47 +1342,50 @@ define <16 x i64> @vp_bswap_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsll.vx v16, v8, a1 +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v24, v8, a2 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v24, v24, a3 +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v16, v8, a4 +; RV32-NEXT: vsll.vi v24, v16, 24 +; RV32-NEXT: li a5, 32 +; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma ; RV32-NEXT: vmv.v.i v16, 0 -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 +; RV32-NEXT: lui a6, 349525 +; RV32-NEXT: addi a6, a6, 1365 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v0, a2 -; RV32-NEXT: lui a2, 1044480 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmerge.vxm v16, v16, a2, v0 +; RV32-NEXT: lui a7, 1044480 +; RV32-NEXT: vmv.v.x v0, a6 +; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma +; RV32-NEXT: vmerge.vxm v16, v16, a7, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v16 -; RV32-NEXT: vsrl.vi v0, v8, 24 -; RV32-NEXT: lui a0, 4080 -; RV32-NEXT: vand.vx v0, v0, a0 +; RV32-NEXT: vand.vv v0, v8, v16 +; RV32-NEXT: vsll.vi v0, v0, 8 ; RV32-NEXT: vor.vv v24, v24, v0 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: li a1, 56 -; RV32-NEXT: li a2, 40 -; RV32-NEXT: vsrl.vx v0, v8, a2 -; RV32-NEXT: lui a3, 16 -; RV32-NEXT: addi a3, a3, -256 -; RV32-NEXT: vand.vx v0, v0, a3 -; RV32-NEXT: vsrl.vx v24, v8, a1 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: addi a4, sp, 16 -; RV32-NEXT: vl8r.v v0, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v0, v8, a3 +; RV32-NEXT: vand.vx v0, v0, a2 +; RV32-NEXT: vsrl.vx v24, v8, a1 ; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: vand.vv v16, v8, v16 -; RV32-NEXT: vsll.vi v16, v16, 8 -; RV32-NEXT: vand.vx v0, v8, a0 -; RV32-NEXT: vsll.vi v0, v0, 24 -; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: vsll.vx v0, v8, a1 -; RV32-NEXT: vand.vx v8, v8, a3 -; RV32-NEXT: vsll.vx v8, v8, a2 -; RV32-NEXT: vor.vv v8, v0, v8 -; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v0, v8, 8 +; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vsrl.vi v8, v8, 24 +; RV32-NEXT: vand.vx v8, v8, a4 +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll index 8f4ad2a..7b2486e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll @@ -71,9 +71,9 @@ define void @bswap_v2i64(ptr %x, ptr %y) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: vmv.v.i v0, 5 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vmv.v.i v0, 5 ; RV32-NEXT: lui a1, 1044480 ; RV32-NEXT: vmerge.vxm v9, v9, a1, v0 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma @@ -317,40 +317,40 @@ define void @bswap_v4i64(ptr %x, ptr %y) { ; LMULMAX2-RV32: # %bb.0: ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32-NEXT: li a1, 85 +; LMULMAX2-RV32-NEXT: li a1, 56 +; LMULMAX2-RV32-NEXT: vsrl.vx v10, v8, a1 +; LMULMAX2-RV32-NEXT: li a2, 40 +; LMULMAX2-RV32-NEXT: vsrl.vx v12, v8, a2 +; LMULMAX2-RV32-NEXT: lui a3, 16 +; LMULMAX2-RV32-NEXT: addi a3, a3, -256 +; LMULMAX2-RV32-NEXT: vand.vx v12, v12, a3 +; LMULMAX2-RV32-NEXT: vor.vv v10, v12, v10 +; LMULMAX2-RV32-NEXT: vsrl.vi v12, v8, 24 +; LMULMAX2-RV32-NEXT: lui a4, 4080 +; LMULMAX2-RV32-NEXT: vand.vx v12, v12, a4 +; LMULMAX2-RV32-NEXT: li a5, 85 ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v0, a1 +; LMULMAX2-RV32-NEXT: vmv.v.x v0, a5 ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.i v10, 0 -; LMULMAX2-RV32-NEXT: lui a1, 1044480 -; LMULMAX2-RV32-NEXT: vmerge.vxm v10, v10, a1, v0 +; LMULMAX2-RV32-NEXT: vmv.v.i v14, 0 +; LMULMAX2-RV32-NEXT: lui a5, 1044480 +; LMULMAX2-RV32-NEXT: vmerge.vxm v14, v14, a5, v0 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vsrl.vi v12, v8, 8 -; LMULMAX2-RV32-NEXT: vand.vv v12, v12, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v14, v8, 24 -; LMULMAX2-RV32-NEXT: lui a1, 4080 -; LMULMAX2-RV32-NEXT: vand.vx v14, v14, a1 -; LMULMAX2-RV32-NEXT: vor.vv v12, v12, v14 -; LMULMAX2-RV32-NEXT: li a2, 56 -; LMULMAX2-RV32-NEXT: vsrl.vx v14, v8, a2 -; LMULMAX2-RV32-NEXT: li a3, 40 -; LMULMAX2-RV32-NEXT: vsrl.vx v16, v8, a3 -; LMULMAX2-RV32-NEXT: lui a4, 16 -; LMULMAX2-RV32-NEXT: addi a4, a4, -256 -; LMULMAX2-RV32-NEXT: vand.vx v16, v16, a4 -; LMULMAX2-RV32-NEXT: vor.vv v14, v16, v14 -; LMULMAX2-RV32-NEXT: vor.vv v12, v12, v14 -; LMULMAX2-RV32-NEXT: vand.vv v10, v8, v10 -; LMULMAX2-RV32-NEXT: vsll.vi v10, v10, 8 -; LMULMAX2-RV32-NEXT: vand.vx v14, v8, a1 -; LMULMAX2-RV32-NEXT: vsll.vi v14, v14, 24 -; LMULMAX2-RV32-NEXT: vor.vv v10, v14, v10 -; LMULMAX2-RV32-NEXT: vsll.vx v14, v8, a2 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a4 -; LMULMAX2-RV32-NEXT: vsll.vx v8, v8, a3 -; LMULMAX2-RV32-NEXT: vor.vv v8, v14, v8 +; LMULMAX2-RV32-NEXT: vsrl.vi v16, v8, 8 +; LMULMAX2-RV32-NEXT: vand.vv v16, v16, v14 +; LMULMAX2-RV32-NEXT: vor.vv v12, v16, v12 +; LMULMAX2-RV32-NEXT: vor.vv v10, v12, v10 +; LMULMAX2-RV32-NEXT: vsll.vx v12, v8, a1 +; LMULMAX2-RV32-NEXT: vand.vx v16, v8, a3 +; LMULMAX2-RV32-NEXT: vsll.vx v16, v16, a2 +; LMULMAX2-RV32-NEXT: vor.vv v12, v12, v16 +; LMULMAX2-RV32-NEXT: vand.vx v16, v8, a4 +; LMULMAX2-RV32-NEXT: vsll.vi v16, v16, 24 +; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v14 +; LMULMAX2-RV32-NEXT: vsll.vi v8, v8, 8 +; LMULMAX2-RV32-NEXT: vor.vv v8, v16, v8 +; LMULMAX2-RV32-NEXT: vor.vv v8, v12, v8 ; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v12 ; LMULMAX2-RV32-NEXT: vse64.v v8, (a0) ; LMULMAX2-RV32-NEXT: ret ; @@ -395,9 +395,9 @@ define void @bswap_v4i64(ptr %x, ptr %y) { ; LMULMAX1-RV32-NEXT: vle64.v v8, (a0) ; LMULMAX1-RV32-NEXT: addi a1, a0, 16 ; LMULMAX1-RV32-NEXT: vle64.v v9, (a1) -; LMULMAX1-RV32-NEXT: vmv.v.i v0, 5 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-RV32-NEXT: vmv.v.i v10, 0 +; LMULMAX1-RV32-NEXT: vmv.v.i v0, 5 ; LMULMAX1-RV32-NEXT: lui a2, 1044480 ; LMULMAX1-RV32-NEXT: vmerge.vxm v10, v10, a2, v0 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll index 768a00e..1c2efca 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll @@ -412,9 +412,9 @@ define fastcc <32 x i32> @pass_vector_arg_indirect_stack(<32 x i32> %x, <32 x i3 ; LMULMAX4-NEXT: addi s0, sp, 256 ; LMULMAX4-NEXT: .cfi_def_cfa s0, 0 ; LMULMAX4-NEXT: andi sp, sp, -128 -; LMULMAX4-NEXT: addi a0, sp, 64 ; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; LMULMAX4-NEXT: vmv.v.i v8, 0 +; LMULMAX4-NEXT: addi a0, sp, 64 ; LMULMAX4-NEXT: vse32.v v8, (a0) ; LMULMAX4-NEXT: mv a0, sp ; LMULMAX4-NEXT: li a1, 1 @@ -516,9 +516,9 @@ define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32> ; LMULMAX4-NEXT: sd a0, 136(sp) ; LMULMAX4-NEXT: li a0, 13 ; LMULMAX4-NEXT: sd a0, 0(sp) -; LMULMAX4-NEXT: addi a0, sp, 72 ; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; LMULMAX4-NEXT: vmv.v.i v8, 0 +; LMULMAX4-NEXT: addi a0, sp, 72 ; LMULMAX4-NEXT: vse32.v v8, (a0) ; LMULMAX4-NEXT: addi a0, sp, 8 ; LMULMAX4-NEXT: li a1, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll index 0818402..497cafd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll @@ -1618,15 +1618,14 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v15i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsrl.vi v24, v8, 1 +; RV32-NEXT: vand.vv v16, v24, v16 ; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 @@ -1776,15 +1775,14 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v16i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsrl.vi v24, v8, 1 +; RV32-NEXT: vand.vv v16, v24, v16 ; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 @@ -1871,104 +1869,110 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: li a2, 16 ; RV32-NEXT: vslidedown.vi v24, v0, 2 -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB34_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB34_2: -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: li a3, 32 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 40 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vmv.v.x v8, a2 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a2, a2, a4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a2, a2, a4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; RV32-NEXT: lui a2, 209715 -; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a2 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a2, a1, 1365 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: li a3, 16 +; RV32-NEXT: vmv.v.x v16, a2 ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a2, a2, a4 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: bltu a0, a3, .LBB34_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a2, 16 +; RV32-NEXT: .LBB34_2: +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v16, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t -; RV32-NEXT: vadd.vv v16, v16, v8, v0.t -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a2 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: lui a2, 4112 -; RV32-NEXT: addi a2, a2, 257 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: lui a3, 209715 +; RV32-NEXT: addi a3, a3, 819 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v16, a3 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: lui a3, 61681 +; RV32-NEXT: addi a3, a3, -241 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v16, a3 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: lui a3, 4112 +; RV32-NEXT: addi a3, a3, 257 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v16, a3 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t @@ -1993,7 +1997,7 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 24 +; RV32-NEXT: li a2, 40 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 @@ -2009,8 +2013,7 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 40 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -2023,8 +2026,7 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 40 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -2039,7 +2041,8 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -2153,72 +2156,65 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v32i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 40 -; RV32-NEXT: mul a1, a1, a2 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb -; RV32-NEXT: li a2, 16 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 5 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB35_2 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a2, a1, 1365 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: li a3, 16 +; RV32-NEXT: vmv.v.x v0, a2 +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: bltu a0, a3, .LBB35_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 +; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB35_2: -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: li a3, 32 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a2 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a2, a2, a4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a2, 209715 -; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v0, a2 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v8, v0 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: sub sp, sp, a3 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v8, 1 +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: lui a3, 209715 +; RV32-NEXT: addi a3, a3, 819 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: li a5, 24 +; RV32-NEXT: mul a4, a4, a5 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vs8r.v v0, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vmv.v.x v0, a3 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vand.vv v24, v8, v0 ; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vand.vv v8, v8, v0 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: lui a2, 4112 -; RV32-NEXT: addi a2, a2, 257 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a2 -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v24, v24, v8 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 +; RV32-NEXT: lui a3, 61681 +; RV32-NEXT: addi a3, a3, -241 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a3 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: lui a3, 4112 +; RV32-NEXT: addi a3, a3, 257 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a3 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsrl.vx v8, v24, a1 +; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 @@ -2229,20 +2225,15 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v24, v8, 1 +; RV32-NEXT: vsrl.vi v24, v16, 1 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a2, 24 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v16 -; RV32-NEXT: vsub.vv v24, v8, v24 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v24, v24, v8 +; RV32-NEXT: vsub.vv v24, v16, v24 ; RV32-NEXT: vand.vv v8, v24, v0 ; RV32-NEXT: vsrl.vi v24, v24, 2 ; RV32-NEXT: vand.vv v24, v24, v0 @@ -2265,8 +2256,7 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll index c5ed48f..cf4727a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll @@ -264,13 +264,13 @@ define void @ctpop_v2i64(ptr %x, ptr %y) { ; LMULMAX2-RV32: # %bb.0: ; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX2-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1 ; LMULMAX2-RV32-NEXT: lui a1, 349525 ; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX2-RV32-NEXT: vmv.v.x v9, a1 ; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v9, v9, v10 +; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV32-NEXT: vand.vv v9, v10, v9 ; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9 ; LMULMAX2-RV32-NEXT: lui a1, 209715 ; LMULMAX2-RV32-NEXT: addi a1, a1, 819 @@ -340,13 +340,13 @@ define void @ctpop_v2i64(ptr %x, ptr %y) { ; LMULMAX1-RV32: # %bb.0: ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1 ; LMULMAX1-RV32-NEXT: lui a1, 349525 ; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v10 +; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX1-RV32-NEXT: vand.vv v9, v10, v9 ; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v9 ; LMULMAX1-RV32-NEXT: lui a1, 209715 ; LMULMAX1-RV32-NEXT: addi a1, a1, 819 @@ -772,13 +772,13 @@ define void @ctpop_v4i64(ptr %x, ptr %y) { ; LMULMAX2-RV32: # %bb.0: ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1 ; LMULMAX2-RV32-NEXT: lui a1, 349525 ; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v12, a1 +; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v10, v10, v12 +; LMULMAX2-RV32-NEXT: vsrl.vi v12, v8, 1 +; LMULMAX2-RV32-NEXT: vand.vv v10, v12, v10 ; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10 ; LMULMAX2-RV32-NEXT: lui a1, 209715 ; LMULMAX2-RV32-NEXT: addi a1, a1, 819 @@ -847,56 +847,56 @@ define void @ctpop_v4i64(ptr %x, ptr %y) { ; LMULMAX1-RV32-LABEL: ctpop_v4i64: ; LMULMAX1-RV32: # %bb.0: ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vle64.v v8, (a0) ; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle64.v v8, (a1) -; LMULMAX1-RV32-NEXT: vle64.v v9, (a0) -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX1-RV32-NEXT: vle64.v v9, (a1) ; LMULMAX1-RV32-NEXT: lui a2, 349525 ; LMULMAX1-RV32-NEXT: addi a2, a2, 1365 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v11, a2 +; LMULMAX1-RV32-NEXT: vmv.v.x v10, a2 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v10, v10, v11 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v10 +; LMULMAX1-RV32-NEXT: vsrl.vi v11, v9, 1 +; LMULMAX1-RV32-NEXT: vand.vv v11, v11, v10 +; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v11 ; LMULMAX1-RV32-NEXT: lui a2, 209715 ; LMULMAX1-RV32-NEXT: addi a2, a2, 819 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v10, a2 +; LMULMAX1-RV32-NEXT: vmv.v.x v11, a2 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v12, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v12, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v12, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v12 +; LMULMAX1-RV32-NEXT: vand.vv v12, v9, v11 +; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 2 +; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v11 +; LMULMAX1-RV32-NEXT: vadd.vv v9, v12, v9 +; LMULMAX1-RV32-NEXT: vsrl.vi v12, v9, 4 +; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v12 ; LMULMAX1-RV32-NEXT: lui a2, 61681 ; LMULMAX1-RV32-NEXT: addi a2, a2, -241 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-RV32-NEXT: vmv.v.x v12, a2 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v12 +; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v12 ; LMULMAX1-RV32-NEXT: lui a2, 4112 ; LMULMAX1-RV32-NEXT: addi a2, a2, 257 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-RV32-NEXT: vmv.v.x v13, a2 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmul.vv v8, v8, v13 -; LMULMAX1-RV32-NEXT: li a2, 56 -; LMULMAX1-RV32-NEXT: vsrl.vx v8, v8, a2 -; LMULMAX1-RV32-NEXT: vsrl.vi v14, v9, 1 -; LMULMAX1-RV32-NEXT: vand.vv v11, v14, v11 -; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v11 -; LMULMAX1-RV32-NEXT: vand.vv v11, v9, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v11, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v12 ; LMULMAX1-RV32-NEXT: vmul.vv v9, v9, v13 +; LMULMAX1-RV32-NEXT: li a2, 56 ; LMULMAX1-RV32-NEXT: vsrl.vx v9, v9, a2 -; LMULMAX1-RV32-NEXT: vse64.v v9, (a0) -; LMULMAX1-RV32-NEXT: vse64.v v8, (a1) +; LMULMAX1-RV32-NEXT: vsrl.vi v14, v8, 1 +; LMULMAX1-RV32-NEXT: vand.vv v10, v14, v10 +; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v10 +; LMULMAX1-RV32-NEXT: vand.vv v10, v8, v11 +; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v11 +; LMULMAX1-RV32-NEXT: vadd.vv v8, v10, v8 +; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v10 +; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v12 +; LMULMAX1-RV32-NEXT: vmul.vv v8, v8, v13 +; LMULMAX1-RV32-NEXT: vsrl.vx v8, v8, a2 +; LMULMAX1-RV32-NEXT: vse64.v v8, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v9, (a1) ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: ctpop_v4i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll index a08e678..689b037 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll @@ -2133,47 +2133,32 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: slli a1, a1, 6 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 64 * vlenb +; RV32-NEXT: vmv1r.v v24, v0 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 40 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv8r.v v16, v8 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: li a1, 16 -; RV32-NEXT: vslidedown.vi v24, v0, 2 -; RV32-NEXT: mv a2, a0 -; RV32-NEXT: bltu a0, a1, .LBB34_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a2, 16 -; RV32-NEXT: .LBB34_2: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v8, v16, a1, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 56 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: addi a1, a0, -16 +; RV32-NEXT: sltu a2, a0, a1 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a2, a2, a1 ; RV32-NEXT: li a3, 32 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV32-NEXT: vmv.v.i v8, -1 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: li a1, 1 ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: slli a4, a4, 5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 56 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vxor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsub.vx v16, v16, a1, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: li a5, 48 ; RV32-NEXT: mul a4, a4, a5 @@ -2191,27 +2176,27 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v8, a4 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: li a5, 24 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: li a5, 56 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: li a5, 48 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: li a5, 48 ; RV32-NEXT: mul a4, a4, a5 @@ -2222,12 +2207,6 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi a4, a4, 819 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v8, a4 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 56 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: li a5, 48 @@ -2248,6 +2227,12 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: li a5, 56 +; RV32-NEXT: mul a4, a4, a5 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: slli a4, a4, 4 @@ -2261,36 +2246,33 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi a4, a4, -241 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v8, a4 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 48 -; RV32-NEXT: mul a4, a4, a5 +; RV32-NEXT: slli a4, a4, 4 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: lui a4, 4112 ; RV32-NEXT: addi a4, a4, 257 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a4 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vmv.v.x v8, a4 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vmul.vv v8, v16, v8, v0.t ; RV32-NEXT: li a2, 56 +; RV32-NEXT: li a3, 16 ; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, a0, -16 -; RV32-NEXT: sltu a0, a0, a3 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; RV32-NEXT: bltu a0, a3, .LBB34_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a0, 16 +; RV32-NEXT: .LBB34_2: ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmv1r.v v0, v24 ; RV32-NEXT: csrr a0, vlenb @@ -2299,26 +2281,23 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vx v8, v16, a1, v0.t -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vxor.vv v16, v16, v8, v0.t -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vxor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsub.vx v16, v16, a1, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a1, 40 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 @@ -2330,31 +2309,50 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a1, 40 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 48 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 56 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 48 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 40 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 48 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 56 @@ -2373,8 +2371,7 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -2385,12 +2382,9 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 6 ; RV32-NEXT: add sp, sp, a0 @@ -2508,24 +2502,23 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: sub sp, sp, a2 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb -; RV32-NEXT: li a2, 1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a2 -; RV32-NEXT: li a3, 32 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vmv.v.i v16, -1 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v24 -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vxor.vv v16, v8, v16 +; RV32-NEXT: li a3, 1 +; RV32-NEXT: vsub.vx v8, v8, a3 +; RV32-NEXT: vand.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: lui a4, 349525 ; RV32-NEXT: addi a4, a4, 1365 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a4 ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: li a5, 24 @@ -2538,7 +2531,7 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a4, 209715 ; RV32-NEXT: addi a4, a4, 819 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a4 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16 @@ -2549,7 +2542,7 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: lui a4, 61681 ; RV32-NEXT: addi a4, a4, -241 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a4 ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: slli a4, a4, 4 @@ -2560,23 +2553,23 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: lui a4, 4112 ; RV32-NEXT: addi a4, a4, 257 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a4 -; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v8, v8, a1 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, a0, -16 -; RV32-NEXT: sltu a0, a0, a3 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: addi a2, a0, -16 +; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 @@ -2584,7 +2577,7 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vxor.vv v8, v0, v8 -; RV32-NEXT: vsub.vx v0, v0, a2 +; RV32-NEXT: vsub.vx v0, v0, a3 ; RV32-NEXT: vand.vv v8, v8, v0 ; RV32-NEXT: vsrl.vi v0, v8, 1 ; RV32-NEXT: csrr a0, vlenb @@ -4785,47 +4778,32 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: slli a1, a1, 6 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 64 * vlenb +; RV32-NEXT: vmv1r.v v24, v0 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 40 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv8r.v v16, v8 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: li a1, 16 -; RV32-NEXT: vslidedown.vi v24, v0, 2 -; RV32-NEXT: mv a2, a0 -; RV32-NEXT: bltu a0, a1, .LBB70_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a2, 16 -; RV32-NEXT: .LBB70_2: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v8, v16, a1, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 56 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: addi a1, a0, -16 +; RV32-NEXT: sltu a2, a0, a1 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a2, a2, a1 ; RV32-NEXT: li a3, 32 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV32-NEXT: vmv.v.i v8, -1 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: li a1, 1 ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: slli a4, a4, 5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 56 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vxor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsub.vx v16, v16, a1, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: li a5, 48 ; RV32-NEXT: mul a4, a4, a5 @@ -4843,27 +4821,27 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v8, a4 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: li a5, 24 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: li a5, 56 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: li a5, 48 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: li a5, 48 ; RV32-NEXT: mul a4, a4, a5 @@ -4874,12 +4852,6 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: addi a4, a4, 819 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v8, a4 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 56 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: li a5, 48 @@ -4900,6 +4872,12 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: li a5, 56 +; RV32-NEXT: mul a4, a4, a5 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: slli a4, a4, 4 @@ -4913,36 +4891,33 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: addi a4, a4, -241 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v8, a4 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 48 -; RV32-NEXT: mul a4, a4, a5 +; RV32-NEXT: slli a4, a4, 4 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: lui a4, 4112 ; RV32-NEXT: addi a4, a4, 257 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a4 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vmv.v.x v8, a4 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vmul.vv v8, v16, v8, v0.t ; RV32-NEXT: li a2, 56 +; RV32-NEXT: li a3, 16 ; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, a0, -16 -; RV32-NEXT: sltu a0, a0, a3 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; RV32-NEXT: bltu a0, a3, .LBB70_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a0, 16 +; RV32-NEXT: .LBB70_2: ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmv1r.v v0, v24 ; RV32-NEXT: csrr a0, vlenb @@ -4951,26 +4926,23 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vx v8, v16, a1, v0.t -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vxor.vv v16, v16, v8, v0.t -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vxor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsub.vx v16, v16, a1, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a1, 40 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 @@ -4982,31 +4954,50 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a1, 40 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 48 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 56 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 48 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 40 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 48 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 56 @@ -5025,8 +5016,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -5037,12 +5027,9 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 6 ; RV32-NEXT: add sp, sp, a0 @@ -5160,24 +5147,23 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: sub sp, sp, a2 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb -; RV32-NEXT: li a2, 1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a2 -; RV32-NEXT: li a3, 32 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vmv.v.i v16, -1 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v24 -; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vxor.vv v16, v8, v16 +; RV32-NEXT: li a3, 1 +; RV32-NEXT: vsub.vx v8, v8, a3 +; RV32-NEXT: vand.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: lui a4, 349525 ; RV32-NEXT: addi a4, a4, 1365 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a4 ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: li a5, 24 @@ -5190,7 +5176,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a4, 209715 ; RV32-NEXT: addi a4, a4, 819 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a4 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16 @@ -5201,7 +5187,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: lui a4, 61681 ; RV32-NEXT: addi a4, a4, -241 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a4 ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: slli a4, a4, 4 @@ -5212,23 +5198,23 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: lui a4, 4112 ; RV32-NEXT: addi a4, a4, 257 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a4 -; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v8, v8, a1 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, a0, -16 -; RV32-NEXT: sltu a0, a0, a3 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: addi a2, a0, -16 +; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 @@ -5236,7 +5222,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vxor.vv v8, v0, v8 -; RV32-NEXT: vsub.vx v0, v0, a2 +; RV32-NEXT: vsub.vx v0, v0, a3 ; RV32-NEXT: vand.vv v8, v8, v0 ; RV32-NEXT: vsrl.vi v0, v8, 1 ; RV32-NEXT: csrr a0, vlenb diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll index fd3c65d..c83845a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll @@ -141,12 +141,13 @@ define <4 x i64> @sextload_v4i8_v4i64(ptr %x) { ; LMULMAX1-LABEL: sextload_v4i8_v4i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; LMULMAX1-NEXT: vle8.v v10, (a0) +; LMULMAX1-NEXT: vle8.v v9, (a0) +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf8 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf8 v9, v8 -; LMULMAX1-NEXT: vsext.vf8 v8, v10 +; LMULMAX1-NEXT: vsext.vf8 v9, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v4i8_v4i64: @@ -164,12 +165,13 @@ define <4 x i64> @zextload_v4i8_v4i64(ptr %x) { ; LMULMAX1-LABEL: zextload_v4i8_v4i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; LMULMAX1-NEXT: vle8.v v10, (a0) +; LMULMAX1-NEXT: vle8.v v9, (a0) +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vzext.vf8 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf8 v9, v8 -; LMULMAX1-NEXT: vzext.vf8 v8, v10 +; LMULMAX1-NEXT: vzext.vf8 v9, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v4i8_v4i64: @@ -211,12 +213,13 @@ define <8 x i32> @sextload_v8i8_v8i32(ptr %x) { ; LMULMAX1-LABEL: sextload_v8i8_v8i32: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; LMULMAX1-NEXT: vle8.v v10, (a0) +; LMULMAX1-NEXT: vle8.v v9, (a0) +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf4 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v9, v8 -; LMULMAX1-NEXT: vsext.vf4 v8, v10 +; LMULMAX1-NEXT: vsext.vf4 v9, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v8i8_v8i32: @@ -234,12 +237,13 @@ define <8 x i32> @zextload_v8i8_v8i32(ptr %x) { ; LMULMAX1-LABEL: zextload_v8i8_v8i32: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; LMULMAX1-NEXT: vle8.v v10, (a0) +; LMULMAX1-NEXT: vle8.v v9, (a0) +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vzext.vf4 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v9, v8 -; LMULMAX1-NEXT: vzext.vf4 v8, v10 +; LMULMAX1-NEXT: vzext.vf4 v9, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v8i8_v8i32: @@ -257,20 +261,21 @@ define <8 x i64> @sextload_v8i8_v8i64(ptr %x) { ; LMULMAX1-LABEL: sextload_v8i8_v8i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; LMULMAX1-NEXT: vle8.v v12, (a0) +; LMULMAX1-NEXT: vle8.v v9, (a0) +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf8 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4 +; LMULMAX1-NEXT: vslidedown.vi v11, v9, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf8 v10, v8 +; LMULMAX1-NEXT: vsext.vf8 v10, v11 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v11, v12, 2 +; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf8 v9, v11 +; LMULMAX1-NEXT: vsext.vf8 v9, v12 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf8 v11, v8 -; LMULMAX1-NEXT: vsext.vf8 v8, v12 +; LMULMAX1-NEXT: vsext.vf8 v11, v12 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v8i8_v8i64: @@ -288,20 +293,21 @@ define <8 x i64> @zextload_v8i8_v8i64(ptr %x) { ; LMULMAX1-LABEL: zextload_v8i8_v8i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; LMULMAX1-NEXT: vle8.v v12, (a0) +; LMULMAX1-NEXT: vle8.v v9, (a0) +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vzext.vf8 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4 +; LMULMAX1-NEXT: vslidedown.vi v11, v9, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf8 v10, v8 +; LMULMAX1-NEXT: vzext.vf8 v10, v11 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v11, v12, 2 +; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf8 v9, v11 +; LMULMAX1-NEXT: vzext.vf8 v9, v12 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf8 v11, v8 -; LMULMAX1-NEXT: vzext.vf8 v8, v12 +; LMULMAX1-NEXT: vzext.vf8 v11, v12 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v8i8_v8i64: @@ -319,12 +325,13 @@ define <16 x i16> @sextload_v16i8_v16i16(ptr %x) { ; LMULMAX1-LABEL: sextload_v16i8_v16i16: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; LMULMAX1-NEXT: vle8.v v10, (a0) +; LMULMAX1-NEXT: vle8.v v9, (a0) +; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf2 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v10, 8 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 8 ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v9, v8 -; LMULMAX1-NEXT: vsext.vf2 v8, v10 +; LMULMAX1-NEXT: vsext.vf2 v9, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v16i8_v16i16: @@ -342,12 +349,13 @@ define <16 x i16> @zextload_v16i8_v16i16(ptr %x) { ; LMULMAX1-LABEL: zextload_v16i8_v16i16: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; LMULMAX1-NEXT: vle8.v v10, (a0) +; LMULMAX1-NEXT: vle8.v v9, (a0) +; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; LMULMAX1-NEXT: vzext.vf2 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v10, 8 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 8 ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v9, v8 -; LMULMAX1-NEXT: vzext.vf2 v8, v10 +; LMULMAX1-NEXT: vzext.vf2 v9, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v16i8_v16i16: @@ -365,20 +373,21 @@ define <16 x i32> @sextload_v16i8_v16i32(ptr %x) { ; LMULMAX1-LABEL: sextload_v16i8_v16i32: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; LMULMAX1-NEXT: vle8.v v12, (a0) +; LMULMAX1-NEXT: vle8.v v9, (a0) +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf4 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v12, 8 +; LMULMAX1-NEXT: vslidedown.vi v11, v9, 8 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v10, v8 +; LMULMAX1-NEXT: vsext.vf4 v10, v11 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v11, v12, 4 +; LMULMAX1-NEXT: vslidedown.vi v12, v9, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v9, v11 +; LMULMAX1-NEXT: vsext.vf4 v9, v12 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4 +; LMULMAX1-NEXT: vslidedown.vi v12, v11, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v11, v8 -; LMULMAX1-NEXT: vsext.vf4 v8, v12 +; LMULMAX1-NEXT: vsext.vf4 v11, v12 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v16i8_v16i32: @@ -396,20 +405,21 @@ define <16 x i32> @zextload_v16i8_v16i32(ptr %x) { ; LMULMAX1-LABEL: zextload_v16i8_v16i32: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; LMULMAX1-NEXT: vle8.v v12, (a0) +; LMULMAX1-NEXT: vle8.v v9, (a0) +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vzext.vf4 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v12, 8 +; LMULMAX1-NEXT: vslidedown.vi v11, v9, 8 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v10, v8 +; LMULMAX1-NEXT: vzext.vf4 v10, v11 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v11, v12, 4 +; LMULMAX1-NEXT: vslidedown.vi v12, v9, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v9, v11 +; LMULMAX1-NEXT: vzext.vf4 v9, v12 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4 +; LMULMAX1-NEXT: vslidedown.vi v12, v11, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v11, v8 -; LMULMAX1-NEXT: vzext.vf4 v8, v12 +; LMULMAX1-NEXT: vzext.vf4 v11, v12 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v16i8_v16i32: @@ -427,47 +437,49 @@ define <16 x i64> @sextload_v16i8_v16i64(ptr %x) { ; LMULMAX1-LABEL: sextload_v16i8_v16i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; LMULMAX1-NEXT: vle8.v v16, (a0) +; LMULMAX1-NEXT: vle8.v v10, (a0) +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf8 v8, v10 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v16, 8 +; LMULMAX1-NEXT: vslidedown.vi v11, v10, 8 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf8 v12, v8 +; LMULMAX1-NEXT: vsext.vf8 v12, v11 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v16, 2 +; LMULMAX1-NEXT: vslidedown.vi v13, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf8 v9, v10 +; LMULMAX1-NEXT: vsext.vf8 v9, v13 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v11, v16, 4 +; LMULMAX1-NEXT: vslidedown.vi v15, v10, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf8 v10, v11 +; LMULMAX1-NEXT: vsext.vf8 v10, v15 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v14, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v14, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vsext.vf8 v13, v14 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4 +; LMULMAX1-NEXT: vslidedown.vi v16, v11, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf8 v14, v8 +; LMULMAX1-NEXT: vsext.vf8 v14, v16 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v15, v11, 2 +; LMULMAX1-NEXT: vslidedown.vi v15, v15, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vsext.vf8 v11, v15 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v16, v16, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf8 v15, v8 -; LMULMAX1-NEXT: vsext.vf8 v8, v16 +; LMULMAX1-NEXT: vsext.vf8 v15, v16 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v16i8_v16i64: ; LMULMAX4: # %bb.0: ; LMULMAX4-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; LMULMAX4-NEXT: vle8.v v16, (a0) +; LMULMAX4-NEXT: vle8.v v12, (a0) +; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; LMULMAX4-NEXT: vsext.vf8 v8, v12 ; LMULMAX4-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8 +; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8 ; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; LMULMAX4-NEXT: vsext.vf8 v12, v8 -; LMULMAX4-NEXT: vsext.vf8 v8, v16 +; LMULMAX4-NEXT: vsext.vf8 v12, v16 ; LMULMAX4-NEXT: ret %y = load <16 x i8>, ptr %x %z = sext <16 x i8> %y to <16 x i64> @@ -478,47 +490,49 @@ define <16 x i64> @zextload_v16i8_v16i64(ptr %x) { ; LMULMAX1-LABEL: zextload_v16i8_v16i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; LMULMAX1-NEXT: vle8.v v16, (a0) +; LMULMAX1-NEXT: vle8.v v10, (a0) +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vzext.vf8 v8, v10 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v16, 8 +; LMULMAX1-NEXT: vslidedown.vi v11, v10, 8 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf8 v12, v8 +; LMULMAX1-NEXT: vzext.vf8 v12, v11 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v16, 2 +; LMULMAX1-NEXT: vslidedown.vi v13, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf8 v9, v10 +; LMULMAX1-NEXT: vzext.vf8 v9, v13 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v11, v16, 4 +; LMULMAX1-NEXT: vslidedown.vi v15, v10, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf8 v10, v11 +; LMULMAX1-NEXT: vzext.vf8 v10, v15 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v14, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v14, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vzext.vf8 v13, v14 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4 +; LMULMAX1-NEXT: vslidedown.vi v16, v11, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf8 v14, v8 +; LMULMAX1-NEXT: vzext.vf8 v14, v16 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v15, v11, 2 +; LMULMAX1-NEXT: vslidedown.vi v15, v15, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vzext.vf8 v11, v15 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v16, v16, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf8 v15, v8 -; LMULMAX1-NEXT: vzext.vf8 v8, v16 +; LMULMAX1-NEXT: vzext.vf8 v15, v16 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v16i8_v16i64: ; LMULMAX4: # %bb.0: ; LMULMAX4-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; LMULMAX4-NEXT: vle8.v v16, (a0) +; LMULMAX4-NEXT: vle8.v v12, (a0) +; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; LMULMAX4-NEXT: vzext.vf8 v8, v12 ; LMULMAX4-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8 +; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8 ; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; LMULMAX4-NEXT: vzext.vf8 v12, v8 -; LMULMAX4-NEXT: vzext.vf8 v8, v16 +; LMULMAX4-NEXT: vzext.vf8 v12, v16 ; LMULMAX4-NEXT: ret %y = load <16 x i8>, ptr %x %z = zext <16 x i8> %y to <16 x i64> @@ -646,12 +660,13 @@ define <4 x i64> @sextload_v4i16_v4i64(ptr %x) { ; LMULMAX1-LABEL: sextload_v4i16_v4i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; LMULMAX1-NEXT: vle16.v v10, (a0) +; LMULMAX1-NEXT: vle16.v v9, (a0) +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf4 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v9, v8 -; LMULMAX1-NEXT: vsext.vf4 v8, v10 +; LMULMAX1-NEXT: vsext.vf4 v9, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v4i16_v4i64: @@ -669,12 +684,13 @@ define <4 x i64> @zextload_v4i16_v4i64(ptr %x) { ; LMULMAX1-LABEL: zextload_v4i16_v4i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; LMULMAX1-NEXT: vle16.v v10, (a0) +; LMULMAX1-NEXT: vle16.v v9, (a0) +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vzext.vf4 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v9, v8 -; LMULMAX1-NEXT: vzext.vf4 v8, v10 +; LMULMAX1-NEXT: vzext.vf4 v9, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v4i16_v4i64: @@ -704,12 +720,13 @@ define <8 x i32> @sextload_v8i16_v8i32(ptr %x) { ; LMULMAX1-LABEL: sextload_v8i16_v8i32: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vle16.v v10, (a0) +; LMULMAX1-NEXT: vle16.v v9, (a0) +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf2 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v9, v8 -; LMULMAX1-NEXT: vsext.vf2 v8, v10 +; LMULMAX1-NEXT: vsext.vf2 v9, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v8i16_v8i32: @@ -727,12 +744,13 @@ define <8 x i32> @zextload_v8i16_v8i32(ptr %x) { ; LMULMAX1-LABEL: zextload_v8i16_v8i32: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vle16.v v10, (a0) +; LMULMAX1-NEXT: vle16.v v9, (a0) +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vzext.vf2 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v9, v8 -; LMULMAX1-NEXT: vzext.vf2 v8, v10 +; LMULMAX1-NEXT: vzext.vf2 v9, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v8i16_v8i32: @@ -750,20 +768,21 @@ define <8 x i64> @sextload_v8i16_v8i64(ptr %x) { ; LMULMAX1-LABEL: sextload_v8i16_v8i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vle16.v v12, (a0) +; LMULMAX1-NEXT: vle16.v v9, (a0) +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf4 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4 +; LMULMAX1-NEXT: vslidedown.vi v11, v9, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v10, v8 +; LMULMAX1-NEXT: vsext.vf4 v10, v11 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v11, v12, 2 +; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v9, v11 +; LMULMAX1-NEXT: vsext.vf4 v9, v12 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v11, v8 -; LMULMAX1-NEXT: vsext.vf4 v8, v12 +; LMULMAX1-NEXT: vsext.vf4 v11, v12 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v8i16_v8i64: @@ -781,20 +800,21 @@ define <8 x i64> @zextload_v8i16_v8i64(ptr %x) { ; LMULMAX1-LABEL: zextload_v8i16_v8i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vle16.v v12, (a0) +; LMULMAX1-NEXT: vle16.v v9, (a0) +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vzext.vf4 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4 +; LMULMAX1-NEXT: vslidedown.vi v11, v9, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v10, v8 +; LMULMAX1-NEXT: vzext.vf4 v10, v11 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v11, v12, 2 +; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v9, v11 +; LMULMAX1-NEXT: vzext.vf4 v9, v12 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v11, v8 -; LMULMAX1-NEXT: vzext.vf4 v8, v12 +; LMULMAX1-NEXT: vzext.vf4 v11, v12 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v8i16_v8i64: @@ -834,19 +854,20 @@ define <16 x i32> @sextload_v16i16_v16i32(ptr %x) { ; LMULMAX1-LABEL: sextload_v16i16_v16i32: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vle16.v v10, (a0) +; LMULMAX1-NEXT: vle16.v v9, (a0) ; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle16.v v12, (a0) +; LMULMAX1-NEXT: vle16.v v11, (a0) +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf2 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v9, v8 +; LMULMAX1-NEXT: vsext.vf2 v9, v10 +; LMULMAX1-NEXT: vsext.vf2 v10, v11 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4 +; LMULMAX1-NEXT: vslidedown.vi v12, v11, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v11, v8 -; LMULMAX1-NEXT: vsext.vf2 v8, v10 -; LMULMAX1-NEXT: vsext.vf2 v10, v12 +; LMULMAX1-NEXT: vsext.vf2 v11, v12 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v16i16_v16i32: @@ -864,19 +885,20 @@ define <16 x i32> @zextload_v16i16_v16i32(ptr %x) { ; LMULMAX1-LABEL: zextload_v16i16_v16i32: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vle16.v v10, (a0) +; LMULMAX1-NEXT: vle16.v v9, (a0) ; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle16.v v12, (a0) +; LMULMAX1-NEXT: vle16.v v11, (a0) +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vzext.vf2 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v9, v8 +; LMULMAX1-NEXT: vzext.vf2 v9, v10 +; LMULMAX1-NEXT: vzext.vf2 v10, v11 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4 +; LMULMAX1-NEXT: vslidedown.vi v12, v11, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v11, v8 -; LMULMAX1-NEXT: vzext.vf2 v8, v10 -; LMULMAX1-NEXT: vzext.vf2 v10, v12 +; LMULMAX1-NEXT: vzext.vf2 v11, v12 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v16i16_v16i32: @@ -894,46 +916,48 @@ define <16 x i64> @sextload_v16i16_v16i64(ptr %x) { ; LMULMAX1-LABEL: sextload_v16i16_v16i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vle16.v v12, (a0) +; LMULMAX1-NEXT: vle16.v v9, (a0) ; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle16.v v16, (a0) +; LMULMAX1-NEXT: vle16.v v13, (a0) +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf4 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4 +; LMULMAX1-NEXT: vslidedown.vi v11, v9, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v10, v8 +; LMULMAX1-NEXT: vsext.vf4 v10, v11 +; LMULMAX1-NEXT: vsext.vf4 v12, v13 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v15, v16, 4 +; LMULMAX1-NEXT: vslidedown.vi v15, v13, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vsext.vf4 v14, v15 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v11, v12, 2 +; LMULMAX1-NEXT: vslidedown.vi v16, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v9, v11 +; LMULMAX1-NEXT: vsext.vf4 v9, v16 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v16, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v11, v8 +; LMULMAX1-NEXT: vsext.vf4 v11, v16 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v16, 2 +; LMULMAX1-NEXT: vslidedown.vi v16, v13, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v13, v8 +; LMULMAX1-NEXT: vsext.vf4 v13, v16 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v15, 2 +; LMULMAX1-NEXT: vslidedown.vi v16, v15, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v15, v8 -; LMULMAX1-NEXT: vsext.vf4 v8, v12 -; LMULMAX1-NEXT: vsext.vf4 v12, v16 +; LMULMAX1-NEXT: vsext.vf4 v15, v16 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v16i16_v16i64: ; LMULMAX4: # %bb.0: ; LMULMAX4-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; LMULMAX4-NEXT: vle16.v v16, (a0) +; LMULMAX4-NEXT: vle16.v v12, (a0) +; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; LMULMAX4-NEXT: vsext.vf4 v8, v12 ; LMULMAX4-NEXT: vsetivli zero, 8, e16, m2, ta, ma -; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8 +; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8 ; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; LMULMAX4-NEXT: vsext.vf4 v12, v8 -; LMULMAX4-NEXT: vsext.vf4 v8, v16 +; LMULMAX4-NEXT: vsext.vf4 v12, v16 ; LMULMAX4-NEXT: ret %y = load <16 x i16>, ptr %x %z = sext <16 x i16> %y to <16 x i64> @@ -944,46 +968,48 @@ define <16 x i64> @zextload_v16i16_v16i64(ptr %x) { ; LMULMAX1-LABEL: zextload_v16i16_v16i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vle16.v v12, (a0) +; LMULMAX1-NEXT: vle16.v v9, (a0) ; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle16.v v16, (a0) +; LMULMAX1-NEXT: vle16.v v13, (a0) +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vzext.vf4 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4 +; LMULMAX1-NEXT: vslidedown.vi v11, v9, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v10, v8 +; LMULMAX1-NEXT: vzext.vf4 v10, v11 +; LMULMAX1-NEXT: vzext.vf4 v12, v13 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v15, v16, 4 +; LMULMAX1-NEXT: vslidedown.vi v15, v13, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vzext.vf4 v14, v15 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v11, v12, 2 +; LMULMAX1-NEXT: vslidedown.vi v16, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v9, v11 +; LMULMAX1-NEXT: vzext.vf4 v9, v16 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX1-NEXT: vslidedown.vi v16, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v11, v8 +; LMULMAX1-NEXT: vzext.vf4 v11, v16 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v16, 2 +; LMULMAX1-NEXT: vslidedown.vi v16, v13, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v13, v8 +; LMULMAX1-NEXT: vzext.vf4 v13, v16 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v15, 2 +; LMULMAX1-NEXT: vslidedown.vi v16, v15, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v15, v8 -; LMULMAX1-NEXT: vzext.vf4 v8, v12 -; LMULMAX1-NEXT: vzext.vf4 v12, v16 +; LMULMAX1-NEXT: vzext.vf4 v15, v16 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v16i16_v16i64: ; LMULMAX4: # %bb.0: ; LMULMAX4-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; LMULMAX4-NEXT: vle16.v v16, (a0) +; LMULMAX4-NEXT: vle16.v v12, (a0) +; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; LMULMAX4-NEXT: vzext.vf4 v8, v12 ; LMULMAX4-NEXT: vsetivli zero, 8, e16, m2, ta, ma -; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8 +; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8 ; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; LMULMAX4-NEXT: vzext.vf4 v12, v8 -; LMULMAX4-NEXT: vzext.vf4 v8, v16 +; LMULMAX4-NEXT: vzext.vf4 v12, v16 ; LMULMAX4-NEXT: ret %y = load <16 x i16>, ptr %x %z = zext <16 x i16> %y to <16 x i64> @@ -1070,12 +1096,13 @@ define <4 x i64> @sextload_v4i32_v4i64(ptr %x) { ; LMULMAX1-LABEL: sextload_v4i32_v4i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vle32.v v10, (a0) +; LMULMAX1-NEXT: vle32.v v9, (a0) +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf2 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v9, v8 -; LMULMAX1-NEXT: vsext.vf2 v8, v10 +; LMULMAX1-NEXT: vsext.vf2 v9, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v4i32_v4i64: @@ -1093,12 +1120,13 @@ define <4 x i64> @zextload_v4i32_v4i64(ptr %x) { ; LMULMAX1-LABEL: zextload_v4i32_v4i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vle32.v v10, (a0) +; LMULMAX1-NEXT: vle32.v v9, (a0) +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vzext.vf2 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v9, v8 -; LMULMAX1-NEXT: vzext.vf2 v8, v10 +; LMULMAX1-NEXT: vzext.vf2 v9, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v4i32_v4i64: @@ -1167,19 +1195,20 @@ define <8 x i64> @sextload_v8i32_v8i64(ptr %x) { ; LMULMAX1-LABEL: sextload_v8i32_v8i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vle32.v v10, (a0) +; LMULMAX1-NEXT: vle32.v v9, (a0) ; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle32.v v12, (a0) +; LMULMAX1-NEXT: vle32.v v11, (a0) +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf2 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v9, v8 +; LMULMAX1-NEXT: vsext.vf2 v9, v10 +; LMULMAX1-NEXT: vsext.vf2 v10, v11 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v12, 2 +; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v11, v8 -; LMULMAX1-NEXT: vsext.vf2 v8, v10 -; LMULMAX1-NEXT: vsext.vf2 v10, v12 +; LMULMAX1-NEXT: vsext.vf2 v11, v12 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v8i32_v8i64: @@ -1197,19 +1226,20 @@ define <8 x i64> @zextload_v8i32_v8i64(ptr %x) { ; LMULMAX1-LABEL: zextload_v8i32_v8i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vle32.v v10, (a0) +; LMULMAX1-NEXT: vle32.v v9, (a0) ; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle32.v v12, (a0) +; LMULMAX1-NEXT: vle32.v v11, (a0) +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vzext.vf2 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v9, v8 +; LMULMAX1-NEXT: vzext.vf2 v9, v10 +; LMULMAX1-NEXT: vzext.vf2 v10, v11 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v12, 2 +; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v11, v8 -; LMULMAX1-NEXT: vzext.vf2 v8, v10 -; LMULMAX1-NEXT: vzext.vf2 v10, v12 +; LMULMAX1-NEXT: vzext.vf2 v11, v12 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v8i32_v8i64: @@ -1298,43 +1328,45 @@ define <16 x i64> @sextload_v16i32_v16i64(ptr %x) { ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: addi a1, a0, 48 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vle32.v v16, (a1) +; LMULMAX1-NEXT: vle32.v v15, (a1) ; LMULMAX1-NEXT: addi a1, a0, 32 -; LMULMAX1-NEXT: vle32.v v14, (a1) -; LMULMAX1-NEXT: vle32.v v10, (a0) +; LMULMAX1-NEXT: vle32.v v13, (a1) +; LMULMAX1-NEXT: vle32.v v9, (a0) ; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle32.v v12, (a0) +; LMULMAX1-NEXT: vle32.v v11, (a0) +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf2 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v9, v8 +; LMULMAX1-NEXT: vsext.vf2 v9, v10 +; LMULMAX1-NEXT: vsext.vf2 v10, v11 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v12, 2 +; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v11, v8 +; LMULMAX1-NEXT: vsext.vf2 v11, v12 +; LMULMAX1-NEXT: vsext.vf2 v12, v13 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v14, 2 +; LMULMAX1-NEXT: vslidedown.vi v14, v13, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v13, v8 +; LMULMAX1-NEXT: vsext.vf2 v13, v14 +; LMULMAX1-NEXT: vsext.vf2 v14, v15 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v16, 2 +; LMULMAX1-NEXT: vslidedown.vi v16, v15, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v15, v8 -; LMULMAX1-NEXT: vsext.vf2 v8, v10 -; LMULMAX1-NEXT: vsext.vf2 v10, v12 -; LMULMAX1-NEXT: vsext.vf2 v12, v14 -; LMULMAX1-NEXT: vsext.vf2 v14, v16 +; LMULMAX1-NEXT: vsext.vf2 v15, v16 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v16i32_v16i64: ; LMULMAX4: # %bb.0: ; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; LMULMAX4-NEXT: vle32.v v16, (a0) +; LMULMAX4-NEXT: vle32.v v12, (a0) +; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; LMULMAX4-NEXT: vsext.vf2 v8, v12 ; LMULMAX4-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8 +; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8 ; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; LMULMAX4-NEXT: vsext.vf2 v12, v8 -; LMULMAX4-NEXT: vsext.vf2 v8, v16 +; LMULMAX4-NEXT: vsext.vf2 v12, v16 ; LMULMAX4-NEXT: ret %y = load <16 x i32>, ptr %x %z = sext <16 x i32> %y to <16 x i64> @@ -1346,43 +1378,45 @@ define <16 x i64> @zextload_v16i32_v16i64(ptr %x) { ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: addi a1, a0, 48 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vle32.v v16, (a1) +; LMULMAX1-NEXT: vle32.v v15, (a1) ; LMULMAX1-NEXT: addi a1, a0, 32 -; LMULMAX1-NEXT: vle32.v v14, (a1) -; LMULMAX1-NEXT: vle32.v v10, (a0) +; LMULMAX1-NEXT: vle32.v v13, (a1) +; LMULMAX1-NEXT: vle32.v v9, (a0) ; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle32.v v12, (a0) +; LMULMAX1-NEXT: vle32.v v11, (a0) +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vzext.vf2 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v9, v8 +; LMULMAX1-NEXT: vzext.vf2 v9, v10 +; LMULMAX1-NEXT: vzext.vf2 v10, v11 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v12, 2 +; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v11, v8 +; LMULMAX1-NEXT: vzext.vf2 v11, v12 +; LMULMAX1-NEXT: vzext.vf2 v12, v13 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v14, 2 +; LMULMAX1-NEXT: vslidedown.vi v14, v13, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v13, v8 +; LMULMAX1-NEXT: vzext.vf2 v13, v14 +; LMULMAX1-NEXT: vzext.vf2 v14, v15 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v8, v16, 2 +; LMULMAX1-NEXT: vslidedown.vi v16, v15, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v15, v8 -; LMULMAX1-NEXT: vzext.vf2 v8, v10 -; LMULMAX1-NEXT: vzext.vf2 v10, v12 -; LMULMAX1-NEXT: vzext.vf2 v12, v14 -; LMULMAX1-NEXT: vzext.vf2 v14, v16 +; LMULMAX1-NEXT: vzext.vf2 v15, v16 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v16i32_v16i64: ; LMULMAX4: # %bb.0: ; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; LMULMAX4-NEXT: vle32.v v16, (a0) +; LMULMAX4-NEXT: vle32.v v12, (a0) +; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; LMULMAX4-NEXT: vzext.vf2 v8, v12 ; LMULMAX4-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8 +; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8 ; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; LMULMAX4-NEXT: vzext.vf2 v12, v8 -; LMULMAX4-NEXT: vzext.vf2 v8, v16 +; LMULMAX4-NEXT: vzext.vf2 v12, v16 ; LMULMAX4-NEXT: ret %y = load <16 x i32>, ptr %x %z = zext <16 x i32> %y to <16 x i64> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll index 740ad95..71f60d6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -95,14 +95,14 @@ define void @buildvec_dominant1_v2f32(<2 x float>* %x) { define void @buildvec_dominant0_v4f32(<4 x float>* %x) { ; CHECK-LABEL: buildvec_dominant0_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.s.x v8, zero ; CHECK-NEXT: lui a1, 262144 -; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vmv.s.x v9, zero ; CHECK-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; CHECK-NEXT: vslideup.vi v9, v8, 2 +; CHECK-NEXT: vslideup.vi v8, v9, 2 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v9, (a0) +; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret store <4 x float> , <4 x float>* %x ret void @@ -112,12 +112,12 @@ define void @buildvec_dominant1_v4f32(<4 x float>* %x, float %f) { ; CHECK-LABEL: buildvec_dominant1_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.s.x v8, zero -; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vfmv.v.f v8, fa0 +; CHECK-NEXT: vmv.s.x v9, zero ; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v9, (a0) +; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %v0 = insertelement <4 x float> poison, float %f, i32 0 %v1 = insertelement <4 x float> %v0, float 0.0, i32 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll index b3e590c..ffd35f0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll @@ -47,11 +47,12 @@ define void @fpext_v8f16_v8f32(ptr %x, ptr %y) { ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; LMULMAX1-NEXT: vle16.v v8, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v9, v8, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; LMULMAX1-NEXT: vfwcvt.f.f.v v10, v9 ; LMULMAX1-NEXT: vfwcvt.f.f.v v9, v8 +; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4 +; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; LMULMAX1-NEXT: vfwcvt.f.f.v v10, v8 ; LMULMAX1-NEXT: addi a0, a1, 16 ; LMULMAX1-NEXT: vse32.v v10, (a0) ; LMULMAX1-NEXT: vse32.v v9, (a1) @@ -92,13 +93,13 @@ define void @fpext_v8f16_v8f64(ptr %x, ptr %y) { ; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; LMULMAX1-NEXT: vfwcvt.f.f.v v11, v12 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; LMULMAX1-NEXT: vfwcvt.f.f.v v12, v10 -; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; LMULMAX1-NEXT: vfwcvt.f.f.v v10, v12 -; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; LMULMAX1-NEXT: vfwcvt.f.f.v v12, v8 ; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; LMULMAX1-NEXT: vfwcvt.f.f.v v8, v12 +; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; LMULMAX1-NEXT: vfwcvt.f.f.v v12, v10 +; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; LMULMAX1-NEXT: vfwcvt.f.f.v v10, v12 ; LMULMAX1-NEXT: addi a0, a1, 32 ; LMULMAX1-NEXT: vse64.v v10, (a0) ; LMULMAX1-NEXT: vse64.v v8, (a1) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat.ll index 9761196..72ff4f4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat.ll @@ -160,8 +160,8 @@ define void @splat_zero_16f16(ptr %x) { ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; LMULMAX1-NEXT: vmv.v.i v8, 0 -; LMULMAX1-NEXT: vse16.v v8, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vse16.v v8, (a1) ; LMULMAX1-NEXT: vse16.v v8, (a0) ; LMULMAX1-NEXT: ret %a = insertelement <16 x half> poison, half 0.0, i32 0 @@ -182,8 +182,8 @@ define void @splat_zero_v8f32(ptr %x) { ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-NEXT: vmv.v.i v8, 0 -; LMULMAX1-NEXT: vse32.v v8, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vse32.v v8, (a1) ; LMULMAX1-NEXT: vse32.v v8, (a0) ; LMULMAX1-NEXT: ret %a = insertelement <8 x float> poison, float 0.0, i32 0 @@ -204,8 +204,8 @@ define void @splat_zero_v4f64(ptr %x) { ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vmv.v.i v8, 0 -; LMULMAX1-NEXT: vse64.v v8, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vse64.v v8, (a1) ; LMULMAX1-NEXT: vse64.v v8, (a0) ; LMULMAX1-NEXT: ret %a = insertelement <4 x double> poison, double 0.0, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll index 9b46d44..62a0c2c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll @@ -77,8 +77,8 @@ define void @fp2si_v2f32_v2i64(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfwcvt.rtz.x.f.v v9, v8 +; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vmerge.vim v8, v9, 0, v0 ; CHECK-NEXT: vse64.v v8, (a1) @@ -95,8 +95,8 @@ define void @fp2ui_v2f32_v2i64(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfwcvt.rtz.xu.f.v v9, v8 +; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vmerge.vim v8, v9, 0, v0 ; CHECK-NEXT: vse64.v v8, (a1) @@ -114,8 +114,8 @@ define void @fp2si_v8f32_v8i64(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfwcvt.rtz.x.f.v v12, v8 +; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vmerge.vim v8, v12, 0, v0 ; CHECK-NEXT: vse64.v v8, (a1) @@ -133,8 +133,8 @@ define void @fp2ui_v8f32_v8i64(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfwcvt.rtz.xu.f.v v12, v8 +; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vmerge.vim v8, v12, 0, v0 ; CHECK-NEXT: vse64.v v8, (a1) @@ -151,12 +151,13 @@ define void @fp2si_v2f16_v2i64(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfwcvt.f.f.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwcvt.rtz.x.f.v v8, v9 +; CHECK-NEXT: vfwcvt.rtz.x.f.v v10, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vmerge.vim v8, v8, 0, v0 +; CHECK-NEXT: vmerge.vim v8, v10, 0, v0 ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: ret %a = load <2 x half>, ptr %x @@ -171,12 +172,13 @@ define void @fp2ui_v2f16_v2i64(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfwcvt.f.f.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwcvt.rtz.xu.f.v v8, v9 +; CHECK-NEXT: vfwcvt.rtz.xu.f.v v10, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vmerge.vim v8, v8, 0, v0 +; CHECK-NEXT: vmerge.vim v8, v10, 0, v0 ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: ret %a = load <2 x half>, ptr %x @@ -661,11 +663,12 @@ declare <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> %a) define void @fp2si_v2f64_v2i32(ptr %x, ptr %y) { ; CHECK-LABEL: fp2si_v2f64_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vfncvt.rtz.x.f.w v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfncvt.rtz.x.f.w v9, v8 ; CHECK-NEXT: vmerge.vim v8, v9, 0, v0 ; CHECK-NEXT: vse32.v v8, (a1) ; CHECK-NEXT: ret @@ -679,11 +682,12 @@ declare <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double>) define void @fp2ui_v2f64_v2i32(ptr %x, ptr %y) { ; CHECK-LABEL: fp2ui_v2f64_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vfncvt.rtz.xu.f.w v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfncvt.rtz.xu.f.w v9, v8 ; CHECK-NEXT: vmerge.vim v8, v9, 0, v0 ; CHECK-NEXT: vse32.v v8, (a1) ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll index bb39fee..8be3d42 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll @@ -491,16 +491,17 @@ define void @fp2si_v8f32_v8i64(ptr %x, ptr %y) { ; LMULMAX1-NEXT: addi a2, a0, 16 ; LMULMAX1-NEXT: vle32.v v8, (a2) ; LMULMAX1-NEXT: vle32.v v9, (a0) -; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v11, v10 +; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v10, v8 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v12, v10 -; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v10, v8 +; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v11, v8 ; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v8, v9 +; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; LMULMAX1-NEXT: vslidedown.vi v9, v9, 2 +; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v12, v9 ; LMULMAX1-NEXT: addi a0, a1, 16 ; LMULMAX1-NEXT: vse64.v v12, (a0) ; LMULMAX1-NEXT: vse64.v v8, (a1) @@ -530,16 +531,17 @@ define void @fp2ui_v8f32_v8i64(ptr %x, ptr %y) { ; LMULMAX1-NEXT: addi a2, a0, 16 ; LMULMAX1-NEXT: vle32.v v8, (a2) ; LMULMAX1-NEXT: vle32.v v9, (a0) -; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v11, v10 +; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v10, v8 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v12, v10 -; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v10, v8 +; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v11, v8 ; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v8, v9 +; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; LMULMAX1-NEXT: vslidedown.vi v9, v9, 2 +; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v12, v9 ; LMULMAX1-NEXT: addi a0, a1, 16 ; LMULMAX1-NEXT: vse64.v v12, (a0) ; LMULMAX1-NEXT: vse64.v v8, (a1) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll index 5f1cc2d4..08f27bd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll @@ -799,9 +799,9 @@ define <16 x i64> @fshr_v16i64(<16 x i64> %a, <16 x i64> %b, <16 x i64> %c, <16 ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: slli a2, a2, 4 ; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v24, (a0) ; RV32-NEXT: addi a0, sp, 16 @@ -809,20 +809,30 @@ define <16 x i64> @fshr_v16i64(<16 x i64> %a, <16 x i64> %b, <16 x i64> %c, <16 ; RV32-NEXT: li a0, 63 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vx v8, v24, a0, v0.t -; RV32-NEXT: vsrl.vv v16, v16, v8, v0.t +; RV32-NEXT: vsrl.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsll.vi v16, v8, 1, v0.t ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vmv.v.i v8, -1 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vxor.vv v8, v24, v8, v0.t ; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsll.vi v24, v24, 1, v0.t -; RV32-NEXT: vsll.vv v8, v24, v8, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsll.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -866,9 +876,9 @@ define <16 x i64> @fshl_v16i64(<16 x i64> %a, <16 x i64> %b, <16 x i64> %c, <16 ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: slli a2, a2, 4 ; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v24, (a0) ; RV32-NEXT: addi a0, sp, 16 @@ -878,19 +888,29 @@ define <16 x i64> @fshl_v16i64(<16 x i64> %a, <16 x i64> %b, <16 x i64> %c, <16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vx v8, v24, a0, v0.t ; RV32-NEXT: vsll.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v16, -1 +; RV32-NEXT: vmv.v.i v8, -1 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v16, v24, v16, v0.t -; RV32-NEXT: vand.vx v16, v16, a0, v0.t -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v24, v24, 1, v0.t -; RV32-NEXT: vsrl.vv v16, v24, v16, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vxor.vv v8, v24, v8, v0.t +; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll index b3cda0a..8019d3f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll @@ -456,10 +456,10 @@ define void @si2fp_v8i16_v8f64(ptr %x, ptr %y) { ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; LMULMAX1-NEXT: vsext.vf2 v12, v11 ; LMULMAX1-NEXT: vfwcvt.f.x.v v11, v12 -; LMULMAX1-NEXT: vsext.vf2 v12, v10 -; LMULMAX1-NEXT: vfwcvt.f.x.v v10, v12 ; LMULMAX1-NEXT: vsext.vf2 v12, v8 ; LMULMAX1-NEXT: vfwcvt.f.x.v v8, v12 +; LMULMAX1-NEXT: vsext.vf2 v12, v10 +; LMULMAX1-NEXT: vfwcvt.f.x.v v10, v12 ; LMULMAX1-NEXT: addi a0, a1, 32 ; LMULMAX1-NEXT: vse64.v v10, (a0) ; LMULMAX1-NEXT: vse64.v v8, (a1) @@ -500,10 +500,10 @@ define void @ui2fp_v8i16_v8f64(ptr %x, ptr %y) { ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; LMULMAX1-NEXT: vzext.vf2 v12, v11 ; LMULMAX1-NEXT: vfwcvt.f.xu.v v11, v12 -; LMULMAX1-NEXT: vzext.vf2 v12, v10 -; LMULMAX1-NEXT: vfwcvt.f.xu.v v10, v12 ; LMULMAX1-NEXT: vzext.vf2 v12, v8 ; LMULMAX1-NEXT: vfwcvt.f.xu.v v8, v12 +; LMULMAX1-NEXT: vzext.vf2 v12, v10 +; LMULMAX1-NEXT: vfwcvt.f.xu.v v10, v12 ; LMULMAX1-NEXT: addi a0, a1, 32 ; LMULMAX1-NEXT: vse64.v v10, (a0) ; LMULMAX1-NEXT: vse64.v v8, (a1) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll index b823814..1732e19 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll @@ -106,14 +106,15 @@ define <64 x i1> @insertelt_v64i1(<64 x i1> %x, i1 %elt) nounwind { ; CHECK-LABEL: insertelt_v64i1: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, a0 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vmv.s.x v12, a0 +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmerge.vim v12, v12, 1, v0 ; CHECK-NEXT: vsetivli zero, 2, e8, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v12, 1 +; CHECK-NEXT: vslideup.vi v12, v8, 1 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vand.vi v8, v12, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %y = insertelement <64 x i1> %x, i1 %elt, i64 1 @@ -124,15 +125,16 @@ define <64 x i1> @insertelt_idx_v64i1(<64 x i1> %x, i1 %elt, i32 zeroext %idx) n ; CHECK-LABEL: insertelt_idx_v64i1: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, a0 ; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vmv.s.x v12, a0 +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmerge.vim v12, v12, 1, v0 ; CHECK-NEXT: addi a0, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, tu, ma -; CHECK-NEXT: vslideup.vx v8, v12, a1 +; CHECK-NEXT: vslideup.vx v12, v8, a1 ; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma -; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vand.vi v8, v12, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %y = insertelement <64 x i1> %x, i1 %elt, i32 %idx diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll index 6ad736f..8395c09 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll @@ -458,14 +458,15 @@ define @insert_nxv2i1_v4i1_0( %v, ptr %svp) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vlm.v v8, (a0) -; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmv.v.i v9, 0 -; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 -; CHECK-NEXT: vsetvli zero, zero, e8, mf4, tu, ma +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vim v9, v10, 1, v0 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, tu, ma ; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma ; CHECK-NEXT: vmsne.vi v0, v9, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll index 2d4d798..4e75644 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -154,10 +154,10 @@ define <4 x i8> @buildvec_vid_stepn3_add3_v4i8() { ; CHECK-LABEL: buildvec_vid_stepn3_add3_v4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: vmv.v.i v8, 3 +; CHECK-NEXT: vmv.v.i v9, 3 +; CHECK-NEXT: vid.v v8 ; CHECK-NEXT: li a0, -3 -; CHECK-NEXT: vmacc.vx v8, a0, v9 +; CHECK-NEXT: vmadd.vx v8, a0, v9 ; CHECK-NEXT: ret ret <4 x i8> } @@ -166,10 +166,10 @@ define void @buildvec_vid_stepn3_addn3_v4i32(ptr %z0, ptr %z1, ptr %z2, ptr %z3) ; CHECK-LABEL: buildvec_vid_stepn3_addn3_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vmv.v.i v9, -3 +; CHECK-NEXT: vmv.v.i v8, -3 +; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: li a4, -3 -; CHECK-NEXT: vmacc.vx v9, a4, v8 +; CHECK-NEXT: vmadd.vx v9, a4, v8 ; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: vse32.v v9, (a1) ; CHECK-NEXT: vse32.v v9, (a2) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll index d3c843c..4686870 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll @@ -55,11 +55,12 @@ define void @sext_v8i8_v8i32(ptr %x, ptr %z) { ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; LMULMAX1-NEXT: vle8.v v8, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v9, v8, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v10, v9 ; LMULMAX1-NEXT: vsext.vf4 v9, v8 +; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4 +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf4 v10, v8 ; LMULMAX1-NEXT: addi a0, a1, 16 ; LMULMAX1-NEXT: vse32.v v10, (a0) ; LMULMAX1-NEXT: vse32.v v9, (a1) @@ -132,17 +133,17 @@ define void @sext_v32i8_v32i32(ptr %x, ptr %z) { ; LMULMAX1-NEXT: vslidedown.vi v15, v12, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-NEXT: vsext.vf4 v16, v15 -; LMULMAX1-NEXT: vsext.vf4 v15, v10 -; LMULMAX1-NEXT: vsext.vf4 v10, v12 -; LMULMAX1-NEXT: vsext.vf4 v12, v8 -; LMULMAX1-NEXT: vsext.vf4 v8, v9 +; LMULMAX1-NEXT: vsext.vf4 v15, v8 +; LMULMAX1-NEXT: vsext.vf4 v8, v10 +; LMULMAX1-NEXT: vsext.vf4 v10, v9 +; LMULMAX1-NEXT: vsext.vf4 v9, v12 ; LMULMAX1-NEXT: addi a0, a1, 32 -; LMULMAX1-NEXT: vse32.v v10, (a0) -; LMULMAX1-NEXT: vse32.v v8, (a1) +; LMULMAX1-NEXT: vse32.v v9, (a0) +; LMULMAX1-NEXT: vse32.v v10, (a1) ; LMULMAX1-NEXT: addi a0, a1, 96 -; LMULMAX1-NEXT: vse32.v v15, (a0) +; LMULMAX1-NEXT: vse32.v v8, (a0) ; LMULMAX1-NEXT: addi a0, a1, 64 -; LMULMAX1-NEXT: vse32.v v12, (a0) +; LMULMAX1-NEXT: vse32.v v15, (a0) ; LMULMAX1-NEXT: addi a0, a1, 48 ; LMULMAX1-NEXT: vse32.v v16, (a0) ; LMULMAX1-NEXT: addi a0, a1, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll index 40412ae..adbb69a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -99,10 +99,10 @@ define <4 x i16> @vrgather_shuffle_xv_v4i16(<4 x i16> %x) { ; CHECK-LABEL: vrgather_shuffle_xv_v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: vrsub.vi v10, v9, 4 -; CHECK-NEXT: vmv.v.i v0, 12 ; CHECK-NEXT: vmv.v.i v9, 5 +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vmv.v.i v0, 12 +; CHECK-NEXT: vrsub.vi v10, v10, 4 ; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret @@ -219,33 +219,32 @@ define <8 x i64> @vrgather_shuffle_vv_v8i64(<8 x i64> %x, <8 x i64> %y) { define <8 x i64> @vrgather_shuffle_xv_v8i64(<8 x i64> %x) { ; RV32-LABEL: vrgather_shuffle_xv_v8i64: ; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: lui a0, %hi(.LCPI12_0) ; RV32-NEXT: addi a0, a0, %lo(.LCPI12_0) -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vle16.v v16, (a0) ; RV32-NEXT: vmv.v.i v20, -1 ; RV32-NEXT: vrgatherei16.vv v12, v20, v16 -; RV32-NEXT: lui a0, %hi(.LCPI12_1) -; RV32-NEXT: addi a0, a0, %lo(.LCPI12_1) -; RV32-NEXT: vle16.v v16, (a0) ; RV32-NEXT: li a0, 113 ; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; RV32-NEXT: vmv.v.x v0, a0 +; RV32-NEXT: lui a0, %hi(.LCPI12_1) +; RV32-NEXT: addi a0, a0, %lo(.LCPI12_1) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32-NEXT: vle16.v v16, (a0) ; RV32-NEXT: vrgatherei16.vv v12, v8, v16, v0.t ; RV32-NEXT: vmv.v.v v8, v12 ; RV32-NEXT: ret ; ; RV64-LABEL: vrgather_shuffle_xv_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI12_0) -; RV64-NEXT: addi a0, a0, %lo(.LCPI12_0) -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vle64.v v16, (a0) ; RV64-NEXT: li a0, 113 ; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; RV64-NEXT: vmv.v.x v0, a0 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64-NEXT: lui a0, %hi(.LCPI12_0) +; RV64-NEXT: addi a0, a0, %lo(.LCPI12_0) +; RV64-NEXT: vle64.v v16, (a0) ; RV64-NEXT: vmv.v.i v12, -1 ; RV64-NEXT: vrgather.vv v12, v8, v16, v0.t ; RV64-NEXT: vmv.v.v v8, v12 @@ -262,13 +261,13 @@ define <8 x i64> @vrgather_shuffle_vx_v8i64(<8 x i64> %x) { ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vle16.v v16, (a0) ; RV32-NEXT: vrgatherei16.vv v12, v8, v16 -; RV32-NEXT: lui a0, %hi(.LCPI13_1) -; RV32-NEXT: addi a0, a0, %lo(.LCPI13_1) -; RV32-NEXT: vle16.v v8, (a0) ; RV32-NEXT: li a0, 140 ; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; RV32-NEXT: vmv.v.x v0, a0 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32-NEXT: lui a0, %hi(.LCPI13_1) +; RV32-NEXT: addi a0, a0, %lo(.LCPI13_1) +; RV32-NEXT: vle16.v v8, (a0) ; RV32-NEXT: vmv.v.i v16, 5 ; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t ; RV32-NEXT: vmv.v.v v8, v12 @@ -276,14 +275,13 @@ define <8 x i64> @vrgather_shuffle_vx_v8i64(<8 x i64> %x) { ; ; RV64-LABEL: vrgather_shuffle_vx_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI13_0) -; RV64-NEXT: addi a0, a0, %lo(.LCPI13_0) -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vle64.v v16, (a0) ; RV64-NEXT: li a0, 115 ; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; RV64-NEXT: vmv.v.x v0, a0 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64-NEXT: lui a0, %hi(.LCPI13_0) +; RV64-NEXT: addi a0, a0, %lo(.LCPI13_0) +; RV64-NEXT: vle64.v v16, (a0) ; RV64-NEXT: vmv.v.i v12, 5 ; RV64-NEXT: vrgather.vv v12, v8, v16, v0.t ; RV64-NEXT: vmv.v.v v8, v12 @@ -388,9 +386,10 @@ define <8 x i8> @splat_ve4_ins_i1ve3(<8 x i8> %v) { define <8 x i8> @splat_ve2_we0(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: splat_ve2_we0: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: li a0, 66 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.x v0, a0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vrgather.vi v10, v8, 2 ; CHECK-NEXT: vrgather.vi v10, v9, 0, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 @@ -421,12 +420,13 @@ define <8 x i8> @splat_ve2_we0_ins_i0ve4(<8 x i8> %v, <8 x i8> %w) { define <8 x i8> @splat_ve2_we0_ins_i0we4(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: splat_ve2_we0_ins_i0we4: ; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 67 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.x v0, a0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vrgather.vi v10, v8, 2 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v8, 4 -; CHECK-NEXT: li a0, 67 -; CHECK-NEXT: vmv.v.x v0, a0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 @@ -475,9 +475,9 @@ define <8 x i8> @splat_ve2_we0_ins_i2we4(<8 x i8> %v, <8 x i8> %w) { ; CHECK-NEXT: vmv.v.i v11, 0 ; CHECK-NEXT: vsetivli zero, 3, e8, mf2, tu, ma ; CHECK-NEXT: vslideup.vi v11, v10, 2 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: li a0, 70 ; CHECK-NEXT: vmv.v.x v0, a0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vrgather.vi v10, v8, 2 ; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll index c8c2aea..7e092ae 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll @@ -339,8 +339,8 @@ define void @splat_zero_v32i8(ptr %x) { ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; LMULMAX1-NEXT: vmv.v.i v8, 0 -; LMULMAX1-NEXT: vse8.v v8, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vse8.v v8, (a1) ; LMULMAX1-NEXT: vse8.v v8, (a0) ; LMULMAX1-NEXT: ret %a = insertelement <32 x i8> poison, i8 0, i32 0 @@ -368,8 +368,8 @@ define void @splat_zero_v16i16(ptr %x) { ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; LMULMAX1-NEXT: vmv.v.i v8, 0 -; LMULMAX1-NEXT: vse16.v v8, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vse16.v v8, (a1) ; LMULMAX1-NEXT: vse16.v v8, (a0) ; LMULMAX1-NEXT: ret %a = insertelement <16 x i16> poison, i16 0, i32 0 @@ -397,8 +397,8 @@ define void @splat_zero_v8i32(ptr %x) { ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-NEXT: vmv.v.i v8, 0 -; LMULMAX1-NEXT: vse32.v v8, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vse32.v v8, (a1) ; LMULMAX1-NEXT: vse32.v v8, (a0) ; LMULMAX1-NEXT: ret %a = insertelement <8 x i32> poison, i32 0, i32 0 @@ -426,8 +426,8 @@ define void @splat_zero_v4i64(ptr %x) { ; LMULMAX1-RV32: # %bb.0: ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-RV32-NEXT: vmv.v.i v8, 0 -; LMULMAX1-RV32-NEXT: vse32.v v8, (a0) -; LMULMAX1-RV32-NEXT: addi a0, a0, 16 +; LMULMAX1-RV32-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32-NEXT: vse32.v v8, (a1) ; LMULMAX1-RV32-NEXT: vse32.v v8, (a0) ; LMULMAX1-RV32-NEXT: ret ; @@ -435,8 +435,8 @@ define void @splat_zero_v4i64(ptr %x) { ; LMULMAX1-RV64: # %bb.0: ; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-RV64-NEXT: vmv.v.i v8, 0 -; LMULMAX1-RV64-NEXT: vse64.v v8, (a0) -; LMULMAX1-RV64-NEXT: addi a0, a0, 16 +; LMULMAX1-RV64-NEXT: addi a1, a0, 16 +; LMULMAX1-RV64-NEXT: vse64.v v8, (a1) ; LMULMAX1-RV64-NEXT: vse64.v v8, (a0) ; LMULMAX1-RV64-NEXT: ret %a = insertelement <4 x i64> poison, i64 0, i32 0 @@ -632,8 +632,8 @@ define void @splat_allones_v32i8(ptr %x) { ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; LMULMAX1-NEXT: vmv.v.i v8, -1 -; LMULMAX1-NEXT: vse8.v v8, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vse8.v v8, (a1) ; LMULMAX1-NEXT: vse8.v v8, (a0) ; LMULMAX1-NEXT: ret %a = insertelement <32 x i8> poison, i8 -1, i32 0 @@ -661,8 +661,8 @@ define void @splat_allones_v16i16(ptr %x) { ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; LMULMAX1-NEXT: vmv.v.i v8, -1 -; LMULMAX1-NEXT: vse16.v v8, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vse16.v v8, (a1) ; LMULMAX1-NEXT: vse16.v v8, (a0) ; LMULMAX1-NEXT: ret %a = insertelement <16 x i16> poison, i16 -1, i32 0 @@ -690,8 +690,8 @@ define void @splat_allones_v8i32(ptr %x) { ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-NEXT: vmv.v.i v8, -1 -; LMULMAX1-NEXT: vse32.v v8, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vse32.v v8, (a1) ; LMULMAX1-NEXT: vse32.v v8, (a0) ; LMULMAX1-NEXT: ret %a = insertelement <8 x i32> poison, i32 -1, i32 0 @@ -719,8 +719,8 @@ define void @splat_allones_v4i64(ptr %x) { ; LMULMAX1-RV32: # %bb.0: ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-RV32-NEXT: vmv.v.i v8, -1 -; LMULMAX1-RV32-NEXT: vse32.v v8, (a0) -; LMULMAX1-RV32-NEXT: addi a0, a0, 16 +; LMULMAX1-RV32-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32-NEXT: vse32.v v8, (a1) ; LMULMAX1-RV32-NEXT: vse32.v v8, (a0) ; LMULMAX1-RV32-NEXT: ret ; @@ -728,8 +728,8 @@ define void @splat_allones_v4i64(ptr %x) { ; LMULMAX1-RV64: # %bb.0: ; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-RV64-NEXT: vmv.v.i v8, -1 -; LMULMAX1-RV64-NEXT: vse64.v v8, (a0) -; LMULMAX1-RV64-NEXT: addi a0, a0, 16 +; LMULMAX1-RV64-NEXT: addi a1, a0, 16 +; LMULMAX1-RV64-NEXT: vse64.v v8, (a1) ; LMULMAX1-RV64-NEXT: vse64.v v8, (a0) ; LMULMAX1-RV64-NEXT: ret %a = insertelement <4 x i64> poison, i64 -1, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll index 7dc7d7c..98f43e1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -1105,45 +1105,45 @@ define void @mulhu_v16i8(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV32-NEXT: vle8.v v8, (a0) -; RV32-NEXT: lui a1, 3 -; RV32-NEXT: addi a1, a1, -2044 +; RV32-NEXT: li a1, 513 ; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; RV32-NEXT: vmv.v.x v0, a1 ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: vmv.v.i v9, 0 -; RV32-NEXT: li a1, -128 -; RV32-NEXT: vmerge.vxm v10, v9, a1, v0 +; RV32-NEXT: vmv.v.i v9, 4 +; RV32-NEXT: vmerge.vim v9, v9, 1, v0 ; RV32-NEXT: lui a1, 1 -; RV32-NEXT: addi a2, a1, 32 +; RV32-NEXT: addi a2, a1, 78 ; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; RV32-NEXT: vmv.v.x v0, a2 ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: lui a2, %hi(.LCPI65_0) -; RV32-NEXT: addi a2, a2, %lo(.LCPI65_0) -; RV32-NEXT: vle8.v v11, (a2) -; RV32-NEXT: vmerge.vim v9, v9, 1, v0 -; RV32-NEXT: vsrl.vv v9, v8, v9 -; RV32-NEXT: vmulhu.vv v9, v9, v11 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: vmulhu.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: li a2, 513 +; RV32-NEXT: vmerge.vim v9, v9, 3, v0 +; RV32-NEXT: lui a2, 8 +; RV32-NEXT: addi a2, a2, 304 ; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; RV32-NEXT: vmv.v.x v0, a2 ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: vmv.v.i v9, 4 -; RV32-NEXT: vmerge.vim v9, v9, 1, v0 -; RV32-NEXT: addi a1, a1, 78 +; RV32-NEXT: vmerge.vim v9, v9, 2, v0 +; RV32-NEXT: lui a2, 3 +; RV32-NEXT: addi a2, a2, -2044 ; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; RV32-NEXT: vmv.v.x v0, a1 +; RV32-NEXT: vmv.v.x v0, a2 ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: vmerge.vim v9, v9, 3, v0 -; RV32-NEXT: lui a1, 8 -; RV32-NEXT: addi a1, a1, 304 +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: li a2, -128 +; RV32-NEXT: vmerge.vxm v11, v10, a2, v0 +; RV32-NEXT: addi a1, a1, 32 ; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; RV32-NEXT: vmv.v.x v0, a1 ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: vmerge.vim v9, v9, 2, v0 +; RV32-NEXT: lui a1, %hi(.LCPI65_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI65_0) +; RV32-NEXT: vle8.v v12, (a1) +; RV32-NEXT: vmerge.vim v10, v10, 1, v0 +; RV32-NEXT: vsrl.vv v10, v8, v10 +; RV32-NEXT: vmulhu.vv v10, v10, v12 +; RV32-NEXT: vsub.vv v8, v8, v10 +; RV32-NEXT: vmulhu.vv v8, v8, v11 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vsrl.vv v8, v8, v9 ; RV32-NEXT: vse8.v v8, (a0) ; RV32-NEXT: ret @@ -1152,45 +1152,45 @@ define void @mulhu_v16i8(ptr %x) { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64-NEXT: vle8.v v8, (a0) -; RV64-NEXT: lui a1, 3 -; RV64-NEXT: addiw a1, a1, -2044 +; RV64-NEXT: li a1, 513 ; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; RV64-NEXT: vmv.v.x v0, a1 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64-NEXT: vmv.v.i v9, 0 -; RV64-NEXT: li a1, -128 -; RV64-NEXT: vmerge.vxm v10, v9, a1, v0 +; RV64-NEXT: vmv.v.i v9, 4 +; RV64-NEXT: vmerge.vim v9, v9, 1, v0 ; RV64-NEXT: lui a1, 1 -; RV64-NEXT: addiw a2, a1, 32 +; RV64-NEXT: addiw a2, a1, 78 ; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; RV64-NEXT: vmv.v.x v0, a2 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64-NEXT: lui a2, %hi(.LCPI65_0) -; RV64-NEXT: addi a2, a2, %lo(.LCPI65_0) -; RV64-NEXT: vle8.v v11, (a2) -; RV64-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64-NEXT: vsrl.vv v9, v8, v9 -; RV64-NEXT: vmulhu.vv v9, v9, v11 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: vmulhu.vv v8, v8, v10 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: li a2, 513 +; RV64-NEXT: vmerge.vim v9, v9, 3, v0 +; RV64-NEXT: lui a2, 8 +; RV64-NEXT: addiw a2, a2, 304 ; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; RV64-NEXT: vmv.v.x v0, a2 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64-NEXT: vmv.v.i v9, 4 -; RV64-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64-NEXT: addiw a1, a1, 78 +; RV64-NEXT: vmerge.vim v9, v9, 2, v0 +; RV64-NEXT: lui a2, 3 +; RV64-NEXT: addiw a2, a2, -2044 ; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; RV64-NEXT: vmv.v.x v0, a1 +; RV64-NEXT: vmv.v.x v0, a2 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64-NEXT: vmerge.vim v9, v9, 3, v0 -; RV64-NEXT: lui a1, 8 -; RV64-NEXT: addiw a1, a1, 304 +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: li a2, -128 +; RV64-NEXT: vmerge.vxm v11, v10, a2, v0 +; RV64-NEXT: addiw a1, a1, 32 ; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; RV64-NEXT: vmv.v.x v0, a1 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64-NEXT: vmerge.vim v9, v9, 2, v0 +; RV64-NEXT: lui a1, %hi(.LCPI65_0) +; RV64-NEXT: addi a1, a1, %lo(.LCPI65_0) +; RV64-NEXT: vle8.v v12, (a1) +; RV64-NEXT: vmerge.vim v10, v10, 1, v0 +; RV64-NEXT: vsrl.vv v10, v8, v10 +; RV64-NEXT: vmulhu.vv v10, v10, v12 +; RV64-NEXT: vsub.vv v8, v8, v10 +; RV64-NEXT: vmulhu.vv v8, v8, v11 +; RV64-NEXT: vadd.vv v8, v8, v10 ; RV64-NEXT: vsrl.vv v8, v8, v9 ; RV64-NEXT: vse8.v v8, (a0) ; RV64-NEXT: ret @@ -1205,30 +1205,31 @@ define void @mulhu_v8i16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: li a1, 33 -; CHECK-NEXT: vmv.v.x v0, a1 -; CHECK-NEXT: vmv.v.i v9, 3 -; CHECK-NEXT: vmerge.vim v9, v9, 2, v0 -; CHECK-NEXT: vmv.v.i v10, 1 -; CHECK-NEXT: vsetivli zero, 7, e16, m1, tu, ma -; CHECK-NEXT: vslideup.vi v9, v10, 6 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: lui a1, 1048568 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma -; CHECK-NEXT: vmv.v.i v12, 0 -; CHECK-NEXT: vmv.s.x v12, a1 +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmv.s.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.i v11, 1 ; CHECK-NEXT: vsetivli zero, 7, e16, m1, tu, ma -; CHECK-NEXT: vslideup.vi v11, v10, 6 +; CHECK-NEXT: vslideup.vi v9, v11, 6 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: lui a1, %hi(.LCPI66_0) ; CHECK-NEXT: addi a1, a1, %lo(.LCPI66_0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vsrl.vv v11, v8, v11 -; CHECK-NEXT: vmulhu.vv v10, v11, v10 -; CHECK-NEXT: vsub.vv v8, v8, v10 -; CHECK-NEXT: vmulhu.vv v8, v8, v12 -; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vle16.v v12, (a1) +; CHECK-NEXT: vsrl.vv v9, v8, v9 +; CHECK-NEXT: vmulhu.vv v9, v9, v12 +; CHECK-NEXT: vsub.vv v8, v8, v9 +; CHECK-NEXT: vmulhu.vv v8, v8, v10 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: li a1, 33 +; CHECK-NEXT: vmv.v.x v0, a1 +; CHECK-NEXT: vmv.v.i v9, 3 +; CHECK-NEXT: vmerge.vim v9, v9, 2, v0 +; CHECK-NEXT: vsetivli zero, 7, e16, m1, tu, ma +; CHECK-NEXT: vslideup.vi v9, v11, 6 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret @@ -1438,16 +1439,16 @@ define void @mulhs_v6i16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vmv.v.i v0, 6 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v0, 6 ; CHECK-NEXT: vmv.v.i v9, -7 ; CHECK-NEXT: vmerge.vim v9, v9, 7, v0 ; CHECK-NEXT: vdiv.vv v9, v8, v9 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vmv.v.i v11, 7 +; CHECK-NEXT: vmv.v.i v10, 7 +; CHECK-NEXT: vid.v v11 ; CHECK-NEXT: li a1, -14 -; CHECK-NEXT: vmacc.vx v11, a1, v10 +; CHECK-NEXT: vmadd.vx v11, a1, v10 ; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 4 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma @@ -5072,38 +5073,38 @@ define void @mulhu_v16i16(ptr %x) { ; LMULMAX2-RV32: # %bb.0: ; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; LMULMAX2-RV32-NEXT: vle16.v v10, (a0) -; LMULMAX2-RV32-NEXT: li a1, 257 +; LMULMAX2-RV32-NEXT: lui a1, 2 +; LMULMAX2-RV32-NEXT: addi a1, a1, 289 ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; LMULMAX2-RV32-NEXT: vmv.v.x v0, a1 ; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.i v12, 0 -; LMULMAX2-RV32-NEXT: lui a1, 1048568 -; LMULMAX2-RV32-NEXT: vmerge.vxm v14, v12, a1, v0 +; LMULMAX2-RV32-NEXT: vmv.v.i v8, 3 +; LMULMAX2-RV32-NEXT: vmerge.vim v12, v8, 2, v0 ; LMULMAX2-RV32-NEXT: lui a1, 4 ; LMULMAX2-RV32-NEXT: addi a1, a1, 64 ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; LMULMAX2-RV32-NEXT: vmv.v.x v8, a1 ; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI182_0) -; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI182_0) -; LMULMAX2-RV32-NEXT: vle16.v v16, (a1) ; LMULMAX2-RV32-NEXT: vmv1r.v v0, v8 ; LMULMAX2-RV32-NEXT: vmerge.vim v12, v12, 1, v0 -; LMULMAX2-RV32-NEXT: vsrl.vv v12, v10, v12 -; LMULMAX2-RV32-NEXT: vmulhu.vv v12, v12, v16 -; LMULMAX2-RV32-NEXT: vsub.vv v10, v10, v12 -; LMULMAX2-RV32-NEXT: vmulhu.vv v10, v10, v14 -; LMULMAX2-RV32-NEXT: vadd.vv v10, v10, v12 -; LMULMAX2-RV32-NEXT: lui a1, 2 -; LMULMAX2-RV32-NEXT: addi a1, a1, 289 +; LMULMAX2-RV32-NEXT: li a1, 257 ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; LMULMAX2-RV32-NEXT: vmv.v.x v0, a1 ; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.i v12, 3 -; LMULMAX2-RV32-NEXT: vmerge.vim v12, v12, 2, v0 +; LMULMAX2-RV32-NEXT: vmv.v.i v14, 0 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI182_0) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI182_0) +; LMULMAX2-RV32-NEXT: vle16.v v16, (a1) +; LMULMAX2-RV32-NEXT: lui a1, 1048568 +; LMULMAX2-RV32-NEXT: vmerge.vxm v18, v14, a1, v0 ; LMULMAX2-RV32-NEXT: vmv1r.v v0, v8 -; LMULMAX2-RV32-NEXT: vmerge.vim v8, v12, 1, v0 +; LMULMAX2-RV32-NEXT: vmerge.vim v8, v14, 1, v0 ; LMULMAX2-RV32-NEXT: vsrl.vv v8, v10, v8 +; LMULMAX2-RV32-NEXT: vmulhu.vv v8, v8, v16 +; LMULMAX2-RV32-NEXT: vsub.vv v10, v10, v8 +; LMULMAX2-RV32-NEXT: vmulhu.vv v10, v10, v18 +; LMULMAX2-RV32-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV32-NEXT: vsrl.vv v8, v8, v12 ; LMULMAX2-RV32-NEXT: vse16.v v8, (a0) ; LMULMAX2-RV32-NEXT: ret ; @@ -5111,38 +5112,38 @@ define void @mulhu_v16i16(ptr %x) { ; LMULMAX2-RV64: # %bb.0: ; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; LMULMAX2-RV64-NEXT: vle16.v v10, (a0) -; LMULMAX2-RV64-NEXT: li a1, 257 +; LMULMAX2-RV64-NEXT: lui a1, 2 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 289 ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; LMULMAX2-RV64-NEXT: vmv.v.x v0, a1 ; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; LMULMAX2-RV64-NEXT: vmv.v.i v12, 0 -; LMULMAX2-RV64-NEXT: lui a1, 1048568 -; LMULMAX2-RV64-NEXT: vmerge.vxm v14, v12, a1, v0 +; LMULMAX2-RV64-NEXT: vmv.v.i v8, 3 +; LMULMAX2-RV64-NEXT: vmerge.vim v12, v8, 2, v0 ; LMULMAX2-RV64-NEXT: lui a1, 4 ; LMULMAX2-RV64-NEXT: addiw a1, a1, 64 ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; LMULMAX2-RV64-NEXT: vmv.v.x v8, a1 ; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI182_0) -; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI182_0) -; LMULMAX2-RV64-NEXT: vle16.v v16, (a1) ; LMULMAX2-RV64-NEXT: vmv1r.v v0, v8 ; LMULMAX2-RV64-NEXT: vmerge.vim v12, v12, 1, v0 -; LMULMAX2-RV64-NEXT: vsrl.vv v12, v10, v12 -; LMULMAX2-RV64-NEXT: vmulhu.vv v12, v12, v16 -; LMULMAX2-RV64-NEXT: vsub.vv v10, v10, v12 -; LMULMAX2-RV64-NEXT: vmulhu.vv v10, v10, v14 -; LMULMAX2-RV64-NEXT: vadd.vv v10, v10, v12 -; LMULMAX2-RV64-NEXT: lui a1, 2 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 289 +; LMULMAX2-RV64-NEXT: li a1, 257 ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; LMULMAX2-RV64-NEXT: vmv.v.x v0, a1 ; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; LMULMAX2-RV64-NEXT: vmv.v.i v12, 3 -; LMULMAX2-RV64-NEXT: vmerge.vim v12, v12, 2, v0 +; LMULMAX2-RV64-NEXT: vmv.v.i v14, 0 +; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI182_0) +; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI182_0) +; LMULMAX2-RV64-NEXT: vle16.v v16, (a1) +; LMULMAX2-RV64-NEXT: lui a1, 1048568 +; LMULMAX2-RV64-NEXT: vmerge.vxm v18, v14, a1, v0 ; LMULMAX2-RV64-NEXT: vmv1r.v v0, v8 -; LMULMAX2-RV64-NEXT: vmerge.vim v8, v12, 1, v0 +; LMULMAX2-RV64-NEXT: vmerge.vim v8, v14, 1, v0 ; LMULMAX2-RV64-NEXT: vsrl.vv v8, v10, v8 +; LMULMAX2-RV64-NEXT: vmulhu.vv v8, v8, v16 +; LMULMAX2-RV64-NEXT: vsub.vv v10, v10, v8 +; LMULMAX2-RV64-NEXT: vmulhu.vv v10, v10, v18 +; LMULMAX2-RV64-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV64-NEXT: vsrl.vv v8, v8, v12 ; LMULMAX2-RV64-NEXT: vse16.v v8, (a0) ; LMULMAX2-RV64-NEXT: ret ; @@ -5656,10 +5657,10 @@ define void @mulhs_v4i64(ptr %x) { ; LMULMAX2-RV64: # %bb.0: ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV64-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV64-NEXT: vmv.v.i v10, 1 ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; LMULMAX2-RV64-NEXT: vmv.v.i v0, 5 ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV64-NEXT: vmv.v.i v10, 1 ; LMULMAX2-RV64-NEXT: vmerge.vim v10, v10, 0, v0 ; LMULMAX2-RV64-NEXT: lui a1, 349525 ; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index a491b26..80abefb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -12359,24 +12359,26 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; ; RV64V-LABEL: mgather_baseidx_v32i8: ; RV64V: # %bb.0: -; RV64V-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64V-NEXT: vsext.vf8 v16, v8 -; RV64V-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64V-NEXT: vmv1r.v v12, v10 -; RV64V-NEXT: vluxei64.v v12, (a0), v16, v0.t +; RV64V-NEXT: vmv1r.v v12, v0 ; RV64V-NEXT: vsetivli zero, 16, e8, m2, ta, ma -; RV64V-NEXT: vslidedown.vi v10, v10, 16 -; RV64V-NEXT: vslidedown.vi v8, v8, 16 +; RV64V-NEXT: vslidedown.vi v14, v8, 16 ; RV64V-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64V-NEXT: vsext.vf8 v16, v8 +; RV64V-NEXT: vsext.vf8 v16, v14 +; RV64V-NEXT: vsetivli zero, 16, e8, m2, ta, ma +; RV64V-NEXT: vslidedown.vi v14, v10, 16 ; RV64V-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64V-NEXT: vslidedown.vi v0, v0, 2 ; RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV64V-NEXT: vluxei64.v v14, (a0), v16, v0.t +; RV64V-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64V-NEXT: vsext.vf8 v16, v8 +; RV64V-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV64V-NEXT: vmv1r.v v0, v12 ; RV64V-NEXT: vluxei64.v v10, (a0), v16, v0.t ; RV64V-NEXT: li a0, 32 ; RV64V-NEXT: vsetvli zero, a0, e8, m2, ta, ma -; RV64V-NEXT: vslideup.vi v12, v10, 16 -; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: vslideup.vi v10, v14, 16 +; RV64V-NEXT: vmv.v.v v8, v10 ; RV64V-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_baseidx_v32i8: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll index 74eeb01..5637765 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -10854,10 +10854,11 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, ma -; RV64-NEXT: vslidedown.vi v8, v8, 16 ; RV64-NEXT: vslidedown.vi v10, v10, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vsext.vf8 v16, v10 +; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, ma +; RV64-NEXT: vslidedown.vi v8, v8, 16 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v0, v0, 2 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll index 369a7ad..1769caa 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll @@ -968,10 +968,10 @@ define i64 @vwreduce_add_v1i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: vsext.vf2 v9, v8 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsrl.vx v8, v9, a0 -; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsrl.vx v8, v9, a1 +; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vwreduce_add_v1i64: @@ -993,10 +993,10 @@ define i64 @vwreduce_uadd_v1i64(ptr %x) { ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: vzext.vf2 v9, v8 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsrl.vx v8, v9, a0 -; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsrl.vx v8, v9, a1 +; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vwreduce_uadd_v1i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll index 6bdb1a7..236ae79 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll @@ -246,8 +246,8 @@ define <8 x i1> @fcmp_ord_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t +; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmand.mm v0, v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 @@ -262,8 +262,8 @@ define <8 x i1> @fcmp_ord_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t +; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmand.mm v0, v9, v8 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 @@ -512,8 +512,8 @@ define <8 x i1> @fcmp_uno_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t +; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 @@ -528,8 +528,8 @@ define <8 x i1> @fcmp_uno_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t +; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 @@ -854,9 +854,9 @@ define <8 x i1> @fcmp_ord_vf_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmfeq.vf v16, v12, fa0, v0.t -; CHECK-NEXT: vmfeq.vv v12, v8, v8, v0.t -; CHECK-NEXT: vmand.mm v0, v12, v16 +; CHECK-NEXT: vmfeq.vv v16, v8, v8, v0.t +; CHECK-NEXT: vmfeq.vf v8, v12, fa0, v0.t +; CHECK-NEXT: vmand.mm v0, v16, v8 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 %vb = shufflevector <8 x double> %elt.head, <8 x double> poison, <8 x i32> zeroinitializer @@ -870,9 +870,9 @@ define <8 x i1> @fcmp_ord_vf_swap_v8f64(<8 x double> %va, double %b, <8 x i1> %m ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmfeq.vf v16, v12, fa0, v0.t -; CHECK-NEXT: vmfeq.vv v12, v8, v8, v0.t -; CHECK-NEXT: vmand.mm v0, v16, v12 +; CHECK-NEXT: vmfeq.vv v16, v8, v8, v0.t +; CHECK-NEXT: vmfeq.vf v8, v12, fa0, v0.t +; CHECK-NEXT: vmand.mm v0, v8, v16 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 %vb = shufflevector <8 x double> %elt.head, <8 x double> poison, <8 x i32> zeroinitializer @@ -1123,9 +1123,9 @@ define <8 x i1> @fcmp_uno_vf_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmfne.vf v16, v12, fa0, v0.t -; CHECK-NEXT: vmfne.vv v12, v8, v8, v0.t -; CHECK-NEXT: vmor.mm v0, v12, v16 +; CHECK-NEXT: vmfne.vv v16, v8, v8, v0.t +; CHECK-NEXT: vmfne.vf v8, v12, fa0, v0.t +; CHECK-NEXT: vmor.mm v0, v16, v8 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 %vb = shufflevector <8 x double> %elt.head, <8 x double> poison, <8 x i32> zeroinitializer @@ -1139,9 +1139,9 @@ define <8 x i1> @fcmp_uno_vf_swap_v8f64(<8 x double> %va, double %b, <8 x i1> %m ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmfne.vf v16, v12, fa0, v0.t -; CHECK-NEXT: vmfne.vv v12, v8, v8, v0.t -; CHECK-NEXT: vmor.mm v0, v16, v12 +; CHECK-NEXT: vmfne.vv v16, v8, v8, v0.t +; CHECK-NEXT: vmfne.vf v8, v12, fa0, v0.t +; CHECK-NEXT: vmor.mm v0, v8, v16 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 %vb = shufflevector <8 x double> %elt.head, <8 x double> poison, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll index ea14e40..e6b3c25 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll @@ -139,10 +139,10 @@ define void @store_constant_v2i32(ptr %p) { ; CHECK-LABEL: store_constant_v2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vmv.v.i v9, 3 +; CHECK-NEXT: vmv.v.i v8, 3 +; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: li a1, 3 -; CHECK-NEXT: vmacc.vx v9, a1, v8 +; CHECK-NEXT: vmadd.vx v9, a1, v8 ; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: ret store <2 x i32> , ptr %p @@ -215,10 +215,10 @@ define void @store_constant_v2i8_align1(ptr %p) { ; CHECK-LABEL: store_constant_v2i8_align1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vmv.v.i v9, 3 +; CHECK-NEXT: vmv.v.i v8, 3 +; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: li a1, 3 -; CHECK-NEXT: vmacc.vx v9, a1, v8 +; CHECK-NEXT: vmadd.vx v9, a1, v8 ; CHECK-NEXT: vse8.v v9, (a0) ; CHECK-NEXT: ret store <2 x i8> , ptr %p, align 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll index b435aed..d0b2cab 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll @@ -1649,16 +1649,17 @@ define <32 x i64> @vadd_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { define <32 x i64> @vadd_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) { ; RV32-LABEL: vadd_vx_v32i64_evl27: ; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v1, v0 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v1, v0, 2 +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: li a0, 32 ; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v8, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 11, e64, m8, ta, ma -; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vadd_vx_v32i64_evl27: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll index 2046c51..7b41aea 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll @@ -3227,10 +3227,10 @@ define <32 x i1> @fcmps_uno_fv_v32f16(<32 x half> %va, half %b) nounwind strictf ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 -; CHECK-NEXT: vmfle.vv v16, v8, v8 -; CHECK-NEXT: vmfle.vf v8, v12, fa0 -; CHECK-NEXT: vmnot.m v8, v8 -; CHECK-NEXT: vmorn.mm v0, v8, v16 +; CHECK-NEXT: vmfle.vf v16, v12, fa0 +; CHECK-NEXT: vmnot.m v12, v16 +; CHECK-NEXT: vmfle.vv v13, v8, v8 +; CHECK-NEXT: vmorn.mm v0, v12, v13 ; CHECK-NEXT: ret %head = insertelement <32 x half> poison, half %b, i32 0 %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll index 30be917..7ebfca7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll @@ -1941,41 +1941,41 @@ define <32 x double> @vpgather_v32f64(<32 x ptr> %ptrs, <32 x i1> %m, i32 zeroex define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_v32i8_v32f64: ; RV32: # %bb.0: -; RV32-NEXT: vmv1r.v v1, v0 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vsext.vf4 v16, v8 -; RV32-NEXT: vsll.vi v24, v16, 3 +; RV32-NEXT: li a3, 16 +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: bltu a1, a3, .LBB87_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a2, 16 +; RV32-NEXT: .LBB87_2: +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: addi a2, a1, -16 -; RV32-NEXT: sltu a3, a1, a2 -; RV32-NEXT: addi a3, a3, -1 -; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: sltu a1, a1, a2 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vi v0, v0, 2 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v24, 16 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t -; RV32-NEXT: li a2, 16 -; RV32-NEXT: bltu a1, a2, .LBB87_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB87_2: ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmv1r.v v0, v1 -; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_v32i8_v32f64: ; RV64: # %bb.0: ; RV64-NEXT: vmv1r.v v10, v0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: vsll.vi v24, v16, 3 ; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, ma -; RV64-NEXT: vslidedown.vi v12, v8, 16 +; RV64-NEXT: vslidedown.vi v8, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vsext.vf8 v16, v12 +; RV64-NEXT: vsext.vf8 v16, v8 ; RV64-NEXT: vsll.vi v16, v16, 3 -; RV64-NEXT: vsext.vf8 v24, v8 -; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a3, a1, a2 ; RV64-NEXT: addi a3, a3, -1 @@ -2001,29 +2001,28 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs, define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %idxs, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_sext_v32i8_v32f64: ; RV32: # %bb.0: -; RV32-NEXT: vmv1r.v v1, v0 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vsext.vf4 v16, v8 -; RV32-NEXT: vsll.vi v24, v16, 3 +; RV32-NEXT: li a3, 16 +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: bltu a1, a3, .LBB88_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a2, 16 +; RV32-NEXT: .LBB88_2: +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: addi a2, a1, -16 -; RV32-NEXT: sltu a3, a1, a2 -; RV32-NEXT: addi a3, a3, -1 -; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: sltu a1, a1, a2 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vi v0, v0, 2 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v24, 16 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t -; RV32-NEXT: li a2, 16 -; RV32-NEXT: bltu a1, a2, .LBB88_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB88_2: ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmv1r.v v0, v1 -; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_sext_v32i8_v32f64: @@ -2034,8 +2033,8 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vsext.vf8 v16, v12 ; RV64-NEXT: vsext.vf8 v24, v8 -; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a3, a1, a2 ; RV64-NEXT: addi a3, a3, -1 @@ -2062,29 +2061,28 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %idxs, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_zext_v32i8_v32f64: ; RV32: # %bb.0: -; RV32-NEXT: vmv1r.v v1, v0 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vzext.vf4 v16, v8 -; RV32-NEXT: vsll.vi v24, v16, 3 +; RV32-NEXT: li a3, 16 +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: bltu a1, a3, .LBB89_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a2, 16 +; RV32-NEXT: .LBB89_2: +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: addi a2, a1, -16 -; RV32-NEXT: sltu a3, a1, a2 -; RV32-NEXT: addi a3, a3, -1 -; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: sltu a1, a1, a2 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vi v0, v0, 2 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v24, 16 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t -; RV32-NEXT: li a2, 16 -; RV32-NEXT: bltu a1, a2, .LBB89_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB89_2: ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmv1r.v v0, v1 -; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_zext_v32i8_v32f64: @@ -2095,8 +2093,8 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vzext.vf8 v16, v12 ; RV64-NEXT: vzext.vf8 v24, v8 -; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a3, a1, a2 ; RV64-NEXT: addi a3, a3, -1 @@ -2123,41 +2121,41 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_v32i16_v32f64: ; RV32: # %bb.0: -; RV32-NEXT: vmv1r.v v1, v0 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vsext.vf2 v16, v8 -; RV32-NEXT: vsll.vi v24, v16, 3 +; RV32-NEXT: li a3, 16 +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: bltu a1, a3, .LBB90_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a2, 16 +; RV32-NEXT: .LBB90_2: +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: addi a2, a1, -16 -; RV32-NEXT: sltu a3, a1, a2 -; RV32-NEXT: addi a3, a3, -1 -; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: sltu a1, a1, a2 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vi v0, v0, 2 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v24, 16 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t -; RV32-NEXT: li a2, 16 -; RV32-NEXT: bltu a1, a2, .LBB90_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB90_2: ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmv1r.v v0, v1 -; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_v32i16_v32f64: ; RV64: # %bb.0: ; RV64-NEXT: vmv1r.v v12, v0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vsext.vf4 v16, v8 +; RV64-NEXT: vsll.vi v24, v16, 3 ; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, ma -; RV64-NEXT: vslidedown.vi v16, v8, 16 +; RV64-NEXT: vslidedown.vi v8, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vsext.vf4 v24, v16 -; RV64-NEXT: vsll.vi v16, v24, 3 -; RV64-NEXT: vsext.vf4 v24, v8 -; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vsext.vf4 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a3, a1, a2 ; RV64-NEXT: addi a3, a3, -1 @@ -2183,29 +2181,28 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16> %idxs, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_sext_v32i16_v32f64: ; RV32: # %bb.0: -; RV32-NEXT: vmv1r.v v1, v0 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vsext.vf2 v16, v8 -; RV32-NEXT: vsll.vi v24, v16, 3 +; RV32-NEXT: li a3, 16 +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: bltu a1, a3, .LBB91_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a2, 16 +; RV32-NEXT: .LBB91_2: +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: addi a2, a1, -16 -; RV32-NEXT: sltu a3, a1, a2 -; RV32-NEXT: addi a3, a3, -1 -; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: sltu a1, a1, a2 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vi v0, v0, 2 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v24, 16 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t -; RV32-NEXT: li a2, 16 -; RV32-NEXT: bltu a1, a2, .LBB91_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB91_2: ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmv1r.v v0, v1 -; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_sext_v32i16_v32f64: @@ -2214,10 +2211,10 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16> ; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, ma ; RV64-NEXT: vslidedown.vi v16, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vsext.vf4 v24, v16 -; RV64-NEXT: vsext.vf4 v0, v8 -; RV64-NEXT: vsll.vi v16, v24, 3 -; RV64-NEXT: vsll.vi v24, v0, 3 +; RV64-NEXT: vsext.vf4 v0, v16 +; RV64-NEXT: vsext.vf4 v16, v8 +; RV64-NEXT: vsll.vi v24, v16, 3 +; RV64-NEXT: vsll.vi v16, v0, 3 ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a3, a1, a2 ; RV64-NEXT: addi a3, a3, -1 @@ -2244,29 +2241,28 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16> define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16> %idxs, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_zext_v32i16_v32f64: ; RV32: # %bb.0: -; RV32-NEXT: vmv1r.v v1, v0 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vzext.vf2 v16, v8 -; RV32-NEXT: vsll.vi v24, v16, 3 +; RV32-NEXT: li a3, 16 +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: bltu a1, a3, .LBB92_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a2, 16 +; RV32-NEXT: .LBB92_2: +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: addi a2, a1, -16 -; RV32-NEXT: sltu a3, a1, a2 -; RV32-NEXT: addi a3, a3, -1 -; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: sltu a1, a1, a2 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vi v0, v0, 2 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v24, 16 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t -; RV32-NEXT: li a2, 16 -; RV32-NEXT: bltu a1, a2, .LBB92_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: .LBB92_2: ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmv1r.v v0, v1 -; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_zext_v32i16_v32f64: @@ -2275,10 +2271,10 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16> ; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, ma ; RV64-NEXT: vslidedown.vi v16, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vzext.vf4 v24, v16 -; RV64-NEXT: vzext.vf4 v0, v8 -; RV64-NEXT: vsll.vi v16, v24, 3 -; RV64-NEXT: vsll.vi v24, v0, 3 +; RV64-NEXT: vzext.vf4 v0, v16 +; RV64-NEXT: vzext.vf4 v16, v8 +; RV64-NEXT: vsll.vi v24, v16, 3 +; RV64-NEXT: vsll.vi v16, v0, 3 ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a3, a1, a2 ; RV64-NEXT: addi a3, a3, -1 @@ -2331,22 +2327,31 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs ; ; RV64-LABEL: vpgather_baseidx_v32i32_v32f64: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: sub sp, sp, a2 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; RV64-NEXT: vmv1r.v v24, v0 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vsext.vf2 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v16, v8, 16 +; RV64-NEXT: vslidedown.vi v8, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vsext.vf2 v0, v16 -; RV64-NEXT: vsll.vi v16, v0, 3 -; RV64-NEXT: vsext.vf2 v0, v8 -; RV64-NEXT: vsll.vi v8, v0, 3 +; RV64-NEXT: vsext.vf2 v16, v8 +; RV64-NEXT: vsll.vi v8, v16, 3 ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a3, a1, a2 ; RV64-NEXT: addi a3, a3, -1 ; RV64-NEXT: and a2, a3, a2 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v0, v24, 2 +; RV64-NEXT: vslidedown.vi v0, v0, 2 ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t +; RV64-NEXT: vluxei64.v v16, (a0), v8, v0.t ; RV64-NEXT: li a2, 16 ; RV64-NEXT: bltu a1, a2, .LBB93_2 ; RV64-NEXT: # %bb.1: @@ -2354,7 +2359,13 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs ; RV64-NEXT: .LBB93_2: ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %ptrs = getelementptr inbounds double, ptr %base, <32 x i32> %idxs %v = call <32 x double> @llvm.vp.gather.v32f64.v32p0(<32 x ptr> %ptrs, <32 x i1> %m, i32 %evl) @@ -2390,22 +2401,14 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32> ; ; RV64-LABEL: vpgather_baseidx_sext_v32i32_v32f64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a2, a2, 1 -; RV64-NEXT: sub sp, sp, a2 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb -; RV64-NEXT: addi a2, sp, 16 -; RV64-NEXT: vs1r.v v0, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vmv1r.v v24, v0 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV64-NEXT: vslidedown.vi v16, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vsext.vf2 v0, v16 -; RV64-NEXT: vsext.vf2 v24, v8 +; RV64-NEXT: vsext.vf2 v16, v8 +; RV64-NEXT: vsll.vi v8, v16, 3 ; RV64-NEXT: vsll.vi v16, v0, 3 -; RV64-NEXT: vsll.vi v8, v24, 3 -; RV64-NEXT: vl1r.v v24, (a2) # Unknown-size Folded Reload ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a3, a1, a2 ; RV64-NEXT: addi a3, a3, -1 @@ -2422,10 +2425,6 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32> ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vmv1r.v v0, v24 ; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 1 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %eidxs = sext <32 x i32> %idxs to <32 x i64> %ptrs = getelementptr inbounds double, ptr %base, <32 x i64> %eidxs @@ -2462,22 +2461,14 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32> ; ; RV64-LABEL: vpgather_baseidx_zext_v32i32_v32f64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a2, a2, 1 -; RV64-NEXT: sub sp, sp, a2 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb -; RV64-NEXT: addi a2, sp, 16 -; RV64-NEXT: vs1r.v v0, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vmv1r.v v24, v0 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV64-NEXT: vslidedown.vi v16, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vzext.vf2 v0, v16 -; RV64-NEXT: vzext.vf2 v24, v8 +; RV64-NEXT: vzext.vf2 v16, v8 +; RV64-NEXT: vsll.vi v8, v16, 3 ; RV64-NEXT: vsll.vi v16, v0, 3 -; RV64-NEXT: vsll.vi v8, v24, 3 -; RV64-NEXT: vl1r.v v24, (a2) # Unknown-size Folded Reload ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a3, a1, a2 ; RV64-NEXT: addi a3, a3, -1 @@ -2494,10 +2485,6 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32> ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vmv1r.v v0, v24 ; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 1 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %eidxs = zext <32 x i32> %idxs to <32 x i64> %ptrs = getelementptr inbounds double, ptr %base, <32 x i64> %eidxs diff --git a/llvm/test/CodeGen/RISCV/rvv/fptosi-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fptosi-sat.ll index 15d4e70..2c917fc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fptosi-sat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fptosi-sat.ll @@ -53,10 +53,11 @@ define @test_signed_v8f32_v8i32( %f) { define @test_signed_v4f32_v4i16( %f) { ; CHECK-LABEL: test_signed_v4f32_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvt.rtz.x.f.w v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vfncvt.rtz.x.f.w v10, v8 ; CHECK-NEXT: vmerge.vim v8, v10, 0, v0 ; CHECK-NEXT: ret %x = call @llvm.fptosi.sat.nxv4f32.nxv4i16( %f) @@ -66,10 +67,11 @@ define @test_signed_v4f32_v4i16( %f) { define @test_signed_v8f32_v8i16( %f) { ; CHECK-LABEL: test_signed_v8f32_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v8 ; CHECK-NEXT: vmerge.vim v8, v12, 0, v0 ; CHECK-NEXT: ret %x = call @llvm.fptosi.sat.nxv8f32.nxv8i16( %f) @@ -80,8 +82,8 @@ define @test_signed_v2f32_v2i64( %f) { ; CHECK-LABEL: test_signed_v2f32_v2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfwcvt.rtz.x.f.v v10, v8 +; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vmerge.vim v8, v10, 0, v0 ; CHECK-NEXT: ret @@ -93,8 +95,8 @@ define @test_signed_v4f32_v4i64( %f) { ; CHECK-LABEL: test_signed_v4f32_v4i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfwcvt.rtz.x.f.v v12, v8 +; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vmerge.vim v8, v12, 0, v0 ; CHECK-NEXT: ret @@ -115,10 +117,11 @@ declare @llvm.fptosi.sat.nxv4f64.nxv4i64( @test_signed_v2f64_v2i32( %f) { ; CHECK-LABEL: test_signed_v2f64_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vfncvt.rtz.x.f.w v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vfncvt.rtz.x.f.w v10, v8 ; CHECK-NEXT: vmerge.vim v8, v10, 0, v0 ; CHECK-NEXT: ret %x = call @llvm.fptosi.sat.nxv2f64.nxv2i32( %f) @@ -128,10 +131,11 @@ define @test_signed_v2f64_v2i32( %f) { define @test_signed_v4f64_v4i32( %f) { ; CHECK-LABEL: test_signed_v4f64_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v8 ; CHECK-NEXT: vmerge.vim v8, v12, 0, v0 ; CHECK-NEXT: ret %x = call @llvm.fptosi.sat.nxv4f64.nxv4i32( %f) @@ -141,10 +145,11 @@ define @test_signed_v4f64_v4i32( %f) { define @test_signed_v8f64_v8i32( %f) { ; CHECK-LABEL: test_signed_v8f64_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-NEXT: vfncvt.rtz.x.f.w v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vfncvt.rtz.x.f.w v16, v8 ; CHECK-NEXT: vmerge.vim v8, v16, 0, v0 ; CHECK-NEXT: ret %x = call @llvm.fptosi.sat.nxv8f64.nxv8i32( %f) @@ -236,8 +241,8 @@ define @test_signed_v2f16_v2i32( %f) { ; CHECK-LABEL: test_signed_v2f16_v2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfwcvt.rtz.x.f.v v9, v8 +; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vmerge.vim v8, v9, 0, v0 ; CHECK-NEXT: ret @@ -249,8 +254,8 @@ define @test_signed_v4f16_v4i32( %f) { ; CHECK-LABEL: test_signed_v4f16_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfwcvt.rtz.x.f.v v10, v8 +; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vmerge.vim v8, v10, 0, v0 ; CHECK-NEXT: ret @@ -262,8 +267,8 @@ define @test_signed_v8f16_v8i32( %f) { ; CHECK-LABEL: test_signed_v8f16_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfwcvt.rtz.x.f.v v12, v8 +; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vmerge.vim v8, v12, 0, v0 ; CHECK-NEXT: ret @@ -299,10 +304,11 @@ define @test_signed_v2f16_v2i64( %f) { ; CHECK-LABEL: test_signed_v2f16_v2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfwcvt.f.f.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfwcvt.rtz.x.f.v v10, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vmerge.vim v8, v10, 0, v0 ; CHECK-NEXT: ret @@ -314,10 +320,11 @@ define @test_signed_v4f16_v4i64( %f) { ; CHECK-LABEL: test_signed_v4f16_v4i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfwcvt.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfwcvt.rtz.x.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vmerge.vim v8, v12, 0, v0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll index dc068ef..368a552 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll @@ -53,10 +53,11 @@ define @test_signed_v8f32_v8i32( %f) { define @test_signed_v4f32_v4i16( %f) { ; CHECK-LABEL: test_signed_v4f32_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvt.rtz.xu.f.w v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vfncvt.rtz.xu.f.w v10, v8 ; CHECK-NEXT: vmerge.vim v8, v10, 0, v0 ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv4f32.nxv4i16( %f) @@ -66,10 +67,11 @@ define @test_signed_v4f32_v4i16( %f) { define @test_signed_v8f32_v8i16( %f) { ; CHECK-LABEL: test_signed_v8f32_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vfncvt.rtz.xu.f.w v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vfncvt.rtz.xu.f.w v12, v8 ; CHECK-NEXT: vmerge.vim v8, v12, 0, v0 ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv8f32.nxv8i16( %f) @@ -80,8 +82,8 @@ define @test_signed_v2f32_v2i64( %f) { ; CHECK-LABEL: test_signed_v2f32_v2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfwcvt.rtz.xu.f.v v10, v8 +; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vmerge.vim v8, v10, 0, v0 ; CHECK-NEXT: ret @@ -93,8 +95,8 @@ define @test_signed_v4f32_v4i64( %f) { ; CHECK-LABEL: test_signed_v4f32_v4i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfwcvt.rtz.xu.f.v v12, v8 +; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vmerge.vim v8, v12, 0, v0 ; CHECK-NEXT: ret @@ -115,10 +117,11 @@ declare @llvm.fptoui.sat.nxv4f64.nxv4i64( @test_signed_v2f64_v2i32( %f) { ; CHECK-LABEL: test_signed_v2f64_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vfncvt.rtz.xu.f.w v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vfncvt.rtz.xu.f.w v10, v8 ; CHECK-NEXT: vmerge.vim v8, v10, 0, v0 ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv2f64.nxv2i32( %f) @@ -128,10 +131,11 @@ define @test_signed_v2f64_v2i32( %f) { define @test_signed_v4f64_v4i32( %f) { ; CHECK-LABEL: test_signed_v4f64_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vfncvt.rtz.xu.f.w v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfncvt.rtz.xu.f.w v12, v8 ; CHECK-NEXT: vmerge.vim v8, v12, 0, v0 ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv4f64.nxv4i32( %f) @@ -141,10 +145,11 @@ define @test_signed_v4f64_v4i32( %f) { define @test_signed_v8f64_v8i32( %f) { ; CHECK-LABEL: test_signed_v8f64_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-NEXT: vfncvt.rtz.xu.f.w v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vfncvt.rtz.xu.f.w v16, v8 ; CHECK-NEXT: vmerge.vim v8, v16, 0, v0 ; CHECK-NEXT: ret %x = call @llvm.fptoui.sat.nxv8f64.nxv8i32( %f) @@ -254,8 +259,8 @@ define @test_signed_v2f16_v2i32( %f) { ; CHECK-LABEL: test_signed_v2f16_v2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfwcvt.rtz.xu.f.v v9, v8 +; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vmerge.vim v8, v9, 0, v0 ; CHECK-NEXT: ret @@ -267,8 +272,8 @@ define @test_signed_v4f16_v4i32( %f) { ; CHECK-LABEL: test_signed_v4f16_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfwcvt.rtz.xu.f.v v10, v8 +; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vmerge.vim v8, v10, 0, v0 ; CHECK-NEXT: ret @@ -280,8 +285,8 @@ define @test_signed_v8f16_v8i32( %f) { ; CHECK-LABEL: test_signed_v8f16_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfwcvt.rtz.xu.f.v v12, v8 +; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vmerge.vim v8, v12, 0, v0 ; CHECK-NEXT: ret @@ -317,10 +322,11 @@ define @test_signed_v2f16_v2i64( %f) { ; CHECK-LABEL: test_signed_v2f16_v2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfwcvt.f.f.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfwcvt.rtz.xu.f.v v10, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vmerge.vim v8, v10, 0, v0 ; CHECK-NEXT: ret @@ -332,10 +338,11 @@ define @test_signed_v4f16_v4i64( %f) { ; CHECK-LABEL: test_signed_v4f16_v4i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vfwcvt.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfwcvt.rtz.xu.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vmerge.vim v8, v12, 0, v0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll index d6ec784..fd6cf68 100644 --- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll @@ -2176,10 +2176,7 @@ define @mgather_baseidx_nxv16i8(ptr %base, ; ; RV64-LABEL: mgather_baseidx_nxv16i8: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV64-NEXT: vsext.vf8 v16, v8 -; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64-NEXT: vluxei64.v v10, (a0), v16, v0.t +; RV64-NEXT: vmv1r.v v12, v0 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: srli a1, a1, 3 ; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma @@ -2188,6 +2185,11 @@ define @mgather_baseidx_nxv16i8(ptr %base, ; RV64-NEXT: vsext.vf8 v16, v9 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64-NEXT: vluxei64.v v11, (a0), v16, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV64-NEXT: vmv1r.v v0, v12 +; RV64-NEXT: vluxei64.v v10, (a0), v16, v0.t ; RV64-NEXT: vmv2r.v v8, v10 ; RV64-NEXT: ret %ptrs = getelementptr inbounds i8, ptr %base, %idxs @@ -2200,45 +2202,49 @@ declare @llvm.masked.gather.nxv32i8.nxv32p0( @mgather_baseidx_nxv32i8(ptr %base, %idxs, %m, %passthru) { ; RV32-LABEL: mgather_baseidx_nxv32i8: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32-NEXT: vsext.vf4 v16, v8 -; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t +; RV32-NEXT: vmv1r.v v16, v0 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: srli a1, a1, 2 ; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; RV32-NEXT: vslidedown.vx v0, v0, a1 ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32-NEXT: vsext.vf4 v16, v10 +; RV32-NEXT: vsext.vf4 v24, v10 +; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; RV32-NEXT: vluxei32.v v14, (a0), v24, v0.t +; RV32-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; RV32-NEXT: vsext.vf4 v24, v8 ; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, mu -; RV32-NEXT: vluxei32.v v14, (a0), v16, v0.t +; RV32-NEXT: vmv1r.v v0, v16 +; RV32-NEXT: vluxei32.v v12, (a0), v24, v0.t ; RV32-NEXT: vmv4r.v v8, v12 ; RV32-NEXT: ret ; ; RV64-LABEL: mgather_baseidx_nxv32i8: ; RV64: # %bb.0: ; RV64-NEXT: vmv1r.v v16, v0 -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV64-NEXT: vsext.vf8 v24, v8 -; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64-NEXT: vluxei64.v v12, (a0), v24, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: srli a2, a1, 3 -; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vx v0, v0, a2 -; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma +; RV64-NEXT: srli a2, a1, 2 +; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; RV64-NEXT: vslidedown.vx v17, v0, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vsext.vf8 v24, v10 +; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV64-NEXT: vmv1r.v v0, v17 +; RV64-NEXT: vluxei64.v v14, (a0), v24, v0.t +; RV64-NEXT: srli a1, a1, 3 +; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vx v0, v16, a1 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV64-NEXT: vsext.vf8 v24, v9 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64-NEXT: vluxei64.v v13, (a0), v24, v0.t -; RV64-NEXT: srli a1, a1, 2 -; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma -; RV64-NEXT: vslidedown.vx v0, v16, a1 -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV64-NEXT: vsext.vf8 v16, v10 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vsext.vf8 v24, v8 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64-NEXT: vluxei64.v v14, (a0), v16, v0.t -; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vx v0, v0, a2 +; RV64-NEXT: vmv1r.v v0, v16 +; RV64-NEXT: vluxei64.v v12, (a0), v24, v0.t +; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vx v0, v17, a1 ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV64-NEXT: vsext.vf8 v16, v11 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll index c4c849d..a3f74ea 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll @@ -905,8 +905,8 @@ define void @test_dag_loop() { ; CHECK-NEXT: vsetivli zero, 1, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v8, (zero) ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: vsetivli zero, 0, e8, m4, tu, mu ; CHECK-NEXT: vmv4r.v v20, v16 ; CHECK-NEXT: vssubu.vx v20, v16, zero, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll index b86f363..941c3bb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll @@ -234,8 +234,8 @@ define @fcmp_ord_vf_nxv1f16( %va, half %b, ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t +; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmand.mm v0, v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 @@ -250,8 +250,8 @@ define @fcmp_ord_vf_swap_nxv1f16( %va, half ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t +; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmand.mm v0, v9, v8 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 @@ -500,8 +500,8 @@ define @fcmp_uno_vf_nxv1f16( %va, half %b, ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t +; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 @@ -516,8 +516,8 @@ define @fcmp_uno_vf_swap_nxv1f16( %va, half ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t +; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 @@ -783,9 +783,9 @@ define @fcmp_ord_vf_nxv8f16( %va, half %b, ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vmfeq.vf v12, v10, fa0, v0.t -; CHECK-NEXT: vmfeq.vv v10, v8, v8, v0.t -; CHECK-NEXT: vmand.mm v0, v10, v12 +; CHECK-NEXT: vmfeq.vv v12, v8, v8, v0.t +; CHECK-NEXT: vmfeq.vf v8, v10, fa0, v0.t +; CHECK-NEXT: vmand.mm v0, v12, v8 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -799,9 +799,9 @@ define @fcmp_ord_vf_swap_nxv8f16( %va, half ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vmfeq.vf v12, v10, fa0, v0.t -; CHECK-NEXT: vmfeq.vv v10, v8, v8, v0.t -; CHECK-NEXT: vmand.mm v0, v12, v10 +; CHECK-NEXT: vmfeq.vv v12, v8, v8, v0.t +; CHECK-NEXT: vmfeq.vf v8, v10, fa0, v0.t +; CHECK-NEXT: vmand.mm v0, v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1052,9 +1052,9 @@ define @fcmp_uno_vf_nxv8f16( %va, half %b, ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vmfne.vf v12, v10, fa0, v0.t -; CHECK-NEXT: vmfne.vv v10, v8, v8, v0.t -; CHECK-NEXT: vmor.mm v0, v10, v12 +; CHECK-NEXT: vmfne.vv v12, v8, v8, v0.t +; CHECK-NEXT: vmfne.vf v8, v10, fa0, v0.t +; CHECK-NEXT: vmor.mm v0, v12, v8 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1068,9 +1068,9 @@ define @fcmp_uno_vf_swap_nxv8f16( %va, half ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vmfne.vf v12, v10, fa0, v0.t -; CHECK-NEXT: vmfne.vv v10, v8, v8, v0.t -; CHECK-NEXT: vmor.mm v0, v12, v10 +; CHECK-NEXT: vmfne.vv v12, v8, v8, v0.t +; CHECK-NEXT: vmfne.vf v8, v10, fa0, v0.t +; CHECK-NEXT: vmor.mm v0, v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement poison, half %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1369,8 +1369,8 @@ define @fcmp_ord_vf_nxv1f64( %va, double ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t +; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmand.mm v0, v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 @@ -1385,8 +1385,8 @@ define @fcmp_ord_vf_swap_nxv1f64( %va, do ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t +; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmand.mm v0, v9, v8 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 @@ -1635,8 +1635,8 @@ define @fcmp_uno_vf_nxv1f64( %va, double ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t +; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 @@ -1651,8 +1651,8 @@ define @fcmp_uno_vf_swap_nxv1f64( %va, do ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t +; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t ; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 @@ -1919,9 +1919,9 @@ define @fcmp_ord_vf_nxv8f64( %va, double ; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; CHECK-NEXT: vfmv.v.f v16, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmfeq.vf v24, v16, fa0, v0.t -; CHECK-NEXT: vmfeq.vv v16, v8, v8, v0.t -; CHECK-NEXT: vmand.mm v0, v16, v24 +; CHECK-NEXT: vmfeq.vv v24, v8, v8, v0.t +; CHECK-NEXT: vmfeq.vf v8, v16, fa0, v0.t +; CHECK-NEXT: vmand.mm v0, v24, v8 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -1935,9 +1935,9 @@ define @fcmp_ord_vf_swap_nxv8f64( %va, do ; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; CHECK-NEXT: vfmv.v.f v16, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmfeq.vf v24, v16, fa0, v0.t -; CHECK-NEXT: vmfeq.vv v16, v8, v8, v0.t -; CHECK-NEXT: vmand.mm v0, v24, v16 +; CHECK-NEXT: vmfeq.vv v24, v8, v8, v0.t +; CHECK-NEXT: vmfeq.vf v8, v16, fa0, v0.t +; CHECK-NEXT: vmand.mm v0, v8, v24 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2188,9 +2188,9 @@ define @fcmp_uno_vf_nxv8f64( %va, double ; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; CHECK-NEXT: vfmv.v.f v16, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmfne.vf v24, v16, fa0, v0.t -; CHECK-NEXT: vmfne.vv v16, v8, v8, v0.t -; CHECK-NEXT: vmor.mm v0, v16, v24 +; CHECK-NEXT: vmfne.vv v24, v8, v8, v0.t +; CHECK-NEXT: vmfne.vf v8, v16, fa0, v0.t +; CHECK-NEXT: vmor.mm v0, v24, v8 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer @@ -2204,9 +2204,9 @@ define @fcmp_uno_vf_swap_nxv8f64( %va, do ; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; CHECK-NEXT: vfmv.v.f v16, fa0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmfne.vf v24, v16, fa0, v0.t -; CHECK-NEXT: vmfne.vv v16, v8, v8, v0.t -; CHECK-NEXT: vmor.mm v0, v24, v16 +; CHECK-NEXT: vmfne.vv v24, v8, v8, v0.t +; CHECK-NEXT: vmfne.vf v8, v16, fa0, v0.t +; CHECK-NEXT: vmor.mm v0, v8, v24 ; CHECK-NEXT: ret %elt.head = insertelement poison, double %b, i32 0 %vb = shufflevector %elt.head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll index 30ec089..5b4018f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll @@ -78,11 +78,11 @@ define <16 x i8> @v8i8_2(<8 x i8> %a, <8 x i8> %b) { ; CHECK-NEXT: vid.v v11 ; CHECK-NEXT: vrsub.vi v12, v11, 15 ; CHECK-NEXT: vrgather.vv v10, v8, v12 -; CHECK-NEXT: vrsub.vi v8, v11, 7 ; CHECK-NEXT: li a0, 255 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vmv.v.x v0, a0 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: vrsub.vi v8, v11, 7 ; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret @@ -224,11 +224,11 @@ define <16 x i16> @v8i16_2(<8 x i16> %a, <8 x i16> %b) { ; CHECK-NEXT: vid.v v14 ; CHECK-NEXT: vrsub.vi v16, v14, 15 ; CHECK-NEXT: vrgather.vv v10, v8, v16 -; CHECK-NEXT: vrsub.vi v8, v14, 7 ; CHECK-NEXT: li a0, 255 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vmv.v.x v0, a0 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; CHECK-NEXT: vrsub.vi v8, v14, 7 ; CHECK-NEXT: vrgather.vv v10, v12, v8, v0.t ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret @@ -341,10 +341,10 @@ define <8 x i32> @v4i32_2(<4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: vid.v v14 ; CHECK-NEXT: vrsub.vi v16, v14, 7 ; CHECK-NEXT: vrgather.vv v10, v8, v16 +; CHECK-NEXT: vrsub.vi v8, v14, 3 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; CHECK-NEXT: vrsub.vi v8, v14, 3 ; CHECK-NEXT: vrgather.vv v10, v12, v8, v0.t ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret @@ -373,11 +373,11 @@ define <16 x i32> @v8i32_2(<8 x i32> %a, <8 x i32> %b) { ; CHECK-NEXT: vid.v v20 ; CHECK-NEXT: vrsub.vi v24, v20, 15 ; CHECK-NEXT: vrgather.vv v12, v8, v24 -; CHECK-NEXT: vrsub.vi v8, v20, 7 ; CHECK-NEXT: li a0, 255 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vmv.v.x v0, a0 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; CHECK-NEXT: vrsub.vi v8, v20, 7 ; CHECK-NEXT: vrgather.vv v12, v16, v8, v0.t ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret @@ -501,10 +501,9 @@ define <8 x i64> @v4i64_2(<4 x i64> %a, <4 x i64> %b) { ; RV32-NEXT: vrsub.vi v19, v18, 7 ; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; RV32-NEXT: vrgatherei16.vv v12, v8, v19 -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV32-NEXT: vmv.v.i v0, 15 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV32-NEXT: vrsub.vi v8, v18, 3 +; RV32-NEXT: vmv.v.i v0, 15 ; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu ; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t ; RV32-NEXT: vmv.v.v v8, v12 @@ -517,10 +516,10 @@ define <8 x i64> @v4i64_2(<4 x i64> %a, <4 x i64> %b) { ; RV64-NEXT: vid.v v20 ; RV64-NEXT: vrsub.vi v24, v20, 7 ; RV64-NEXT: vrgather.vv v12, v8, v24 +; RV64-NEXT: vrsub.vi v8, v20, 3 ; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; RV64-NEXT: vmv.v.i v0, 15 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrsub.vi v8, v20, 3 ; RV64-NEXT: vrgather.vv v12, v16, v8, v0.t ; RV64-NEXT: vmv.v.v v8, v12 ; RV64-NEXT: ret @@ -605,11 +604,11 @@ define <16 x half> @v8f16_2(<8 x half> %a, <8 x half> %b) { ; CHECK-NEXT: vid.v v14 ; CHECK-NEXT: vrsub.vi v16, v14, 15 ; CHECK-NEXT: vrgather.vv v10, v8, v16 -; CHECK-NEXT: vrsub.vi v8, v14, 7 ; CHECK-NEXT: li a0, 255 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vmv.v.x v0, a0 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; CHECK-NEXT: vrsub.vi v8, v14, 7 ; CHECK-NEXT: vrgather.vv v10, v12, v8, v0.t ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret @@ -693,10 +692,10 @@ define <8 x float> @v4f32_2(<4 x float> %a, <4 x float> %b) { ; CHECK-NEXT: vid.v v14 ; CHECK-NEXT: vrsub.vi v16, v14, 7 ; CHECK-NEXT: vrgather.vv v10, v8, v16 +; CHECK-NEXT: vrsub.vi v8, v14, 3 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; CHECK-NEXT: vrsub.vi v8, v14, 3 ; CHECK-NEXT: vrgather.vv v10, v12, v8, v0.t ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret @@ -725,11 +724,11 @@ define <16 x float> @v8f32_2(<8 x float> %a, <8 x float> %b) { ; CHECK-NEXT: vid.v v20 ; CHECK-NEXT: vrsub.vi v24, v20, 15 ; CHECK-NEXT: vrgather.vv v12, v8, v24 -; CHECK-NEXT: vrsub.vi v8, v20, 7 ; CHECK-NEXT: li a0, 255 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vmv.v.x v0, a0 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; CHECK-NEXT: vrsub.vi v8, v20, 7 ; CHECK-NEXT: vrgather.vv v12, v16, v8, v0.t ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret @@ -796,10 +795,9 @@ define <8 x double> @v4f64_2(<4 x double> %a, <4 x double> %b) { ; RV32-NEXT: vrsub.vi v19, v18, 7 ; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; RV32-NEXT: vrgatherei16.vv v12, v8, v19 -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV32-NEXT: vmv.v.i v0, 15 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV32-NEXT: vrsub.vi v8, v18, 3 +; RV32-NEXT: vmv.v.i v0, 15 ; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu ; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t ; RV32-NEXT: vmv.v.v v8, v12 @@ -812,10 +810,10 @@ define <8 x double> @v4f64_2(<4 x double> %a, <4 x double> %b) { ; RV64-NEXT: vid.v v20 ; RV64-NEXT: vrsub.vi v24, v20, 7 ; RV64-NEXT: vrgather.vv v12, v8, v24 +; RV64-NEXT: vrsub.vi v8, v20, 3 ; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; RV64-NEXT: vmv.v.i v0, 15 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrsub.vi v8, v20, 3 ; RV64-NEXT: vrgather.vv v12, v16, v8, v0.t ; RV64-NEXT: vmv.v.v v8, v12 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll index 443fe93..8fb00364 100644 --- a/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll @@ -9,18 +9,18 @@ declare <16 x i8> @llvm.sshl.sat.v16i8(<16 x i8>, <16 x i8>) define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; CHECK-LABEL: vec_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, -1 ; CHECK-NEXT: li a0, -1 ; CHECK-NEXT: srli a1, a0, 1 -; CHECK-NEXT: vsll.vv v10, v8, v9 -; CHECK-NEXT: vsra.vv v9, v10, v9 -; CHECK-NEXT: vmsne.vv v8, v8, v9 -; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsll.vv v11, v8, v9 +; CHECK-NEXT: vsra.vv v9, v11, v9 +; CHECK-NEXT: vmsne.vv v9, v8, v9 +; CHECK-NEXT: vmsle.vi v0, v8, -1 ; CHECK-NEXT: slli a0, a0, 63 -; CHECK-NEXT: vmerge.vxm v9, v9, a0, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0 +; CHECK-NEXT: vmerge.vxm v8, v10, a0, v0 +; CHECK-NEXT: vmv.v.v v0, v9 +; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0 ; CHECK-NEXT: ret %tmp = call <2 x i64> @llvm.sshl.sat.v2i64(<2 x i64> %x, <2 x i64> %y) ret <2 x i64> %tmp @@ -29,19 +29,19 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-LABEL: vec_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, -1 ; CHECK-NEXT: lui a0, 524288 ; CHECK-NEXT: addiw a0, a0, -1 -; CHECK-NEXT: vsll.vv v10, v8, v9 -; CHECK-NEXT: vsra.vv v9, v10, v9 -; CHECK-NEXT: vmsne.vv v8, v8, v9 -; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vsll.vv v11, v8, v9 +; CHECK-NEXT: vsra.vv v9, v11, v9 +; CHECK-NEXT: vmsne.vv v9, v8, v9 +; CHECK-NEXT: vmsle.vi v0, v8, -1 ; CHECK-NEXT: li a0, 1 ; CHECK-NEXT: slli a0, a0, 31 -; CHECK-NEXT: vmerge.vxm v9, v9, a0, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0 +; CHECK-NEXT: vmerge.vxm v8, v10, a0, v0 +; CHECK-NEXT: vmv.v.v v0, v9 +; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0 ; CHECK-NEXT: ret %tmp = call <4 x i32> @llvm.sshl.sat.v4i32(<4 x i32> %x, <4 x i32> %y) ret <4 x i32> %tmp @@ -50,10 +50,10 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; CHECK-LABEL: vec_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, -1 ; CHECK-NEXT: lui a0, 8 ; CHECK-NEXT: addiw a1, a0, -1 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmsle.vi v0, v8, -1 ; CHECK-NEXT: vsll.vv v10, v8, v9 ; CHECK-NEXT: vsra.vv v9, v10, v9 ; CHECK-NEXT: vmsne.vv v8, v8, v9 @@ -69,17 +69,17 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; CHECK-LABEL: vec_v16i8: ; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 127 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vsll.vv v11, v8, v9 +; CHECK-NEXT: vsra.vv v9, v11, v9 +; CHECK-NEXT: vmsne.vv v9, v8, v9 ; CHECK-NEXT: vmsle.vi v0, v8, -1 -; CHECK-NEXT: li a0, 127 -; CHECK-NEXT: vsll.vv v10, v8, v9 -; CHECK-NEXT: vsra.vv v9, v10, v9 -; CHECK-NEXT: vmsne.vv v8, v8, v9 -; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: li a0, 128 -; CHECK-NEXT: vmerge.vxm v9, v9, a0, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0 +; CHECK-NEXT: vmerge.vxm v8, v10, a0, v0 +; CHECK-NEXT: vmv.v.v v0, v9 +; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0 ; CHECK-NEXT: ret %tmp = call <16 x i8> @llvm.sshl.sat.v16i8(<16 x i8> %x, <16 x i8> %y) ret <16 x i8> %tmp @@ -94,15 +94,15 @@ define @vec_nxv2i64( %x, ; CHECK-LABEL: vec_nxv2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, -1 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: srli a1, a0, 1 ; CHECK-NEXT: vsll.vv v12, v8, v10 ; CHECK-NEXT: vsra.vv v14, v12, v10 ; CHECK-NEXT: vmsne.vv v10, v8, v14 -; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: srli a1, a0, 1 +; CHECK-NEXT: vmv.v.x v14, a1 +; CHECK-NEXT: vmsle.vi v0, v8, -1 ; CHECK-NEXT: slli a0, a0, 63 -; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 +; CHECK-NEXT: vmerge.vxm v8, v14, a0, v0 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret @@ -114,16 +114,16 @@ define @vec_nxv4i32( %x, ; CHECK-LABEL: vec_nxv4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, -1 -; CHECK-NEXT: lui a0, 524288 -; CHECK-NEXT: addiw a0, a0, -1 ; CHECK-NEXT: vsll.vv v12, v8, v10 ; CHECK-NEXT: vsra.vv v14, v12, v10 ; CHECK-NEXT: vmsne.vv v10, v8, v14 -; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vmv.v.x v14, a0 +; CHECK-NEXT: vmsle.vi v0, v8, -1 ; CHECK-NEXT: li a0, 1 ; CHECK-NEXT: slli a0, a0, 31 -; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 +; CHECK-NEXT: vmerge.vxm v8, v14, a0, v0 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret @@ -135,12 +135,12 @@ define @vec_nxv8i16( %x, ; CHECK-LABEL: vec_nxv8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, -1 -; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: addiw a1, a0, -1 ; CHECK-NEXT: vsll.vv v12, v8, v10 ; CHECK-NEXT: vsra.vv v14, v12, v10 ; CHECK-NEXT: vmsne.vv v10, v8, v14 +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: addiw a1, a0, -1 +; CHECK-NEXT: vmsle.vi v0, v8, -1 ; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-NEXT: vmv1r.v v0, v10 @@ -154,14 +154,14 @@ define @vec_nxv16i8( %x, ; CHECK-LABEL: vec_nxv16i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vmsle.vi v0, v8, -1 -; CHECK-NEXT: li a0, 127 ; CHECK-NEXT: vsll.vv v12, v8, v10 ; CHECK-NEXT: vsra.vv v14, v12, v10 ; CHECK-NEXT: vmsne.vv v10, v8, v14 -; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: li a0, 127 +; CHECK-NEXT: vmv.v.x v14, a0 +; CHECK-NEXT: vmsle.vi v0, v8, -1 ; CHECK-NEXT: li a0, 128 -; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 +; CHECK-NEXT: vmerge.vxm v8, v14, a0, v0 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll index 8d353dd..5e1e0fb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -17,12 +17,12 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) { ; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV32-NEXT: vmerge.vim v8, v8, 1, v0 -; RV32-NEXT: vadd.vi v12, v11, -16 ; RV32-NEXT: lui a0, 16 ; RV32-NEXT: addi a0, a0, -256 ; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; RV32-NEXT: vmv.v.x v0, a0 ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV32-NEXT: vadd.vi v12, v11, -16 ; RV32-NEXT: vrgather.vv v9, v8, v12, v0.t ; RV32-NEXT: vmsne.vi v9, v9, 0 ; RV32-NEXT: vadd.vi v12, v11, 1 @@ -45,12 +45,12 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) { ; RV64-NEXT: vslidedown.vi v0, v0, 2 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64-NEXT: vadd.vi v12, v11, -16 ; RV64-NEXT: lui a0, 16 ; RV64-NEXT: addiw a0, a0, -256 ; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; RV64-NEXT: vmv.v.x v0, a0 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV64-NEXT: vadd.vi v12, v11, -16 ; RV64-NEXT: vrgather.vv v9, v8, v12, v0.t ; RV64-NEXT: vmsne.vi v9, v9, 0 ; RV64-NEXT: vadd.vi v12, v11, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll index 18ced70..1ec6382 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll @@ -7,12 +7,13 @@ define <32 x i1> @vector_interleave_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b) { ; CHECK-LABEL: vector_interleave_v32i1_v16i1: ; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vslideup.vi v0, v8, 2 -; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 ; CHECK-NEXT: vsetivli zero, 16, e8, m2, ta, ma ; CHECK-NEXT: vslidedown.vi v10, v8, 16 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv-copy.mir b/llvm/test/CodeGen/RISCV/rvv/vmv-copy.mir index 116d8d5..46f3b09 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmv-copy.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vmv-copy.mir @@ -31,7 +31,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $x15 = PseudoVSETVLI $x14, 82 /* e32, m4, ta, mu */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: $v28m4 = PseudoVLE32_V_M4 killed $x16, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype - ; CHECK-NEXT: $v12m4 = PseudoVMV_V_V_M4 $v28m4, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype + ; CHECK-NEXT: $v12m4 = PseudoVMV_V_V_M4 undef $v12m4, $v28m4, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype $x15 = PseudoVSETVLI $x14, 82, implicit-def $vl, implicit-def $vtype $v28m4 = PseudoVLE32_V_M4 killed $x16, $noreg, 5, implicit $vl, implicit $vtype $v12m4 = COPY $v28m4 @@ -47,10 +47,10 @@ body: | ; CHECK: liveins: $x14 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $x15 = PseudoVSETVLI $x14, 82 /* e32, m4, ta, mu */, implicit-def $vl, implicit-def $vtype - ; CHECK-NEXT: $v28m4 = PseudoVMV_V_I_M4 0, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype - ; CHECK-NEXT: $v12m4 = PseudoVMV_V_I_M4 0, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype + ; CHECK-NEXT: $v28m4 = PseudoVMV_V_I_M4 undef $v28m4, 0, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype + ; CHECK-NEXT: $v12m4 = PseudoVMV_V_I_M4 undef $v12m4, 0, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype $x15 = PseudoVSETVLI $x14, 82, implicit-def $vl, implicit-def $vtype - $v28m4 = PseudoVMV_V_I_M4 0, $noreg, 5, implicit $vl, implicit $vtype + $v28m4 = PseudoVMV_V_I_M4 undef $v28m4, 0, $noreg, 5, 0, implicit $vl, implicit $vtype $v12m4 = COPY $v28m4 ... --- @@ -81,11 +81,11 @@ body: | ; CHECK: liveins: $x14, $x16 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $x15 = PseudoVSETVLI $x14, 82 /* e32, m4, ta, mu */, implicit-def $vl, implicit-def $vtype - ; CHECK-NEXT: $v28m4 = PseudoVMV_V_I_M4 0, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype + ; CHECK-NEXT: $v28m4 = PseudoVMV_V_I_M4 undef $v28m4, 0, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: $v4m4, $x0 = PseudoVLE32FF_V_M4 $x16, $noreg, 5 /* e32 */, implicit-def $vl ; CHECK-NEXT: $v12m4 = VMV4R_V $v28m4 $x15 = PseudoVSETVLI $x14, 82, implicit-def $vl, implicit-def $vtype - $v28m4 = PseudoVMV_V_I_M4 0, $noreg, 5, implicit $vl, implicit $vtype + $v28m4 = PseudoVMV_V_I_M4 undef $v28m4, 0, $noreg, 5, 0, implicit $vl, implicit $vtype $v4m4,$x0 = PseudoVLE32FF_V_M4 $x16, $noreg, 5, implicit-def $vl $v12m4 = COPY $v28m4 ... @@ -132,7 +132,7 @@ body: | ; CHECK-NEXT: $v0m2 = PseudoVLE32_V_M2 $x18, $noreg, 4 /* e16 */, implicit $vl, implicit $vtype ; CHECK-NEXT: $x0 = PseudoVSETVLIX0 $x0, 82 /* e32, m4, ta, mu */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: $v4m4 = PseudoVLE32_V_M4 killed $x18, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype - ; CHECK-NEXT: $v12m4 = PseudoVMV_V_V_M4 $v28m4, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype + ; CHECK-NEXT: $v12m4 = PseudoVMV_V_V_M4 undef $v12m4, $v28m4, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype $x15 = PseudoVSETVLI $x14, 82, implicit-def $vl, implicit-def $vtype $v28m4 = PseudoVLE32_V_M4 killed $x16, $noreg, 5, implicit $vl, implicit $vtype $x0 = PseudoVSETVLIX0 $x0, 73, implicit-def $vl, implicit-def $vtype @@ -253,8 +253,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $x15 = PseudoVSETVLI $x14, 80 /* e32, m1, ta, mu */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: $v8_v9 = PseudoVLSEG2E32_V_M1 killed $x16, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype - ; CHECK-NEXT: $v10 = PseudoVMV_V_V_M1 $v8, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype - ; CHECK-NEXT: $v11 = PseudoVMV_V_V_M1 $v9, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype + ; CHECK-NEXT: $v10 = PseudoVMV_V_V_M1 undef $v10, $v8, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype + ; CHECK-NEXT: $v11 = PseudoVMV_V_V_M1 undef $v11, $v9, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype $x15 = PseudoVSETVLI $x14, 80, implicit-def $vl, implicit-def $vtype $v8_v9 = PseudoVLSEG2E32_V_M1 killed $x16, $noreg, 5, implicit $vl, implicit $vtype $v10_v11 = COPY $v8_v9 diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll index be927fc..e51bc9c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll @@ -978,14 +978,14 @@ declare half @llvm.vector.reduce.fadd.nxv3f16(half, ) define half @vreduce_ord_fadd_nxv3f16( %v, half %s) { ; CHECK-LABEL: vreduce_ord_fadd_nxv3f16: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: slli a1, a0, 1 ; CHECK-NEXT: add a1, a1, a0 ; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: lui a2, 1048568 -; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.x v9, a2 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma ; CHECK-NEXT: vslideup.vx v8, v9, a1 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma @@ -1002,16 +1002,16 @@ declare half @llvm.vector.reduce.fadd.nxv6f16(half, ) define half @vreduce_ord_fadd_nxv6f16( %v, half %s) { ; CHECK-LABEL: vreduce_ord_fadd_nxv6f16: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: lui a0, 1048568 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vmv.v.x v11, a0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: add a1, a0, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma -; CHECK-NEXT: vslideup.vx v9, v10, a0 +; CHECK-NEXT: vslideup.vx v9, v11, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vfredosum.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -1029,13 +1029,12 @@ define half @vreduce_ord_fadd_nxv10f16( %v, half %s) { ; CHECK-NEXT: vmv.v.x v12, a0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: add a1, a0, a0 -; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma -; CHECK-NEXT: vslideup.vx v10, v12, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma ; CHECK-NEXT: vmv.v.v v11, v12 +; CHECK-NEXT: add a1, a0, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma ; CHECK-NEXT: vslideup.vx v11, v12, a0 +; CHECK-NEXT: vslideup.vx v10, v12, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: vfredosum.vs v8, v8, v12 @@ -1066,14 +1065,14 @@ define half @vreduce_ord_fadd_nxv12f16( %v, half %s) { define half @vreduce_fadd_nxv3f16( %v, half %s) { ; CHECK-LABEL: vreduce_fadd_nxv3f16: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: slli a1, a0, 1 ; CHECK-NEXT: add a1, a1, a0 ; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: lui a2, 1048568 -; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.x v9, a2 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma ; CHECK-NEXT: vslideup.vx v8, v9, a1 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma @@ -1088,16 +1087,16 @@ define half @vreduce_fadd_nxv3f16( %v, half %s) { define half @vreduce_fadd_nxv6f16( %v, half %s) { ; CHECK-LABEL: vreduce_fadd_nxv6f16: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: lui a0, 1048568 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vmv.v.x v11, a0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: add a1, a0, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma -; CHECK-NEXT: vslideup.vx v9, v10, a0 +; CHECK-NEXT: vslideup.vx v9, v11, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vfredusum.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -1110,19 +1109,18 @@ declare half @llvm.vector.reduce.fmin.nxv10f16() define half @vreduce_fmin_nxv10f16( %v) { ; CHECK-LABEL: vreduce_fmin_nxv10f16: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI73_0) +; CHECK-NEXT: flh fa5, %lo(.LCPI73_0)(a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfmv.v.f v12, fa5 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: lui a1, %hi(.LCPI73_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI73_0)(a1) ; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: add a1, a0, a0 -; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; CHECK-NEXT: vfmv.v.f v12, fa5 -; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma -; CHECK-NEXT: vslideup.vx v10, v12, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma ; CHECK-NEXT: vmv.v.v v11, v12 +; CHECK-NEXT: add a1, a0, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma ; CHECK-NEXT: vslideup.vx v11, v12, a0 +; CHECK-NEXT: vslideup.vx v10, v12, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfmv.s.f v12, fa5 ; CHECK-NEXT: vfredmin.vs v8, v8, v12 diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir index d29b630..0cbb7df 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir @@ -284,15 +284,17 @@ body: | ; CHECK-NEXT: bb.1.if.then: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %dead1:vr = IMPLICIT_DEF ; CHECK-NEXT: dead $x0 = PseudoVSETVLIX0 killed $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype, implicit $vl - ; CHECK-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[PseudoVLE32_V_MF2_]], $noreg, 6 /* e64 */, implicit $vl, implicit $vtype + ; CHECK-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 %dead1, [[PseudoVLE32_V_MF2_]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: PseudoBR %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.if.else: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %dead2:vr = IMPLICIT_DEF ; CHECK-NEXT: dead $x0 = PseudoVSETVLIX0 killed $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype, implicit $vl - ; CHECK-NEXT: early-clobber %2:vr = PseudoVSEXT_VF2_M1 [[PseudoVLE32_V_MF2_]], $noreg, 6 /* e64 */, implicit $vl, implicit $vtype + ; CHECK-NEXT: early-clobber %2:vr = PseudoVSEXT_VF2_M1 %dead2, [[PseudoVLE32_V_MF2_]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3.if.end: ; CHECK-NEXT: [[PHI:%[0-9]+]]:vr = PHI %1, %bb.1, %2, %bb.2 @@ -312,11 +314,13 @@ body: | PseudoBR %bb.1 bb.1.if.then: - early-clobber %1:vr = PseudoVZEXT_VF2_M1 %0, %7, 6 + %dead1:vr = IMPLICIT_DEF + early-clobber %1:vr = PseudoVZEXT_VF2_M1 %dead1, %0, %7, 6, 0 PseudoBR %bb.3 bb.2.if.else: - early-clobber %2:vr = PseudoVSEXT_VF2_M1 %0, %7, 6 + %dead2:vr = IMPLICIT_DEF + early-clobber %2:vr = PseudoVSEXT_VF2_M1 %dead2, %0, %7, 6, 0 bb.3.if.end: %3:vr = PHI %1, %bb.1, %2, %bb.2 @@ -510,8 +514,9 @@ body: | ; CHECK-NEXT: %pt:vr = IMPLICIT_DEF ; CHECK-NEXT: dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 $x0, 223 /* e64, mf2, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: [[PseudoVID_V_MF2_:%[0-9]+]]:vr = PseudoVID_V_MF2 %pt, -1, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype + ; CHECK-NEXT: %pt2:vr = IMPLICIT_DEF ; CHECK-NEXT: dead [[PseudoVSETVLIX0_1:%[0-9]+]]:gpr = PseudoVSETVLIX0 $x0, 215 /* e32, mf2, ta, ma */, implicit-def $vl, implicit-def $vtype - ; CHECK-NEXT: [[PseudoVMV_V_I_MF2_:%[0-9]+]]:vrnov0 = PseudoVMV_V_I_MF2 0, -1, 5 /* e32 */, implicit $vl, implicit $vtype + ; CHECK-NEXT: [[PseudoVMV_V_I_MF2_:%[0-9]+]]:vrnov0 = PseudoVMV_V_I_MF2 %pt2, 0, -1, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) @@ -546,7 +551,8 @@ body: | %2:gpr = IMPLICIT_DEF %pt:vr = IMPLICIT_DEF %3:vr = PseudoVID_V_MF2 %pt, -1, 6, 0 - %4:vrnov0 = PseudoVMV_V_I_MF2 0, -1, 5 + %pt2:vr = IMPLICIT_DEF + %4:vrnov0 = PseudoVMV_V_I_MF2 %pt2, 0, -1, 5, 0 bb.1: successors: %bb.2(0x40000000), %bb.3(0x40000000) @@ -761,8 +767,9 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x12 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x10 + ; CHECK-NEXT: %dead:vr = IMPLICIT_DEF ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 4, 208 /* e32, m1, ta, ma */, implicit-def $vl, implicit-def $vtype - ; CHECK-NEXT: [[PseudoVMV_V_I_M1_:%[0-9]+]]:vr = PseudoVMV_V_I_M1 0, 4, 5 /* e32 */, implicit $vl, implicit $vtype + ; CHECK-NEXT: [[PseudoVMV_V_I_M1_:%[0-9]+]]:vr = PseudoVMV_V_I_M1 %dead, 0, 4, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vr = COPY [[PseudoVMV_V_I_M1_]] ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vr = COPY [[COPY2]] ; CHECK-NEXT: [[LUI:%[0-9]+]]:gpr = LUI 1 @@ -796,7 +803,8 @@ body: | %8:gpr = COPY $x12 %6:gpr = COPY $x10 - %11:vr = PseudoVMV_V_I_M1 0, 4, 5 + %dead:vr = IMPLICIT_DEF + %11:vr = PseudoVMV_V_I_M1 %dead, 0, 4, 5, 0 %12:vr = COPY %11 %10:vr = COPY %12 %13:gpr = LUI 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir index 247b835..80665cd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir @@ -200,13 +200,15 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x10 ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY]], 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: [[PseudoVLE32_V_MF2_:%[0-9]+]]:vr = PseudoVLE32_V_MF2 [[COPY1]], $noreg, 5 /* e32 */, implicit $vl, implicit $vtype - ; CHECK-NEXT: early-clobber %3:vr = PseudoVZEXT_VF2_M1 killed [[PseudoVLE32_V_MF2_]], $noreg, 6 /* e64 */, implicit $vl, implicit $vtype + ; CHECK-NEXT: %dead:vr = IMPLICIT_DEF + ; CHECK-NEXT: early-clobber %3:vr = PseudoVZEXT_VF2_M1 %dead, killed [[PseudoVLE32_V_MF2_]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: $v8 = COPY %3 ; CHECK-NEXT: PseudoRET implicit $v8 %1:gprnox0 = COPY $x11 %0:gpr = COPY $x10 %2:vr = PseudoVLE32_V_MF2 %0, %1, 5 - early-clobber %3:vr = PseudoVZEXT_VF2_M1 killed %2, %1, 6 + %dead:vr = IMPLICIT_DEF + early-clobber %3:vr = PseudoVZEXT_VF2_M1 %dead, killed %2, %1, 6, 0 $v8 = COPY %3 PseudoRET implicit $v8 @@ -307,8 +309,8 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 2, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: [[PseudoVLE64_V_M1_:%[0-9]+]]:vr = PseudoVLE64_V_M1 [[COPY]], 2, 6 /* e64 */, implicit $vl, implicit $vtype :: (load (s128) from %ir.x) - ; CHECK-NEXT: dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype - ; CHECK-NEXT: [[PseudoVMV_V_I_M1_:%[0-9]+]]:vr = PseudoVMV_V_I_M1 0, -1, 6 /* e64 */, implicit $vl, implicit $vtype + ; CHECK-NEXT: dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 $x0, 152 /* e64, m1, tu, ma */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: [[PseudoVMV_V_I_M1_:%[0-9]+]]:vr = PseudoVMV_V_I_M1 undef $v2, 0, -1, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 2, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: [[PseudoVREDSUM_VS_M1_E8_:%[0-9]+]]:vr = PseudoVREDSUM_VS_M1_E8 [[DEF]], killed [[PseudoVLE64_V_M1_]], killed [[PseudoVMV_V_I_M1_]], 2, 6 /* e64 */, 1 /* ta, mu */, implicit $vl, implicit $vtype @@ -317,7 +319,7 @@ body: | ; CHECK-NEXT: PseudoRET implicit $x10 %0:gpr = COPY $x10 %1:vr = PseudoVLE64_V_M1 %0, 2, 6 :: (load (s128) from %ir.x) - %2:vr = PseudoVMV_V_I_M1 0, -1, 6 + %2:vr = PseudoVMV_V_I_M1 undef $v2, 0, -1, 6, 0 %4:vr = IMPLICIT_DEF %3:vr = PseudoVREDSUM_VS_M1_E8 %4, killed %1, killed %2, 2, 6, 1 %5:gpr = PseudoVMV_X_S_M1 killed %3, 6 @@ -422,11 +424,11 @@ body: | ; CHECK-NEXT: %pt:vrm2 = IMPLICIT_DEF ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 4, 217 /* e64, m2, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: [[PseudoVID_V_M2_:%[0-9]+]]:vrm2 = PseudoVID_V_M2 %pt, 4, 6 /* e64 */, 3 /* ta, ma */, implicit $vl, implicit $vtype - ; CHECK-NEXT: dead $x0 = PseudoVSETVLIX0 killed $x0, 198 /* e8, mf4, ta, ma */, implicit-def $vl, implicit-def $vtype, implicit $vl - ; CHECK-NEXT: [[PseudoVMV_V_I_MF4_:%[0-9]+]]:vr = PseudoVMV_V_I_MF4 0, 4, 3 /* e8 */, implicit $vl, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoVSETVLIX0 killed $x0, 134 /* e8, mf4, tu, ma */, implicit-def $vl, implicit-def $vtype, implicit $vl + ; CHECK-NEXT: [[PseudoVMV_V_I_MF4_:%[0-9]+]]:vr = PseudoVMV_V_I_MF4 undef [[PseudoVMV_V_I_MF4_]], 0, 4, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: PseudoRET %pt:vrm2 = IMPLICIT_DEF %0:vrm2 = PseudoVID_V_M2 %pt, 4, 6, 3 - %4:vr = PseudoVMV_V_I_MF4 0, 4, 3 + %4:vr = PseudoVMV_V_I_MF4 undef %4, 0, 4, 3, 0 PseudoRET ... diff --git a/llvm/test/CodeGen/RISCV/rvv/wrong-stack-offset-for-rvv-object.mir b/llvm/test/CodeGen/RISCV/rvv/wrong-stack-offset-for-rvv-object.mir index 7dfc79a..6d05a8e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/wrong-stack-offset-for-rvv-object.mir +++ b/llvm/test/CodeGen/RISCV/rvv/wrong-stack-offset-for-rvv-object.mir @@ -169,7 +169,7 @@ body: | ; CHECK-NEXT: $x10 = ADD $x2, killed $x10 ; CHECK-NEXT: SD killed renamable $x16, killed $x10, 64 :: (store (s64) into %fixed-stack.1, align 16) ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 2, 69 /* e8, mf8, ta, mu */, implicit-def $vl, implicit-def $vtype - ; CHECK-NEXT: renamable $v8 = PseudoVMV_V_I_MF8 0, 2, 3 /* e8 */, implicit $vl, implicit $vtype + ; CHECK-NEXT: renamable $v8 = PseudoVMV_V_I_MF8 undef $v8, 0, 2, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: $x10 = ADDI $x2, 32 ; CHECK-NEXT: VS1R_V killed renamable $v8, killed $x10 :: (store unknown-size into %stack.1, align 8) ; CHECK-NEXT: {{ $}} @@ -200,7 +200,7 @@ body: | SD killed renamable $x17, %fixed-stack.0, 0 :: (store (s64)) SD killed renamable $x16, %fixed-stack.1, 0 :: (store (s64) into %fixed-stack.1, align 16) dead $x0 = PseudoVSETIVLI 2, 69, implicit-def $vl, implicit-def $vtype - renamable $v8 = PseudoVMV_V_I_MF8 0, 2, 3, implicit $vl, implicit $vtype + renamable $v8 = PseudoVMV_V_I_MF8 undef $v8, 0, 2, 3, 0, implicit $vl, implicit $vtype VS1R_V killed renamable $v8, %stack.1 :: (store unknown-size into %stack.1, align 8) bb.1.while.cond: