This change continues with the line of work discussed in https://discourse.llvm.org/t/riscv-transition-in-vector-pseudo-structure-policy-variants/71295. In D153155, we started removing the legacy distinction between unsuffixed (TA) and _TU pseudos. This patch continues that effort for the unary instruction families.
The change consists of a few interacting pieces:
* Adding a vector policy operand to VPseudoUnaryNoMaskTU.
* Then using VPseudoUnaryNoMaskTU for all cases where VPseudoUnaryNoMask was previously used and deleting the unsuffixed form.
* Then renaming VPseudoUnaryNoMaskTU to VPseudoUnaryNoMask, and adjusting the RISCVMaskedPseudo table to use the combined pseudo.
* Fixing up two places in C++ code which manually construct VMV_V_* instructions.
Normally, I'd try to factor this into a couple of changes, but in this case, the table structure is tied to naming and thus we can't really separate the otherwise NFC bits.
As before, we see codegen changes (some improvements and some regressions) due to scheduling differences caused by the extra implicit_def instructions.
Differential Revision: https://reviews.llvm.org/D153899
bool RISCVDAGToDAGISel::performVMergeToVMv(SDNode *N) {
#define CASE_VMERGE_TO_VMV(lmul) \
case RISCV::PseudoVMERGE_VVM_##lmul##_TU: \
- NewOpc = RISCV::PseudoVMV_V_V_##lmul##_TU; \
+ NewOpc = RISCV::PseudoVMV_V_V_##lmul; \
break;
unsigned NewOpc;
switch (N->getMachineOpcode()) {
if (!usesAllOnesMask(N, /* MaskOpIdx */ 3))
return false;
+ SDLoc DL(N);
+ SDValue PolicyOp =
+ CurDAG->getTargetConstant(/*TUMU*/ 0, DL, Subtarget->getXLenVT());
SDNode *Result = CurDAG->getMachineNode(
- NewOpc, SDLoc(N), N->getValueType(0),
- {N->getOperand(1), N->getOperand(2), N->getOperand(4), N->getOperand(5)});
+ NewOpc, DL, N->getValueType(0),
+ {N->getOperand(1), N->getOperand(2), N->getOperand(4), N->getOperand(5),
+ PolicyOp});
ReplaceUses(N, Result);
return true;
}
if (NF == 1) {
auto MIB = BuildMI(MBB, MBBI, DL, get(Opc), DstReg);
+ if (UseVMV_V_V)
+ MIB.addReg(DstReg, RegState::Undef);
if (UseVMV_V_I)
- MIB = MIB.add(DefMBBI->getOperand(1));
+ MIB = MIB.add(DefMBBI->getOperand(2));
else
MIB = MIB.addReg(SrcReg, getKillRegState(KillSrc));
if (UseVMV_V_V) {
const MCInstrDesc &Desc = DefMBBI->getDesc();
MIB.add(DefMBBI->getOperand(RISCVII::getVLOpNum(Desc))); // AVL
MIB.add(DefMBBI->getOperand(RISCVII::getSEWOpNum(Desc))); // SEW
+ MIB.addImm(0); // tu, mu
MIB.addReg(RISCV::VL, RegState::Implicit);
MIB.addReg(RISCV::VTYPE, RegState::Implicit);
}
for (; I != End; I += Incr) {
auto MIB = BuildMI(MBB, MBBI, DL, get(Opc),
TRI->getSubReg(DstReg, SubRegIdx + I));
+ if (UseVMV_V_V)
+ MIB.addReg(TRI->getSubReg(DstReg, SubRegIdx + I),
+ RegState::Undef);
if (UseVMV_V_I)
- MIB = MIB.add(DefMBBI->getOperand(1));
+ MIB = MIB.add(DefMBBI->getOperand(2));
else
MIB = MIB.addReg(TRI->getSubReg(SrcReg, SubRegIdx + I),
getKillRegState(KillSrc));
const MCInstrDesc &Desc = DefMBBI->getDesc();
MIB.add(DefMBBI->getOperand(RISCVII::getVLOpNum(Desc))); // AVL
MIB.add(DefMBBI->getOperand(RISCVII::getSEWOpNum(Desc))); // SEW
+ MIB.addImm(0); // tu, mu
MIB.addReg(RISCV::VL, RegState::Implicit);
MIB.addReg(RISCV::VTYPE, RegState::Implicit);
}
class VPseudoUnaryNoMask<DAGOperand RetClass, DAGOperand OpClass,
string Constraint = ""> :
- Pseudo<(outs RetClass:$rd),
- (ins OpClass:$rs2, AVL:$vl, ixlenimm:$sew), []>,
- RISCVVPseudo {
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
- let Constraints = Constraint;
- let HasVLOp = 1;
- let HasSEWOp = 1;
-}
-
-class VPseudoUnaryNoMaskTU<DAGOperand RetClass, DAGOperand OpClass,
- string Constraint = ""> :
Pseudo<(outs RetClass:$rd),
- (ins RetClass:$merge, OpClass:$rs2, AVL:$vl, ixlenimm:$sew), []>,
+ (ins RetClass:$merge, OpClass:$rs2, AVL:$vl, ixlenimm:$sew,
+ ixlenimm:$policy), []>,
RISCVVPseudo {
let mayLoad = 0;
let mayStore = 0;
let Constraints = Join<[Constraint, "$rd = $merge"], ",">.ret;
let HasVLOp = 1;
let HasSEWOp = 1;
+ let HasVecPolicyOp = 1;
}
class VPseudoUnaryMask<VReg RetClass, VReg OpClass, string Constraint = ""> :
let VLMul = m.value in {
def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, VR, constraint>,
Sched<[WriteVMIotV_MX, ReadVMIotV_MX, ReadVMask]>;
- def "_" # m.MX # "_TU" : VPseudoUnaryNoMaskTU<m.vrclass, VR, constraint>,
- Sched<[WriteVMIotV_MX, ReadVMIotV_MX, ReadVMask]>;
def "_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, VR, constraint>,
- RISCVMaskedPseudo</*MaskOpIdx*/ 2>,
+ RISCVMaskedPseudo</*MaskOpIdx*/ 2,
+ /*HasTU*/ false,
+ /*IsCombined*/true>,
Sched<[WriteVMIotV_MX, ReadVMIotV_MX, ReadVMask]>;
}
}
Sched<[WriteVIMovX_MX, ReadVIMovX_MX]>;
def "_I_" # mx : VPseudoUnaryNoMask<m.vrclass, simm5>,
Sched<[WriteVIMovI_MX]>;
- def "_V_" # mx # "_TU": VPseudoUnaryNoMaskTU<m.vrclass, m.vrclass>,
- Sched<[WriteVIMovV_MX, ReadVIMovV_MX]>;
- def "_X_" # mx # "_TU": VPseudoUnaryNoMaskTU<m.vrclass, GPR>,
- Sched<[WriteVIMovX_MX, ReadVIMovX_MX]>;
- def "_I_" # mx # "_TU": VPseudoUnaryNoMaskTU<m.vrclass, simm5>,
- Sched<[WriteVIMovI_MX]>;
}
}
}
def "_" # f.FX # "_" # mx :
VPseudoUnaryNoMask<m.vrclass, f.fprclass>,
Sched<[WriteVFMovV_MX, ReadVFMovF_MX]>;
- def "_" # f.FX # "_" # mx # "_TU":
- VPseudoUnaryNoMaskTU<m.vrclass, f.fprclass>,
- Sched<[WriteVFMovV_MX, ReadVFMovF_MX]>;
}
}
}
let VLMul = m.value in {
def "_V_" # mx : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
Sched<[WriteVFClassV_MX, ReadVFClassV_MX, ReadVMask]>;
- def "_V_" # mx # "_TU": VPseudoUnaryNoMaskTU<m.vrclass, m.vrclass>,
- Sched<[WriteVFClassV_MX, ReadVFClassV_MX, ReadVMask]>;
def "_V_" # mx # "_MASK" : VPseudoUnaryMask<m.vrclass, m.vrclass>,
- RISCVMaskedPseudo</*MaskOpIdx*/ 2>,
+ RISCVMaskedPseudo</*MaskOpIdx*/ 2,
+ /*HasTU*/ false,
+ /*IsCombined*/true>,
Sched<[WriteVFClassV_MX, ReadVFClassV_MX, ReadVMask]>;
}
}
def "_V" # suffix : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
Sched<[WriteVFSqrtV_MX_E, ReadVFSqrtV_MX_E,
ReadVMask]>;
- def "_V" # suffix # "_TU": VPseudoUnaryNoMaskTU<m.vrclass, m.vrclass>,
- Sched<[WriteVFSqrtV_MX_E, ReadVFSqrtV_MX_E,
- ReadVMask]>;
def "_V" # suffix # "_MASK" : VPseudoUnaryMask<m.vrclass, m.vrclass>,
- RISCVMaskedPseudo</*MaskOpIdx*/ 2>,
+ RISCVMaskedPseudo</*MaskOpIdx*/ 2,
+ /*HasTU*/ false,
+ /*IsCombined*/true>,
Sched<[WriteVFSqrtV_MX_E, ReadVFSqrtV_MX_E,
ReadVMask]>;
}
let VLMul = m.value in {
def "_V_" # mx : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
Sched<[WriteVFRecpV_MX, ReadVFRecpV_MX, ReadVMask]>;
- def "_V_" # mx # "_TU": VPseudoUnaryNoMaskTU<m.vrclass, m.vrclass>,
- Sched<[WriteVFRecpV_MX, ReadVFRecpV_MX, ReadVMask]>;
def "_V_" # mx # "_MASK" : VPseudoUnaryMask<m.vrclass, m.vrclass>,
- RISCVMaskedPseudo</*MaskOpIdx*/ 2>,
+ RISCVMaskedPseudo</*MaskOpIdx*/ 2,
+ /*HasTU*/ false,
+ /*IsCombined*/true>,
Sched<[WriteVFRecpV_MX, ReadVFRecpV_MX, ReadVMask]>;
}
}
let VLMul = m.value in {
def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f2vrclass, constraints>,
Sched<[WriteVExtV_MX, ReadVExtV_MX, ReadVMask]>;
- def "_" # mx # "_TU": VPseudoUnaryNoMaskTU<m.vrclass, m.f2vrclass, constraints>,
- Sched<[WriteVExtV_MX, ReadVExtV_MX, ReadVMask]>;
def "_" # mx # "_MASK" :
VPseudoUnaryMask<m.vrclass, m.f2vrclass, constraints>,
- RISCVMaskedPseudo</*MaskOpIdx*/ 2>,
+ RISCVMaskedPseudo</*MaskOpIdx*/ 2, /*HasTU*/ false, /*IsCombined*/true>,
Sched<[WriteVExtV_MX, ReadVExtV_MX, ReadVMask]>;
}
}
let VLMul = m.value in {
def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f4vrclass, constraints>,
Sched<[WriteVExtV_MX, ReadVExtV_MX, ReadVMask]>;
- def "_" # mx # "_TU": VPseudoUnaryNoMaskTU<m.vrclass, m.f4vrclass, constraints>,
- Sched<[WriteVExtV_MX, ReadVExtV_MX, ReadVMask]>;
def "_" # mx # "_MASK" :
VPseudoUnaryMask<m.vrclass, m.f4vrclass, constraints>,
- RISCVMaskedPseudo</*MaskOpIdx*/ 2>,
+ RISCVMaskedPseudo</*MaskOpIdx*/ 2, /*HasTU*/ false, /*IsCombined*/true>,
Sched<[WriteVExtV_MX, ReadVExtV_MX, ReadVMask]>;
}
}
let VLMul = m.value in {
def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f8vrclass, constraints>,
Sched<[WriteVExtV_MX, ReadVExtV_MX, ReadVMask]>;
- def "_" # mx # "_TU": VPseudoUnaryNoMaskTU<m.vrclass, m.f8vrclass, constraints>,
- Sched<[WriteVExtV_MX, ReadVExtV_MX, ReadVMask]>;
def "_" # mx # "_MASK" :
VPseudoUnaryMask<m.vrclass, m.f8vrclass, constraints>,
- RISCVMaskedPseudo</*MaskOpIdx*/ 2>,
+ RISCVMaskedPseudo</*MaskOpIdx*/ 2, /*HasTU*/ false, /*IsCombined*/true>,
Sched<[WriteVExtV_MX, ReadVExtV_MX, ReadVMask]>;
}
}
string Constraint = ""> {
let VLMul = MInfo.value in {
def "_" # MInfo.MX : VPseudoUnaryNoMask<RetClass, Op1Class, Constraint>;
- def "_" # MInfo.MX # "_TU": VPseudoUnaryNoMaskTU<RetClass, Op1Class, Constraint>;
def "_" # MInfo.MX # "_MASK" : VPseudoUnaryMask<RetClass, Op1Class,
Constraint>,
- RISCVMaskedPseudo</*MaskOpIdx*/ 2>;
+ RISCVMaskedPseudo</*MaskOpIdx*/ 2,
+ /*HasTU*/ false,
+ /*IsCombined*/true>;
}
}
ValueType op2_type,
int log2sew,
LMULInfo vlmul,
+ VReg result_reg_class,
VReg op2_reg_class,
bit isSEWAware = 0> :
Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
- (result_type undef),
+ (result_type result_reg_class:$merge),
(op2_type op2_reg_class:$rs2),
VLOpFrag)),
(!cast<Instruction>(
!if(isSEWAware,
inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew),
inst#"_"#kind#"_"#vlmul.MX))
- (op2_type op2_reg_class:$rs2),
- GPR:$vl, log2sew)>;
-
-class VPatUnaryNoMaskTU<string intrinsic_name,
- string inst,
- string kind,
- ValueType result_type,
- ValueType op2_type,
- int log2sew,
- LMULInfo vlmul,
- VReg result_reg_class,
- VReg op2_reg_class,
- bit isSEWAware = 0> :
- Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
(result_type result_reg_class:$merge),
(op2_type op2_reg_class:$rs2),
- VLOpFrag)),
- (!cast<Instruction>(
- !if(isSEWAware,
- inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew)#"_TU",
- inst#"_"#kind#"_"#vlmul.MX#"_TU"))
- (result_type result_reg_class:$merge),
- (op2_type op2_reg_class:$rs2),
- GPR:$vl, log2sew)>;
+ GPR:$vl, log2sew, TU_MU)>;
class VPatUnaryMask<string intrinsic_name,
string inst,
(mti.Mask VR:$rs2),
VLOpFrag)),
(!cast<Instruction>(inst#"_M_"#mti.BX)
+ (mti.Mask (IMPLICIT_DEF)),
(mti.Mask VR:$rs2),
- GPR:$vl, mti.Log2SEW)>;
+ GPR:$vl, mti.Log2SEW, TU_MU)>;
class VPatMaskUnaryMask<string intrinsic_name,
string inst,
foreach vti = AllIntegerVectors in {
let Predicates = GetVTypePredicates<vti>.Predicates in {
def : VPatUnaryNoMask<intrinsic, instruction, "M", vti.Vector, vti.Mask,
- vti.Log2SEW, vti.LMul, VR>;
- def : VPatUnaryNoMaskTU<intrinsic, instruction, "M", vti.Vector, vti.Mask,
- vti.Log2SEW, vti.LMul, vti.RegClass,VR>;
+ vti.Log2SEW, vti.LMul, vti.RegClass, VR>;
def : VPatUnaryMask<intrinsic, instruction, "M", vti.Vector, vti.Mask,
vti.Mask, vti.Log2SEW, vti.LMul, vti.RegClass, VR>;
}
GetVTypePredicates<fti>.Predicates) in {
def : VPatUnaryNoMask<intrinsic, instruction, suffix,
vti.Vector, fti.Vector,
- vti.Log2SEW, vti.LMul, fti.RegClass>;
- def : VPatUnaryNoMaskTU<intrinsic, instruction, suffix,
- vti.Vector, fti.Vector,
- vti.Log2SEW, vti.LMul, vti.RegClass, fti.RegClass>;
+ vti.Log2SEW, vti.LMul, vti.RegClass, fti.RegClass>;
def : VPatUnaryMask<intrinsic, instruction, suffix,
vti.Vector, fti.Vector, vti.Mask,
vti.Log2SEW, vti.LMul, vti.RegClass, fti.RegClass>;
let Predicates = GetVTypePredicates<vti>.Predicates in {
def : VPatUnaryNoMask<intrinsic, instruction, "V",
vti.Vector, vti.Vector, vti.Log2SEW,
- vti.LMul, vti.RegClass, isSEWAware>;
- def : VPatUnaryNoMaskTU<intrinsic, instruction, "V",
- vti.Vector, vti.Vector, vti.Log2SEW,
- vti.LMul, vti.RegClass, vti.RegClass, isSEWAware>;
+ vti.LMul, vti.RegClass, vti.RegClass, isSEWAware>;
def : VPatUnaryMask<intrinsic, instruction, "V",
vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
vti.LMul, vti.RegClass, vti.RegClass, isSEWAware>;
VReg op1_reg_class>
{
def : VPatUnaryNoMask<intrinsic, inst, kind, result_type, op1_type,
- sew, vlmul, op1_reg_class>;
- def : VPatUnaryNoMaskTU<intrinsic, inst, kind, result_type, op1_type,
- sew, vlmul, result_reg_class, op1_reg_class>;
+ sew, vlmul, result_reg_class, op1_reg_class>;
def : VPatUnaryMask<intrinsic, inst, kind, result_type, op1_type,
mask_type, sew, vlmul, result_reg_class, op1_reg_class>;
}
//===----------------------------------------------------------------------===//
foreach vti = AllVectors in {
let Predicates = GetVTypePredicates<vti>.Predicates in {
- def : Pat<(vti.Vector (int_riscv_vmv_v_v (vti.Vector undef),
- (vti.Vector vti.RegClass:$rs1),
- VLOpFrag)),
- (!cast<Instruction>("PseudoVMV_V_V_"#vti.LMul.MX)
- $rs1, GPR:$vl, vti.Log2SEW)>;
def : Pat<(vti.Vector (int_riscv_vmv_v_v (vti.Vector vti.RegClass:$passthru),
(vti.Vector vti.RegClass:$rs1),
VLOpFrag)),
- (!cast<Instruction>("PseudoVMV_V_V_"#vti.LMul.MX#"_TU")
- $passthru, $rs1, GPR:$vl, vti.Log2SEW)>;
+ (!cast<Instruction>("PseudoVMV_V_V_"#vti.LMul.MX)
+ $passthru, $rs1, GPR:$vl, vti.Log2SEW, TU_MU)>;
// vmv.v.x/vmv.v.i are handled in RISCInstrVInstrInfoVVLPatterns.td
}
GetVTypePredicates<fti>.Predicates) in
def : Pat<(vti.Vector (op (fti.Vector fti.RegClass:$rs2))),
(!cast<Instruction>(inst_name#"_"#suffix#"_"#vti.LMul.MX)
- fti.RegClass:$rs2, fti.AVL, vti.Log2SEW)>;
+ (vti.Vector (IMPLICIT_DEF)),
+ fti.RegClass:$rs2, fti.AVL, vti.Log2SEW, TU_MU)>;
}
}
GetVTypePredicates<ivti>.Predicates) in
def : Pat<(fvti.Vector (vop (ivti.Vector ivti.RegClass:$rs1))),
(!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX)
- ivti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW)>;
+ (fvti.Vector (IMPLICIT_DEF)),
+ ivti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW, TU_MU)>;
}
}
GetVTypePredicates<ivti>.Predicates) in
def : Pat<(ivti.Vector (vop (fvti.Vector fvti.RegClass:$rs1))),
(!cast<Instruction>(instruction_name#"_"#ivti.LMul.MX)
- fvti.RegClass:$rs1, ivti.AVL, ivti.Log2SEW)>;
+ (ivti.Vector (IMPLICIT_DEF)),
+ fvti.RegClass:$rs1, ivti.AVL, ivti.Log2SEW, TU_MU)>;
}
}
GetVTypePredicates<fwti>.Predicates) in
def : Pat<(fwti.Vector (vop (ivti.Vector ivti.RegClass:$rs1))),
(!cast<Instruction>(instruction_name#"_"#ivti.LMul.MX)
- ivti.RegClass:$rs1, ivti.AVL, ivti.Log2SEW)>;
+ (fwti.Vector (IMPLICIT_DEF)),
+ ivti.RegClass:$rs1, ivti.AVL, ivti.Log2SEW, TU_MU)>;
}
}
GetVTypePredicates<iwti>.Predicates) in
def : Pat<(iwti.Vector (vop (fvti.Vector fvti.RegClass:$rs1))),
(!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX)
- fvti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW)>;
+ (iwti.Vector (IMPLICIT_DEF)),
+ fvti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW, TU_MU)>;
}
}
GetVTypePredicates<iwti>.Predicates) in
def : Pat<(fvti.Vector (vop (iwti.Vector iwti.RegClass:$rs1))),
(!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX)
- iwti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW)>;
+ (fvti.Vector (IMPLICIT_DEF)),
+ iwti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW, TU_MU)>;
}
}
GetVTypePredicates<fwti>.Predicates) in
def : Pat<(vti.Vector (vop (fwti.Vector fwti.RegClass:$rs1))),
(!cast<Instruction>(instruction_name#"_"#vti.LMul.MX)
- fwti.RegClass:$rs1, vti.AVL, vti.Log2SEW)>;
+ (vti.Vector (IMPLICIT_DEF)),
+ fwti.RegClass:$rs1, vti.AVL, vti.Log2SEW, TU_MU)>;
}
}
// 13.8. Vector Floating-Point Square-Root Instruction
def : Pat<(any_fsqrt (vti.Vector vti.RegClass:$rs2)),
(!cast<Instruction>("PseudoVFSQRT_V_"# vti.LMul.MX#"_E"#vti.SEW)
- vti.RegClass:$rs2, vti.AVL, vti.Log2SEW)>;
+ (vti.Vector (IMPLICIT_DEF)),
+ vti.RegClass:$rs2, vti.AVL, vti.Log2SEW, TU_MU)>;
// 13.12. Vector Floating-Point Sign-Injection Instructions
def : Pat<(fabs (vti.Vector vti.RegClass:$rs)),
GetVTypePredicates<fwti>.Predicates) in
def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))),
(!cast<Instruction>("PseudoVFNCVT_F_F_W_"#fvti.LMul.MX)
- fwti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW)>;
+ (fvti.Vector (IMPLICIT_DEF)),
+ fwti.RegClass:$rs1, fvti.AVL, fvti.Log2SEW, TU_MU)>;
}
//===----------------------------------------------------------------------===//
let Predicates = GetVTypePredicates<fvti>.Predicates in {
def : Pat<(fvti.Vector (SplatFPOp fvti.ScalarRegClass:$rs1)),
(!cast<Instruction>("PseudoVFMV_V_"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
+ (fvti.Vector (IMPLICIT_DEF)),
(fvti.Scalar fvti.ScalarRegClass:$rs1),
- fvti.AVL, fvti.Log2SEW)>;
+ fvti.AVL, fvti.Log2SEW, TU_MU)>;
def : Pat<(fvti.Vector (SplatFPOp (fvti.Scalar fpimm0))),
(!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX)
- 0, fvti.AVL, fvti.Log2SEW)>;
+ (fvti.Vector (IMPLICIT_DEF)),
+ 0, fvti.AVL, fvti.Log2SEW, TU_MU)>;
}
}
// 11.16. Vector Integer Move Instructions
foreach vti = AllVectors in {
let Predicates = GetVTypePredicates<vti>.Predicates in {
- def : Pat<(vti.Vector (riscv_vmv_v_v_vl (vti.Vector undef),
- vti.RegClass:$rs2, VLOpFrag)),
- (!cast<Instruction>("PseudoVMV_V_V_"#vti.LMul.MX)
- vti.RegClass:$rs2, GPR:$vl, vti.Log2SEW)>;
def : Pat<(vti.Vector (riscv_vmv_v_v_vl vti.RegClass:$passthru,
vti.RegClass:$rs2, VLOpFrag)),
- (!cast<Instruction>("PseudoVMV_V_V_"#vti.LMul.MX#"_TU")
- vti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$vl, vti.Log2SEW)>;
+ (!cast<Instruction>("PseudoVMV_V_V_"#vti.LMul.MX)
+ vti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$vl, vti.Log2SEW, TU_MU)>;
}
foreach vti = AllIntegerVectors in {
- def : Pat<(vti.Vector (riscv_vmv_v_x_vl (vti.Vector undef), GPR:$rs2, VLOpFrag)),
- (!cast<Instruction>("PseudoVMV_V_X_"#vti.LMul.MX)
- GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
def : Pat<(vti.Vector (riscv_vmv_v_x_vl vti.RegClass:$passthru, GPR:$rs2, VLOpFrag)),
- (!cast<Instruction>("PseudoVMV_V_X_"#vti.LMul.MX#"_TU")
- vti.RegClass:$passthru, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
+ (!cast<Instruction>("PseudoVMV_V_X_"#vti.LMul.MX)
+ vti.RegClass:$passthru, GPR:$rs2, GPR:$vl, vti.Log2SEW, TU_MU)>;
defvar ImmPat = !cast<ComplexPattern>("sew"#vti.SEW#"simm5");
- def : Pat<(vti.Vector (riscv_vmv_v_x_vl (vti.Vector undef), (ImmPat simm5:$imm5),
- VLOpFrag)),
- (!cast<Instruction>("PseudoVMV_V_I_"#vti.LMul.MX)
- simm5:$imm5, GPR:$vl, vti.Log2SEW)>;
def : Pat<(vti.Vector (riscv_vmv_v_x_vl vti.RegClass:$passthru, (ImmPat simm5:$imm5),
VLOpFrag)),
- (!cast<Instruction>("PseudoVMV_V_I_"#vti.LMul.MX#"_TU")
- vti.RegClass:$passthru, simm5:$imm5, GPR:$vl, vti.Log2SEW)>;
+ (!cast<Instruction>("PseudoVMV_V_I_"#vti.LMul.MX)
+ vti.RegClass:$passthru, simm5:$imm5, GPR:$vl, vti.Log2SEW, TU_MU)>;
}
}
def : Pat<(riscv_fclass_vl (vti.Vector vti.RegClass:$rs2),
(vti.Mask true_mask), VLOpFrag),
(!cast<Instruction>("PseudoVFCLASS_V_"# vti.LMul.MX)
- vti.RegClass:$rs2, GPR:$vl, vti.Log2SEW)>;
+ (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs2, GPR:$vl, vti.Log2SEW, TU_MU)>;
}
}
// 13.16. Vector Floating-Point Move Instruction
// If we're splatting fpimm0, use vmv.v.x vd, x0.
def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
- (fvti.Vector undef), (fvti.Scalar (fpimm0)), VLOpFrag)),
- (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX)
- 0, GPR:$vl, fvti.Log2SEW)>;
- def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
fvti.Vector:$passthru, (fvti.Scalar (fpimm0)), VLOpFrag)),
- (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX#"_TU")
- $passthru, 0, GPR:$vl, fvti.Log2SEW)>;
- def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
- (fvti.Vector undef), (fvti.Scalar (SelectFPImm (XLenVT GPR:$imm))), VLOpFrag)),
- (!cast<Instruction>("PseudoVMV_V_X_"#fvti.LMul.MX)
- GPR:$imm, GPR:$vl, fvti.Log2SEW)>;
+ (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX)
+ $passthru, 0, GPR:$vl, fvti.Log2SEW, TU_MU)>;
def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
fvti.Vector:$passthru, (fvti.Scalar (SelectFPImm (XLenVT GPR:$imm))), VLOpFrag)),
- (!cast<Instruction>("PseudoVMV_V_X_"#fvti.LMul.MX#"_TU")
- $passthru, GPR:$imm, GPR:$vl, fvti.Log2SEW)>;
+ (!cast<Instruction>("PseudoVMV_V_X_"#fvti.LMul.MX)
+ $passthru, GPR:$imm, GPR:$vl, fvti.Log2SEW, TU_MU)>;
def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
- (fvti.Vector undef), (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)),
- (!cast<Instruction>("PseudoVFMV_V_" # fvti.ScalarSuffix # "_" #
- fvti.LMul.MX)
- (fvti.Scalar fvti.ScalarRegClass:$rs2),
- GPR:$vl, fvti.Log2SEW)>;
- def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
fvti.Vector:$passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)),
(!cast<Instruction>("PseudoVFMV_V_" # fvti.ScalarSuffix # "_" #
- fvti.LMul.MX # "_TU")
+ fvti.LMul.MX)
$passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2),
- GPR:$vl, fvti.Log2SEW)>;
+ GPR:$vl, fvti.Log2SEW, TU_MU)>;
}
}
; CHECK-NEXT: andi sp, sp, -64
; CHECK-NEXT: mv s1, sp
; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 0
; CHECK-NEXT: addi a0, s1, 64
-; CHECK-NEXT: vs8r.v v8, (a0)
; CHECK-NEXT: sd a0, 0(sp)
+; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vs8r.v v8, (a0)
; CHECK-NEXT: li a0, 0
; CHECK-NEXT: li a1, 0
; CHECK-NEXT: li a2, 0
; CHECK-NEXT: $x2 = frame-setup SUB $x2, killed $x10
; CHECK-NEXT: $x2 = frame-setup ANDI $x2, -128
; CHECK-NEXT: dead renamable $x15 = PseudoVSETIVLI 1, 72 /* e16, m1, ta, mu */, implicit-def $vl, implicit-def $vtype
- ; CHECK-NEXT: renamable $v25 = PseudoVMV_V_X_M1 killed renamable $x12, $noreg, 4 /* e16 */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: renamable $v25 = PseudoVMV_V_X_M1 undef $v25, killed renamable $x12, $noreg, 4 /* e16 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
; CHECK-NEXT: $x10 = PseudoReadVLENB
; CHECK-NEXT: $x11 = ADDI killed $x0, 50
; CHECK-NEXT: $x10 = MUL killed $x10, killed $x11
liveins: $x12
dead renamable $x15 = PseudoVSETIVLI 1, 72, implicit-def $vl, implicit-def $vtype
- renamable $v25 = PseudoVMV_V_X_M1 killed renamable $x12, $noreg, 4, implicit $vl, implicit $vtype
+ renamable $v25 = PseudoVMV_V_X_M1 undef $v25, killed renamable $x12, $noreg, 4, 0, implicit $vl, implicit $vtype
VS1R_V killed renamable $v25, %stack.1 :: (store unknown-size into %stack.1, align 8)
renamable $x1 = ADDI $x0, 255
renamable $x5 = nuw ADDI %stack.0, 256
; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; RV32-NEXT: vid.v v11
; RV32-NEXT: vrgather.vv v10, v8, v11
-; RV32-NEXT: vadd.vi v8, v11, -1
; RV32-NEXT: lui a0, 11
; RV32-NEXT: addi a0, a0, -1366
; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; RV32-NEXT: vmv.v.x v0, a0
; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu
+; RV32-NEXT: vadd.vi v8, v11, -1
; RV32-NEXT: vrgather.vv v10, v9, v8, v0.t
; RV32-NEXT: vmv.v.v v8, v10
; RV32-NEXT: ret
; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; RV64-NEXT: vid.v v11
; RV64-NEXT: vrgather.vv v10, v8, v11
-; RV64-NEXT: vadd.vi v8, v11, -1
; RV64-NEXT: lui a0, 11
; RV64-NEXT: addiw a0, a0, -1366
; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; RV64-NEXT: vmv.v.x v0, a0
; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu
+; RV64-NEXT: vadd.vi v8, v11, -1
; RV64-NEXT: vrgather.vv v10, v9, v8, v0.t
; RV64-NEXT: vmv.v.v v8, v10
; RV64-NEXT: ret
; RV32-NEXT: vand.vx v11, v11, a3, v0.t
; RV32-NEXT: vor.vv v10, v11, v10, v0.t
; RV32-NEXT: vsrl.vi v11, v8, 8, v0.t
-; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT: vmv.v.i v0, 5
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV32-NEXT: vmv.v.i v12, 0
+; RV32-NEXT: vmv.v.i v0, 5
; RV32-NEXT: lui a4, 1044480
; RV32-NEXT: vmerge.vxm v12, v12, a4, v0
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: vand.vx v10, v10, a3
; RV32-NEXT: vor.vv v9, v10, v9
-; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT: vmv.v.i v0, 5
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV32-NEXT: vmv.v.i v10, 0
+; RV32-NEXT: vmv.v.i v0, 5
; RV32-NEXT: lui a4, 1044480
; RV32-NEXT: vmerge.vxm v10, v10, a4, v0
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: vand.vx v14, v14, a3, v0.t
; RV32-NEXT: vor.vv v12, v14, v12, v0.t
-; RV32-NEXT: vsrl.vi v14, v8, 8, v0.t
-; RV32-NEXT: li a4, 85
+; RV32-NEXT: vsrl.vi v14, v8, 24, v0.t
+; RV32-NEXT: lui a4, 4080
+; RV32-NEXT: vand.vx v14, v14, a4, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t
+; RV32-NEXT: li a5, 85
; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT: vmv.v.x v0, a4
+; RV32-NEXT: vmv.v.x v0, a5
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.i v16, 0
-; RV32-NEXT: lui a4, 1044480
-; RV32-NEXT: vmerge.vxm v16, v16, a4, v0
+; RV32-NEXT: vmv.v.i v18, 0
+; RV32-NEXT: lui a5, 1044480
+; RV32-NEXT: vmerge.vxm v18, v18, a5, v0
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vmv1r.v v0, v10
-; RV32-NEXT: vand.vv v14, v14, v16, v0.t
-; RV32-NEXT: vsrl.vi v18, v8, 24, v0.t
-; RV32-NEXT: lui a4, 4080
-; RV32-NEXT: vand.vx v18, v18, a4, v0.t
-; RV32-NEXT: vor.vv v14, v14, v18, v0.t
+; RV32-NEXT: vand.vv v16, v16, v18, v0.t
+; RV32-NEXT: vor.vv v14, v16, v14, v0.t
; RV32-NEXT: vor.vv v12, v14, v12, v0.t
; RV32-NEXT: vsll.vx v14, v8, a1, v0.t
-; RV32-NEXT: vand.vx v18, v8, a3, v0.t
-; RV32-NEXT: vsll.vx v18, v18, a2, v0.t
-; RV32-NEXT: vor.vv v14, v14, v18, v0.t
-; RV32-NEXT: vand.vx v18, v8, a4, v0.t
-; RV32-NEXT: vsll.vi v18, v18, 24, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vand.vx v16, v8, a3, v0.t
+; RV32-NEXT: vsll.vx v16, v16, a2, v0.t
+; RV32-NEXT: vor.vv v14, v14, v16, v0.t
+; RV32-NEXT: vand.vx v16, v8, a4, v0.t
+; RV32-NEXT: vsll.vi v16, v16, 24, v0.t
+; RV32-NEXT: vand.vv v8, v8, v18, v0.t
; RV32-NEXT: vsll.vi v8, v8, 8, v0.t
-; RV32-NEXT: vor.vv v8, v18, v8, v0.t
+; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: vor.vv v8, v14, v8, v0.t
; RV32-NEXT: vor.vv v8, v8, v12, v0.t
; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t
; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: vand.vx v12, v12, a3
; RV32-NEXT: vor.vv v10, v12, v10
+; RV32-NEXT: vsrl.vi v12, v8, 8
; RV32-NEXT: li a4, 85
; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; RV32-NEXT: vmv.v.x v0, a4
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.i v12, 0
+; RV32-NEXT: vmv.v.i v14, 0
; RV32-NEXT: lui a4, 1044480
-; RV32-NEXT: vmerge.vxm v12, v12, a4, v0
+; RV32-NEXT: vmerge.vxm v14, v14, a4, v0
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vsrl.vi v14, v8, 8
-; RV32-NEXT: vand.vv v14, v14, v12
+; RV32-NEXT: vand.vv v12, v12, v14
; RV32-NEXT: vsrl.vi v16, v8, 24
; RV32-NEXT: lui a4, 4080
; RV32-NEXT: vand.vx v16, v16, a4
-; RV32-NEXT: vor.vv v14, v14, v16
-; RV32-NEXT: vor.vv v10, v14, v10
-; RV32-NEXT: vsll.vx v14, v8, a1
+; RV32-NEXT: vor.vv v12, v12, v16
+; RV32-NEXT: vor.vv v10, v12, v10
+; RV32-NEXT: vsll.vx v12, v8, a1
; RV32-NEXT: vand.vx v16, v8, a3
; RV32-NEXT: vsll.vx v16, v16, a2
-; RV32-NEXT: vor.vv v14, v14, v16
-; RV32-NEXT: vand.vv v12, v8, v12
-; RV32-NEXT: vsll.vi v12, v12, 8
-; RV32-NEXT: vand.vx v8, v8, a4
-; RV32-NEXT: vsll.vi v8, v8, 24
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vor.vv v8, v14, v8
+; RV32-NEXT: vor.vv v12, v12, v16
+; RV32-NEXT: vand.vx v16, v8, a4
+; RV32-NEXT: vsll.vi v16, v16, 24
+; RV32-NEXT: vand.vv v8, v8, v14
+; RV32-NEXT: vsll.vi v8, v8, 8
+; RV32-NEXT: vor.vv v8, v16, v8
+; RV32-NEXT: vor.vv v8, v12, v8
; RV32-NEXT: vor.vv v8, v8, v10
; RV32-NEXT: vsrl.vi v10, v8, 4
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: lui a3, 16
; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: vand.vx v20, v20, a3, v0.t
-; RV32-NEXT: vor.vv v20, v20, v16, v0.t
-; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t
-; RV32-NEXT: lui a4, 5
-; RV32-NEXT: addi a4, a4, 1365
+; RV32-NEXT: vor.vv v16, v20, v16, v0.t
+; RV32-NEXT: vsrl.vi v20, v8, 24, v0.t
+; RV32-NEXT: lui a4, 4080
+; RV32-NEXT: vand.vx v24, v20, a4, v0.t
+; RV32-NEXT: vsrl.vi v28, v8, 8, v0.t
+; RV32-NEXT: lui a5, 5
+; RV32-NEXT: addi a5, a5, 1365
; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT: vmv.v.x v0, a4
+; RV32-NEXT: vmv.v.x v0, a5
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.i v16, 0
-; RV32-NEXT: lui a4, 1044480
-; RV32-NEXT: vmerge.vxm v16, v16, a4, v0
+; RV32-NEXT: vmv.v.i v20, 0
+; RV32-NEXT: lui a5, 1044480
+; RV32-NEXT: vmerge.vxm v20, v20, a5, v0
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vmv1r.v v0, v12
-; RV32-NEXT: vand.vv v24, v24, v16, v0.t
-; RV32-NEXT: vsrl.vi v28, v8, 24, v0.t
-; RV32-NEXT: lui a4, 4080
-; RV32-NEXT: vand.vx v28, v28, a4, v0.t
-; RV32-NEXT: vor.vv v24, v24, v28, v0.t
-; RV32-NEXT: vor.vv v20, v24, v20, v0.t
+; RV32-NEXT: vand.vv v28, v28, v20, v0.t
+; RV32-NEXT: vor.vv v24, v28, v24, v0.t
+; RV32-NEXT: vor.vv v16, v24, v16, v0.t
; RV32-NEXT: vsll.vx v24, v8, a1, v0.t
; RV32-NEXT: vand.vx v28, v8, a3, v0.t
; RV32-NEXT: vsll.vx v28, v28, a2, v0.t
; RV32-NEXT: vor.vv v24, v24, v28, v0.t
; RV32-NEXT: vand.vx v28, v8, a4, v0.t
; RV32-NEXT: vsll.vi v28, v28, 24, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vand.vv v8, v8, v20, v0.t
; RV32-NEXT: vsll.vi v8, v8, 8, v0.t
; RV32-NEXT: vor.vv v8, v28, v8, v0.t
; RV32-NEXT: vor.vv v8, v24, v8, v0.t
-; RV32-NEXT: vor.vv v8, v8, v20, v0.t
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: lui a1, 61681
; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: vand.vx v16, v16, a3
; RV32-NEXT: vor.vv v12, v16, v12
+; RV32-NEXT: vsrl.vi v20, v8, 8
; RV32-NEXT: lui a4, 5
; RV32-NEXT: addi a4, a4, 1365
; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; RV32-NEXT: lui a4, 1044480
; RV32-NEXT: vmerge.vxm v16, v16, a4, v0
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vsrl.vi v20, v8, 8
; RV32-NEXT: vand.vv v20, v20, v16
; RV32-NEXT: vsrl.vi v24, v8, 24
; RV32-NEXT: lui a4, 4080
; RV32-NEXT: vand.vx v24, v8, a3
; RV32-NEXT: vsll.vx v24, v24, a2
; RV32-NEXT: vor.vv v20, v20, v24
-; RV32-NEXT: vand.vv v16, v8, v16
-; RV32-NEXT: vsll.vi v16, v16, 8
-; RV32-NEXT: vand.vx v8, v8, a4
-; RV32-NEXT: vsll.vi v8, v8, 24
-; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vand.vx v24, v8, a4
+; RV32-NEXT: vsll.vi v24, v24, 24
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vsll.vi v8, v8, 8
+; RV32-NEXT: vor.vv v8, v24, v8
; RV32-NEXT: vor.vv v8, v20, v8
; RV32-NEXT: vor.vv v8, v8, v12
; RV32-NEXT: vsrl.vi v12, v8, 4
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 24
-; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; RV32-NEXT: vmv1r.v v1, v0
; RV32-NEXT: li a3, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsll.vx v24, v8, a3, v0.t
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vx v16, v8, a3, v0.t
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t
; RV32-NEXT: lui a1, 16
-; RV32-NEXT: addi a4, a1, -256
-; RV32-NEXT: vand.vx v24, v8, a4, v0.t
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: vsll.vx v24, v24, a5, v0.t
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: addi a5, a1, -256
+; RV32-NEXT: vand.vx v24, v24, a5, v0.t
; RV32-NEXT: vor.vv v24, v24, v16, v0.t
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 4
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t
; RV32-NEXT: lui a6, 4080
-; RV32-NEXT: vand.vx v24, v8, a6, v0.t
-; RV32-NEXT: vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT: vand.vx v24, v24, a6, v0.t
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; RV32-NEXT: li a1, 32
+; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; RV32-NEXT: lui a2, 349525
; RV32-NEXT: addi a2, a2, 1365
; RV32-NEXT: lui a7, 1044480
; RV32-NEXT: vmv.v.x v0, a2
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmerge.vxm v24, v24, a7, v0
-; RV32-NEXT: addi a7, sp, 16
-; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill
+; RV32-NEXT: vmerge.vxm v16, v24, a7, v0
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: li t0, 24
+; RV32-NEXT: mul a7, a7, t0
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 16
+; RV32-NEXT: vs8r.v v16, (a7) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmv1r.v v0, v1
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
-; RV32-NEXT: vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT: addi a7, sp, 16
+; RV32-NEXT: vl8r.v v24, (a7) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: li t0, 24
+; RV32-NEXT: mul a7, a7, t0
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 16
+; RV32-NEXT: vl8r.v v16, (a7) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v24, v16, v0.t
; RV32-NEXT: csrr a7, vlenb
; RV32-NEXT: slli a7, a7, 3
; RV32-NEXT: add a7, sp, a7
; RV32-NEXT: addi a7, a7, 16
; RV32-NEXT: vl8r.v v24, (a7) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: vor.vv v16, v16, v24, v0.t
; RV32-NEXT: csrr a7, vlenb
; RV32-NEXT: slli a7, a7, 4
; RV32-NEXT: add a7, sp, a7
; RV32-NEXT: addi a7, a7, 16
; RV32-NEXT: vl8r.v v24, (a7) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v24, v24, v16, v0.t
+; RV32-NEXT: vor.vv v24, v16, v24, v0.t
; RV32-NEXT: csrr a7, vlenb
; RV32-NEXT: slli a7, a7, 4
; RV32-NEXT: add a7, sp, a7
; RV32-NEXT: addi a7, a7, 16
; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t
+; RV32-NEXT: vsll.vx v16, v8, a3, v0.t
+; RV32-NEXT: vand.vx v24, v8, a5, v0.t
+; RV32-NEXT: vsll.vx v24, v24, a4, v0.t
+; RV32-NEXT: vor.vv v16, v16, v24, v0.t
+; RV32-NEXT: vand.vx v24, v8, a6, v0.t
+; RV32-NEXT: vsll.vi v24, v24, 24, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vx v24, v8, a5, v0.t
-; RV32-NEXT: vand.vx v16, v24, a4, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v24, v16, v24, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsll.vi v8, v8, 8, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t
-; RV32-NEXT: addi a3, sp, 16
; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v16, v24, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t
-; RV32-NEXT: vand.vx v8, v8, a6, v0.t
; RV32-NEXT: vor.vv v8, v24, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v24, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v8, 4, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vor.vv v24, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v16, v24, 4, v0.t
; RV32-NEXT: lui a3, 61681
; RV32-NEXT: addi a3, a3, -241
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v8, a3
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: vand.vv v8, v24, v8, v0.t
; RV32-NEXT: vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v24, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vor.vv v24, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t
; RV32-NEXT: lui a3, 209715
; RV32-NEXT: addi a3, a3, 819
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v8, a3
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: vand.vv v8, v24, v8, v0.t
; RV32-NEXT: vsll.vi v8, v8, 2, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v24, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vor.vv v24, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v24, 1, v0.t
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v8, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v8, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: vand.vv v8, v24, v8, v0.t
; RV32-NEXT: vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 24
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; RV32-NEXT: li a1, 32
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: li a1, 56
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsll.vx v16, v8, a1
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -256
+; RV32-NEXT: vand.vx v24, v8, a2
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: vsll.vx v24, v24, a3
+; RV32-NEXT: vor.vv v16, v16, v24
+; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: lui a4, 4080
+; RV32-NEXT: vand.vx v16, v8, a4
+; RV32-NEXT: vsll.vi v24, v16, 24
+; RV32-NEXT: li a5, 32
+; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
; RV32-NEXT: vmv.v.i v16, 0
-; RV32-NEXT: lui a2, 349525
-; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: lui a6, 349525
+; RV32-NEXT: addi a6, a6, 1365
; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT: vmv.v.x v0, a2
-; RV32-NEXT: lui a3, 1044480
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmerge.vxm v16, v16, a3, v0
+; RV32-NEXT: lui a7, 1044480
+; RV32-NEXT: vmv.v.x v0, a6
+; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT: vmerge.vxm v16, v16, a7, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v8, 8
-; RV32-NEXT: vand.vv v24, v24, v16
-; RV32-NEXT: vsrl.vi v0, v8, 24
-; RV32-NEXT: lui a3, 4080
-; RV32-NEXT: vand.vx v0, v0, a3
+; RV32-NEXT: vand.vv v0, v8, v16
+; RV32-NEXT: vsll.vi v0, v0, 8
; RV32-NEXT: vor.vv v24, v24, v0
-; RV32-NEXT: addi a4, sp, 16
-; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: li a4, 56
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: vsrl.vx v0, v8, a5
-; RV32-NEXT: lui a6, 16
-; RV32-NEXT: addi a6, a6, -256
-; RV32-NEXT: vand.vx v0, v0, a6
-; RV32-NEXT: vsrl.vx v24, v8, a4
-; RV32-NEXT: vor.vv v24, v0, v24
; RV32-NEXT: addi a7, sp, 16
; RV32-NEXT: vl8r.v v0, (a7) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v24, v0, v24
-; RV32-NEXT: vand.vv v16, v8, v16
-; RV32-NEXT: vsll.vi v16, v16, 8
-; RV32-NEXT: vand.vx v0, v8, a3
-; RV32-NEXT: vsll.vi v0, v0, 24
-; RV32-NEXT: vor.vv v16, v0, v16
-; RV32-NEXT: vsll.vx v0, v8, a4
-; RV32-NEXT: vand.vx v8, v8, a6
-; RV32-NEXT: vsll.vx v8, v8, a5
-; RV32-NEXT: vor.vv v8, v0, v8
-; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vx v0, v8, a3
+; RV32-NEXT: vand.vx v0, v0, a2
+; RV32-NEXT: vsrl.vx v24, v8, a1
+; RV32-NEXT: vor.vv v24, v0, v24
+; RV32-NEXT: vsrl.vi v0, v8, 8
+; RV32-NEXT: vand.vv v16, v0, v16
+; RV32-NEXT: vsrl.vi v8, v8, 24
+; RV32-NEXT: vand.vx v8, v8, a4
+; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vor.vv v8, v8, v24
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: lui a3, 61681
-; RV32-NEXT: addi a3, a3, -241
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a3
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vsll.vi v8, v8, 4
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 2
-; RV32-NEXT: lui a3, 209715
-; RV32-NEXT: addi a3, a3, 819
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a3
+; RV32-NEXT: lui a1, 209715
+; RV32-NEXT: addi a1, a1, 819
+; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vsll.vi v8, v8, 2
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a6
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 24
-; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; RV32-NEXT: vmv1r.v v1, v0
; RV32-NEXT: li a3, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsll.vx v24, v8, a3, v0.t
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vx v16, v8, a3, v0.t
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t
; RV32-NEXT: lui a1, 16
-; RV32-NEXT: addi a4, a1, -256
-; RV32-NEXT: vand.vx v24, v8, a4, v0.t
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: vsll.vx v24, v24, a5, v0.t
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: addi a5, a1, -256
+; RV32-NEXT: vand.vx v24, v24, a5, v0.t
; RV32-NEXT: vor.vv v24, v24, v16, v0.t
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 4
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t
; RV32-NEXT: lui a6, 4080
-; RV32-NEXT: vand.vx v24, v8, a6, v0.t
-; RV32-NEXT: vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT: vand.vx v24, v24, a6, v0.t
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; RV32-NEXT: li a1, 32
+; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; RV32-NEXT: lui a2, 349525
; RV32-NEXT: addi a2, a2, 1365
; RV32-NEXT: lui a7, 1044480
; RV32-NEXT: vmv.v.x v0, a2
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmerge.vxm v24, v24, a7, v0
-; RV32-NEXT: addi a7, sp, 16
-; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill
+; RV32-NEXT: vmerge.vxm v16, v24, a7, v0
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: li t0, 24
+; RV32-NEXT: mul a7, a7, t0
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 16
+; RV32-NEXT: vs8r.v v16, (a7) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmv1r.v v0, v1
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
-; RV32-NEXT: vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT: addi a7, sp, 16
+; RV32-NEXT: vl8r.v v24, (a7) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: li t0, 24
+; RV32-NEXT: mul a7, a7, t0
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 16
+; RV32-NEXT: vl8r.v v16, (a7) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v24, v16, v0.t
; RV32-NEXT: csrr a7, vlenb
; RV32-NEXT: slli a7, a7, 3
; RV32-NEXT: add a7, sp, a7
; RV32-NEXT: addi a7, a7, 16
; RV32-NEXT: vl8r.v v24, (a7) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: vor.vv v16, v16, v24, v0.t
; RV32-NEXT: csrr a7, vlenb
; RV32-NEXT: slli a7, a7, 4
; RV32-NEXT: add a7, sp, a7
; RV32-NEXT: addi a7, a7, 16
; RV32-NEXT: vl8r.v v24, (a7) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v24, v24, v16, v0.t
+; RV32-NEXT: vor.vv v24, v16, v24, v0.t
; RV32-NEXT: csrr a7, vlenb
; RV32-NEXT: slli a7, a7, 4
; RV32-NEXT: add a7, sp, a7
; RV32-NEXT: addi a7, a7, 16
; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t
+; RV32-NEXT: vsll.vx v16, v8, a3, v0.t
+; RV32-NEXT: vand.vx v24, v8, a5, v0.t
+; RV32-NEXT: vsll.vx v24, v24, a4, v0.t
+; RV32-NEXT: vor.vv v16, v16, v24, v0.t
+; RV32-NEXT: vand.vx v24, v8, a6, v0.t
+; RV32-NEXT: vsll.vi v24, v24, 24, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vx v24, v8, a5, v0.t
-; RV32-NEXT: vand.vx v16, v24, a4, v0.t
; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v24, v16, v24, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsll.vi v8, v8, 8, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 3
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t
-; RV32-NEXT: addi a3, sp, 16
; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v16, v24, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t
-; RV32-NEXT: vand.vx v8, v8, a6, v0.t
; RV32-NEXT: vor.vv v8, v24, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: add a3, sp, a3
; RV32-NEXT: addi a3, a3, 16
; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v24, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v8, 4, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vor.vv v24, v8, v24, v0.t
+; RV32-NEXT: vsrl.vi v16, v24, 4, v0.t
; RV32-NEXT: lui a3, 61681
; RV32-NEXT: addi a3, a3, -241
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v8, a3
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: vand.vv v8, v24, v8, v0.t
; RV32-NEXT: vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v24, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vor.vv v24, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t
; RV32-NEXT: lui a3, 209715
; RV32-NEXT: addi a3, a3, 819
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v8, a3
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: vand.vv v8, v24, v8, v0.t
; RV32-NEXT: vsll.vi v8, v8, 2, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v24, v8, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vor.vv v24, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v24, 1, v0.t
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v8, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v8, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: vand.vv v8, v24, v8, v0.t
; RV32-NEXT: vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 24
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; RV32-NEXT: li a1, 32
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: li a1, 56
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsll.vx v16, v8, a1
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -256
+; RV32-NEXT: vand.vx v24, v8, a2
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: vsll.vx v24, v24, a3
+; RV32-NEXT: vor.vv v16, v16, v24
+; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: lui a4, 4080
+; RV32-NEXT: vand.vx v16, v8, a4
+; RV32-NEXT: vsll.vi v24, v16, 24
+; RV32-NEXT: li a5, 32
+; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
; RV32-NEXT: vmv.v.i v16, 0
-; RV32-NEXT: lui a2, 349525
-; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: lui a6, 349525
+; RV32-NEXT: addi a6, a6, 1365
; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT: vmv.v.x v0, a2
-; RV32-NEXT: lui a3, 1044480
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmerge.vxm v16, v16, a3, v0
+; RV32-NEXT: lui a7, 1044480
+; RV32-NEXT: vmv.v.x v0, a6
+; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT: vmerge.vxm v16, v16, a7, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v8, 8
-; RV32-NEXT: vand.vv v24, v24, v16
-; RV32-NEXT: vsrl.vi v0, v8, 24
-; RV32-NEXT: lui a3, 4080
-; RV32-NEXT: vand.vx v0, v0, a3
+; RV32-NEXT: vand.vv v0, v8, v16
+; RV32-NEXT: vsll.vi v0, v0, 8
; RV32-NEXT: vor.vv v24, v24, v0
-; RV32-NEXT: addi a4, sp, 16
-; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: li a4, 56
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: vsrl.vx v0, v8, a5
-; RV32-NEXT: lui a6, 16
-; RV32-NEXT: addi a6, a6, -256
-; RV32-NEXT: vand.vx v0, v0, a6
-; RV32-NEXT: vsrl.vx v24, v8, a4
-; RV32-NEXT: vor.vv v24, v0, v24
; RV32-NEXT: addi a7, sp, 16
; RV32-NEXT: vl8r.v v0, (a7) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v24, v0, v24
-; RV32-NEXT: vand.vv v16, v8, v16
-; RV32-NEXT: vsll.vi v16, v16, 8
-; RV32-NEXT: vand.vx v0, v8, a3
-; RV32-NEXT: vsll.vi v0, v0, 24
-; RV32-NEXT: vor.vv v16, v0, v16
-; RV32-NEXT: vsll.vx v0, v8, a4
-; RV32-NEXT: vand.vx v8, v8, a6
-; RV32-NEXT: vsll.vx v8, v8, a5
-; RV32-NEXT: vor.vv v8, v0, v8
-; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vx v0, v8, a3
+; RV32-NEXT: vand.vx v0, v0, a2
+; RV32-NEXT: vsrl.vx v24, v8, a1
+; RV32-NEXT: vor.vv v24, v0, v24
+; RV32-NEXT: vsrl.vi v0, v8, 8
+; RV32-NEXT: vand.vv v16, v0, v16
+; RV32-NEXT: vsrl.vi v8, v8, 24
+; RV32-NEXT: vand.vx v8, v8, a4
+; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vor.vv v8, v8, v24
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: lui a3, 61681
-; RV32-NEXT: addi a3, a3, -241
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a3
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
+; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vsll.vi v8, v8, 4
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 2
-; RV32-NEXT: lui a3, 209715
-; RV32-NEXT: addi a3, a3, 819
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a3
+; RV32-NEXT: lui a1, 209715
+; RV32-NEXT: addi a1, a1, 819
+; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: vsll.vi v8, v8, 2
; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
+; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a6
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vand.vv v16, v16, v24
; RV32-NEXT: vand.vv v8, v8, v24
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: vmv.v.i v0, 5
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV32-NEXT: vmv.v.i v9, 0
+; RV32-NEXT: vmv.v.i v0, 5
; RV32-NEXT: lui a1, 1044480
; RV32-NEXT: vmerge.vxm v9, v9, a1, v0
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX2-RV32: # %bb.0:
; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; LMULMAX2-RV32-NEXT: vle64.v v8, (a0)
-; LMULMAX2-RV32-NEXT: li a1, 85
+; LMULMAX2-RV32-NEXT: li a1, 56
+; LMULMAX2-RV32-NEXT: vsrl.vx v10, v8, a1
+; LMULMAX2-RV32-NEXT: li a2, 40
+; LMULMAX2-RV32-NEXT: vsrl.vx v12, v8, a2
+; LMULMAX2-RV32-NEXT: lui a3, 16
+; LMULMAX2-RV32-NEXT: addi a3, a3, -256
+; LMULMAX2-RV32-NEXT: vand.vx v12, v12, a3
+; LMULMAX2-RV32-NEXT: vor.vv v10, v12, v10
+; LMULMAX2-RV32-NEXT: vsrl.vi v12, v8, 24
+; LMULMAX2-RV32-NEXT: lui a4, 4080
+; LMULMAX2-RV32-NEXT: vand.vx v12, v12, a4
+; LMULMAX2-RV32-NEXT: li a5, 85
; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v0, a1
+; LMULMAX2-RV32-NEXT: vmv.v.x v0, a5
; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.i v10, 0
-; LMULMAX2-RV32-NEXT: lui a1, 1044480
-; LMULMAX2-RV32-NEXT: vmerge.vxm v10, v10, a1, v0
+; LMULMAX2-RV32-NEXT: vmv.v.i v14, 0
+; LMULMAX2-RV32-NEXT: lui a5, 1044480
+; LMULMAX2-RV32-NEXT: vmerge.vxm v14, v14, a5, v0
; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vsrl.vi v12, v8, 8
-; LMULMAX2-RV32-NEXT: vand.vv v12, v12, v10
-; LMULMAX2-RV32-NEXT: vsrl.vi v14, v8, 24
-; LMULMAX2-RV32-NEXT: lui a1, 4080
-; LMULMAX2-RV32-NEXT: vand.vx v14, v14, a1
-; LMULMAX2-RV32-NEXT: vor.vv v12, v12, v14
-; LMULMAX2-RV32-NEXT: li a2, 56
-; LMULMAX2-RV32-NEXT: vsrl.vx v14, v8, a2
-; LMULMAX2-RV32-NEXT: li a3, 40
-; LMULMAX2-RV32-NEXT: vsrl.vx v16, v8, a3
-; LMULMAX2-RV32-NEXT: lui a4, 16
-; LMULMAX2-RV32-NEXT: addi a4, a4, -256
-; LMULMAX2-RV32-NEXT: vand.vx v16, v16, a4
-; LMULMAX2-RV32-NEXT: vor.vv v14, v16, v14
-; LMULMAX2-RV32-NEXT: vor.vv v12, v12, v14
-; LMULMAX2-RV32-NEXT: vand.vv v10, v8, v10
-; LMULMAX2-RV32-NEXT: vsll.vi v10, v10, 8
-; LMULMAX2-RV32-NEXT: vand.vx v14, v8, a1
-; LMULMAX2-RV32-NEXT: vsll.vi v14, v14, 24
-; LMULMAX2-RV32-NEXT: vor.vv v10, v14, v10
-; LMULMAX2-RV32-NEXT: vsll.vx v14, v8, a2
-; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a4
-; LMULMAX2-RV32-NEXT: vsll.vx v8, v8, a3
-; LMULMAX2-RV32-NEXT: vor.vv v8, v14, v8
+; LMULMAX2-RV32-NEXT: vsrl.vi v16, v8, 8
+; LMULMAX2-RV32-NEXT: vand.vv v16, v16, v14
+; LMULMAX2-RV32-NEXT: vor.vv v12, v16, v12
+; LMULMAX2-RV32-NEXT: vor.vv v10, v12, v10
+; LMULMAX2-RV32-NEXT: vsll.vx v12, v8, a1
+; LMULMAX2-RV32-NEXT: vand.vx v16, v8, a3
+; LMULMAX2-RV32-NEXT: vsll.vx v16, v16, a2
+; LMULMAX2-RV32-NEXT: vor.vv v12, v12, v16
+; LMULMAX2-RV32-NEXT: vand.vx v16, v8, a4
+; LMULMAX2-RV32-NEXT: vsll.vi v16, v16, 24
+; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v14
+; LMULMAX2-RV32-NEXT: vsll.vi v8, v8, 8
+; LMULMAX2-RV32-NEXT: vor.vv v8, v16, v8
+; LMULMAX2-RV32-NEXT: vor.vv v8, v12, v8
; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v12
; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4
; LMULMAX2-RV32-NEXT: lui a1, 61681
; LMULMAX2-RV32-NEXT: addi a1, a1, -241
; LMULMAX1-RV32-NEXT: vle64.v v8, (a0)
; LMULMAX1-RV32-NEXT: addi a1, a0, 16
; LMULMAX1-RV32-NEXT: vle64.v v10, (a1)
-; LMULMAX1-RV32-NEXT: vmv.v.i v0, 5
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; LMULMAX1-RV32-NEXT: vmv.v.i v9, 0
+; LMULMAX1-RV32-NEXT: vmv.v.i v0, 5
; LMULMAX1-RV32-NEXT: lui a2, 1044480
; LMULMAX1-RV32-NEXT: vmerge.vxm v9, v9, a2, v0
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vand.vx v11, v11, a3, v0.t
; RV32-NEXT: vor.vv v10, v11, v10, v0.t
; RV32-NEXT: vsrl.vi v11, v8, 8, v0.t
-; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT: vmv.v.i v0, 5
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV32-NEXT: vmv.v.i v12, 0
+; RV32-NEXT: vmv.v.i v0, 5
; RV32-NEXT: lui a4, 1044480
; RV32-NEXT: vmerge.vxm v12, v12, a4, v0
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: vand.vx v10, v10, a3
; RV32-NEXT: vor.vv v9, v10, v9
-; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT: vmv.v.i v0, 5
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV32-NEXT: vmv.v.i v10, 0
+; RV32-NEXT: vmv.v.i v0, 5
; RV32-NEXT: lui a4, 1044480
; RV32-NEXT: vmerge.vxm v10, v10, a4, v0
; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: vand.vx v14, v14, a3, v0.t
; RV32-NEXT: vor.vv v12, v14, v12, v0.t
-; RV32-NEXT: vsrl.vi v14, v8, 8, v0.t
-; RV32-NEXT: li a4, 85
+; RV32-NEXT: vsrl.vi v14, v8, 24, v0.t
+; RV32-NEXT: lui a4, 4080
+; RV32-NEXT: vand.vx v14, v14, a4, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t
+; RV32-NEXT: li a5, 85
; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT: vmv.v.x v0, a4
+; RV32-NEXT: vmv.v.x v0, a5
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.i v16, 0
-; RV32-NEXT: lui a4, 1044480
-; RV32-NEXT: vmerge.vxm v16, v16, a4, v0
+; RV32-NEXT: vmv.v.i v18, 0
+; RV32-NEXT: lui a5, 1044480
+; RV32-NEXT: vmerge.vxm v18, v18, a5, v0
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; RV32-NEXT: vmv1r.v v0, v10
-; RV32-NEXT: vand.vv v14, v14, v16, v0.t
-; RV32-NEXT: vsrl.vi v18, v8, 24, v0.t
-; RV32-NEXT: lui a0, 4080
-; RV32-NEXT: vand.vx v18, v18, a0, v0.t
-; RV32-NEXT: vor.vv v14, v14, v18, v0.t
+; RV32-NEXT: vand.vv v16, v16, v18, v0.t
+; RV32-NEXT: vor.vv v14, v16, v14, v0.t
; RV32-NEXT: vor.vv v12, v14, v12, v0.t
; RV32-NEXT: vsll.vx v14, v8, a1, v0.t
-; RV32-NEXT: vand.vx v18, v8, a3, v0.t
-; RV32-NEXT: vsll.vx v18, v18, a2, v0.t
-; RV32-NEXT: vor.vv v14, v14, v18, v0.t
-; RV32-NEXT: vand.vx v18, v8, a0, v0.t
-; RV32-NEXT: vsll.vi v18, v18, 24, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vand.vx v16, v8, a3, v0.t
+; RV32-NEXT: vsll.vx v16, v16, a2, v0.t
+; RV32-NEXT: vor.vv v14, v14, v16, v0.t
+; RV32-NEXT: vand.vx v16, v8, a4, v0.t
+; RV32-NEXT: vsll.vi v16, v16, 24, v0.t
+; RV32-NEXT: vand.vv v8, v8, v18, v0.t
; RV32-NEXT: vsll.vi v8, v8, 8, v0.t
-; RV32-NEXT: vor.vv v8, v18, v8, v0.t
+; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: vor.vv v8, v14, v8, v0.t
; RV32-NEXT: vor.vv v8, v8, v12, v0.t
; RV32-NEXT: ret
; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: vand.vx v12, v12, a3
; RV32-NEXT: vor.vv v10, v12, v10
+; RV32-NEXT: vsrl.vi v12, v8, 8
; RV32-NEXT: li a4, 85
; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; RV32-NEXT: vmv.v.x v0, a4
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.i v12, 0
+; RV32-NEXT: vmv.v.i v14, 0
; RV32-NEXT: lui a4, 1044480
-; RV32-NEXT: vmerge.vxm v12, v12, a4, v0
+; RV32-NEXT: vmerge.vxm v14, v14, a4, v0
; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT: vsrl.vi v14, v8, 8
-; RV32-NEXT: vand.vv v14, v14, v12
+; RV32-NEXT: vand.vv v12, v12, v14
; RV32-NEXT: vsrl.vi v16, v8, 24
; RV32-NEXT: lui a0, 4080
; RV32-NEXT: vand.vx v16, v16, a0
-; RV32-NEXT: vor.vv v14, v14, v16
-; RV32-NEXT: vor.vv v10, v14, v10
-; RV32-NEXT: vsll.vx v14, v8, a1
+; RV32-NEXT: vor.vv v12, v12, v16
+; RV32-NEXT: vor.vv v10, v12, v10
+; RV32-NEXT: vsll.vx v12, v8, a1
; RV32-NEXT: vand.vx v16, v8, a3
; RV32-NEXT: vsll.vx v16, v16, a2
-; RV32-NEXT: vor.vv v14, v14, v16
-; RV32-NEXT: vand.vv v12, v8, v12
-; RV32-NEXT: vsll.vi v12, v12, 8
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vsll.vi v8, v8, 24
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vor.vv v8, v14, v8
+; RV32-NEXT: vor.vv v12, v12, v16
+; RV32-NEXT: vand.vx v16, v8, a0
+; RV32-NEXT: vsll.vi v16, v16, 24
+; RV32-NEXT: vand.vv v8, v8, v14
+; RV32-NEXT: vsll.vi v8, v8, 8
+; RV32-NEXT: vor.vv v8, v16, v8
+; RV32-NEXT: vor.vv v8, v12, v8
; RV32-NEXT: vor.vv v8, v8, v10
; RV32-NEXT: ret
;
; RV32-NEXT: lui a3, 16
; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: vand.vx v20, v20, a3, v0.t
-; RV32-NEXT: vor.vv v20, v20, v16, v0.t
-; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t
-; RV32-NEXT: lui a4, 5
-; RV32-NEXT: addi a4, a4, 1365
+; RV32-NEXT: vor.vv v16, v20, v16, v0.t
+; RV32-NEXT: vsrl.vi v20, v8, 24, v0.t
+; RV32-NEXT: lui a4, 4080
+; RV32-NEXT: vand.vx v24, v20, a4, v0.t
+; RV32-NEXT: vsrl.vi v28, v8, 8, v0.t
+; RV32-NEXT: lui a5, 5
+; RV32-NEXT: addi a5, a5, 1365
; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT: vmv.v.x v0, a4
+; RV32-NEXT: vmv.v.x v0, a5
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vmv.v.i v16, 0
-; RV32-NEXT: lui a4, 1044480
-; RV32-NEXT: vmerge.vxm v16, v16, a4, v0
+; RV32-NEXT: vmv.v.i v20, 0
+; RV32-NEXT: lui a5, 1044480
+; RV32-NEXT: vmerge.vxm v20, v20, a5, v0
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; RV32-NEXT: vmv1r.v v0, v12
-; RV32-NEXT: vand.vv v24, v24, v16, v0.t
-; RV32-NEXT: vsrl.vi v28, v8, 24, v0.t
-; RV32-NEXT: lui a0, 4080
-; RV32-NEXT: vand.vx v28, v28, a0, v0.t
-; RV32-NEXT: vor.vv v24, v24, v28, v0.t
-; RV32-NEXT: vor.vv v20, v24, v20, v0.t
+; RV32-NEXT: vand.vv v28, v28, v20, v0.t
+; RV32-NEXT: vor.vv v24, v28, v24, v0.t
+; RV32-NEXT: vor.vv v16, v24, v16, v0.t
; RV32-NEXT: vsll.vx v24, v8, a1, v0.t
; RV32-NEXT: vand.vx v28, v8, a3, v0.t
; RV32-NEXT: vsll.vx v28, v28, a2, v0.t
; RV32-NEXT: vor.vv v24, v24, v28, v0.t
-; RV32-NEXT: vand.vx v28, v8, a0, v0.t
+; RV32-NEXT: vand.vx v28, v8, a4, v0.t
; RV32-NEXT: vsll.vi v28, v28, 24, v0.t
-; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: vand.vv v8, v8, v20, v0.t
; RV32-NEXT: vsll.vi v8, v8, 8, v0.t
; RV32-NEXT: vor.vv v8, v28, v8, v0.t
; RV32-NEXT: vor.vv v8, v24, v8, v0.t
-; RV32-NEXT: vor.vv v8, v8, v20, v0.t
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: ret
;
; RV64-LABEL: vp_bswap_v8i64:
; RV32-NEXT: addi a3, a3, -256
; RV32-NEXT: vand.vx v16, v16, a3
; RV32-NEXT: vor.vv v12, v16, v12
+; RV32-NEXT: vsrl.vi v20, v8, 8
; RV32-NEXT: lui a4, 5
; RV32-NEXT: addi a4, a4, 1365
; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; RV32-NEXT: lui a4, 1044480
; RV32-NEXT: vmerge.vxm v16, v16, a4, v0
; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT: vsrl.vi v20, v8, 8
; RV32-NEXT: vand.vv v20, v20, v16
; RV32-NEXT: vsrl.vi v24, v8, 24
; RV32-NEXT: lui a0, 4080
; RV32-NEXT: vand.vx v24, v8, a3
; RV32-NEXT: vsll.vx v24, v24, a2
; RV32-NEXT: vor.vv v20, v20, v24
-; RV32-NEXT: vand.vv v16, v8, v16
-; RV32-NEXT: vsll.vi v16, v16, 8
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vsll.vi v8, v8, 24
-; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vand.vx v24, v8, a0
+; RV32-NEXT: vsll.vi v24, v24, 24
+; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vsll.vi v8, v8, 8
+; RV32-NEXT: vor.vv v8, v24, v8
; RV32-NEXT: vor.vv v8, v20, v8
; RV32-NEXT: vor.vv v8, v8, v12
; RV32-NEXT: ret
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 24
-; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; RV32-NEXT: vmv1r.v v1, v0
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsll.vx v24, v8, a1, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: lui a2, 16
-; RV32-NEXT: addi a2, a2, -256
-; RV32-NEXT: vand.vx v24, v8, a2, v0.t
-; RV32-NEXT: li a3, 40
-; RV32-NEXT: vsll.vx v24, v24, a3, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 3
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 4
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 3
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
+; RV32-NEXT: li a2, 40
+; RV32-NEXT: vsrl.vx v24, v8, a2, v0.t
+; RV32-NEXT: lui a3, 16
+; RV32-NEXT: addi a3, a3, -256
+; RV32-NEXT: vand.vx v24, v24, a3, v0.t
; RV32-NEXT: vor.vv v24, v24, v16, v0.t
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: slli a4, a4, 4
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t
; RV32-NEXT: lui a4, 4080
-; RV32-NEXT: vand.vx v24, v8, a4, v0.t
-; RV32-NEXT: vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT: vand.vx v24, v24, a4, v0.t
; RV32-NEXT: csrr a5, vlenb
; RV32-NEXT: slli a5, a5, 3
; RV32-NEXT: add a5, sp, a5
; RV32-NEXT: addi a5, a5, 16
; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill
; RV32-NEXT: li a5, 32
+; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t
+; RV32-NEXT: addi a6, sp, 16
+; RV32-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
; RV32-NEXT: lui a6, 349525
; RV32-NEXT: addi a6, a6, 1365
; RV32-NEXT: lui a7, 1044480
; RV32-NEXT: vmv.v.x v0, a6
; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
-; RV32-NEXT: vmerge.vxm v24, v24, a7, v0
-; RV32-NEXT: addi a5, sp, 16
-; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: vmerge.vxm v16, v24, a7, v0
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: li a6, 24
+; RV32-NEXT: mul a5, a5, a6
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 16
+; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmv1r.v v0, v1
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
-; RV32-NEXT: vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a0, a0, a5
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v24, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: vor.vv v16, v16, v24, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v24, v24, v16, v0.t
+; RV32-NEXT: vor.vv v24, v16, v24, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vx v24, v8, a1, v0.t
+; RV32-NEXT: vsll.vx v16, v8, a1, v0.t
+; RV32-NEXT: vand.vx v24, v8, a3, v0.t
+; RV32-NEXT: vsll.vx v24, v24, a2, v0.t
+; RV32-NEXT: vor.vv v16, v16, v24, v0.t
+; RV32-NEXT: vand.vx v24, v8, a4, v0.t
+; RV32-NEXT: vsll.vi v24, v24, 24, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT: vand.vx v16, v24, a2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v24, v16, v24, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsll.vi v8, v8, 8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t
-; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v16, v24, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t
-; RV32-NEXT: vand.vx v8, v8, a4, v0.t
; RV32-NEXT: vor.vv v8, v24, v8, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v16, v8, v0.t
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 24
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; RV32-NEXT: li a1, 32
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: li a1, 56
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsll.vx v16, v8, a1
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -256
+; RV32-NEXT: vand.vx v24, v8, a2
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: vsll.vx v24, v24, a3
+; RV32-NEXT: vor.vv v16, v16, v24
+; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: lui a4, 4080
+; RV32-NEXT: vand.vx v16, v8, a4
+; RV32-NEXT: vsll.vi v24, v16, 24
+; RV32-NEXT: li a5, 32
+; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
; RV32-NEXT: vmv.v.i v16, 0
-; RV32-NEXT: lui a2, 349525
-; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: lui a6, 349525
+; RV32-NEXT: addi a6, a6, 1365
; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT: vmv.v.x v0, a2
-; RV32-NEXT: lui a2, 1044480
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmerge.vxm v16, v16, a2, v0
+; RV32-NEXT: lui a7, 1044480
+; RV32-NEXT: vmv.v.x v0, a6
+; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT: vmerge.vxm v16, v16, a7, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v8, 8
-; RV32-NEXT: vand.vv v24, v24, v16
-; RV32-NEXT: vsrl.vi v0, v8, 24
-; RV32-NEXT: lui a0, 4080
-; RV32-NEXT: vand.vx v0, v0, a0
+; RV32-NEXT: vand.vv v0, v8, v16
+; RV32-NEXT: vsll.vi v0, v0, 8
; RV32-NEXT: vor.vv v24, v24, v0
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: li a1, 56
-; RV32-NEXT: li a2, 40
-; RV32-NEXT: vsrl.vx v0, v8, a2
-; RV32-NEXT: lui a3, 16
-; RV32-NEXT: addi a3, a3, -256
-; RV32-NEXT: vand.vx v0, v0, a3
-; RV32-NEXT: vsrl.vx v24, v8, a1
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v24, v0, v24
-; RV32-NEXT: addi a4, sp, 16
-; RV32-NEXT: vl8r.v v0, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vx v0, v8, a3
+; RV32-NEXT: vand.vx v0, v0, a2
+; RV32-NEXT: vsrl.vx v24, v8, a1
; RV32-NEXT: vor.vv v24, v0, v24
-; RV32-NEXT: vand.vv v16, v8, v16
-; RV32-NEXT: vsll.vi v16, v16, 8
-; RV32-NEXT: vand.vx v0, v8, a0
-; RV32-NEXT: vsll.vi v0, v0, 24
-; RV32-NEXT: vor.vv v16, v0, v16
-; RV32-NEXT: vsll.vx v0, v8, a1
-; RV32-NEXT: vand.vx v8, v8, a3
-; RV32-NEXT: vsll.vx v8, v8, a2
-; RV32-NEXT: vor.vv v8, v0, v8
-; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v0, v8, 8
+; RV32-NEXT: vand.vv v16, v0, v16
+; RV32-NEXT: vsrl.vi v8, v8, 24
+; RV32-NEXT: vand.vx v8, v8, a4
+; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vor.vv v8, v8, v24
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 24
-; RV32-NEXT: mul a1, a1, a2
+; RV32-NEXT: slli a1, a1, 5
; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
; RV32-NEXT: vmv1r.v v1, v0
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsll.vx v24, v8, a1, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: lui a2, 16
-; RV32-NEXT: addi a2, a2, -256
-; RV32-NEXT: vand.vx v24, v8, a2, v0.t
-; RV32-NEXT: li a3, 40
-; RV32-NEXT: vsll.vx v24, v24, a3, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 3
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 4
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 3
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t
+; RV32-NEXT: li a2, 40
+; RV32-NEXT: vsrl.vx v24, v8, a2, v0.t
+; RV32-NEXT: lui a3, 16
+; RV32-NEXT: addi a3, a3, -256
+; RV32-NEXT: vand.vx v24, v24, a3, v0.t
; RV32-NEXT: vor.vv v24, v24, v16, v0.t
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: slli a4, a4, 4
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t
; RV32-NEXT: lui a4, 4080
-; RV32-NEXT: vand.vx v24, v8, a4, v0.t
-; RV32-NEXT: vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT: vand.vx v24, v24, a4, v0.t
; RV32-NEXT: csrr a5, vlenb
; RV32-NEXT: slli a5, a5, 3
; RV32-NEXT: add a5, sp, a5
; RV32-NEXT: addi a5, a5, 16
; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill
; RV32-NEXT: li a5, 32
+; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t
+; RV32-NEXT: addi a6, sp, 16
+; RV32-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
; RV32-NEXT: lui a6, 349525
; RV32-NEXT: addi a6, a6, 1365
; RV32-NEXT: lui a7, 1044480
; RV32-NEXT: vmv.v.x v0, a6
; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
-; RV32-NEXT: vmerge.vxm v24, v24, a7, v0
-; RV32-NEXT: addi a5, sp, 16
-; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill
+; RV32-NEXT: vmerge.vxm v16, v24, a7, v0
+; RV32-NEXT: csrr a5, vlenb
+; RV32-NEXT: li a6, 24
+; RV32-NEXT: mul a5, a5, a6
+; RV32-NEXT: add a5, sp, a5
+; RV32-NEXT: addi a5, a5, 16
+; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmv1r.v v0, v1
-; RV32-NEXT: vand.vv v16, v8, v24, v0.t
-; RV32-NEXT: vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a0, a0, a5
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v16, v24, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v16, v24, v16, v0.t
+; RV32-NEXT: vor.vv v16, v16, v24, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v24, v24, v16, v0.t
+; RV32-NEXT: vor.vv v24, v16, v24, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vx v24, v8, a1, v0.t
+; RV32-NEXT: vsll.vx v16, v8, a1, v0.t
+; RV32-NEXT: vand.vx v24, v8, a3, v0.t
+; RV32-NEXT: vsll.vx v24, v24, a2, v0.t
+; RV32-NEXT: vor.vv v16, v16, v24, v0.t
+; RV32-NEXT: vand.vx v24, v8, a4, v0.t
+; RV32-NEXT: vsll.vi v24, v24, 24, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT: vand.vx v16, v24, a2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v24, v16, v24, v0.t
+; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vsll.vi v8, v8, 8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t
-; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v16, v24, v0.t
-; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t
-; RV32-NEXT: vand.vx v8, v8, a4, v0.t
; RV32-NEXT: vor.vv v8, v24, v8, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v8, v24, v0.t
+; RV32-NEXT: vor.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vor.vv v8, v16, v8, v0.t
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 24
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
; RV32-NEXT: slli a1, a1, 3
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; RV32-NEXT: li a1, 32
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: li a1, 56
+; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT: vsll.vx v16, v8, a1
+; RV32-NEXT: lui a2, 16
+; RV32-NEXT: addi a2, a2, -256
+; RV32-NEXT: vand.vx v24, v8, a2
+; RV32-NEXT: li a3, 40
+; RV32-NEXT: vsll.vx v24, v24, a3
+; RV32-NEXT: vor.vv v16, v16, v24
+; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: lui a4, 4080
+; RV32-NEXT: vand.vx v16, v8, a4
+; RV32-NEXT: vsll.vi v24, v16, 24
+; RV32-NEXT: li a5, 32
+; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
; RV32-NEXT: vmv.v.i v16, 0
-; RV32-NEXT: lui a2, 349525
-; RV32-NEXT: addi a2, a2, 1365
+; RV32-NEXT: lui a6, 349525
+; RV32-NEXT: addi a6, a6, 1365
; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; RV32-NEXT: vmv.v.x v0, a2
-; RV32-NEXT: lui a2, 1044480
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmerge.vxm v16, v16, a2, v0
+; RV32-NEXT: lui a7, 1044480
+; RV32-NEXT: vmv.v.x v0, a6
+; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma
+; RV32-NEXT: vmerge.vxm v16, v16, a7, v0
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v24, v8, 8
-; RV32-NEXT: vand.vv v24, v24, v16
-; RV32-NEXT: vsrl.vi v0, v8, 24
-; RV32-NEXT: lui a0, 4080
-; RV32-NEXT: vand.vx v0, v0, a0
+; RV32-NEXT: vand.vv v0, v8, v16
+; RV32-NEXT: vsll.vi v0, v0, 8
; RV32-NEXT: vor.vv v24, v24, v0
-; RV32-NEXT: addi a1, sp, 16
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: li a1, 56
-; RV32-NEXT: li a2, 40
-; RV32-NEXT: vsrl.vx v0, v8, a2
-; RV32-NEXT: lui a3, 16
-; RV32-NEXT: addi a3, a3, -256
-; RV32-NEXT: vand.vx v0, v0, a3
-; RV32-NEXT: vsrl.vx v24, v8, a1
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vor.vv v24, v0, v24
-; RV32-NEXT: addi a4, sp, 16
-; RV32-NEXT: vl8r.v v0, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsrl.vx v0, v8, a3
+; RV32-NEXT: vand.vx v0, v0, a2
+; RV32-NEXT: vsrl.vx v24, v8, a1
; RV32-NEXT: vor.vv v24, v0, v24
-; RV32-NEXT: vand.vv v16, v8, v16
-; RV32-NEXT: vsll.vi v16, v16, 8
-; RV32-NEXT: vand.vx v0, v8, a0
-; RV32-NEXT: vsll.vi v0, v0, 24
-; RV32-NEXT: vor.vv v16, v0, v16
-; RV32-NEXT: vsll.vx v0, v8, a1
-; RV32-NEXT: vand.vx v8, v8, a3
-; RV32-NEXT: vsll.vx v8, v8, a2
-; RV32-NEXT: vor.vv v8, v0, v8
-; RV32-NEXT: vor.vv v8, v8, v16
+; RV32-NEXT: vsrl.vi v0, v8, 8
+; RV32-NEXT: vand.vv v16, v0, v16
+; RV32-NEXT: vsrl.vi v8, v8, 24
+; RV32-NEXT: vand.vx v8, v8, a4
+; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: vor.vv v8, v8, v24
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v8, v16, v8
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add sp, sp, a0
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: vmv.v.i v0, 5
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV32-NEXT: vmv.v.i v9, 0
+; RV32-NEXT: vmv.v.i v0, 5
; RV32-NEXT: lui a1, 1044480
; RV32-NEXT: vmerge.vxm v9, v9, a1, v0
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX2-RV32: # %bb.0:
; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; LMULMAX2-RV32-NEXT: vle64.v v8, (a0)
-; LMULMAX2-RV32-NEXT: li a1, 85
+; LMULMAX2-RV32-NEXT: li a1, 56
+; LMULMAX2-RV32-NEXT: vsrl.vx v10, v8, a1
+; LMULMAX2-RV32-NEXT: li a2, 40
+; LMULMAX2-RV32-NEXT: vsrl.vx v12, v8, a2
+; LMULMAX2-RV32-NEXT: lui a3, 16
+; LMULMAX2-RV32-NEXT: addi a3, a3, -256
+; LMULMAX2-RV32-NEXT: vand.vx v12, v12, a3
+; LMULMAX2-RV32-NEXT: vor.vv v10, v12, v10
+; LMULMAX2-RV32-NEXT: vsrl.vi v12, v8, 24
+; LMULMAX2-RV32-NEXT: lui a4, 4080
+; LMULMAX2-RV32-NEXT: vand.vx v12, v12, a4
+; LMULMAX2-RV32-NEXT: li a5, 85
; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v0, a1
+; LMULMAX2-RV32-NEXT: vmv.v.x v0, a5
; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.i v10, 0
-; LMULMAX2-RV32-NEXT: lui a1, 1044480
-; LMULMAX2-RV32-NEXT: vmerge.vxm v10, v10, a1, v0
+; LMULMAX2-RV32-NEXT: vmv.v.i v14, 0
+; LMULMAX2-RV32-NEXT: lui a5, 1044480
+; LMULMAX2-RV32-NEXT: vmerge.vxm v14, v14, a5, v0
; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vsrl.vi v12, v8, 8
-; LMULMAX2-RV32-NEXT: vand.vv v12, v12, v10
-; LMULMAX2-RV32-NEXT: vsrl.vi v14, v8, 24
-; LMULMAX2-RV32-NEXT: lui a1, 4080
-; LMULMAX2-RV32-NEXT: vand.vx v14, v14, a1
-; LMULMAX2-RV32-NEXT: vor.vv v12, v12, v14
-; LMULMAX2-RV32-NEXT: li a2, 56
-; LMULMAX2-RV32-NEXT: vsrl.vx v14, v8, a2
-; LMULMAX2-RV32-NEXT: li a3, 40
-; LMULMAX2-RV32-NEXT: vsrl.vx v16, v8, a3
-; LMULMAX2-RV32-NEXT: lui a4, 16
-; LMULMAX2-RV32-NEXT: addi a4, a4, -256
-; LMULMAX2-RV32-NEXT: vand.vx v16, v16, a4
-; LMULMAX2-RV32-NEXT: vor.vv v14, v16, v14
-; LMULMAX2-RV32-NEXT: vor.vv v12, v12, v14
-; LMULMAX2-RV32-NEXT: vand.vv v10, v8, v10
-; LMULMAX2-RV32-NEXT: vsll.vi v10, v10, 8
-; LMULMAX2-RV32-NEXT: vand.vx v14, v8, a1
-; LMULMAX2-RV32-NEXT: vsll.vi v14, v14, 24
-; LMULMAX2-RV32-NEXT: vor.vv v10, v14, v10
-; LMULMAX2-RV32-NEXT: vsll.vx v14, v8, a2
-; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a4
-; LMULMAX2-RV32-NEXT: vsll.vx v8, v8, a3
-; LMULMAX2-RV32-NEXT: vor.vv v8, v14, v8
+; LMULMAX2-RV32-NEXT: vsrl.vi v16, v8, 8
+; LMULMAX2-RV32-NEXT: vand.vv v16, v16, v14
+; LMULMAX2-RV32-NEXT: vor.vv v12, v16, v12
+; LMULMAX2-RV32-NEXT: vor.vv v10, v12, v10
+; LMULMAX2-RV32-NEXT: vsll.vx v12, v8, a1
+; LMULMAX2-RV32-NEXT: vand.vx v16, v8, a3
+; LMULMAX2-RV32-NEXT: vsll.vx v16, v16, a2
+; LMULMAX2-RV32-NEXT: vor.vv v12, v12, v16
+; LMULMAX2-RV32-NEXT: vand.vx v16, v8, a4
+; LMULMAX2-RV32-NEXT: vsll.vi v16, v16, 24
+; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v14
+; LMULMAX2-RV32-NEXT: vsll.vi v8, v8, 8
+; LMULMAX2-RV32-NEXT: vor.vv v8, v16, v8
+; LMULMAX2-RV32-NEXT: vor.vv v8, v12, v8
; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v12
; LMULMAX2-RV32-NEXT: vse64.v v8, (a0)
; LMULMAX2-RV32-NEXT: ret
;
; LMULMAX1-RV32-NEXT: vle64.v v8, (a0)
; LMULMAX1-RV32-NEXT: addi a1, a0, 16
; LMULMAX1-RV32-NEXT: vle64.v v9, (a1)
-; LMULMAX1-RV32-NEXT: vmv.v.i v0, 5
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; LMULMAX1-RV32-NEXT: vmv.v.i v10, 0
+; LMULMAX1-RV32-NEXT: vmv.v.i v0, 5
; LMULMAX1-RV32-NEXT: lui a2, 1044480
; LMULMAX1-RV32-NEXT: vmerge.vxm v10, v10, a2, v0
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX4-NEXT: addi s0, sp, 256
; LMULMAX4-NEXT: .cfi_def_cfa s0, 0
; LMULMAX4-NEXT: andi sp, sp, -128
-; LMULMAX4-NEXT: addi a0, sp, 64
; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; LMULMAX4-NEXT: vmv.v.i v8, 0
+; LMULMAX4-NEXT: addi a0, sp, 64
; LMULMAX4-NEXT: vse32.v v8, (a0)
; LMULMAX4-NEXT: mv a0, sp
; LMULMAX4-NEXT: li a1, 1
; LMULMAX4-NEXT: sd a0, 136(sp)
; LMULMAX4-NEXT: li a0, 13
; LMULMAX4-NEXT: sd a0, 0(sp)
-; LMULMAX4-NEXT: addi a0, sp, 72
; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; LMULMAX4-NEXT: vmv.v.i v8, 0
+; LMULMAX4-NEXT: addi a0, sp, 72
; LMULMAX4-NEXT: vse32.v v8, (a0)
; LMULMAX4-NEXT: addi a0, sp, 8
; LMULMAX4-NEXT: li a1, 1
define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v15i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
; RV32-NEXT: li a2, 32
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a1
+; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
+; RV32-NEXT: vsrl.vi v24, v8, 1
+; RV32-NEXT: vand.vv v16, v24, v16
; RV32-NEXT: vsub.vv v8, v8, v16
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v16i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: addi a1, a1, 1365
; RV32-NEXT: li a2, 32
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a1
+; RV32-NEXT: vmv.v.x v16, a1
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
+; RV32-NEXT: vsrl.vi v24, v8, 1
+; RV32-NEXT: vand.vv v16, v24, v16
; RV32-NEXT: vsub.vv v8, v8, v16
; RV32-NEXT: lui a1, 209715
; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT: li a2, 16
; RV32-NEXT: vslidedown.vi v24, v0, 2
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: bltu a0, a2, .LBB34_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: li a1, 16
-; RV32-NEXT: .LBB34_2:
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT: lui a2, 349525
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: li a3, 32
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 40
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a2, a2, a4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a4, 40
-; RV32-NEXT: mul a2, a2, a4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 5
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: lui a2, 209715
-; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a2, a1, 1365
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: li a3, 16
+; RV32-NEXT: vmv.v.x v16, a2
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: li a4, 40
; RV32-NEXT: mul a2, a2, a4
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 5
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 5
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT: mv a2, a0
+; RV32-NEXT: bltu a0, a3, .LBB34_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: li a2, 16
+; RV32-NEXT: .LBB34_2:
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 40
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT: vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t
-; RV32-NEXT: vadd.vv v16, v16, v8, v0.t
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 5
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a2, a2, 257
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a2
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: lui a3, 209715
+; RV32-NEXT: addi a3, a3, 819
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT: vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: li a4, 24
+; RV32-NEXT: mul a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: lui a3, 4112
+; RV32-NEXT: addi a3, a3, 257
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v16, a3
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t
; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a2, 24
+; RV32-NEXT: li a2, 40
; RV32-NEXT: mul a0, a0, a2
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a2, 40
-; RV32-NEXT: mul a0, a0, a2
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a2, 40
-; RV32-NEXT: mul a0, a0, a2
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a2, 24
+; RV32-NEXT: mul a0, a0, a2
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; RV32-LABEL: vp_ctpop_v32i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: li a2, 40
-; RV32-NEXT: mul a1, a1, a2
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; RV32-NEXT: li a2, 16
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 5
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: bltu a0, a2, .LBB35_2
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a2, a1, 1365
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: li a3, 16
+; RV32-NEXT: vmv.v.x v0, a2
+; RV32-NEXT: mv a2, a0
+; RV32-NEXT: bltu a0, a3, .LBB35_2
; RV32-NEXT: # %bb.1:
-; RV32-NEXT: li a1, 16
+; RV32-NEXT: li a2, 16
; RV32-NEXT: .LBB35_2:
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: lui a2, 349525
-; RV32-NEXT: addi a2, a2, 1365
-; RV32-NEXT: li a3, 32
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a2
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a4, 24
-; RV32-NEXT: mul a2, a2, a4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v16, v24
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: lui a2, 209715
-; RV32-NEXT: addi a2, a2, 819
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v0, a2
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v16, v8, v0
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: sub sp, sp, a3
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vsrl.vi v24, v8, 1
+; RV32-NEXT: vand.vv v24, v24, v0
+; RV32-NEXT: vsub.vv v8, v8, v24
+; RV32-NEXT: lui a3, 209715
+; RV32-NEXT: addi a3, a3, 819
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 24
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vs8r.v v0, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: vmv.v.x v0, a3
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v24, v8, v0
; RV32-NEXT: vsrl.vi v8, v8, 2
; RV32-NEXT: vand.vv v8, v8, v0
-; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: lui a2, 61681
-; RV32-NEXT: addi a2, a2, -241
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a2
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 4
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v24, v8, v16
-; RV32-NEXT: lui a2, 4112
-; RV32-NEXT: addi a2, a2, 257
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v24, v24, v8
+; RV32-NEXT: vadd.vv v8, v24, v8
+; RV32-NEXT: vsrl.vi v24, v8, 4
+; RV32-NEXT: vadd.vv v8, v8, v24
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vand.vv v8, v8, v24
+; RV32-NEXT: lui a3, 4112
+; RV32-NEXT: addi a3, a3, 257
+; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v24, a3
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: li a1, 56
-; RV32-NEXT: vsrl.vx v8, v24, a1
+; RV32-NEXT: vsrl.vx v8, v8, a1
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a0, a0, -1
; RV32-NEXT: and a0, a0, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vsrl.vi v24, v8, 1
+; RV32-NEXT: vsrl.vi v24, v16, 1
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a2, 24
; RV32-NEXT: mul a0, a0, a2
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v24, v24, v16
-; RV32-NEXT: vsub.vv v24, v8, v24
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vand.vv v24, v24, v8
+; RV32-NEXT: vsub.vv v24, v16, v24
; RV32-NEXT: vand.vv v8, v24, v0
; RV32-NEXT: vsrl.vi v24, v24, 2
; RV32-NEXT: vand.vv v24, v24, v0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
; LMULMAX2-RV32: # %bb.0:
; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX2-RV32-NEXT: vle64.v v8, (a0)
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1
; LMULMAX2-RV32-NEXT: lui a1, 349525
; LMULMAX2-RV32-NEXT: addi a1, a1, 1365
; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1
+; LMULMAX2-RV32-NEXT: vmv.v.x v9, a1
; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vand.vv v9, v9, v10
+; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX2-RV32-NEXT: vand.vv v9, v10, v9
; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9
; LMULMAX2-RV32-NEXT: lui a1, 209715
; LMULMAX2-RV32-NEXT: addi a1, a1, 819
; LMULMAX1-RV32: # %bb.0:
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-RV32-NEXT: vle64.v v8, (a0)
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1
; LMULMAX1-RV32-NEXT: lui a1, 349525
; LMULMAX1-RV32-NEXT: addi a1, a1, 1365
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v10, a1
+; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v10
+; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX1-RV32-NEXT: vand.vv v9, v10, v9
; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v9
; LMULMAX1-RV32-NEXT: lui a1, 209715
; LMULMAX1-RV32-NEXT: addi a1, a1, 819
; LMULMAX2-RV32: # %bb.0:
; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; LMULMAX2-RV32-NEXT: vle64.v v8, (a0)
-; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1
; LMULMAX2-RV32-NEXT: lui a1, 349525
; LMULMAX2-RV32-NEXT: addi a1, a1, 1365
; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v12, a1
+; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1
; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vand.vv v10, v10, v12
+; LMULMAX2-RV32-NEXT: vsrl.vi v12, v8, 1
+; LMULMAX2-RV32-NEXT: vand.vv v10, v12, v10
; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10
; LMULMAX2-RV32-NEXT: lui a1, 209715
; LMULMAX2-RV32-NEXT: addi a1, a1, 819
; LMULMAX1-RV32-LABEL: ctpop_v4i64:
; LMULMAX1-RV32: # %bb.0:
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-RV32-NEXT: vle64.v v8, (a0)
; LMULMAX1-RV32-NEXT: addi a1, a0, 16
-; LMULMAX1-RV32-NEXT: vle64.v v8, (a1)
-; LMULMAX1-RV32-NEXT: vle64.v v9, (a0)
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX1-RV32-NEXT: vle64.v v9, (a1)
; LMULMAX1-RV32-NEXT: lui a2, 349525
; LMULMAX1-RV32-NEXT: addi a2, a2, 1365
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v11, a2
+; LMULMAX1-RV32-NEXT: vmv.v.x v10, a2
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v10, v10, v11
-; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v10
+; LMULMAX1-RV32-NEXT: vsrl.vi v11, v9, 1
+; LMULMAX1-RV32-NEXT: vand.vv v11, v11, v10
+; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v11
; LMULMAX1-RV32-NEXT: lui a2, 209715
; LMULMAX1-RV32-NEXT: addi a2, a2, 819
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v10, a2
+; LMULMAX1-RV32-NEXT: vmv.v.x v11, a2
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v12, v8, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: vadd.vv v8, v12, v8
-; LMULMAX1-RV32-NEXT: vsrl.vi v12, v8, 4
-; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v12
+; LMULMAX1-RV32-NEXT: vand.vv v12, v9, v11
+; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 2
+; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v11
+; LMULMAX1-RV32-NEXT: vadd.vv v9, v12, v9
+; LMULMAX1-RV32-NEXT: vsrl.vi v12, v9, 4
+; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v12
; LMULMAX1-RV32-NEXT: lui a2, 61681
; LMULMAX1-RV32-NEXT: addi a2, a2, -241
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; LMULMAX1-RV32-NEXT: vmv.v.x v12, a2
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v12
+; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v12
; LMULMAX1-RV32-NEXT: lui a2, 4112
; LMULMAX1-RV32-NEXT: addi a2, a2, 257
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; LMULMAX1-RV32-NEXT: vmv.v.x v13, a2
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmul.vv v8, v8, v13
-; LMULMAX1-RV32-NEXT: li a2, 56
-; LMULMAX1-RV32-NEXT: vsrl.vx v8, v8, a2
-; LMULMAX1-RV32-NEXT: vsrl.vi v14, v9, 1
-; LMULMAX1-RV32-NEXT: vand.vv v11, v14, v11
-; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v11
-; LMULMAX1-RV32-NEXT: vand.vv v11, v9, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 2
-; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT: vadd.vv v9, v11, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4
-; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v12
; LMULMAX1-RV32-NEXT: vmul.vv v9, v9, v13
+; LMULMAX1-RV32-NEXT: li a2, 56
; LMULMAX1-RV32-NEXT: vsrl.vx v9, v9, a2
-; LMULMAX1-RV32-NEXT: vse64.v v9, (a0)
-; LMULMAX1-RV32-NEXT: vse64.v v8, (a1)
+; LMULMAX1-RV32-NEXT: vsrl.vi v14, v8, 1
+; LMULMAX1-RV32-NEXT: vand.vv v10, v14, v10
+; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v10
+; LMULMAX1-RV32-NEXT: vand.vv v10, v8, v11
+; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v11
+; LMULMAX1-RV32-NEXT: vadd.vv v8, v10, v8
+; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4
+; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v10
+; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v12
+; LMULMAX1-RV32-NEXT: vmul.vv v8, v8, v13
+; LMULMAX1-RV32-NEXT: vsrl.vx v8, v8, a2
+; LMULMAX1-RV32-NEXT: vse64.v v8, (a0)
+; LMULMAX1-RV32-NEXT: vse64.v v9, (a1)
; LMULMAX1-RV32-NEXT: ret
;
; LMULMAX1-RV64-LABEL: ctpop_v4i64:
; RV32-NEXT: slli a1, a1, 6
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 64 * vlenb
+; RV32-NEXT: vmv1r.v v24, v0
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 40
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vmv8r.v v16, v8
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT: li a1, 16
-; RV32-NEXT: vslidedown.vi v24, v0, 2
-; RV32-NEXT: mv a2, a0
-; RV32-NEXT: bltu a0, a1, .LBB34_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: li a2, 16
-; RV32-NEXT: .LBB34_2:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v8, v16, a1, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 56
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vslidedown.vi v0, v0, 2
+; RV32-NEXT: addi a1, a0, -16
+; RV32-NEXT: sltu a2, a0, a1
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: and a2, a2, a1
; RV32-NEXT: li a3, 32
; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
; RV32-NEXT: vmv.v.i v8, -1
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: li a1, 1
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: slli a4, a4, 5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vxor.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 56
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsub.vx v16, v16, a1, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: li a5, 48
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v8, a4
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: li a5, 24
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: li a5, 56
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: li a5, 48
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: li a5, 48
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: addi a4, a4, 819
; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v8, a4
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 56
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: li a5, 48
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 56
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: slli a4, a4, 4
; RV32-NEXT: addi a4, a4, -241
; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v8, a4
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 48
-; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: slli a4, a4, 4
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: lui a4, 4112
; RV32-NEXT: addi a4, a4, 257
; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a4
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vmv.v.x v8, a4
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vmul.vv v8, v16, v8, v0.t
; RV32-NEXT: li a2, 56
+; RV32-NEXT: li a3, 16
; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, a0, -16
-; RV32-NEXT: sltu a0, a0, a3
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a3
+; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: bltu a0, a3, .LBB34_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: li a0, 16
+; RV32-NEXT: .LBB34_2:
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmv1r.v v0, v24
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vx v8, v16, a1, v0.t
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vxor.vv v16, v16, v8, v0.t
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsub.vx v16, v16, a1, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
+; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
+; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 48
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 56
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 48
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 48
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 6
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: sub sp, sp, a2
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; RV32-NEXT: li a2, 1
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a2
-; RV32-NEXT: li a3, 32
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.i v24, -1
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: li a2, 32
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.i v16, -1
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vxor.vv v8, v8, v24
-; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vxor.vv v16, v8, v16
+; RV32-NEXT: li a3, 1
+; RV32-NEXT: vsub.vx v8, v8, a3
+; RV32-NEXT: vand.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: lui a4, 349525
; RV32-NEXT: addi a4, a4, 1365
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a4
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: li a5, 24
; RV32-NEXT: vsub.vv v8, v8, v16
; RV32-NEXT: lui a4, 209715
; RV32-NEXT: addi a4, a4, 819
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a4
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v24, v8, v16
; RV32-NEXT: vadd.vv v8, v8, v24
; RV32-NEXT: lui a4, 61681
; RV32-NEXT: addi a4, a4, -241
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a4
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: slli a4, a4, 4
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: lui a4, 4112
; RV32-NEXT: addi a4, a4, 257
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a4
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsrl.vx v8, v8, a1
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, a0, -16
-; RV32-NEXT: sltu a0, a0, a3
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: addi a2, a0, -16
+; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a3
+; RV32-NEXT: and a0, a0, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vxor.vv v8, v0, v8
-; RV32-NEXT: vsub.vx v0, v0, a2
+; RV32-NEXT: vsub.vx v0, v0, a3
; RV32-NEXT: vand.vv v8, v8, v0
; RV32-NEXT: vsrl.vi v0, v8, 1
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a1, a1, 6
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 64 * vlenb
+; RV32-NEXT: vmv1r.v v24, v0
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: li a2, 40
; RV32-NEXT: mul a1, a1, a2
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vmv8r.v v16, v8
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT: li a1, 16
-; RV32-NEXT: vslidedown.vi v24, v0, 2
-; RV32-NEXT: mv a2, a0
-; RV32-NEXT: bltu a0, a1, .LBB70_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: li a2, 16
-; RV32-NEXT: .LBB70_2:
-; RV32-NEXT: li a1, 1
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v8, v16, a1, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 56
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vslidedown.vi v0, v0, 2
+; RV32-NEXT: addi a1, a0, -16
+; RV32-NEXT: sltu a2, a0, a1
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: and a2, a2, a1
; RV32-NEXT: li a3, 32
; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
; RV32-NEXT: vmv.v.i v8, -1
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: li a1, 1
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: slli a4, a4, 5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vxor.vv v16, v16, v8, v0.t
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 56
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsub.vx v16, v16, a1, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: li a5, 48
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v8, a4
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: li a5, 24
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: li a5, 56
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v16, v8, v0.t
+; RV32-NEXT: vand.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: li a5, 48
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT: vsub.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: li a5, 48
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: addi a4, a4, 819
; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v8, a4
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 56
-; RV32-NEXT: mul a4, a4, a5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: li a5, 48
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload
; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: li a5, 56
+; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 16
+; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: slli a4, a4, 4
; RV32-NEXT: addi a4, a4, -241
; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v8, a4
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: li a5, 48
-; RV32-NEXT: mul a4, a4, a5
+; RV32-NEXT: slli a4, a4, 4
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: lui a4, 4112
; RV32-NEXT: addi a4, a4, 257
; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a4
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: vmv.v.x v8, a4
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vmul.vv v8, v16, v8, v0.t
; RV32-NEXT: li a2, 56
+; RV32-NEXT: li a3, 16
; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, a0, -16
-; RV32-NEXT: sltu a0, a0, a3
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a3
+; RV32-NEXT: addi a4, sp, 16
+; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: bltu a0, a3, .LBB70_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: li a0, 16
+; RV32-NEXT: .LBB70_2:
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: vmv1r.v v0, v24
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vsub.vx v8, v16, a1, v0.t
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vxor.vv v16, v16, v8, v0.t
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v8, v16, v8, v0.t
+; RV32-NEXT: vxor.vv v8, v16, v8, v0.t
+; RV32-NEXT: vsub.vx v16, v16, a1, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
+; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 40
+; RV32-NEXT: li a1, 48
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vand.vv v16, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 40
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsub.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 48
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 56
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 48
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vand.vv v16, v8, v16, v0.t
+; RV32-NEXT: vand.vv v16, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 40
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 48
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t
; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 48
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 6
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: sub sp, sp, a2
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; RV32-NEXT: li a2, 1
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a2
-; RV32-NEXT: li a3, 32
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.i v24, -1
-; RV32-NEXT: csrr a4, vlenb
-; RV32-NEXT: slli a4, a4, 5
-; RV32-NEXT: add a4, sp, a4
-; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV32-NEXT: li a2, 32
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.i v16, -1
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 5
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 16
+; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vxor.vv v8, v8, v24
-; RV32-NEXT: vand.vv v8, v8, v16
+; RV32-NEXT: vxor.vv v16, v8, v16
+; RV32-NEXT: li a3, 1
+; RV32-NEXT: vsub.vx v8, v8, a3
+; RV32-NEXT: vand.vv v8, v16, v8
; RV32-NEXT: vsrl.vi v16, v8, 1
; RV32-NEXT: lui a4, 349525
; RV32-NEXT: addi a4, a4, 1365
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a4
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: li a5, 24
; RV32-NEXT: vsub.vv v8, v8, v16
; RV32-NEXT: lui a4, 209715
; RV32-NEXT: addi a4, a4, 819
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v16, a4
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vv v24, v8, v16
; RV32-NEXT: vadd.vv v8, v8, v24
; RV32-NEXT: lui a4, 61681
; RV32-NEXT: addi a4, a4, -241
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a4
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: slli a4, a4, 4
; RV32-NEXT: vand.vv v8, v8, v24
; RV32-NEXT: lui a4, 4112
; RV32-NEXT: addi a4, a4, 257
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v24, a4
-; RV32-NEXT: addi a3, sp, 16
-; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vmul.vv v8, v8, v24
; RV32-NEXT: li a1, 56
; RV32-NEXT: vsrl.vx v8, v8, a1
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 3
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, a0, -16
-; RV32-NEXT: sltu a0, a0, a3
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: addi a2, a0, -16
+; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a3
+; RV32-NEXT: and a0, a0, a2
; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vxor.vv v8, v0, v8
-; RV32-NEXT: vsub.vx v0, v0, a2
+; RV32-NEXT: vsub.vx v0, v0, a3
; RV32-NEXT: vand.vv v8, v8, v0
; RV32-NEXT: vsrl.vi v0, v8, 1
; RV32-NEXT: csrr a0, vlenb
; LMULMAX1-LABEL: sextload_v4i8_v4i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vle8.v v10, (a0)
+; LMULMAX1-NEXT: vle8.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf8 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf8 v9, v8
-; LMULMAX1-NEXT: vsext.vf8 v8, v10
+; LMULMAX1-NEXT: vsext.vf8 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v4i8_v4i64:
; LMULMAX1-LABEL: zextload_v4i8_v4i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vle8.v v10, (a0)
+; LMULMAX1-NEXT: vle8.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf8 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf8 v9, v8
-; LMULMAX1-NEXT: vzext.vf8 v8, v10
+; LMULMAX1-NEXT: vzext.vf8 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v4i8_v4i64:
; LMULMAX1-LABEL: sextload_v8i8_v8i32:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vle8.v v10, (a0)
+; LMULMAX1-NEXT: vle8.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf4 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v9, v8
-; LMULMAX1-NEXT: vsext.vf4 v8, v10
+; LMULMAX1-NEXT: vsext.vf4 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v8i8_v8i32:
; LMULMAX1-LABEL: zextload_v8i8_v8i32:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vle8.v v10, (a0)
+; LMULMAX1-NEXT: vle8.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf4 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf4 v9, v8
-; LMULMAX1-NEXT: vzext.vf4 v8, v10
+; LMULMAX1-NEXT: vzext.vf4 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v8i8_v8i32:
; LMULMAX1-LABEL: sextload_v8i8_v8i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vle8.v v12, (a0)
+; LMULMAX1-NEXT: vle8.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf8 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4
+; LMULMAX1-NEXT: vslidedown.vi v11, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf8 v10, v8
+; LMULMAX1-NEXT: vsext.vf8 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v11, v12, 2
+; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf8 v9, v11
+; LMULMAX1-NEXT: vsext.vf8 v9, v12
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf8 v11, v8
-; LMULMAX1-NEXT: vsext.vf8 v8, v12
+; LMULMAX1-NEXT: vsext.vf8 v11, v12
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v8i8_v8i64:
; LMULMAX1-LABEL: zextload_v8i8_v8i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vle8.v v12, (a0)
+; LMULMAX1-NEXT: vle8.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf8 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4
+; LMULMAX1-NEXT: vslidedown.vi v11, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf8 v10, v8
+; LMULMAX1-NEXT: vzext.vf8 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v11, v12, 2
+; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf8 v9, v11
+; LMULMAX1-NEXT: vzext.vf8 v9, v12
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf8 v11, v8
-; LMULMAX1-NEXT: vzext.vf8 v8, v12
+; LMULMAX1-NEXT: vzext.vf8 v11, v12
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v8i8_v8i64:
; LMULMAX1-LABEL: sextload_v16i8_v16i16:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; LMULMAX1-NEXT: vle8.v v10, (a0)
+; LMULMAX1-NEXT: vle8.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 8
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 8
; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v9, v8
-; LMULMAX1-NEXT: vsext.vf2 v8, v10
+; LMULMAX1-NEXT: vsext.vf2 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v16i8_v16i16:
; LMULMAX1-LABEL: zextload_v16i8_v16i16:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; LMULMAX1-NEXT: vle8.v v10, (a0)
+; LMULMAX1-NEXT: vle8.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 8
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 8
; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v9, v8
-; LMULMAX1-NEXT: vzext.vf2 v8, v10
+; LMULMAX1-NEXT: vzext.vf2 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v16i8_v16i16:
; LMULMAX1-LABEL: sextload_v16i8_v16i32:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; LMULMAX1-NEXT: vle8.v v12, (a0)
+; LMULMAX1-NEXT: vle8.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf4 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v12, 8
+; LMULMAX1-NEXT: vslidedown.vi v11, v9, 8
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v10, v8
+; LMULMAX1-NEXT: vsext.vf4 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v11, v12, 4
+; LMULMAX1-NEXT: vslidedown.vi v12, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v9, v11
+; LMULMAX1-NEXT: vsext.vf4 v9, v12
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v11, v8
-; LMULMAX1-NEXT: vsext.vf4 v8, v12
+; LMULMAX1-NEXT: vsext.vf4 v11, v12
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v16i8_v16i32:
; LMULMAX1-LABEL: zextload_v16i8_v16i32:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; LMULMAX1-NEXT: vle8.v v12, (a0)
+; LMULMAX1-NEXT: vle8.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf4 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v12, 8
+; LMULMAX1-NEXT: vslidedown.vi v11, v9, 8
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf4 v10, v8
+; LMULMAX1-NEXT: vzext.vf4 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v11, v12, 4
+; LMULMAX1-NEXT: vslidedown.vi v12, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf4 v9, v11
+; LMULMAX1-NEXT: vzext.vf4 v9, v12
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf4 v11, v8
-; LMULMAX1-NEXT: vzext.vf4 v8, v12
+; LMULMAX1-NEXT: vzext.vf4 v11, v12
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v16i8_v16i32:
; LMULMAX1-LABEL: sextload_v16i8_v16i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; LMULMAX1-NEXT: vle8.v v16, (a0)
+; LMULMAX1-NEXT: vle8.v v10, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf8 v8, v10
; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v16, 8
+; LMULMAX1-NEXT: vslidedown.vi v11, v10, 8
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf8 v12, v8
+; LMULMAX1-NEXT: vsext.vf8 v12, v11
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v10, v16, 2
+; LMULMAX1-NEXT: vslidedown.vi v13, v10, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf8 v9, v10
+; LMULMAX1-NEXT: vsext.vf8 v9, v13
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v11, v16, 4
+; LMULMAX1-NEXT: vslidedown.vi v15, v10, 4
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf8 v10, v11
+; LMULMAX1-NEXT: vsext.vf8 v10, v15
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v14, v8, 2
+; LMULMAX1-NEXT: vslidedown.vi v14, v11, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vsext.vf8 v13, v14
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4
+; LMULMAX1-NEXT: vslidedown.vi v16, v11, 4
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf8 v14, v8
+; LMULMAX1-NEXT: vsext.vf8 v14, v16
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v15, v11, 2
+; LMULMAX1-NEXT: vslidedown.vi v15, v15, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vsext.vf8 v11, v15
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2
+; LMULMAX1-NEXT: vslidedown.vi v16, v16, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf8 v15, v8
-; LMULMAX1-NEXT: vsext.vf8 v8, v16
+; LMULMAX1-NEXT: vsext.vf8 v15, v16
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v16i8_v16i64:
; LMULMAX4: # %bb.0:
; LMULMAX4-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; LMULMAX4-NEXT: vle8.v v16, (a0)
+; LMULMAX4-NEXT: vle8.v v12, (a0)
+; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; LMULMAX4-NEXT: vsext.vf8 v8, v12
; LMULMAX4-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8
+; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8
; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; LMULMAX4-NEXT: vsext.vf8 v12, v8
-; LMULMAX4-NEXT: vsext.vf8 v8, v16
+; LMULMAX4-NEXT: vsext.vf8 v12, v16
; LMULMAX4-NEXT: ret
%y = load <16 x i8>, ptr %x
%z = sext <16 x i8> %y to <16 x i64>
; LMULMAX1-LABEL: zextload_v16i8_v16i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; LMULMAX1-NEXT: vle8.v v16, (a0)
+; LMULMAX1-NEXT: vle8.v v10, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf8 v8, v10
; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v16, 8
+; LMULMAX1-NEXT: vslidedown.vi v11, v10, 8
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf8 v12, v8
+; LMULMAX1-NEXT: vzext.vf8 v12, v11
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v10, v16, 2
+; LMULMAX1-NEXT: vslidedown.vi v13, v10, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf8 v9, v10
+; LMULMAX1-NEXT: vzext.vf8 v9, v13
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v11, v16, 4
+; LMULMAX1-NEXT: vslidedown.vi v15, v10, 4
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf8 v10, v11
+; LMULMAX1-NEXT: vzext.vf8 v10, v15
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v14, v8, 2
+; LMULMAX1-NEXT: vslidedown.vi v14, v11, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vzext.vf8 v13, v14
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4
+; LMULMAX1-NEXT: vslidedown.vi v16, v11, 4
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf8 v14, v8
+; LMULMAX1-NEXT: vzext.vf8 v14, v16
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v15, v11, 2
+; LMULMAX1-NEXT: vslidedown.vi v15, v15, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vzext.vf8 v11, v15
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2
+; LMULMAX1-NEXT: vslidedown.vi v16, v16, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf8 v15, v8
-; LMULMAX1-NEXT: vzext.vf8 v8, v16
+; LMULMAX1-NEXT: vzext.vf8 v15, v16
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v16i8_v16i64:
; LMULMAX4: # %bb.0:
; LMULMAX4-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; LMULMAX4-NEXT: vle8.v v16, (a0)
+; LMULMAX4-NEXT: vle8.v v12, (a0)
+; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; LMULMAX4-NEXT: vzext.vf8 v8, v12
; LMULMAX4-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8
+; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8
; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; LMULMAX4-NEXT: vzext.vf8 v12, v8
-; LMULMAX4-NEXT: vzext.vf8 v8, v16
+; LMULMAX4-NEXT: vzext.vf8 v12, v16
; LMULMAX4-NEXT: ret
%y = load <16 x i8>, ptr %x
%z = zext <16 x i8> %y to <16 x i64>
; LMULMAX1-LABEL: sextload_v4i16_v4i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vle16.v v10, (a0)
+; LMULMAX1-NEXT: vle16.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf4 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v9, v8
-; LMULMAX1-NEXT: vsext.vf4 v8, v10
+; LMULMAX1-NEXT: vsext.vf4 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v4i16_v4i64:
; LMULMAX1-LABEL: zextload_v4i16_v4i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vle16.v v10, (a0)
+; LMULMAX1-NEXT: vle16.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf4 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf4 v9, v8
-; LMULMAX1-NEXT: vzext.vf4 v8, v10
+; LMULMAX1-NEXT: vzext.vf4 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v4i16_v4i64:
; LMULMAX1-LABEL: sextload_v8i16_v8i32:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; LMULMAX1-NEXT: vle16.v v10, (a0)
+; LMULMAX1-NEXT: vle16.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v9, v8
-; LMULMAX1-NEXT: vsext.vf2 v8, v10
+; LMULMAX1-NEXT: vsext.vf2 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v8i16_v8i32:
; LMULMAX1-LABEL: zextload_v8i16_v8i32:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; LMULMAX1-NEXT: vle16.v v10, (a0)
+; LMULMAX1-NEXT: vle16.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v9, v8
-; LMULMAX1-NEXT: vzext.vf2 v8, v10
+; LMULMAX1-NEXT: vzext.vf2 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v8i16_v8i32:
; LMULMAX1-LABEL: sextload_v8i16_v8i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; LMULMAX1-NEXT: vle16.v v12, (a0)
+; LMULMAX1-NEXT: vle16.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf4 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4
+; LMULMAX1-NEXT: vslidedown.vi v11, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v10, v8
+; LMULMAX1-NEXT: vsext.vf4 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v11, v12, 2
+; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v9, v11
+; LMULMAX1-NEXT: vsext.vf4 v9, v12
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v11, v8
-; LMULMAX1-NEXT: vsext.vf4 v8, v12
+; LMULMAX1-NEXT: vsext.vf4 v11, v12
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v8i16_v8i64:
; LMULMAX1-LABEL: zextload_v8i16_v8i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; LMULMAX1-NEXT: vle16.v v12, (a0)
+; LMULMAX1-NEXT: vle16.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf4 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4
+; LMULMAX1-NEXT: vslidedown.vi v11, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf4 v10, v8
+; LMULMAX1-NEXT: vzext.vf4 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v11, v12, 2
+; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf4 v9, v11
+; LMULMAX1-NEXT: vzext.vf4 v9, v12
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf4 v11, v8
-; LMULMAX1-NEXT: vzext.vf4 v8, v12
+; LMULMAX1-NEXT: vzext.vf4 v11, v12
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v8i16_v8i64:
; LMULMAX1-LABEL: sextload_v16i16_v16i32:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; LMULMAX1-NEXT: vle16.v v10, (a0)
+; LMULMAX1-NEXT: vle16.v v9, (a0)
; LMULMAX1-NEXT: addi a0, a0, 16
-; LMULMAX1-NEXT: vle16.v v12, (a0)
+; LMULMAX1-NEXT: vle16.v v11, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v9, v8
+; LMULMAX1-NEXT: vsext.vf2 v9, v10
+; LMULMAX1-NEXT: vsext.vf2 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v11, v8
-; LMULMAX1-NEXT: vsext.vf2 v8, v10
-; LMULMAX1-NEXT: vsext.vf2 v10, v12
+; LMULMAX1-NEXT: vsext.vf2 v11, v12
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v16i16_v16i32:
; LMULMAX1-LABEL: zextload_v16i16_v16i32:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; LMULMAX1-NEXT: vle16.v v10, (a0)
+; LMULMAX1-NEXT: vle16.v v9, (a0)
; LMULMAX1-NEXT: addi a0, a0, 16
-; LMULMAX1-NEXT: vle16.v v12, (a0)
+; LMULMAX1-NEXT: vle16.v v11, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v9, v8
+; LMULMAX1-NEXT: vzext.vf2 v9, v10
+; LMULMAX1-NEXT: vzext.vf2 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v11, v8
-; LMULMAX1-NEXT: vzext.vf2 v8, v10
-; LMULMAX1-NEXT: vzext.vf2 v10, v12
+; LMULMAX1-NEXT: vzext.vf2 v11, v12
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v16i16_v16i32:
; LMULMAX1-LABEL: sextload_v16i16_v16i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; LMULMAX1-NEXT: vle16.v v12, (a0)
+; LMULMAX1-NEXT: vle16.v v9, (a0)
; LMULMAX1-NEXT: addi a0, a0, 16
-; LMULMAX1-NEXT: vle16.v v16, (a0)
+; LMULMAX1-NEXT: vle16.v v13, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf4 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4
+; LMULMAX1-NEXT: vslidedown.vi v11, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v10, v8
+; LMULMAX1-NEXT: vsext.vf4 v10, v11
+; LMULMAX1-NEXT: vsext.vf4 v12, v13
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v15, v16, 4
+; LMULMAX1-NEXT: vslidedown.vi v15, v13, 4
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vsext.vf4 v14, v15
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v11, v12, 2
+; LMULMAX1-NEXT: vslidedown.vi v16, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v9, v11
+; LMULMAX1-NEXT: vsext.vf4 v9, v16
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2
+; LMULMAX1-NEXT: vslidedown.vi v16, v11, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v11, v8
+; LMULMAX1-NEXT: vsext.vf4 v11, v16
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v16, 2
+; LMULMAX1-NEXT: vslidedown.vi v16, v13, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v13, v8
+; LMULMAX1-NEXT: vsext.vf4 v13, v16
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v15, 2
+; LMULMAX1-NEXT: vslidedown.vi v16, v15, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v15, v8
-; LMULMAX1-NEXT: vsext.vf4 v8, v12
-; LMULMAX1-NEXT: vsext.vf4 v12, v16
+; LMULMAX1-NEXT: vsext.vf4 v15, v16
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v16i16_v16i64:
; LMULMAX4: # %bb.0:
; LMULMAX4-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; LMULMAX4-NEXT: vle16.v v16, (a0)
+; LMULMAX4-NEXT: vle16.v v12, (a0)
+; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; LMULMAX4-NEXT: vsext.vf4 v8, v12
; LMULMAX4-NEXT: vsetivli zero, 8, e16, m2, ta, ma
-; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8
+; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8
; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; LMULMAX4-NEXT: vsext.vf4 v12, v8
-; LMULMAX4-NEXT: vsext.vf4 v8, v16
+; LMULMAX4-NEXT: vsext.vf4 v12, v16
; LMULMAX4-NEXT: ret
%y = load <16 x i16>, ptr %x
%z = sext <16 x i16> %y to <16 x i64>
; LMULMAX1-LABEL: zextload_v16i16_v16i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; LMULMAX1-NEXT: vle16.v v12, (a0)
+; LMULMAX1-NEXT: vle16.v v9, (a0)
; LMULMAX1-NEXT: addi a0, a0, 16
-; LMULMAX1-NEXT: vle16.v v16, (a0)
+; LMULMAX1-NEXT: vle16.v v13, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf4 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v12, 4
+; LMULMAX1-NEXT: vslidedown.vi v11, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf4 v10, v8
+; LMULMAX1-NEXT: vzext.vf4 v10, v11
+; LMULMAX1-NEXT: vzext.vf4 v12, v13
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v15, v16, 4
+; LMULMAX1-NEXT: vslidedown.vi v15, v13, 4
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vzext.vf4 v14, v15
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v11, v12, 2
+; LMULMAX1-NEXT: vslidedown.vi v16, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf4 v9, v11
+; LMULMAX1-NEXT: vzext.vf4 v9, v16
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2
+; LMULMAX1-NEXT: vslidedown.vi v16, v11, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf4 v11, v8
+; LMULMAX1-NEXT: vzext.vf4 v11, v16
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v16, 2
+; LMULMAX1-NEXT: vslidedown.vi v16, v13, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf4 v13, v8
+; LMULMAX1-NEXT: vzext.vf4 v13, v16
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v15, 2
+; LMULMAX1-NEXT: vslidedown.vi v16, v15, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf4 v15, v8
-; LMULMAX1-NEXT: vzext.vf4 v8, v12
-; LMULMAX1-NEXT: vzext.vf4 v12, v16
+; LMULMAX1-NEXT: vzext.vf4 v15, v16
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v16i16_v16i64:
; LMULMAX4: # %bb.0:
; LMULMAX4-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; LMULMAX4-NEXT: vle16.v v16, (a0)
+; LMULMAX4-NEXT: vle16.v v12, (a0)
+; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; LMULMAX4-NEXT: vzext.vf4 v8, v12
; LMULMAX4-NEXT: vsetivli zero, 8, e16, m2, ta, ma
-; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8
+; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8
; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; LMULMAX4-NEXT: vzext.vf4 v12, v8
-; LMULMAX4-NEXT: vzext.vf4 v8, v16
+; LMULMAX4-NEXT: vzext.vf4 v12, v16
; LMULMAX4-NEXT: ret
%y = load <16 x i16>, ptr %x
%z = zext <16 x i16> %y to <16 x i64>
; LMULMAX1-LABEL: sextload_v4i32_v4i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vle32.v v10, (a0)
+; LMULMAX1-NEXT: vle32.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v9, v8
-; LMULMAX1-NEXT: vsext.vf2 v8, v10
+; LMULMAX1-NEXT: vsext.vf2 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v4i32_v4i64:
; LMULMAX1-LABEL: zextload_v4i32_v4i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vle32.v v10, (a0)
+; LMULMAX1-NEXT: vle32.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v9, v8
-; LMULMAX1-NEXT: vzext.vf2 v8, v10
+; LMULMAX1-NEXT: vzext.vf2 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v4i32_v4i64:
; LMULMAX1-LABEL: sextload_v8i32_v8i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vle32.v v10, (a0)
+; LMULMAX1-NEXT: vle32.v v9, (a0)
; LMULMAX1-NEXT: addi a0, a0, 16
-; LMULMAX1-NEXT: vle32.v v12, (a0)
+; LMULMAX1-NEXT: vle32.v v11, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v9, v8
+; LMULMAX1-NEXT: vsext.vf2 v9, v10
+; LMULMAX1-NEXT: vsext.vf2 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v12, 2
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v11, v8
-; LMULMAX1-NEXT: vsext.vf2 v8, v10
-; LMULMAX1-NEXT: vsext.vf2 v10, v12
+; LMULMAX1-NEXT: vsext.vf2 v11, v12
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v8i32_v8i64:
; LMULMAX1-LABEL: zextload_v8i32_v8i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vle32.v v10, (a0)
+; LMULMAX1-NEXT: vle32.v v9, (a0)
; LMULMAX1-NEXT: addi a0, a0, 16
-; LMULMAX1-NEXT: vle32.v v12, (a0)
+; LMULMAX1-NEXT: vle32.v v11, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v9, v8
+; LMULMAX1-NEXT: vzext.vf2 v9, v10
+; LMULMAX1-NEXT: vzext.vf2 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v12, 2
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v11, v8
-; LMULMAX1-NEXT: vzext.vf2 v8, v10
-; LMULMAX1-NEXT: vzext.vf2 v10, v12
+; LMULMAX1-NEXT: vzext.vf2 v11, v12
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v8i32_v8i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: addi a1, a0, 48
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vle32.v v16, (a1)
+; LMULMAX1-NEXT: vle32.v v15, (a1)
; LMULMAX1-NEXT: addi a1, a0, 32
-; LMULMAX1-NEXT: vle32.v v14, (a1)
-; LMULMAX1-NEXT: vle32.v v10, (a0)
+; LMULMAX1-NEXT: vle32.v v13, (a1)
+; LMULMAX1-NEXT: vle32.v v9, (a0)
; LMULMAX1-NEXT: addi a0, a0, 16
-; LMULMAX1-NEXT: vle32.v v12, (a0)
+; LMULMAX1-NEXT: vle32.v v11, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v9, v8
+; LMULMAX1-NEXT: vsext.vf2 v9, v10
+; LMULMAX1-NEXT: vsext.vf2 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v12, 2
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v11, v8
+; LMULMAX1-NEXT: vsext.vf2 v11, v12
+; LMULMAX1-NEXT: vsext.vf2 v12, v13
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v14, 2
+; LMULMAX1-NEXT: vslidedown.vi v14, v13, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v13, v8
+; LMULMAX1-NEXT: vsext.vf2 v13, v14
+; LMULMAX1-NEXT: vsext.vf2 v14, v15
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v16, 2
+; LMULMAX1-NEXT: vslidedown.vi v16, v15, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v15, v8
-; LMULMAX1-NEXT: vsext.vf2 v8, v10
-; LMULMAX1-NEXT: vsext.vf2 v10, v12
-; LMULMAX1-NEXT: vsext.vf2 v12, v14
-; LMULMAX1-NEXT: vsext.vf2 v14, v16
+; LMULMAX1-NEXT: vsext.vf2 v15, v16
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v16i32_v16i64:
; LMULMAX4: # %bb.0:
; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; LMULMAX4-NEXT: vle32.v v16, (a0)
+; LMULMAX4-NEXT: vle32.v v12, (a0)
+; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; LMULMAX4-NEXT: vsext.vf2 v8, v12
; LMULMAX4-NEXT: vsetivli zero, 8, e32, m4, ta, ma
-; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8
+; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8
; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; LMULMAX4-NEXT: vsext.vf2 v12, v8
-; LMULMAX4-NEXT: vsext.vf2 v8, v16
+; LMULMAX4-NEXT: vsext.vf2 v12, v16
; LMULMAX4-NEXT: ret
%y = load <16 x i32>, ptr %x
%z = sext <16 x i32> %y to <16 x i64>
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: addi a1, a0, 48
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vle32.v v16, (a1)
+; LMULMAX1-NEXT: vle32.v v15, (a1)
; LMULMAX1-NEXT: addi a1, a0, 32
-; LMULMAX1-NEXT: vle32.v v14, (a1)
-; LMULMAX1-NEXT: vle32.v v10, (a0)
+; LMULMAX1-NEXT: vle32.v v13, (a1)
+; LMULMAX1-NEXT: vle32.v v9, (a0)
; LMULMAX1-NEXT: addi a0, a0, 16
-; LMULMAX1-NEXT: vle32.v v12, (a0)
+; LMULMAX1-NEXT: vle32.v v11, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v9, v8
+; LMULMAX1-NEXT: vzext.vf2 v9, v10
+; LMULMAX1-NEXT: vzext.vf2 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v12, 2
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v11, v8
+; LMULMAX1-NEXT: vzext.vf2 v11, v12
+; LMULMAX1-NEXT: vzext.vf2 v12, v13
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v14, 2
+; LMULMAX1-NEXT: vslidedown.vi v14, v13, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v13, v8
+; LMULMAX1-NEXT: vzext.vf2 v13, v14
+; LMULMAX1-NEXT: vzext.vf2 v14, v15
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v16, 2
+; LMULMAX1-NEXT: vslidedown.vi v16, v15, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v15, v8
-; LMULMAX1-NEXT: vzext.vf2 v8, v10
-; LMULMAX1-NEXT: vzext.vf2 v10, v12
-; LMULMAX1-NEXT: vzext.vf2 v12, v14
-; LMULMAX1-NEXT: vzext.vf2 v14, v16
+; LMULMAX1-NEXT: vzext.vf2 v15, v16
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v16i32_v16i64:
; LMULMAX4: # %bb.0:
; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; LMULMAX4-NEXT: vle32.v v16, (a0)
+; LMULMAX4-NEXT: vle32.v v12, (a0)
+; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; LMULMAX4-NEXT: vzext.vf2 v8, v12
; LMULMAX4-NEXT: vsetivli zero, 8, e32, m4, ta, ma
-; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8
+; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8
; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; LMULMAX4-NEXT: vzext.vf2 v12, v8
-; LMULMAX4-NEXT: vzext.vf2 v8, v16
+; LMULMAX4-NEXT: vzext.vf2 v12, v16
; LMULMAX4-NEXT: ret
%y = load <16 x i32>, ptr %x
%z = zext <16 x i32> %y to <16 x i64>
define void @buildvec_dominant0_v4f32(<4 x float>* %x) {
; CHECK-LABEL: buildvec_dominant0_v4f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v8, zero
; CHECK-NEXT: lui a1, 262144
-; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a1
+; CHECK-NEXT: vmv.s.x v9, zero
; CHECK-NEXT: vsetivli zero, 3, e32, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v9, v8, 2
+; CHECK-NEXT: vslideup.vi v8, v9, 2
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vse32.v v9, (a0)
+; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: ret
store <4 x float> <float 2.0, float 2.0, float 0.0, float 2.0>, <4 x float>* %x
ret void
; CHECK-LABEL: buildvec_dominant1_v4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v8, zero
-; CHECK-NEXT: vfmv.v.f v9, fa0
+; CHECK-NEXT: vfmv.v.f v8, fa0
+; CHECK-NEXT: vmv.s.x v9, zero
; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v9, v8, 1
+; CHECK-NEXT: vslideup.vi v8, v9, 1
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vse32.v v9, (a0)
+; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: ret
%v0 = insertelement <4 x float> poison, float %f, i32 0
%v1 = insertelement <4 x float> %v0, float 0.0, i32 1
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; LMULMAX1-NEXT: vle16.v v8, (a0)
-; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v9, v8, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vfwcvt.f.f.v v10, v9
; LMULMAX1-NEXT: vfwcvt.f.f.v v9, v8
+; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
+; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4
+; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; LMULMAX1-NEXT: vfwcvt.f.f.v v10, v8
; LMULMAX1-NEXT: addi a0, a1, 16
; LMULMAX1-NEXT: vse32.v v10, (a0)
; LMULMAX1-NEXT: vse32.v v9, (a1)
; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; LMULMAX1-NEXT: vfwcvt.f.f.v v11, v12
; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
-; LMULMAX1-NEXT: vfwcvt.f.f.v v12, v10
-; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; LMULMAX1-NEXT: vfwcvt.f.f.v v10, v12
-; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; LMULMAX1-NEXT: vfwcvt.f.f.v v12, v8
; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; LMULMAX1-NEXT: vfwcvt.f.f.v v8, v12
+; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; LMULMAX1-NEXT: vfwcvt.f.f.v v12, v10
+; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; LMULMAX1-NEXT: vfwcvt.f.f.v v10, v12
; LMULMAX1-NEXT: addi a0, a1, 32
; LMULMAX1-NEXT: vse64.v v10, (a0)
; LMULMAX1-NEXT: vse64.v v8, (a1)
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; LMULMAX1-NEXT: vmv.v.i v8, 0
-; LMULMAX1-NEXT: vse16.v v8, (a0)
-; LMULMAX1-NEXT: addi a0, a0, 16
+; LMULMAX1-NEXT: addi a1, a0, 16
+; LMULMAX1-NEXT: vse16.v v8, (a1)
; LMULMAX1-NEXT: vse16.v v8, (a0)
; LMULMAX1-NEXT: ret
%a = insertelement <16 x half> poison, half 0.0, i32 0
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; LMULMAX1-NEXT: vmv.v.i v8, 0
-; LMULMAX1-NEXT: vse32.v v8, (a0)
-; LMULMAX1-NEXT: addi a0, a0, 16
+; LMULMAX1-NEXT: addi a1, a0, 16
+; LMULMAX1-NEXT: vse32.v v8, (a1)
; LMULMAX1-NEXT: vse32.v v8, (a0)
; LMULMAX1-NEXT: ret
%a = insertelement <8 x float> poison, float 0.0, i32 0
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vmv.v.i v8, 0
-; LMULMAX1-NEXT: vse64.v v8, (a0)
-; LMULMAX1-NEXT: addi a0, a0, 16
+; LMULMAX1-NEXT: addi a1, a0, 16
+; LMULMAX1-NEXT: vse64.v v8, (a1)
; LMULMAX1-NEXT: vse64.v v8, (a0)
; LMULMAX1-NEXT: ret
%a = insertelement <4 x double> poison, double 0.0, i32 0
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vfwcvt.rtz.x.f.v v9, v8
+; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma
; CHECK-NEXT: vmerge.vim v8, v9, 0, v0
; CHECK-NEXT: vse64.v v8, (a1)
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vfwcvt.rtz.xu.f.v v9, v8
+; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma
; CHECK-NEXT: vmerge.vim v8, v9, 0, v0
; CHECK-NEXT: vse64.v v8, (a1)
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vfwcvt.rtz.x.f.v v12, v8
+; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
; CHECK-NEXT: vmerge.vim v8, v12, 0, v0
; CHECK-NEXT: vse64.v v8, (a1)
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vfwcvt.rtz.xu.f.v v12, v8
+; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
; CHECK-NEXT: vmerge.vim v8, v12, 0, v0
; CHECK-NEXT: vse64.v v8, (a1)
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vfwcvt.f.f.v v9, v8
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vfwcvt.rtz.x.f.v v8, v9
+; CHECK-NEXT: vfwcvt.rtz.x.f.v v10, v9
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-NEXT: vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT: vmerge.vim v8, v10, 0, v0
; CHECK-NEXT: vse64.v v8, (a1)
; CHECK-NEXT: ret
%a = load <2 x half>, ptr %x
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vfwcvt.f.f.v v9, v8
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vfwcvt.rtz.xu.f.v v8, v9
+; CHECK-NEXT: vfwcvt.rtz.xu.f.v v10, v9
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-NEXT: vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT: vmerge.vim v8, v10, 0, v0
; CHECK-NEXT: vse64.v v8, (a1)
; CHECK-NEXT: ret
%a = load <2 x half>, ptr %x
define void @fp2si_v2f64_v2i32(ptr %x, ptr %y) {
; CHECK-LABEL: fp2si_v2f64_v2i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: vfncvt.rtz.x.f.w v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma
; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vfncvt.rtz.x.f.w v9, v8
; CHECK-NEXT: vmerge.vim v8, v9, 0, v0
; CHECK-NEXT: vse32.v v8, (a1)
; CHECK-NEXT: ret
define void @fp2ui_v2f64_v2i32(ptr %x, ptr %y) {
; CHECK-LABEL: fp2ui_v2f64_v2i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: vfncvt.rtz.xu.f.w v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma
; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vfncvt.rtz.xu.f.w v9, v8
; CHECK-NEXT: vmerge.vim v8, v9, 0, v0
; CHECK-NEXT: vse32.v v8, (a1)
; CHECK-NEXT: ret
; LMULMAX1-NEXT: addi a2, a0, 16
; LMULMAX1-NEXT: vle32.v v8, (a2)
; LMULMAX1-NEXT: vle32.v v9, (a0)
-; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v10, v8, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v11, v10
+; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v10, v8
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
+; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v12, v10
-; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v10, v8
+; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v11, v8
; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v8, v9
+; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
+; LMULMAX1-NEXT: vslidedown.vi v9, v9, 2
+; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v12, v9
; LMULMAX1-NEXT: addi a0, a1, 16
; LMULMAX1-NEXT: vse64.v v12, (a0)
; LMULMAX1-NEXT: vse64.v v8, (a1)
; LMULMAX1-NEXT: addi a2, a0, 16
; LMULMAX1-NEXT: vle32.v v8, (a2)
; LMULMAX1-NEXT: vle32.v v9, (a0)
-; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v10, v8, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v11, v10
+; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v10, v8
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
+; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v12, v10
-; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v10, v8
+; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v11, v8
; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v8, v9
+; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
+; LMULMAX1-NEXT: vslidedown.vi v9, v9, 2
+; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v12, v9
; LMULMAX1-NEXT: addi a0, a1, 16
; LMULMAX1-NEXT: vse64.v v12, (a0)
; LMULMAX1-NEXT: vse64.v v8, (a1)
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: slli a2, a2, 4
; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vle64.v v24, (a0)
; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: li a0, 63
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vx v8, v24, a0, v0.t
-; RV32-NEXT: vsrl.vv v16, v16, v8, v0.t
+; RV32-NEXT: vsrl.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vsll.vi v16, v8, 1, v0.t
; RV32-NEXT: li a2, 32
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vmv.v.i v8, -1
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vxor.vv v8, v24, v8, v0.t
; RV32-NEXT: vand.vx v8, v8, a0, v0.t
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vsll.vi v24, v24, 1, v0.t
-; RV32-NEXT: vsll.vv v8, v24, v8, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
+; RV32-NEXT: vsll.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v8, v8, v16, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: slli a2, a2, 4
; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vle64.v v24, (a0)
; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV32-NEXT: vand.vx v8, v24, a0, v0.t
; RV32-NEXT: vsll.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t
; RV32-NEXT: li a2, 32
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.i v16, -1
+; RV32-NEXT: vmv.v.i v8, -1
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vxor.vv v16, v24, v16, v0.t
-; RV32-NEXT: vand.vx v16, v16, a0, v0.t
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vsrl.vi v24, v24, 1, v0.t
-; RV32-NEXT: vsrl.vv v16, v24, v16, v0.t
-; RV32-NEXT: vor.vv v8, v8, v16, v0.t
+; RV32-NEXT: vxor.vv v8, v24, v8, v0.t
+; RV32-NEXT: vand.vx v8, v8, a0, v0.t
+; RV32-NEXT: vsrl.vv v8, v16, v8, v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vor.vv v8, v16, v8, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; LMULMAX1-NEXT: vsext.vf2 v12, v11
; LMULMAX1-NEXT: vfwcvt.f.x.v v11, v12
-; LMULMAX1-NEXT: vsext.vf2 v12, v10
-; LMULMAX1-NEXT: vfwcvt.f.x.v v10, v12
; LMULMAX1-NEXT: vsext.vf2 v12, v8
; LMULMAX1-NEXT: vfwcvt.f.x.v v8, v12
+; LMULMAX1-NEXT: vsext.vf2 v12, v10
+; LMULMAX1-NEXT: vfwcvt.f.x.v v10, v12
; LMULMAX1-NEXT: addi a0, a1, 32
; LMULMAX1-NEXT: vse64.v v10, (a0)
; LMULMAX1-NEXT: vse64.v v8, (a1)
; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; LMULMAX1-NEXT: vzext.vf2 v12, v11
; LMULMAX1-NEXT: vfwcvt.f.xu.v v11, v12
-; LMULMAX1-NEXT: vzext.vf2 v12, v10
-; LMULMAX1-NEXT: vfwcvt.f.xu.v v10, v12
; LMULMAX1-NEXT: vzext.vf2 v12, v8
; LMULMAX1-NEXT: vfwcvt.f.xu.v v8, v12
+; LMULMAX1-NEXT: vzext.vf2 v12, v10
+; LMULMAX1-NEXT: vfwcvt.f.xu.v v10, v12
; LMULMAX1-NEXT: addi a0, a1, 32
; LMULMAX1-NEXT: vse64.v v10, (a0)
; LMULMAX1-NEXT: vse64.v v8, (a1)
; CHECK-LABEL: insertelt_v64i1:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 64
+; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a0
; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 0
-; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT: vmv.s.x v12, a0
+; CHECK-NEXT: vmv.v.i v12, 0
+; CHECK-NEXT: vmerge.vim v12, v12, 1, v0
; CHECK-NEXT: vsetivli zero, 2, e8, m4, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v12, 1
+; CHECK-NEXT: vslideup.vi v12, v8, 1
; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT: vand.vi v8, v8, 1
+; CHECK-NEXT: vand.vi v8, v12, 1
; CHECK-NEXT: vmsne.vi v0, v8, 0
; CHECK-NEXT: ret
%y = insertelement <64 x i1> %x, i1 %elt, i64 1
; CHECK-LABEL: insertelt_idx_v64i1:
; CHECK: # %bb.0:
; CHECK-NEXT: li a2, 64
+; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a0
; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 0
-; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT: vmv.s.x v12, a0
+; CHECK-NEXT: vmv.v.i v12, 0
+; CHECK-NEXT: vmerge.vim v12, v12, 1, v0
; CHECK-NEXT: addi a0, a1, 1
; CHECK-NEXT: vsetvli zero, a0, e8, m4, tu, ma
-; CHECK-NEXT: vslideup.vx v8, v12, a1
+; CHECK-NEXT: vslideup.vx v12, v8, a1
; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-NEXT: vand.vi v8, v8, 1
+; CHECK-NEXT: vand.vi v8, v12, 1
; CHECK-NEXT: vmsne.vi v0, v8, 0
; CHECK-NEXT: ret
%y = insertelement <64 x i1> %x, i1 %elt, i32 %idx
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; CHECK-NEXT: vlm.v v8, (a0)
-; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT: vmv.v.i v9, 0
-; CHECK-NEXT: vmerge.vim v9, v9, 1, v0
-; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
; CHECK-NEXT: vmv.v.i v10, 0
; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT: vsetvli zero, zero, e8, mf4, tu, ma
+; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vmv.v.i v10, 0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, tu, ma
; CHECK-NEXT: vmv.v.v v9, v8
; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
; CHECK-NEXT: vmsne.vi v0, v9, 0
; CHECK-LABEL: buildvec_vid_stepn3_add3_v4i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT: vid.v v9
-; CHECK-NEXT: vmv.v.i v8, 3
+; CHECK-NEXT: vmv.v.i v9, 3
+; CHECK-NEXT: vid.v v8
; CHECK-NEXT: li a0, -3
-; CHECK-NEXT: vmacc.vx v8, a0, v9
+; CHECK-NEXT: vmadd.vx v8, a0, v9
; CHECK-NEXT: ret
ret <4 x i8> <i8 3, i8 0, i8 -3, i8 -6>
}
; CHECK-LABEL: buildvec_vid_stepn3_addn3_v4i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vid.v v8
-; CHECK-NEXT: vmv.v.i v9, -3
+; CHECK-NEXT: vmv.v.i v8, -3
+; CHECK-NEXT: vid.v v9
; CHECK-NEXT: li a4, -3
-; CHECK-NEXT: vmacc.vx v9, a4, v8
+; CHECK-NEXT: vmadd.vx v9, a4, v8
; CHECK-NEXT: vse32.v v9, (a0)
; CHECK-NEXT: vse32.v v9, (a1)
; CHECK-NEXT: vse32.v v9, (a2)
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; LMULMAX1-NEXT: vle8.v v8, (a0)
-; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v9, v8, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v10, v9
; LMULMAX1-NEXT: vsext.vf4 v9, v8
+; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf4 v10, v8
; LMULMAX1-NEXT: addi a0, a1, 16
; LMULMAX1-NEXT: vse32.v v10, (a0)
; LMULMAX1-NEXT: vse32.v v9, (a1)
; LMULMAX1-NEXT: vslidedown.vi v15, v12, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; LMULMAX1-NEXT: vsext.vf4 v16, v15
-; LMULMAX1-NEXT: vsext.vf4 v15, v10
-; LMULMAX1-NEXT: vsext.vf4 v10, v12
-; LMULMAX1-NEXT: vsext.vf4 v12, v8
-; LMULMAX1-NEXT: vsext.vf4 v8, v9
+; LMULMAX1-NEXT: vsext.vf4 v15, v8
+; LMULMAX1-NEXT: vsext.vf4 v8, v10
+; LMULMAX1-NEXT: vsext.vf4 v10, v9
+; LMULMAX1-NEXT: vsext.vf4 v9, v12
; LMULMAX1-NEXT: addi a0, a1, 32
-; LMULMAX1-NEXT: vse32.v v10, (a0)
-; LMULMAX1-NEXT: vse32.v v8, (a1)
+; LMULMAX1-NEXT: vse32.v v9, (a0)
+; LMULMAX1-NEXT: vse32.v v10, (a1)
; LMULMAX1-NEXT: addi a0, a1, 96
-; LMULMAX1-NEXT: vse32.v v15, (a0)
+; LMULMAX1-NEXT: vse32.v v8, (a0)
; LMULMAX1-NEXT: addi a0, a1, 64
-; LMULMAX1-NEXT: vse32.v v12, (a0)
+; LMULMAX1-NEXT: vse32.v v15, (a0)
; LMULMAX1-NEXT: addi a0, a1, 48
; LMULMAX1-NEXT: vse32.v v16, (a0)
; LMULMAX1-NEXT: addi a0, a1, 16
; CHECK-LABEL: vrgather_shuffle_xv_v4i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
-; CHECK-NEXT: vid.v v9
-; CHECK-NEXT: vrsub.vi v10, v9, 4
-; CHECK-NEXT: vmv.v.i v0, 12
; CHECK-NEXT: vmv.v.i v9, 5
+; CHECK-NEXT: vid.v v10
+; CHECK-NEXT: vmv.v.i v0, 12
+; CHECK-NEXT: vrsub.vi v10, v10, 4
; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t
; CHECK-NEXT: vmv1r.v v8, v9
; CHECK-NEXT: ret
define <8 x i64> @vrgather_shuffle_xv_v8i64(<8 x i64> %x) {
; RV32-LABEL: vrgather_shuffle_xv_v8i64:
; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; RV32-NEXT: lui a0, %hi(.LCPI12_0)
; RV32-NEXT: addi a0, a0, %lo(.LCPI12_0)
-; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; RV32-NEXT: vle16.v v16, (a0)
; RV32-NEXT: vmv.v.i v20, -1
; RV32-NEXT: vrgatherei16.vv v12, v20, v16
-; RV32-NEXT: lui a0, %hi(.LCPI12_1)
-; RV32-NEXT: addi a0, a0, %lo(.LCPI12_1)
-; RV32-NEXT: vle16.v v16, (a0)
; RV32-NEXT: li a0, 113
; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; RV32-NEXT: vmv.v.x v0, a0
+; RV32-NEXT: lui a0, %hi(.LCPI12_1)
+; RV32-NEXT: addi a0, a0, %lo(.LCPI12_1)
; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; RV32-NEXT: vle16.v v16, (a0)
; RV32-NEXT: vrgatherei16.vv v12, v8, v16, v0.t
; RV32-NEXT: vmv.v.v v8, v12
; RV32-NEXT: ret
;
; RV64-LABEL: vrgather_shuffle_xv_v8i64:
; RV64: # %bb.0:
-; RV64-NEXT: lui a0, %hi(.LCPI12_0)
-; RV64-NEXT: addi a0, a0, %lo(.LCPI12_0)
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vle64.v v16, (a0)
; RV64-NEXT: li a0, 113
; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; RV64-NEXT: vmv.v.x v0, a0
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; RV64-NEXT: lui a0, %hi(.LCPI12_0)
+; RV64-NEXT: addi a0, a0, %lo(.LCPI12_0)
+; RV64-NEXT: vle64.v v16, (a0)
; RV64-NEXT: vmv.v.i v12, -1
; RV64-NEXT: vrgather.vv v12, v8, v16, v0.t
; RV64-NEXT: vmv.v.v v8, v12
; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; RV32-NEXT: vle16.v v16, (a0)
; RV32-NEXT: vrgatherei16.vv v12, v8, v16
-; RV32-NEXT: lui a0, %hi(.LCPI13_1)
-; RV32-NEXT: addi a0, a0, %lo(.LCPI13_1)
-; RV32-NEXT: vle16.v v8, (a0)
; RV32-NEXT: li a0, 140
; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; RV32-NEXT: vmv.v.x v0, a0
; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; RV32-NEXT: lui a0, %hi(.LCPI13_1)
+; RV32-NEXT: addi a0, a0, %lo(.LCPI13_1)
+; RV32-NEXT: vle16.v v8, (a0)
; RV32-NEXT: vmv.v.i v16, 5
; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t
; RV32-NEXT: vmv.v.v v8, v12
;
; RV64-LABEL: vrgather_shuffle_vx_v8i64:
; RV64: # %bb.0:
-; RV64-NEXT: lui a0, %hi(.LCPI13_0)
-; RV64-NEXT: addi a0, a0, %lo(.LCPI13_0)
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vle64.v v16, (a0)
; RV64-NEXT: li a0, 115
; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; RV64-NEXT: vmv.v.x v0, a0
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
+; RV64-NEXT: lui a0, %hi(.LCPI13_0)
+; RV64-NEXT: addi a0, a0, %lo(.LCPI13_0)
+; RV64-NEXT: vle64.v v16, (a0)
; RV64-NEXT: vmv.v.i v12, 5
; RV64-NEXT: vrgather.vv v12, v8, v16, v0.t
; RV64-NEXT: vmv.v.v v8, v12
define <8 x i8> @splat_ve2_we0(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: splat_ve2_we0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: li a0, 66
+; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; CHECK-NEXT: vmv.v.x v0, a0
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vrgather.vi v10, v8, 2
; CHECK-NEXT: vrgather.vi v10, v9, 0, v0.t
; CHECK-NEXT: vmv1r.v v8, v10
define <8 x i8> @splat_ve2_we0_ins_i0we4(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: splat_ve2_we0_ins_i0we4:
; CHECK: # %bb.0:
+; CHECK-NEXT: li a0, 67
+; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT: vmv.v.x v0, a0
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vrgather.vi v10, v8, 2
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vmv.v.i v8, 4
-; CHECK-NEXT: li a0, 67
-; CHECK-NEXT: vmv.v.x v0, a0
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: vmv.v.i v11, 0
; CHECK-NEXT: vsetivli zero, 3, e8, mf2, tu, ma
; CHECK-NEXT: vslideup.vi v11, v10, 2
-; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: li a0, 70
; CHECK-NEXT: vmv.v.x v0, a0
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vrgather.vi v10, v8, 2
; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t
; CHECK-NEXT: vmv1r.v v8, v10
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; LMULMAX1-NEXT: vmv.v.i v8, 0
-; LMULMAX1-NEXT: vse8.v v8, (a0)
-; LMULMAX1-NEXT: addi a0, a0, 16
+; LMULMAX1-NEXT: addi a1, a0, 16
+; LMULMAX1-NEXT: vse8.v v8, (a1)
; LMULMAX1-NEXT: vse8.v v8, (a0)
; LMULMAX1-NEXT: ret
%a = insertelement <32 x i8> poison, i8 0, i32 0
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; LMULMAX1-NEXT: vmv.v.i v8, 0
-; LMULMAX1-NEXT: vse16.v v8, (a0)
-; LMULMAX1-NEXT: addi a0, a0, 16
+; LMULMAX1-NEXT: addi a1, a0, 16
+; LMULMAX1-NEXT: vse16.v v8, (a1)
; LMULMAX1-NEXT: vse16.v v8, (a0)
; LMULMAX1-NEXT: ret
%a = insertelement <16 x i16> poison, i16 0, i32 0
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; LMULMAX1-NEXT: vmv.v.i v8, 0
-; LMULMAX1-NEXT: vse32.v v8, (a0)
-; LMULMAX1-NEXT: addi a0, a0, 16
+; LMULMAX1-NEXT: addi a1, a0, 16
+; LMULMAX1-NEXT: vse32.v v8, (a1)
; LMULMAX1-NEXT: vse32.v v8, (a0)
; LMULMAX1-NEXT: ret
%a = insertelement <8 x i32> poison, i32 0, i32 0
; LMULMAX1-RV32: # %bb.0:
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; LMULMAX1-RV32-NEXT: vmv.v.i v8, 0
-; LMULMAX1-RV32-NEXT: vse32.v v8, (a0)
-; LMULMAX1-RV32-NEXT: addi a0, a0, 16
+; LMULMAX1-RV32-NEXT: addi a1, a0, 16
+; LMULMAX1-RV32-NEXT: vse32.v v8, (a1)
; LMULMAX1-RV32-NEXT: vse32.v v8, (a0)
; LMULMAX1-RV32-NEXT: ret
;
; LMULMAX1-RV64: # %bb.0:
; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-RV64-NEXT: vmv.v.i v8, 0
-; LMULMAX1-RV64-NEXT: vse64.v v8, (a0)
-; LMULMAX1-RV64-NEXT: addi a0, a0, 16
+; LMULMAX1-RV64-NEXT: addi a1, a0, 16
+; LMULMAX1-RV64-NEXT: vse64.v v8, (a1)
; LMULMAX1-RV64-NEXT: vse64.v v8, (a0)
; LMULMAX1-RV64-NEXT: ret
%a = insertelement <4 x i64> poison, i64 0, i32 0
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; LMULMAX1-NEXT: vmv.v.i v8, -1
-; LMULMAX1-NEXT: vse8.v v8, (a0)
-; LMULMAX1-NEXT: addi a0, a0, 16
+; LMULMAX1-NEXT: addi a1, a0, 16
+; LMULMAX1-NEXT: vse8.v v8, (a1)
; LMULMAX1-NEXT: vse8.v v8, (a0)
; LMULMAX1-NEXT: ret
%a = insertelement <32 x i8> poison, i8 -1, i32 0
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; LMULMAX1-NEXT: vmv.v.i v8, -1
-; LMULMAX1-NEXT: vse16.v v8, (a0)
-; LMULMAX1-NEXT: addi a0, a0, 16
+; LMULMAX1-NEXT: addi a1, a0, 16
+; LMULMAX1-NEXT: vse16.v v8, (a1)
; LMULMAX1-NEXT: vse16.v v8, (a0)
; LMULMAX1-NEXT: ret
%a = insertelement <16 x i16> poison, i16 -1, i32 0
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; LMULMAX1-NEXT: vmv.v.i v8, -1
-; LMULMAX1-NEXT: vse32.v v8, (a0)
-; LMULMAX1-NEXT: addi a0, a0, 16
+; LMULMAX1-NEXT: addi a1, a0, 16
+; LMULMAX1-NEXT: vse32.v v8, (a1)
; LMULMAX1-NEXT: vse32.v v8, (a0)
; LMULMAX1-NEXT: ret
%a = insertelement <8 x i32> poison, i32 -1, i32 0
; LMULMAX1-RV32: # %bb.0:
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; LMULMAX1-RV32-NEXT: vmv.v.i v8, -1
-; LMULMAX1-RV32-NEXT: vse32.v v8, (a0)
-; LMULMAX1-RV32-NEXT: addi a0, a0, 16
+; LMULMAX1-RV32-NEXT: addi a1, a0, 16
+; LMULMAX1-RV32-NEXT: vse32.v v8, (a1)
; LMULMAX1-RV32-NEXT: vse32.v v8, (a0)
; LMULMAX1-RV32-NEXT: ret
;
; LMULMAX1-RV64: # %bb.0:
; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-RV64-NEXT: vmv.v.i v8, -1
-; LMULMAX1-RV64-NEXT: vse64.v v8, (a0)
-; LMULMAX1-RV64-NEXT: addi a0, a0, 16
+; LMULMAX1-RV64-NEXT: addi a1, a0, 16
+; LMULMAX1-RV64-NEXT: vse64.v v8, (a1)
; LMULMAX1-RV64-NEXT: vse64.v v8, (a0)
; LMULMAX1-RV64-NEXT: ret
%a = insertelement <4 x i64> poison, i64 -1, i32 0
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; RV32-NEXT: vle8.v v8, (a0)
-; RV32-NEXT: lui a1, 3
-; RV32-NEXT: addi a1, a1, -2044
+; RV32-NEXT: li a1, 513
; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; RV32-NEXT: vmv.v.x v0, a1
; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; RV32-NEXT: vmv.v.i v9, 0
-; RV32-NEXT: li a1, -128
-; RV32-NEXT: vmerge.vxm v10, v9, a1, v0
+; RV32-NEXT: vmv.v.i v9, 4
+; RV32-NEXT: vmerge.vim v9, v9, 1, v0
; RV32-NEXT: lui a1, 1
-; RV32-NEXT: addi a2, a1, 32
+; RV32-NEXT: addi a2, a1, 78
; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; RV32-NEXT: vmv.v.x v0, a2
; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; RV32-NEXT: lui a2, %hi(.LCPI65_0)
-; RV32-NEXT: addi a2, a2, %lo(.LCPI65_0)
-; RV32-NEXT: vle8.v v11, (a2)
-; RV32-NEXT: vmerge.vim v9, v9, 1, v0
-; RV32-NEXT: vsrl.vv v9, v8, v9
-; RV32-NEXT: vmulhu.vv v9, v9, v11
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: vmulhu.vv v8, v8, v10
-; RV32-NEXT: vadd.vv v8, v8, v9
-; RV32-NEXT: li a2, 513
+; RV32-NEXT: vmerge.vim v9, v9, 3, v0
+; RV32-NEXT: lui a2, 8
+; RV32-NEXT: addi a2, a2, 304
; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; RV32-NEXT: vmv.v.x v0, a2
; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; RV32-NEXT: vmv.v.i v9, 4
-; RV32-NEXT: vmerge.vim v9, v9, 1, v0
-; RV32-NEXT: addi a1, a1, 78
+; RV32-NEXT: vmerge.vim v9, v9, 2, v0
+; RV32-NEXT: lui a2, 3
+; RV32-NEXT: addi a2, a2, -2044
; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT: vmv.v.x v0, a1
+; RV32-NEXT: vmv.v.x v0, a2
; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; RV32-NEXT: vmerge.vim v9, v9, 3, v0
-; RV32-NEXT: lui a1, 8
-; RV32-NEXT: addi a1, a1, 304
+; RV32-NEXT: vmv.v.i v10, 0
+; RV32-NEXT: li a2, -128
+; RV32-NEXT: vmerge.vxm v11, v10, a2, v0
+; RV32-NEXT: addi a1, a1, 32
; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; RV32-NEXT: vmv.v.x v0, a1
; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; RV32-NEXT: vmerge.vim v9, v9, 2, v0
+; RV32-NEXT: lui a1, %hi(.LCPI65_0)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI65_0)
+; RV32-NEXT: vle8.v v12, (a1)
+; RV32-NEXT: vmerge.vim v10, v10, 1, v0
+; RV32-NEXT: vsrl.vv v10, v8, v10
+; RV32-NEXT: vmulhu.vv v10, v10, v12
+; RV32-NEXT: vsub.vv v8, v8, v10
+; RV32-NEXT: vmulhu.vv v8, v8, v11
+; RV32-NEXT: vadd.vv v8, v8, v10
; RV32-NEXT: vsrl.vv v8, v8, v9
; RV32-NEXT: vse8.v v8, (a0)
; RV32-NEXT: ret
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; RV64-NEXT: vle8.v v8, (a0)
-; RV64-NEXT: lui a1, 3
-; RV64-NEXT: addiw a1, a1, -2044
+; RV64-NEXT: li a1, 513
; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; RV64-NEXT: vmv.v.x v0, a1
; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; RV64-NEXT: vmv.v.i v9, 0
-; RV64-NEXT: li a1, -128
-; RV64-NEXT: vmerge.vxm v10, v9, a1, v0
+; RV64-NEXT: vmv.v.i v9, 4
+; RV64-NEXT: vmerge.vim v9, v9, 1, v0
; RV64-NEXT: lui a1, 1
-; RV64-NEXT: addiw a2, a1, 32
+; RV64-NEXT: addiw a2, a1, 78
; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; RV64-NEXT: vmv.v.x v0, a2
; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; RV64-NEXT: lui a2, %hi(.LCPI65_0)
-; RV64-NEXT: addi a2, a2, %lo(.LCPI65_0)
-; RV64-NEXT: vle8.v v11, (a2)
-; RV64-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64-NEXT: vsrl.vv v9, v8, v9
-; RV64-NEXT: vmulhu.vv v9, v9, v11
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: vmulhu.vv v8, v8, v10
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: li a2, 513
+; RV64-NEXT: vmerge.vim v9, v9, 3, v0
+; RV64-NEXT: lui a2, 8
+; RV64-NEXT: addiw a2, a2, 304
; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; RV64-NEXT: vmv.v.x v0, a2
; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; RV64-NEXT: vmv.v.i v9, 4
-; RV64-NEXT: vmerge.vim v9, v9, 1, v0
-; RV64-NEXT: addiw a1, a1, 78
+; RV64-NEXT: vmerge.vim v9, v9, 2, v0
+; RV64-NEXT: lui a2, 3
+; RV64-NEXT: addiw a2, a2, -2044
; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-NEXT: vmv.v.x v0, a1
+; RV64-NEXT: vmv.v.x v0, a2
; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; RV64-NEXT: vmerge.vim v9, v9, 3, v0
-; RV64-NEXT: lui a1, 8
-; RV64-NEXT: addiw a1, a1, 304
+; RV64-NEXT: vmv.v.i v10, 0
+; RV64-NEXT: li a2, -128
+; RV64-NEXT: vmerge.vxm v11, v10, a2, v0
+; RV64-NEXT: addiw a1, a1, 32
; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; RV64-NEXT: vmv.v.x v0, a1
; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; RV64-NEXT: vmerge.vim v9, v9, 2, v0
+; RV64-NEXT: lui a1, %hi(.LCPI65_0)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI65_0)
+; RV64-NEXT: vle8.v v12, (a1)
+; RV64-NEXT: vmerge.vim v10, v10, 1, v0
+; RV64-NEXT: vsrl.vv v10, v8, v10
+; RV64-NEXT: vmulhu.vv v10, v10, v12
+; RV64-NEXT: vsub.vv v8, v8, v10
+; RV64-NEXT: vmulhu.vv v8, v8, v11
+; RV64-NEXT: vadd.vv v8, v8, v10
; RV64-NEXT: vsrl.vv v8, v8, v9
; RV64-NEXT: vse8.v v8, (a0)
; RV64-NEXT: ret
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: li a1, 33
-; CHECK-NEXT: vmv.v.x v0, a1
-; CHECK-NEXT: vmv.v.i v9, 3
-; CHECK-NEXT: vmerge.vim v9, v9, 2, v0
-; CHECK-NEXT: vmv.v.i v10, 1
-; CHECK-NEXT: vsetivli zero, 7, e16, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v9, v10, 6
-; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v11, 0
+; CHECK-NEXT: vmv.v.i v9, 0
; CHECK-NEXT: lui a1, 1048568
; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma
-; CHECK-NEXT: vmv.v.i v12, 0
-; CHECK-NEXT: vmv.s.x v12, a1
+; CHECK-NEXT: vmv.v.i v10, 0
+; CHECK-NEXT: vmv.s.x v10, a1
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v11, 1
; CHECK-NEXT: vsetivli zero, 7, e16, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v11, v10, 6
+; CHECK-NEXT: vslideup.vi v9, v11, 6
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: lui a1, %hi(.LCPI66_0)
; CHECK-NEXT: addi a1, a1, %lo(.LCPI66_0)
-; CHECK-NEXT: vle16.v v10, (a1)
-; CHECK-NEXT: vsrl.vv v11, v8, v11
-; CHECK-NEXT: vmulhu.vv v10, v11, v10
-; CHECK-NEXT: vsub.vv v8, v8, v10
-; CHECK-NEXT: vmulhu.vv v8, v8, v12
-; CHECK-NEXT: vadd.vv v8, v8, v10
+; CHECK-NEXT: vle16.v v12, (a1)
+; CHECK-NEXT: vsrl.vv v9, v8, v9
+; CHECK-NEXT: vmulhu.vv v9, v9, v12
+; CHECK-NEXT: vsub.vv v8, v8, v9
+; CHECK-NEXT: vmulhu.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v8, v8, v9
+; CHECK-NEXT: li a1, 33
+; CHECK-NEXT: vmv.v.x v0, a1
+; CHECK-NEXT: vmv.v.i v9, 3
+; CHECK-NEXT: vmerge.vim v9, v9, 2, v0
+; CHECK-NEXT: vsetivli zero, 7, e16, m1, tu, ma
+; CHECK-NEXT: vslideup.vi v9, v11, 6
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vsrl.vv v8, v8, v9
; CHECK-NEXT: vse16.v v8, (a0)
; CHECK-NEXT: ret
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vmv.v.i v0, 6
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.i v0, 6
; CHECK-NEXT: vmv.v.i v9, -7
; CHECK-NEXT: vmerge.vim v9, v9, 7, v0
; CHECK-NEXT: vdiv.vv v9, v8, v9
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vid.v v10
-; CHECK-NEXT: vmv.v.i v11, 7
+; CHECK-NEXT: vmv.v.i v10, 7
+; CHECK-NEXT: vid.v v11
; CHECK-NEXT: li a1, -14
-; CHECK-NEXT: vmacc.vx v11, a1, v10
+; CHECK-NEXT: vmadd.vx v11, a1, v10
; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 4
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
; LMULMAX2-RV32: # %bb.0:
; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; LMULMAX2-RV32-NEXT: vle16.v v10, (a0)
-; LMULMAX2-RV32-NEXT: li a1, 257
+; LMULMAX2-RV32-NEXT: lui a1, 2
+; LMULMAX2-RV32-NEXT: addi a1, a1, 289
; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; LMULMAX2-RV32-NEXT: vmv.v.x v0, a1
; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.i v12, 0
-; LMULMAX2-RV32-NEXT: lui a1, 1048568
-; LMULMAX2-RV32-NEXT: vmerge.vxm v14, v12, a1, v0
+; LMULMAX2-RV32-NEXT: vmv.v.i v8, 3
+; LMULMAX2-RV32-NEXT: vmerge.vim v12, v8, 2, v0
; LMULMAX2-RV32-NEXT: lui a1, 4
; LMULMAX2-RV32-NEXT: addi a1, a1, 64
; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; LMULMAX2-RV32-NEXT: vmv.v.x v8, a1
; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI182_0)
-; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI182_0)
-; LMULMAX2-RV32-NEXT: vle16.v v16, (a1)
; LMULMAX2-RV32-NEXT: vmv1r.v v0, v8
; LMULMAX2-RV32-NEXT: vmerge.vim v12, v12, 1, v0
-; LMULMAX2-RV32-NEXT: vsrl.vv v12, v10, v12
-; LMULMAX2-RV32-NEXT: vmulhu.vv v12, v12, v16
-; LMULMAX2-RV32-NEXT: vsub.vv v10, v10, v12
-; LMULMAX2-RV32-NEXT: vmulhu.vv v10, v10, v14
-; LMULMAX2-RV32-NEXT: vadd.vv v10, v10, v12
-; LMULMAX2-RV32-NEXT: lui a1, 2
-; LMULMAX2-RV32-NEXT: addi a1, a1, 289
+; LMULMAX2-RV32-NEXT: li a1, 257
; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; LMULMAX2-RV32-NEXT: vmv.v.x v0, a1
; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.i v12, 3
-; LMULMAX2-RV32-NEXT: vmerge.vim v12, v12, 2, v0
+; LMULMAX2-RV32-NEXT: vmv.v.i v14, 0
+; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI182_0)
+; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI182_0)
+; LMULMAX2-RV32-NEXT: vle16.v v16, (a1)
+; LMULMAX2-RV32-NEXT: lui a1, 1048568
+; LMULMAX2-RV32-NEXT: vmerge.vxm v18, v14, a1, v0
; LMULMAX2-RV32-NEXT: vmv1r.v v0, v8
-; LMULMAX2-RV32-NEXT: vmerge.vim v8, v12, 1, v0
+; LMULMAX2-RV32-NEXT: vmerge.vim v8, v14, 1, v0
; LMULMAX2-RV32-NEXT: vsrl.vv v8, v10, v8
+; LMULMAX2-RV32-NEXT: vmulhu.vv v8, v8, v16
+; LMULMAX2-RV32-NEXT: vsub.vv v10, v10, v8
+; LMULMAX2-RV32-NEXT: vmulhu.vv v10, v10, v18
+; LMULMAX2-RV32-NEXT: vadd.vv v8, v10, v8
+; LMULMAX2-RV32-NEXT: vsrl.vv v8, v8, v12
; LMULMAX2-RV32-NEXT: vse16.v v8, (a0)
; LMULMAX2-RV32-NEXT: ret
;
; LMULMAX2-RV64: # %bb.0:
; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; LMULMAX2-RV64-NEXT: vle16.v v10, (a0)
-; LMULMAX2-RV64-NEXT: li a1, 257
+; LMULMAX2-RV64-NEXT: lui a1, 2
+; LMULMAX2-RV64-NEXT: addiw a1, a1, 289
; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; LMULMAX2-RV64-NEXT: vmv.v.x v0, a1
; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; LMULMAX2-RV64-NEXT: vmv.v.i v12, 0
-; LMULMAX2-RV64-NEXT: lui a1, 1048568
-; LMULMAX2-RV64-NEXT: vmerge.vxm v14, v12, a1, v0
+; LMULMAX2-RV64-NEXT: vmv.v.i v8, 3
+; LMULMAX2-RV64-NEXT: vmerge.vim v12, v8, 2, v0
; LMULMAX2-RV64-NEXT: lui a1, 4
; LMULMAX2-RV64-NEXT: addiw a1, a1, 64
; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; LMULMAX2-RV64-NEXT: vmv.v.x v8, a1
; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI182_0)
-; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI182_0)
-; LMULMAX2-RV64-NEXT: vle16.v v16, (a1)
; LMULMAX2-RV64-NEXT: vmv1r.v v0, v8
; LMULMAX2-RV64-NEXT: vmerge.vim v12, v12, 1, v0
-; LMULMAX2-RV64-NEXT: vsrl.vv v12, v10, v12
-; LMULMAX2-RV64-NEXT: vmulhu.vv v12, v12, v16
-; LMULMAX2-RV64-NEXT: vsub.vv v10, v10, v12
-; LMULMAX2-RV64-NEXT: vmulhu.vv v10, v10, v14
-; LMULMAX2-RV64-NEXT: vadd.vv v10, v10, v12
-; LMULMAX2-RV64-NEXT: lui a1, 2
-; LMULMAX2-RV64-NEXT: addiw a1, a1, 289
+; LMULMAX2-RV64-NEXT: li a1, 257
; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; LMULMAX2-RV64-NEXT: vmv.v.x v0, a1
; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; LMULMAX2-RV64-NEXT: vmv.v.i v12, 3
-; LMULMAX2-RV64-NEXT: vmerge.vim v12, v12, 2, v0
+; LMULMAX2-RV64-NEXT: vmv.v.i v14, 0
+; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI182_0)
+; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI182_0)
+; LMULMAX2-RV64-NEXT: vle16.v v16, (a1)
+; LMULMAX2-RV64-NEXT: lui a1, 1048568
+; LMULMAX2-RV64-NEXT: vmerge.vxm v18, v14, a1, v0
; LMULMAX2-RV64-NEXT: vmv1r.v v0, v8
-; LMULMAX2-RV64-NEXT: vmerge.vim v8, v12, 1, v0
+; LMULMAX2-RV64-NEXT: vmerge.vim v8, v14, 1, v0
; LMULMAX2-RV64-NEXT: vsrl.vv v8, v10, v8
+; LMULMAX2-RV64-NEXT: vmulhu.vv v8, v8, v16
+; LMULMAX2-RV64-NEXT: vsub.vv v10, v10, v8
+; LMULMAX2-RV64-NEXT: vmulhu.vv v10, v10, v18
+; LMULMAX2-RV64-NEXT: vadd.vv v8, v10, v8
+; LMULMAX2-RV64-NEXT: vsrl.vv v8, v8, v12
; LMULMAX2-RV64-NEXT: vse16.v v8, (a0)
; LMULMAX2-RV64-NEXT: ret
;
; LMULMAX2-RV64: # %bb.0:
; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; LMULMAX2-RV64-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV64-NEXT: vmv.v.i v10, 1
; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; LMULMAX2-RV64-NEXT: vmv.v.i v0, 5
; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV64-NEXT: vmv.v.i v10, 1
; LMULMAX2-RV64-NEXT: vmerge.vim v10, v10, 0, v0
; LMULMAX2-RV64-NEXT: lui a1, 349525
; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365
;
; RV64V-LABEL: mgather_baseidx_v32i8:
; RV64V: # %bb.0:
-; RV64V-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64V-NEXT: vsext.vf8 v16, v8
-; RV64V-NEXT: vsetvli zero, zero, e8, m1, ta, mu
-; RV64V-NEXT: vmv1r.v v12, v10
-; RV64V-NEXT: vluxei64.v v12, (a0), v16, v0.t
+; RV64V-NEXT: vmv1r.v v12, v0
; RV64V-NEXT: vsetivli zero, 16, e8, m2, ta, ma
-; RV64V-NEXT: vslidedown.vi v10, v10, 16
-; RV64V-NEXT: vslidedown.vi v8, v8, 16
+; RV64V-NEXT: vslidedown.vi v14, v8, 16
; RV64V-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64V-NEXT: vsext.vf8 v16, v8
+; RV64V-NEXT: vsext.vf8 v16, v14
+; RV64V-NEXT: vsetivli zero, 16, e8, m2, ta, ma
+; RV64V-NEXT: vslidedown.vi v14, v10, 16
; RV64V-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64V-NEXT: vslidedown.vi v0, v0, 2
; RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, mu
+; RV64V-NEXT: vluxei64.v v14, (a0), v16, v0.t
+; RV64V-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; RV64V-NEXT: vsext.vf8 v16, v8
+; RV64V-NEXT: vsetvli zero, zero, e8, m1, ta, mu
+; RV64V-NEXT: vmv1r.v v0, v12
; RV64V-NEXT: vluxei64.v v10, (a0), v16, v0.t
; RV64V-NEXT: li a0, 32
; RV64V-NEXT: vsetvli zero, a0, e8, m2, ta, ma
-; RV64V-NEXT: vslideup.vi v12, v10, 16
-; RV64V-NEXT: vmv.v.v v8, v12
+; RV64V-NEXT: vslideup.vi v10, v14, 16
+; RV64V-NEXT: vmv.v.v v8, v10
; RV64V-NEXT: ret
;
; RV64ZVE32F-LABEL: mgather_baseidx_v32i8:
; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, ma
; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t
; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 16
; RV64-NEXT: vslidedown.vi v10, v10, 16
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vsext.vf8 v16, v10
+; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 16
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64-NEXT: vslidedown.vi v0, v0, 2
; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; RV32-NEXT: vle32.v v8, (a0)
; RV32-NEXT: vsext.vf2 v9, v8
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsrl.vx v8, v9, a0
-; RV32-NEXT: vmv.x.s a1, v8
; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: vsrl.vx v8, v9, a1
+; RV32-NEXT: vmv.x.s a1, v8
; RV32-NEXT: ret
;
; RV64-LABEL: vwreduce_add_v1i64:
; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; RV32-NEXT: vle32.v v8, (a0)
; RV32-NEXT: vzext.vf2 v9, v8
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsrl.vx v8, v9, a0
-; RV32-NEXT: vmv.x.s a1, v8
; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: vsrl.vx v8, v9, a1
+; RV32-NEXT: vmv.x.s a1, v8
; RV32-NEXT: ret
;
; RV64-LABEL: vwreduce_uadd_v1i64:
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vfmv.v.f v9, fa0
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t
+; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmand.mm v0, v8, v9
; CHECK-NEXT: ret
%elt.head = insertelement <8 x half> poison, half %b, i32 0
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vfmv.v.f v9, fa0
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t
+; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmand.mm v0, v9, v8
; CHECK-NEXT: ret
%elt.head = insertelement <8 x half> poison, half %b, i32 0
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vfmv.v.f v9, fa0
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t
+; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmor.mm v0, v8, v9
; CHECK-NEXT: ret
%elt.head = insertelement <8 x half> poison, half %b, i32 0
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vfmv.v.f v9, fa0
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t
+; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmor.mm v0, v9, v8
; CHECK-NEXT: ret
%elt.head = insertelement <8 x half> poison, half %b, i32 0
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; CHECK-NEXT: vfmv.v.f v12, fa0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT: vmfeq.vf v16, v12, fa0, v0.t
-; CHECK-NEXT: vmfeq.vv v12, v8, v8, v0.t
-; CHECK-NEXT: vmand.mm v0, v12, v16
+; CHECK-NEXT: vmfeq.vv v16, v8, v8, v0.t
+; CHECK-NEXT: vmfeq.vf v8, v12, fa0, v0.t
+; CHECK-NEXT: vmand.mm v0, v16, v8
; CHECK-NEXT: ret
%elt.head = insertelement <8 x double> poison, double %b, i32 0
%vb = shufflevector <8 x double> %elt.head, <8 x double> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; CHECK-NEXT: vfmv.v.f v12, fa0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT: vmfeq.vf v16, v12, fa0, v0.t
-; CHECK-NEXT: vmfeq.vv v12, v8, v8, v0.t
-; CHECK-NEXT: vmand.mm v0, v16, v12
+; CHECK-NEXT: vmfeq.vv v16, v8, v8, v0.t
+; CHECK-NEXT: vmfeq.vf v8, v12, fa0, v0.t
+; CHECK-NEXT: vmand.mm v0, v8, v16
; CHECK-NEXT: ret
%elt.head = insertelement <8 x double> poison, double %b, i32 0
%vb = shufflevector <8 x double> %elt.head, <8 x double> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; CHECK-NEXT: vfmv.v.f v12, fa0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT: vmfne.vf v16, v12, fa0, v0.t
-; CHECK-NEXT: vmfne.vv v12, v8, v8, v0.t
-; CHECK-NEXT: vmor.mm v0, v12, v16
+; CHECK-NEXT: vmfne.vv v16, v8, v8, v0.t
+; CHECK-NEXT: vmfne.vf v8, v12, fa0, v0.t
+; CHECK-NEXT: vmor.mm v0, v16, v8
; CHECK-NEXT: ret
%elt.head = insertelement <8 x double> poison, double %b, i32 0
%vb = shufflevector <8 x double> %elt.head, <8 x double> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; CHECK-NEXT: vfmv.v.f v12, fa0
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT: vmfne.vf v16, v12, fa0, v0.t
-; CHECK-NEXT: vmfne.vv v12, v8, v8, v0.t
-; CHECK-NEXT: vmor.mm v0, v16, v12
+; CHECK-NEXT: vmfne.vv v16, v8, v8, v0.t
+; CHECK-NEXT: vmfne.vf v8, v12, fa0, v0.t
+; CHECK-NEXT: vmor.mm v0, v8, v16
; CHECK-NEXT: ret
%elt.head = insertelement <8 x double> poison, double %b, i32 0
%vb = shufflevector <8 x double> %elt.head, <8 x double> poison, <8 x i32> zeroinitializer
; CHECK-LABEL: store_constant_v2i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT: vid.v v8
-; CHECK-NEXT: vmv.v.i v9, 3
+; CHECK-NEXT: vmv.v.i v8, 3
+; CHECK-NEXT: vid.v v9
; CHECK-NEXT: li a1, 3
-; CHECK-NEXT: vmacc.vx v9, a1, v8
+; CHECK-NEXT: vmadd.vx v9, a1, v8
; CHECK-NEXT: vse32.v v9, (a0)
; CHECK-NEXT: ret
store <2 x i32> <i32 3, i32 6>, ptr %p
; CHECK-LABEL: store_constant_v2i8_align1:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT: vid.v v8
-; CHECK-NEXT: vmv.v.i v9, 3
+; CHECK-NEXT: vmv.v.i v8, 3
+; CHECK-NEXT: vid.v v9
; CHECK-NEXT: li a1, 3
-; CHECK-NEXT: vmacc.vx v9, a1, v8
+; CHECK-NEXT: vmadd.vx v9, a1, v8
; CHECK-NEXT: vse8.v v9, (a0)
; CHECK-NEXT: ret
store <2 x i8> <i8 3, i8 6>, ptr %p, align 1
define <32 x i64> @vadd_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) {
; RV32-LABEL: vadd_vx_v32i64_evl27:
; RV32: # %bb.0:
+; RV32-NEXT: vmv1r.v v1, v0
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT: vslidedown.vi v1, v0, 2
+; RV32-NEXT: vslidedown.vi v0, v0, 2
; RV32-NEXT: li a0, 32
; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; RV32-NEXT: vmv.v.i v24, -1
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
; RV32-NEXT: vsetivli zero, 11, e64, m8, ta, ma
-; RV32-NEXT: vmv1r.v v0, v1
; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vmv1r.v v0, v1
+; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
; RV32-NEXT: ret
;
; RV64-LABEL: vadd_vx_v32i64_evl27:
; CHECK-NEXT: li a0, 32
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-NEXT: vfmv.v.f v12, fa0
-; CHECK-NEXT: vmfle.vv v16, v8, v8
-; CHECK-NEXT: vmfle.vf v8, v12, fa0
-; CHECK-NEXT: vmnot.m v8, v8
-; CHECK-NEXT: vmorn.mm v0, v8, v16
+; CHECK-NEXT: vmfle.vf v16, v12, fa0
+; CHECK-NEXT: vmnot.m v12, v16
+; CHECK-NEXT: vmfle.vv v13, v8, v8
+; CHECK-NEXT: vmorn.mm v0, v12, v13
; CHECK-NEXT: ret
%head = insertelement <32 x half> poison, half %b, i32 0
%splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer
define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs, <32 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vpgather_baseidx_v32i8_v32f64:
; RV32: # %bb.0:
-; RV32-NEXT: vmv1r.v v1, v0
; RV32-NEXT: li a2, 32
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vsext.vf4 v16, v8
-; RV32-NEXT: vsll.vi v24, v16, 3
+; RV32-NEXT: li a3, 16
+; RV32-NEXT: vsll.vi v16, v16, 3
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: bltu a1, a3, .LBB87_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: li a2, 16
+; RV32-NEXT: .LBB87_2:
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT: vslidedown.vi v24, v16, 16
; RV32-NEXT: addi a2, a1, -16
-; RV32-NEXT: sltu a3, a1, a2
-; RV32-NEXT: addi a3, a3, -1
-; RV32-NEXT: and a2, a3, a2
+; RV32-NEXT: sltu a1, a1, a2
+; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v24, 16
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t
-; RV32-NEXT: li a2, 16
-; RV32-NEXT: bltu a1, a2, .LBB87_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: li a1, 16
-; RV32-NEXT: .LBB87_2:
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmv1r.v v0, v1
-; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t
+; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t
; RV32-NEXT: ret
;
; RV64-LABEL: vpgather_baseidx_v32i8_v32f64:
; RV64: # %bb.0:
; RV64-NEXT: vmv1r.v v10, v0
+; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vsext.vf8 v16, v8
+; RV64-NEXT: vsll.vi v24, v16, 3
; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v12, v8, 16
+; RV64-NEXT: vslidedown.vi v8, v8, 16
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vsext.vf8 v16, v12
+; RV64-NEXT: vsext.vf8 v16, v8
; RV64-NEXT: vsll.vi v16, v16, 3
-; RV64-NEXT: vsext.vf8 v24, v8
-; RV64-NEXT: vsll.vi v24, v24, 3
; RV64-NEXT: addi a2, a1, -16
; RV64-NEXT: sltu a3, a1, a2
; RV64-NEXT: addi a3, a3, -1
define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %idxs, <32 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vpgather_baseidx_sext_v32i8_v32f64:
; RV32: # %bb.0:
-; RV32-NEXT: vmv1r.v v1, v0
; RV32-NEXT: li a2, 32
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vsext.vf4 v16, v8
-; RV32-NEXT: vsll.vi v24, v16, 3
+; RV32-NEXT: li a3, 16
+; RV32-NEXT: vsll.vi v16, v16, 3
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: bltu a1, a3, .LBB88_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: li a2, 16
+; RV32-NEXT: .LBB88_2:
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT: vslidedown.vi v24, v16, 16
; RV32-NEXT: addi a2, a1, -16
-; RV32-NEXT: sltu a3, a1, a2
-; RV32-NEXT: addi a3, a3, -1
-; RV32-NEXT: and a2, a3, a2
+; RV32-NEXT: sltu a1, a1, a2
+; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v24, 16
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t
-; RV32-NEXT: li a2, 16
-; RV32-NEXT: bltu a1, a2, .LBB88_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: li a1, 16
-; RV32-NEXT: .LBB88_2:
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmv1r.v v0, v1
-; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t
+; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t
; RV32-NEXT: ret
;
; RV64-LABEL: vpgather_baseidx_sext_v32i8_v32f64:
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vsext.vf8 v16, v12
; RV64-NEXT: vsext.vf8 v24, v8
-; RV64-NEXT: vsll.vi v16, v16, 3
; RV64-NEXT: vsll.vi v24, v24, 3
+; RV64-NEXT: vsll.vi v16, v16, 3
; RV64-NEXT: addi a2, a1, -16
; RV64-NEXT: sltu a3, a1, a2
; RV64-NEXT: addi a3, a3, -1
define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %idxs, <32 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vpgather_baseidx_zext_v32i8_v32f64:
; RV32: # %bb.0:
-; RV32-NEXT: vmv1r.v v1, v0
; RV32-NEXT: li a2, 32
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vzext.vf4 v16, v8
-; RV32-NEXT: vsll.vi v24, v16, 3
+; RV32-NEXT: li a3, 16
+; RV32-NEXT: vsll.vi v16, v16, 3
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: bltu a1, a3, .LBB89_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: li a2, 16
+; RV32-NEXT: .LBB89_2:
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT: vslidedown.vi v24, v16, 16
; RV32-NEXT: addi a2, a1, -16
-; RV32-NEXT: sltu a3, a1, a2
-; RV32-NEXT: addi a3, a3, -1
-; RV32-NEXT: and a2, a3, a2
+; RV32-NEXT: sltu a1, a1, a2
+; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v24, 16
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t
-; RV32-NEXT: li a2, 16
-; RV32-NEXT: bltu a1, a2, .LBB89_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: li a1, 16
-; RV32-NEXT: .LBB89_2:
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmv1r.v v0, v1
-; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t
+; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t
; RV32-NEXT: ret
;
; RV64-LABEL: vpgather_baseidx_zext_v32i8_v32f64:
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vzext.vf8 v16, v12
; RV64-NEXT: vzext.vf8 v24, v8
-; RV64-NEXT: vsll.vi v16, v16, 3
; RV64-NEXT: vsll.vi v24, v24, 3
+; RV64-NEXT: vsll.vi v16, v16, 3
; RV64-NEXT: addi a2, a1, -16
; RV64-NEXT: sltu a3, a1, a2
; RV64-NEXT: addi a3, a3, -1
define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs, <32 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vpgather_baseidx_v32i16_v32f64:
; RV32: # %bb.0:
-; RV32-NEXT: vmv1r.v v1, v0
; RV32-NEXT: li a2, 32
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vsext.vf2 v16, v8
-; RV32-NEXT: vsll.vi v24, v16, 3
+; RV32-NEXT: li a3, 16
+; RV32-NEXT: vsll.vi v16, v16, 3
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: bltu a1, a3, .LBB90_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: li a2, 16
+; RV32-NEXT: .LBB90_2:
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT: vslidedown.vi v24, v16, 16
; RV32-NEXT: addi a2, a1, -16
-; RV32-NEXT: sltu a3, a1, a2
-; RV32-NEXT: addi a3, a3, -1
-; RV32-NEXT: and a2, a3, a2
+; RV32-NEXT: sltu a1, a1, a2
+; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v24, 16
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t
-; RV32-NEXT: li a2, 16
-; RV32-NEXT: bltu a1, a2, .LBB90_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: li a1, 16
-; RV32-NEXT: .LBB90_2:
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmv1r.v v0, v1
-; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t
+; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t
; RV32-NEXT: ret
;
; RV64-LABEL: vpgather_baseidx_v32i16_v32f64:
; RV64: # %bb.0:
; RV64-NEXT: vmv1r.v v12, v0
+; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vsext.vf4 v16, v8
+; RV64-NEXT: vsll.vi v24, v16, 3
; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, ma
-; RV64-NEXT: vslidedown.vi v16, v8, 16
+; RV64-NEXT: vslidedown.vi v8, v8, 16
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vsext.vf4 v24, v16
-; RV64-NEXT: vsll.vi v16, v24, 3
-; RV64-NEXT: vsext.vf4 v24, v8
-; RV64-NEXT: vsll.vi v24, v24, 3
+; RV64-NEXT: vsext.vf4 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 3
; RV64-NEXT: addi a2, a1, -16
; RV64-NEXT: sltu a3, a1, a2
; RV64-NEXT: addi a3, a3, -1
define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16> %idxs, <32 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vpgather_baseidx_sext_v32i16_v32f64:
; RV32: # %bb.0:
-; RV32-NEXT: vmv1r.v v1, v0
; RV32-NEXT: li a2, 32
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vsext.vf2 v16, v8
-; RV32-NEXT: vsll.vi v24, v16, 3
+; RV32-NEXT: li a3, 16
+; RV32-NEXT: vsll.vi v16, v16, 3
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: bltu a1, a3, .LBB91_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: li a2, 16
+; RV32-NEXT: .LBB91_2:
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT: vslidedown.vi v24, v16, 16
; RV32-NEXT: addi a2, a1, -16
-; RV32-NEXT: sltu a3, a1, a2
-; RV32-NEXT: addi a3, a3, -1
-; RV32-NEXT: and a2, a3, a2
+; RV32-NEXT: sltu a1, a1, a2
+; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v24, 16
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t
-; RV32-NEXT: li a2, 16
-; RV32-NEXT: bltu a1, a2, .LBB91_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: li a1, 16
-; RV32-NEXT: .LBB91_2:
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmv1r.v v0, v1
-; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t
+; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t
; RV32-NEXT: ret
;
; RV64-LABEL: vpgather_baseidx_sext_v32i16_v32f64:
; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, ma
; RV64-NEXT: vslidedown.vi v16, v8, 16
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vsext.vf4 v24, v16
-; RV64-NEXT: vsext.vf4 v0, v8
-; RV64-NEXT: vsll.vi v16, v24, 3
-; RV64-NEXT: vsll.vi v24, v0, 3
+; RV64-NEXT: vsext.vf4 v0, v16
+; RV64-NEXT: vsext.vf4 v16, v8
+; RV64-NEXT: vsll.vi v24, v16, 3
+; RV64-NEXT: vsll.vi v16, v0, 3
; RV64-NEXT: addi a2, a1, -16
; RV64-NEXT: sltu a3, a1, a2
; RV64-NEXT: addi a3, a3, -1
define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16> %idxs, <32 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vpgather_baseidx_zext_v32i16_v32f64:
; RV32: # %bb.0:
-; RV32-NEXT: vmv1r.v v1, v0
; RV32-NEXT: li a2, 32
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vzext.vf2 v16, v8
-; RV32-NEXT: vsll.vi v24, v16, 3
+; RV32-NEXT: li a3, 16
+; RV32-NEXT: vsll.vi v16, v16, 3
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: bltu a1, a3, .LBB92_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: li a2, 16
+; RV32-NEXT: .LBB92_2:
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT: vslidedown.vi v24, v16, 16
; RV32-NEXT: addi a2, a1, -16
-; RV32-NEXT: sltu a3, a1, a2
-; RV32-NEXT: addi a3, a3, -1
-; RV32-NEXT: and a2, a3, a2
+; RV32-NEXT: sltu a1, a1, a2
+; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v24, 16
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t
-; RV32-NEXT: li a2, 16
-; RV32-NEXT: bltu a1, a2, .LBB92_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: li a1, 16
-; RV32-NEXT: .LBB92_2:
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmv1r.v v0, v1
-; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t
+; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t
; RV32-NEXT: ret
;
; RV64-LABEL: vpgather_baseidx_zext_v32i16_v32f64:
; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, ma
; RV64-NEXT: vslidedown.vi v16, v8, 16
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vzext.vf4 v24, v16
-; RV64-NEXT: vzext.vf4 v0, v8
-; RV64-NEXT: vsll.vi v16, v24, 3
-; RV64-NEXT: vsll.vi v24, v0, 3
+; RV64-NEXT: vzext.vf4 v0, v16
+; RV64-NEXT: vzext.vf4 v16, v8
+; RV64-NEXT: vsll.vi v24, v16, 3
+; RV64-NEXT: vsll.vi v16, v0, 3
; RV64-NEXT: addi a2, a1, -16
; RV64-NEXT: sltu a3, a1, a2
; RV64-NEXT: addi a3, a3, -1
;
; RV64-LABEL: vpgather_baseidx_v32i32_v32f64:
; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: sub sp, sp, a2
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; RV64-NEXT: vmv1r.v v24, v0
+; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT: vsext.vf2 v16, v8
+; RV64-NEXT: vsll.vi v16, v16, 3
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT: vslidedown.vi v16, v8, 16
+; RV64-NEXT: vslidedown.vi v8, v8, 16
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vsext.vf2 v0, v16
-; RV64-NEXT: vsll.vi v16, v0, 3
-; RV64-NEXT: vsext.vf2 v0, v8
-; RV64-NEXT: vsll.vi v8, v0, 3
+; RV64-NEXT: vsext.vf2 v16, v8
+; RV64-NEXT: vsll.vi v8, v16, 3
; RV64-NEXT: addi a2, a1, -16
; RV64-NEXT: sltu a3, a1, a2
; RV64-NEXT: addi a3, a3, -1
; RV64-NEXT: and a2, a3, a2
; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT: vslidedown.vi v0, v24, 2
+; RV64-NEXT: vslidedown.vi v0, v0, 2
; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t
+; RV64-NEXT: vluxei64.v v16, (a0), v8, v0.t
; RV64-NEXT: li a2, 16
; RV64-NEXT: bltu a1, a2, .LBB93_2
; RV64-NEXT: # %bb.1:
; RV64-NEXT: .LBB93_2:
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vmv1r.v v0, v24
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%ptrs = getelementptr inbounds double, ptr %base, <32 x i32> %idxs
%v = call <32 x double> @llvm.vp.gather.v32f64.v32p0(<32 x ptr> %ptrs, <32 x i1> %m, i32 %evl)
;
; RV64-LABEL: vpgather_baseidx_sext_v32i32_v32f64:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: sub sp, sp, a2
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
-; RV64-NEXT: addi a2, sp, 16
-; RV64-NEXT: vs1r.v v0, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: vmv1r.v v24, v0
; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV64-NEXT: vslidedown.vi v16, v8, 16
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vsext.vf2 v0, v16
-; RV64-NEXT: vsext.vf2 v24, v8
+; RV64-NEXT: vsext.vf2 v16, v8
+; RV64-NEXT: vsll.vi v8, v16, 3
; RV64-NEXT: vsll.vi v16, v0, 3
-; RV64-NEXT: vsll.vi v8, v24, 3
-; RV64-NEXT: vl1r.v v24, (a2) # Unknown-size Folded Reload
; RV64-NEXT: addi a2, a1, -16
; RV64-NEXT: sltu a3, a1, a2
; RV64-NEXT: addi a3, a3, -1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vmv1r.v v0, v24
; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%eidxs = sext <32 x i32> %idxs to <32 x i64>
%ptrs = getelementptr inbounds double, ptr %base, <32 x i64> %eidxs
;
; RV64-LABEL: vpgather_baseidx_zext_v32i32_v32f64:
; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: sub sp, sp, a2
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
-; RV64-NEXT: addi a2, sp, 16
-; RV64-NEXT: vs1r.v v0, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: vmv1r.v v24, v0
; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; RV64-NEXT: vslidedown.vi v16, v8, 16
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vzext.vf2 v0, v16
-; RV64-NEXT: vzext.vf2 v24, v8
+; RV64-NEXT: vzext.vf2 v16, v8
+; RV64-NEXT: vsll.vi v8, v16, 3
; RV64-NEXT: vsll.vi v16, v0, 3
-; RV64-NEXT: vsll.vi v8, v24, 3
-; RV64-NEXT: vl1r.v v24, (a2) # Unknown-size Folded Reload
; RV64-NEXT: addi a2, a1, -16
; RV64-NEXT: sltu a3, a1, a2
; RV64-NEXT: addi a3, a3, -1
; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; RV64-NEXT: vmv1r.v v0, v24
; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
%eidxs = zext <32 x i32> %idxs to <32 x i64>
%ptrs = getelementptr inbounds double, ptr %base, <32 x i64> %eidxs
define <vscale x 4 x i16> @test_signed_v4f32_v4i16(<vscale x 4 x float> %f) {
; CHECK-LABEL: test_signed_v4f32_v4i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvt.rtz.x.f.w v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfncvt.rtz.x.f.w v10, v8
; CHECK-NEXT: vmerge.vim v8, v10, 0, v0
; CHECK-NEXT: ret
%x = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4f32.nxv4i16(<vscale x 4 x float> %f)
define <vscale x 8 x i16> @test_signed_v8f32_v8i16(<vscale x 8 x float> %f) {
; CHECK-LABEL: test_signed_v8f32_v8i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v8
; CHECK-NEXT: vmerge.vim v8, v12, 0, v0
; CHECK-NEXT: ret
%x = call <vscale x 8 x i16> @llvm.fptosi.sat.nxv8f32.nxv8i16(<vscale x 8 x float> %f)
; CHECK-LABEL: test_signed_v2f32_v2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vfwcvt.rtz.x.f.v v10, v8
+; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
; CHECK-NEXT: vmerge.vim v8, v10, 0, v0
; CHECK-NEXT: ret
; CHECK-LABEL: test_signed_v4f32_v4i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vfwcvt.rtz.x.f.v v12, v8
+; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
; CHECK-NEXT: vmerge.vim v8, v12, 0, v0
; CHECK-NEXT: ret
define <vscale x 2 x i32> @test_signed_v2f64_v2i32(<vscale x 2 x double> %f) {
; CHECK-LABEL: test_signed_v2f64_v2i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfncvt.rtz.x.f.w v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT: vfncvt.rtz.x.f.w v10, v8
; CHECK-NEXT: vmerge.vim v8, v10, 0, v0
; CHECK-NEXT: ret
%x = call <vscale x 2 x i32> @llvm.fptosi.sat.nxv2f64.nxv2i32(<vscale x 2 x double> %f)
define <vscale x 4 x i32> @test_signed_v4f64_v4i32(<vscale x 4 x double> %f) {
; CHECK-LABEL: test_signed_v4f64_v4i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v8
; CHECK-NEXT: vmerge.vim v8, v12, 0, v0
; CHECK-NEXT: ret
%x = call <vscale x 4 x i32> @llvm.fptosi.sat.nxv4f64.nxv4i32(<vscale x 4 x double> %f)
define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
; CHECK-LABEL: test_signed_v8f64_v8i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfncvt.rtz.x.f.w v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT: vfncvt.rtz.x.f.w v16, v8
; CHECK-NEXT: vmerge.vim v8, v16, 0, v0
; CHECK-NEXT: ret
%x = call <vscale x 8 x i32> @llvm.fptosi.sat.nxv8f64.nxv8i32(<vscale x 8 x double> %f)
; CHECK-LABEL: test_signed_v2f16_v2i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vfwcvt.rtz.x.f.v v9, v8
+; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; CHECK-NEXT: vmerge.vim v8, v9, 0, v0
; CHECK-NEXT: ret
; CHECK-LABEL: test_signed_v4f16_v4i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vfwcvt.rtz.x.f.v v10, v8
+; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vmerge.vim v8, v10, 0, v0
; CHECK-NEXT: ret
; CHECK-LABEL: test_signed_v8f16_v8i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vfwcvt.rtz.x.f.v v12, v8
+; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-NEXT: vmerge.vim v8, v12, 0, v0
; CHECK-NEXT: ret
; CHECK-LABEL: test_signed_v2f16_v2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vfwcvt.f.f.v v9, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; CHECK-NEXT: vfwcvt.rtz.x.f.v v10, v9
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
; CHECK-NEXT: vmerge.vim v8, v10, 0, v0
; CHECK-NEXT: ret
; CHECK-LABEL: test_signed_v4f16_v4i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vfwcvt.f.f.v v10, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vfwcvt.rtz.x.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
; CHECK-NEXT: vmerge.vim v8, v12, 0, v0
; CHECK-NEXT: ret
define <vscale x 4 x i16> @test_signed_v4f32_v4i16(<vscale x 4 x float> %f) {
; CHECK-LABEL: test_signed_v4f32_v4i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvt.rtz.xu.f.w v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfncvt.rtz.xu.f.w v10, v8
; CHECK-NEXT: vmerge.vim v8, v10, 0, v0
; CHECK-NEXT: ret
%x = call <vscale x 4 x i16> @llvm.fptoui.sat.nxv4f32.nxv4i16(<vscale x 4 x float> %f)
define <vscale x 8 x i16> @test_signed_v8f32_v8i16(<vscale x 8 x float> %f) {
; CHECK-LABEL: test_signed_v8f32_v8i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvt.rtz.xu.f.w v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfncvt.rtz.xu.f.w v12, v8
; CHECK-NEXT: vmerge.vim v8, v12, 0, v0
; CHECK-NEXT: ret
%x = call <vscale x 8 x i16> @llvm.fptoui.sat.nxv8f32.nxv8i16(<vscale x 8 x float> %f)
; CHECK-LABEL: test_signed_v2f32_v2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vfwcvt.rtz.xu.f.v v10, v8
+; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
; CHECK-NEXT: vmerge.vim v8, v10, 0, v0
; CHECK-NEXT: ret
; CHECK-LABEL: test_signed_v4f32_v4i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vfwcvt.rtz.xu.f.v v12, v8
+; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
; CHECK-NEXT: vmerge.vim v8, v12, 0, v0
; CHECK-NEXT: ret
define <vscale x 2 x i32> @test_signed_v2f64_v2i32(<vscale x 2 x double> %f) {
; CHECK-LABEL: test_signed_v2f64_v2i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfncvt.rtz.xu.f.w v10, v8
+; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT: vfncvt.rtz.xu.f.w v10, v8
; CHECK-NEXT: vmerge.vim v8, v10, 0, v0
; CHECK-NEXT: ret
%x = call <vscale x 2 x i32> @llvm.fptoui.sat.nxv2f64.nxv2i32(<vscale x 2 x double> %f)
define <vscale x 4 x i32> @test_signed_v4f64_v4i32(<vscale x 4 x double> %f) {
; CHECK-LABEL: test_signed_v4f64_v4i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfncvt.rtz.xu.f.w v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT: vfncvt.rtz.xu.f.w v12, v8
; CHECK-NEXT: vmerge.vim v8, v12, 0, v0
; CHECK-NEXT: ret
%x = call <vscale x 4 x i32> @llvm.fptoui.sat.nxv4f64.nxv4i32(<vscale x 4 x double> %f)
define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
; CHECK-LABEL: test_signed_v8f64_v8i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT: vfncvt.rtz.xu.f.w v16, v8
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT: vfncvt.rtz.xu.f.w v16, v8
; CHECK-NEXT: vmerge.vim v8, v16, 0, v0
; CHECK-NEXT: ret
%x = call <vscale x 8 x i32> @llvm.fptoui.sat.nxv8f64.nxv8i32(<vscale x 8 x double> %f)
; CHECK-LABEL: test_signed_v2f16_v2i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vfwcvt.rtz.xu.f.v v9, v8
+; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; CHECK-NEXT: vmerge.vim v8, v9, 0, v0
; CHECK-NEXT: ret
; CHECK-LABEL: test_signed_v4f16_v4i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vfwcvt.rtz.xu.f.v v10, v8
+; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vmerge.vim v8, v10, 0, v0
; CHECK-NEXT: ret
; CHECK-LABEL: test_signed_v8f16_v8i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vfwcvt.rtz.xu.f.v v12, v8
+; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-NEXT: vmerge.vim v8, v12, 0, v0
; CHECK-NEXT: ret
; CHECK-LABEL: test_signed_v2f16_v2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vfwcvt.f.f.v v9, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; CHECK-NEXT: vfwcvt.rtz.xu.f.v v10, v9
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
; CHECK-NEXT: vmerge.vim v8, v10, 0, v0
; CHECK-NEXT: ret
; CHECK-LABEL: test_signed_v4f16_v4i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vfwcvt.f.f.v v10, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vfwcvt.rtz.xu.f.v v12, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmfne.vv v0, v8, v8
; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
; CHECK-NEXT: vmerge.vim v8, v12, 0, v0
; CHECK-NEXT: ret
;
; RV64-LABEL: mgather_baseidx_nxv16i8:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT: vsext.vf8 v16, v8
-; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu
-; RV64-NEXT: vluxei64.v v10, (a0), v16, v0.t
+; RV64-NEXT: vmv1r.v v12, v0
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: srli a1, a1, 3
; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma
; RV64-NEXT: vsext.vf8 v16, v9
; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu
; RV64-NEXT: vluxei64.v v11, (a0), v16, v0.t
+; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT: vsext.vf8 v16, v8
+; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu
+; RV64-NEXT: vmv1r.v v0, v12
+; RV64-NEXT: vluxei64.v v10, (a0), v16, v0.t
; RV64-NEXT: vmv2r.v v8, v10
; RV64-NEXT: ret
%ptrs = getelementptr inbounds i8, ptr %base, <vscale x 16 x i8> %idxs
define <vscale x 32 x i8> @mgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8> %idxs, <vscale x 32 x i1> %m, <vscale x 32 x i8> %passthru) {
; RV32-LABEL: mgather_baseidx_nxv32i8:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT: vsext.vf4 v16, v8
-; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, mu
-; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t
+; RV32-NEXT: vmv1r.v v16, v0
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: srli a1, a1, 2
; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
; RV32-NEXT: vslidedown.vx v0, v0, a1
; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT: vsext.vf4 v16, v10
+; RV32-NEXT: vsext.vf4 v24, v10
+; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, mu
+; RV32-NEXT: vluxei32.v v14, (a0), v24, v0.t
+; RV32-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; RV32-NEXT: vsext.vf4 v24, v8
; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, mu
-; RV32-NEXT: vluxei32.v v14, (a0), v16, v0.t
+; RV32-NEXT: vmv1r.v v0, v16
+; RV32-NEXT: vluxei32.v v12, (a0), v24, v0.t
; RV32-NEXT: vmv4r.v v8, v12
; RV32-NEXT: ret
;
; RV64-LABEL: mgather_baseidx_nxv32i8:
; RV64: # %bb.0:
; RV64-NEXT: vmv1r.v v16, v0
-; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT: vsext.vf8 v24, v8
-; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu
-; RV64-NEXT: vluxei64.v v12, (a0), v24, v0.t
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: srli a2, a1, 3
-; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma
-; RV64-NEXT: vslidedown.vx v0, v0, a2
-; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma
+; RV64-NEXT: srli a2, a1, 2
+; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; RV64-NEXT: vslidedown.vx v17, v0, a2
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT: vsext.vf8 v24, v10
+; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu
+; RV64-NEXT: vmv1r.v v0, v17
+; RV64-NEXT: vluxei64.v v14, (a0), v24, v0.t
+; RV64-NEXT: srli a1, a1, 3
+; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma
+; RV64-NEXT: vslidedown.vx v0, v16, a1
+; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV64-NEXT: vsext.vf8 v24, v9
; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu
; RV64-NEXT: vluxei64.v v13, (a0), v24, v0.t
-; RV64-NEXT: srli a1, a1, 2
-; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
-; RV64-NEXT: vslidedown.vx v0, v16, a1
-; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT: vsext.vf8 v16, v10
+; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT: vsext.vf8 v24, v8
; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu
-; RV64-NEXT: vluxei64.v v14, (a0), v16, v0.t
-; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
-; RV64-NEXT: vslidedown.vx v0, v0, a2
+; RV64-NEXT: vmv1r.v v0, v16
+; RV64-NEXT: vluxei64.v v12, (a0), v24, v0.t
+; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma
+; RV64-NEXT: vslidedown.vx v0, v17, a1
; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; RV64-NEXT: vsext.vf8 v16, v11
; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu
; CHECK-NEXT: vsetivli zero, 1, e16, m8, ta, ma
; CHECK-NEXT: vle16.v v8, (zero)
; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT: vmclr.m v0
; CHECK-NEXT: vmv.v.i v16, 0
+; CHECK-NEXT: vmclr.m v0
; CHECK-NEXT: vsetivli zero, 0, e8, m4, tu, mu
; CHECK-NEXT: vmv4r.v v20, v16
; CHECK-NEXT: vssubu.vx v20, v16, zero, v0.t
; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
; CHECK-NEXT: vfmv.v.f v9, fa0
; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t
+; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmand.mm v0, v8, v9
; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
; CHECK-NEXT: vfmv.v.f v9, fa0
; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t
+; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmand.mm v0, v9, v8
; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
; CHECK-NEXT: vfmv.v.f v9, fa0
; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t
+; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmor.mm v0, v8, v9
; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
; CHECK-NEXT: vfmv.v.f v9, fa0
; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t
+; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmor.mm v0, v9, v8
; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
; CHECK-NEXT: vfmv.v.f v10, fa0
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT: vmfeq.vf v12, v10, fa0, v0.t
-; CHECK-NEXT: vmfeq.vv v10, v8, v8, v0.t
-; CHECK-NEXT: vmand.mm v0, v10, v12
+; CHECK-NEXT: vmfeq.vv v12, v8, v8, v0.t
+; CHECK-NEXT: vmfeq.vf v8, v10, fa0, v0.t
+; CHECK-NEXT: vmand.mm v0, v12, v8
; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
; CHECK-NEXT: vfmv.v.f v10, fa0
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT: vmfeq.vf v12, v10, fa0, v0.t
-; CHECK-NEXT: vmfeq.vv v10, v8, v8, v0.t
-; CHECK-NEXT: vmand.mm v0, v12, v10
+; CHECK-NEXT: vmfeq.vv v12, v8, v8, v0.t
+; CHECK-NEXT: vmfeq.vf v8, v10, fa0, v0.t
+; CHECK-NEXT: vmand.mm v0, v8, v12
; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
; CHECK-NEXT: vfmv.v.f v10, fa0
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT: vmfne.vf v12, v10, fa0, v0.t
-; CHECK-NEXT: vmfne.vv v10, v8, v8, v0.t
-; CHECK-NEXT: vmor.mm v0, v10, v12
+; CHECK-NEXT: vmfne.vv v12, v8, v8, v0.t
+; CHECK-NEXT: vmfne.vf v8, v10, fa0, v0.t
+; CHECK-NEXT: vmor.mm v0, v12, v8
; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
; CHECK-NEXT: vfmv.v.f v10, fa0
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT: vmfne.vf v12, v10, fa0, v0.t
-; CHECK-NEXT: vmfne.vv v10, v8, v8, v0.t
-; CHECK-NEXT: vmor.mm v0, v12, v10
+; CHECK-NEXT: vmfne.vv v12, v8, v8, v0.t
+; CHECK-NEXT: vmfne.vf v8, v10, fa0, v0.t
+; CHECK-NEXT: vmor.mm v0, v8, v12
; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
%vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma
; CHECK-NEXT: vfmv.v.f v9, fa0
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t
+; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmand.mm v0, v8, v9
; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 1 x double> poison, double %b, i32 0
; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma
; CHECK-NEXT: vfmv.v.f v9, fa0
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmfeq.vv v8, v8, v8, v0.t
+; CHECK-NEXT: vmfeq.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmand.mm v0, v9, v8
; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 1 x double> poison, double %b, i32 0
; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma
; CHECK-NEXT: vfmv.v.f v9, fa0
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t
+; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmor.mm v0, v8, v9
; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 1 x double> poison, double %b, i32 0
; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma
; CHECK-NEXT: vfmv.v.f v9, fa0
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmfne.vv v8, v8, v8, v0.t
+; CHECK-NEXT: vmfne.vf v9, v9, fa0, v0.t
; CHECK-NEXT: vmor.mm v0, v9, v8
; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 1 x double> poison, double %b, i32 0
; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; CHECK-NEXT: vfmv.v.f v16, fa0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vf v24, v16, fa0, v0.t
-; CHECK-NEXT: vmfeq.vv v16, v8, v8, v0.t
-; CHECK-NEXT: vmand.mm v0, v16, v24
+; CHECK-NEXT: vmfeq.vv v24, v8, v8, v0.t
+; CHECK-NEXT: vmfeq.vf v8, v16, fa0, v0.t
+; CHECK-NEXT: vmand.mm v0, v24, v8
; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 8 x double> poison, double %b, i32 0
%vb = shufflevector <vscale x 8 x double> %elt.head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; CHECK-NEXT: vfmv.v.f v16, fa0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vmfeq.vf v24, v16, fa0, v0.t
-; CHECK-NEXT: vmfeq.vv v16, v8, v8, v0.t
-; CHECK-NEXT: vmand.mm v0, v24, v16
+; CHECK-NEXT: vmfeq.vv v24, v8, v8, v0.t
+; CHECK-NEXT: vmfeq.vf v8, v16, fa0, v0.t
+; CHECK-NEXT: vmand.mm v0, v8, v24
; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 8 x double> poison, double %b, i32 0
%vb = shufflevector <vscale x 8 x double> %elt.head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; CHECK-NEXT: vfmv.v.f v16, fa0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vmfne.vf v24, v16, fa0, v0.t
-; CHECK-NEXT: vmfne.vv v16, v8, v8, v0.t
-; CHECK-NEXT: vmor.mm v0, v16, v24
+; CHECK-NEXT: vmfne.vv v24, v8, v8, v0.t
+; CHECK-NEXT: vmfne.vf v8, v16, fa0, v0.t
+; CHECK-NEXT: vmor.mm v0, v24, v8
; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 8 x double> poison, double %b, i32 0
%vb = shufflevector <vscale x 8 x double> %elt.head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; CHECK-NEXT: vfmv.v.f v16, fa0
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vmfne.vf v24, v16, fa0, v0.t
-; CHECK-NEXT: vmfne.vv v16, v8, v8, v0.t
-; CHECK-NEXT: vmor.mm v0, v24, v16
+; CHECK-NEXT: vmfne.vv v24, v8, v8, v0.t
+; CHECK-NEXT: vmfne.vf v8, v16, fa0, v0.t
+; CHECK-NEXT: vmor.mm v0, v8, v24
; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 8 x double> poison, double %b, i32 0
%vb = shufflevector <vscale x 8 x double> %elt.head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: vid.v v11
; CHECK-NEXT: vrsub.vi v12, v11, 15
; CHECK-NEXT: vrgather.vv v10, v8, v12
-; CHECK-NEXT: vrsub.vi v8, v11, 7
; CHECK-NEXT: li a0, 255
; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; CHECK-NEXT: vmv.v.x v0, a0
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu
+; CHECK-NEXT: vrsub.vi v8, v11, 7
; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t
; CHECK-NEXT: vmv.v.v v8, v10
; CHECK-NEXT: ret
; CHECK-NEXT: vid.v v14
; CHECK-NEXT: vrsub.vi v16, v14, 15
; CHECK-NEXT: vrgather.vv v10, v8, v16
-; CHECK-NEXT: vrsub.vi v8, v14, 7
; CHECK-NEXT: li a0, 255
; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; CHECK-NEXT: vmv.v.x v0, a0
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu
+; CHECK-NEXT: vrsub.vi v8, v14, 7
; CHECK-NEXT: vrgather.vv v10, v12, v8, v0.t
; CHECK-NEXT: vmv.v.v v8, v10
; CHECK-NEXT: ret
; CHECK-NEXT: vid.v v14
; CHECK-NEXT: vrsub.vi v16, v14, 7
; CHECK-NEXT: vrgather.vv v10, v8, v16
+; CHECK-NEXT: vrsub.vi v8, v14, 3
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; CHECK-NEXT: vmv.v.i v0, 15
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu
-; CHECK-NEXT: vrsub.vi v8, v14, 3
; CHECK-NEXT: vrgather.vv v10, v12, v8, v0.t
; CHECK-NEXT: vmv.v.v v8, v10
; CHECK-NEXT: ret
; CHECK-NEXT: vid.v v20
; CHECK-NEXT: vrsub.vi v24, v20, 15
; CHECK-NEXT: vrgather.vv v12, v8, v24
-; CHECK-NEXT: vrsub.vi v8, v20, 7
; CHECK-NEXT: li a0, 255
; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; CHECK-NEXT: vmv.v.x v0, a0
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT: vrsub.vi v8, v20, 7
; CHECK-NEXT: vrgather.vv v12, v16, v8, v0.t
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: ret
; RV32-NEXT: vrsub.vi v19, v18, 7
; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma
; RV32-NEXT: vrgatherei16.vv v12, v8, v19
-; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT: vmv.v.i v0, 15
-; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; RV32-NEXT: vrsub.vi v8, v18, 3
+; RV32-NEXT: vmv.v.i v0, 15
; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu
; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t
; RV32-NEXT: vmv.v.v v8, v12
; RV64-NEXT: vid.v v20
; RV64-NEXT: vrsub.vi v24, v20, 7
; RV64-NEXT: vrgather.vv v12, v8, v24
+; RV64-NEXT: vrsub.vi v8, v20, 3
; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; RV64-NEXT: vmv.v.i v0, 15
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT: vrsub.vi v8, v20, 3
; RV64-NEXT: vrgather.vv v12, v16, v8, v0.t
; RV64-NEXT: vmv.v.v v8, v12
; RV64-NEXT: ret
; CHECK-NEXT: vid.v v14
; CHECK-NEXT: vrsub.vi v16, v14, 15
; CHECK-NEXT: vrgather.vv v10, v8, v16
-; CHECK-NEXT: vrsub.vi v8, v14, 7
; CHECK-NEXT: li a0, 255
; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; CHECK-NEXT: vmv.v.x v0, a0
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu
+; CHECK-NEXT: vrsub.vi v8, v14, 7
; CHECK-NEXT: vrgather.vv v10, v12, v8, v0.t
; CHECK-NEXT: vmv.v.v v8, v10
; CHECK-NEXT: ret
; CHECK-NEXT: vid.v v14
; CHECK-NEXT: vrsub.vi v16, v14, 7
; CHECK-NEXT: vrgather.vv v10, v8, v16
+; CHECK-NEXT: vrsub.vi v8, v14, 3
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; CHECK-NEXT: vmv.v.i v0, 15
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu
-; CHECK-NEXT: vrsub.vi v8, v14, 3
; CHECK-NEXT: vrgather.vv v10, v12, v8, v0.t
; CHECK-NEXT: vmv.v.v v8, v10
; CHECK-NEXT: ret
; CHECK-NEXT: vid.v v20
; CHECK-NEXT: vrsub.vi v24, v20, 15
; CHECK-NEXT: vrgather.vv v12, v8, v24
-; CHECK-NEXT: vrsub.vi v8, v20, 7
; CHECK-NEXT: li a0, 255
; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; CHECK-NEXT: vmv.v.x v0, a0
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT: vrsub.vi v8, v20, 7
; CHECK-NEXT: vrgather.vv v12, v16, v8, v0.t
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: ret
; RV32-NEXT: vrsub.vi v19, v18, 7
; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma
; RV32-NEXT: vrgatherei16.vv v12, v8, v19
-; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT: vmv.v.i v0, 15
-; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; RV32-NEXT: vrsub.vi v8, v18, 3
+; RV32-NEXT: vmv.v.i v0, 15
; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu
; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t
; RV32-NEXT: vmv.v.v v8, v12
; RV64-NEXT: vid.v v20
; RV64-NEXT: vrsub.vi v24, v20, 7
; RV64-NEXT: vrgather.vv v12, v8, v24
+; RV64-NEXT: vrsub.vi v8, v20, 3
; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; RV64-NEXT: vmv.v.i v0, 15
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT: vrsub.vi v8, v20, 3
; RV64-NEXT: vrgather.vv v12, v16, v8, v0.t
; RV64-NEXT: vmv.v.v v8, v12
; RV64-NEXT: ret
define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; CHECK-LABEL: vec_v2i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT: vmsle.vi v0, v8, -1
; CHECK-NEXT: li a0, -1
; CHECK-NEXT: srli a1, a0, 1
-; CHECK-NEXT: vsll.vv v10, v8, v9
-; CHECK-NEXT: vsra.vv v9, v10, v9
-; CHECK-NEXT: vmsne.vv v8, v8, v9
-; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a1
+; CHECK-NEXT: vsll.vv v11, v8, v9
+; CHECK-NEXT: vsra.vv v9, v11, v9
+; CHECK-NEXT: vmsne.vv v9, v8, v9
+; CHECK-NEXT: vmsle.vi v0, v8, -1
; CHECK-NEXT: slli a0, a0, 63
-; CHECK-NEXT: vmerge.vxm v9, v9, a0, v0
-; CHECK-NEXT: vmv.v.v v0, v8
-; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0
+; CHECK-NEXT: vmerge.vxm v8, v10, a0, v0
+; CHECK-NEXT: vmv.v.v v0, v9
+; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0
; CHECK-NEXT: ret
%tmp = call <2 x i64> @llvm.sshl.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
ret <2 x i64> %tmp
define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; CHECK-LABEL: vec_v4i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vmsle.vi v0, v8, -1
; CHECK-NEXT: lui a0, 524288
; CHECK-NEXT: addiw a0, a0, -1
-; CHECK-NEXT: vsll.vv v10, v8, v9
-; CHECK-NEXT: vsra.vv v9, v10, v9
-; CHECK-NEXT: vmsne.vv v8, v8, v9
-; CHECK-NEXT: vmv.v.x v9, a0
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vsll.vv v11, v8, v9
+; CHECK-NEXT: vsra.vv v9, v11, v9
+; CHECK-NEXT: vmsne.vv v9, v8, v9
+; CHECK-NEXT: vmsle.vi v0, v8, -1
; CHECK-NEXT: li a0, 1
; CHECK-NEXT: slli a0, a0, 31
-; CHECK-NEXT: vmerge.vxm v9, v9, a0, v0
-; CHECK-NEXT: vmv.v.v v0, v8
-; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0
+; CHECK-NEXT: vmerge.vxm v8, v10, a0, v0
+; CHECK-NEXT: vmv.v.v v0, v9
+; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0
; CHECK-NEXT: ret
%tmp = call <4 x i32> @llvm.sshl.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
ret <4 x i32> %tmp
define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; CHECK-LABEL: vec_v8i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vmsle.vi v0, v8, -1
; CHECK-NEXT: lui a0, 8
; CHECK-NEXT: addiw a1, a0, -1
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT: vmsle.vi v0, v8, -1
; CHECK-NEXT: vsll.vv v10, v8, v9
; CHECK-NEXT: vsra.vv v9, v10, v9
; CHECK-NEXT: vmsne.vv v8, v8, v9
define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; CHECK-LABEL: vec_v16i8:
; CHECK: # %bb.0:
+; CHECK-NEXT: li a0, 127
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vsll.vv v11, v8, v9
+; CHECK-NEXT: vsra.vv v9, v11, v9
+; CHECK-NEXT: vmsne.vv v9, v8, v9
; CHECK-NEXT: vmsle.vi v0, v8, -1
-; CHECK-NEXT: li a0, 127
-; CHECK-NEXT: vsll.vv v10, v8, v9
-; CHECK-NEXT: vsra.vv v9, v10, v9
-; CHECK-NEXT: vmsne.vv v8, v8, v9
-; CHECK-NEXT: vmv.v.x v9, a0
; CHECK-NEXT: li a0, 128
-; CHECK-NEXT: vmerge.vxm v9, v9, a0, v0
-; CHECK-NEXT: vmv.v.v v0, v8
-; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0
+; CHECK-NEXT: vmerge.vxm v8, v10, a0, v0
+; CHECK-NEXT: vmv.v.v v0, v9
+; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0
; CHECK-NEXT: ret
%tmp = call <16 x i8> @llvm.sshl.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
ret <16 x i8> %tmp
; CHECK-LABEL: vec_nxv2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT: vmsle.vi v0, v8, -1
-; CHECK-NEXT: li a0, -1
-; CHECK-NEXT: srli a1, a0, 1
; CHECK-NEXT: vsll.vv v12, v8, v10
; CHECK-NEXT: vsra.vv v14, v12, v10
; CHECK-NEXT: vmsne.vv v10, v8, v14
-; CHECK-NEXT: vmv.v.x v8, a1
+; CHECK-NEXT: li a0, -1
+; CHECK-NEXT: srli a1, a0, 1
+; CHECK-NEXT: vmv.v.x v14, a1
+; CHECK-NEXT: vmsle.vi v0, v8, -1
; CHECK-NEXT: slli a0, a0, 63
-; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0
+; CHECK-NEXT: vmerge.vxm v8, v14, a0, v0
; CHECK-NEXT: vmv1r.v v0, v10
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
; CHECK-NEXT: ret
; CHECK-LABEL: vec_nxv4i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT: vmsle.vi v0, v8, -1
-; CHECK-NEXT: lui a0, 524288
-; CHECK-NEXT: addiw a0, a0, -1
; CHECK-NEXT: vsll.vv v12, v8, v10
; CHECK-NEXT: vsra.vv v14, v12, v10
; CHECK-NEXT: vmsne.vv v10, v8, v14
-; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addiw a0, a0, -1
+; CHECK-NEXT: vmv.v.x v14, a0
+; CHECK-NEXT: vmsle.vi v0, v8, -1
; CHECK-NEXT: li a0, 1
; CHECK-NEXT: slli a0, a0, 31
-; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0
+; CHECK-NEXT: vmerge.vxm v8, v14, a0, v0
; CHECK-NEXT: vmv1r.v v0, v10
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
; CHECK-NEXT: ret
; CHECK-LABEL: vec_nxv8i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vmsle.vi v0, v8, -1
-; CHECK-NEXT: lui a0, 8
-; CHECK-NEXT: addiw a1, a0, -1
; CHECK-NEXT: vsll.vv v12, v8, v10
; CHECK-NEXT: vsra.vv v14, v12, v10
; CHECK-NEXT: vmsne.vv v10, v8, v14
+; CHECK-NEXT: lui a0, 8
+; CHECK-NEXT: addiw a1, a0, -1
+; CHECK-NEXT: vmsle.vi v0, v8, -1
; CHECK-NEXT: vmv.v.x v8, a1
; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0
; CHECK-NEXT: vmv1r.v v0, v10
; CHECK-LABEL: vec_nxv16i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT: vmsle.vi v0, v8, -1
-; CHECK-NEXT: li a0, 127
; CHECK-NEXT: vsll.vv v12, v8, v10
; CHECK-NEXT: vsra.vv v14, v12, v10
; CHECK-NEXT: vmsne.vv v10, v8, v14
-; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: li a0, 127
+; CHECK-NEXT: vmv.v.x v14, a0
+; CHECK-NEXT: vmsle.vi v0, v8, -1
; CHECK-NEXT: li a0, 128
-; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0
+; CHECK-NEXT: vmerge.vxm v8, v14, a0, v0
; CHECK-NEXT: vmv1r.v v0, v10
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
; CHECK-NEXT: ret
; RV32-NEXT: vslidedown.vi v0, v0, 2
; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; RV32-NEXT: vmerge.vim v8, v8, 1, v0
-; RV32-NEXT: vadd.vi v12, v11, -16
; RV32-NEXT: lui a0, 16
; RV32-NEXT: addi a0, a0, -256
; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; RV32-NEXT: vmv.v.x v0, a0
; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu
+; RV32-NEXT: vadd.vi v12, v11, -16
; RV32-NEXT: vrgather.vv v9, v8, v12, v0.t
; RV32-NEXT: vmsne.vi v9, v9, 0
; RV32-NEXT: vadd.vi v12, v11, 1
; RV64-NEXT: vslidedown.vi v0, v0, 2
; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; RV64-NEXT: vmerge.vim v8, v8, 1, v0
-; RV64-NEXT: vadd.vi v12, v11, -16
; RV64-NEXT: lui a0, 16
; RV64-NEXT: addiw a0, a0, -256
; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; RV64-NEXT: vmv.v.x v0, a0
; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu
+; RV64-NEXT: vadd.vi v12, v11, -16
; RV64-NEXT: vrgather.vv v9, v8, v12, v0.t
; RV64-NEXT: vmsne.vi v9, v9, 0
; RV64-NEXT: vadd.vi v12, v11, 1
define <32 x i1> @vector_interleave_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b) {
; CHECK-LABEL: vector_interleave_v32i1_v16i1:
; CHECK: # %bb.0:
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT: vmv.v.i v10, 0
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; CHECK-NEXT: vslideup.vi v0, v8, 2
-; CHECK-NEXT: li a0, 32
; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 0
-; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: vmerge.vim v8, v10, 1, v0
; CHECK-NEXT: vsetivli zero, 16, e8, m2, ta, ma
; CHECK-NEXT: vslidedown.vi v10, v8, 16
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $x15 = PseudoVSETVLI $x14, 82 /* e32, m4, ta, mu */, implicit-def $vl, implicit-def $vtype
; CHECK-NEXT: $v28m4 = PseudoVLE32_V_M4 killed $x16, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype
- ; CHECK-NEXT: $v12m4 = PseudoVMV_V_V_M4 $v28m4, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: $v12m4 = PseudoVMV_V_V_M4 undef $v12m4, $v28m4, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
$x15 = PseudoVSETVLI $x14, 82, implicit-def $vl, implicit-def $vtype
$v28m4 = PseudoVLE32_V_M4 killed $x16, $noreg, 5, implicit $vl, implicit $vtype
$v12m4 = COPY $v28m4
; CHECK: liveins: $x14
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $x15 = PseudoVSETVLI $x14, 82 /* e32, m4, ta, mu */, implicit-def $vl, implicit-def $vtype
- ; CHECK-NEXT: $v28m4 = PseudoVMV_V_I_M4 0, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype
- ; CHECK-NEXT: $v12m4 = PseudoVMV_V_I_M4 0, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: $v28m4 = PseudoVMV_V_I_M4 undef $v28m4, 0, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: $v12m4 = PseudoVMV_V_I_M4 undef $v12m4, 0, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
$x15 = PseudoVSETVLI $x14, 82, implicit-def $vl, implicit-def $vtype
- $v28m4 = PseudoVMV_V_I_M4 0, $noreg, 5, implicit $vl, implicit $vtype
+ $v28m4 = PseudoVMV_V_I_M4 undef $v28m4, 0, $noreg, 5, 0, implicit $vl, implicit $vtype
$v12m4 = COPY $v28m4
...
---
; CHECK: liveins: $x14, $x16
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $x15 = PseudoVSETVLI $x14, 82 /* e32, m4, ta, mu */, implicit-def $vl, implicit-def $vtype
- ; CHECK-NEXT: $v28m4 = PseudoVMV_V_I_M4 0, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: $v28m4 = PseudoVMV_V_I_M4 undef $v28m4, 0, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
; CHECK-NEXT: $v4m4, $x0 = PseudoVLE32FF_V_M4 $x16, $noreg, 5 /* e32 */, implicit-def $vl
; CHECK-NEXT: $v12m4 = VMV4R_V $v28m4
$x15 = PseudoVSETVLI $x14, 82, implicit-def $vl, implicit-def $vtype
- $v28m4 = PseudoVMV_V_I_M4 0, $noreg, 5, implicit $vl, implicit $vtype
+ $v28m4 = PseudoVMV_V_I_M4 undef $v28m4, 0, $noreg, 5, 0, implicit $vl, implicit $vtype
$v4m4,$x0 = PseudoVLE32FF_V_M4 $x16, $noreg, 5, implicit-def $vl
$v12m4 = COPY $v28m4
...
; CHECK-NEXT: $v0m2 = PseudoVLE32_V_M2 $x18, $noreg, 4 /* e16 */, implicit $vl, implicit $vtype
; CHECK-NEXT: $x0 = PseudoVSETVLIX0 $x0, 82 /* e32, m4, ta, mu */, implicit-def $vl, implicit-def $vtype
; CHECK-NEXT: $v4m4 = PseudoVLE32_V_M4 killed $x18, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype
- ; CHECK-NEXT: $v12m4 = PseudoVMV_V_V_M4 $v28m4, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: $v12m4 = PseudoVMV_V_V_M4 undef $v12m4, $v28m4, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
$x15 = PseudoVSETVLI $x14, 82, implicit-def $vl, implicit-def $vtype
$v28m4 = PseudoVLE32_V_M4 killed $x16, $noreg, 5, implicit $vl, implicit $vtype
$x0 = PseudoVSETVLIX0 $x0, 73, implicit-def $vl, implicit-def $vtype
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $x15 = PseudoVSETVLI $x14, 80 /* e32, m1, ta, mu */, implicit-def $vl, implicit-def $vtype
; CHECK-NEXT: $v8_v9 = PseudoVLSEG2E32_V_M1 killed $x16, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype
- ; CHECK-NEXT: $v10 = PseudoVMV_V_V_M1 $v8, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype
- ; CHECK-NEXT: $v11 = PseudoVMV_V_V_M1 $v9, $noreg, 5 /* e32 */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: $v10 = PseudoVMV_V_V_M1 undef $v10, $v8, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: $v11 = PseudoVMV_V_V_M1 undef $v11, $v9, $noreg, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
$x15 = PseudoVSETVLI $x14, 80, implicit-def $vl, implicit-def $vtype
$v8_v9 = PseudoVLSEG2E32_V_M1 killed $x16, $noreg, 5, implicit $vl, implicit $vtype
$v10_v11 = COPY $v8_v9
define half @vreduce_ord_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) {
; CHECK-LABEL: vreduce_ord_fadd_nxv3f16:
; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 3
; CHECK-NEXT: slli a1, a0, 1
; CHECK-NEXT: add a1, a1, a0
; CHECK-NEXT: add a0, a1, a0
-; CHECK-NEXT: lui a2, 1048568
-; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v9, a2
; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
; CHECK-NEXT: vslideup.vx v8, v9, a1
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
define half @vreduce_ord_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) {
; CHECK-LABEL: vreduce_ord_fadd_nxv6f16:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfmv.s.f v10, fa0
; CHECK-NEXT: lui a0, 1048568
-; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vmv.v.x v11, a0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 2
; CHECK-NEXT: add a1, a0, a0
; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma
-; CHECK-NEXT: vslideup.vx v9, v10, a0
+; CHECK-NEXT: vslideup.vx v9, v11, a0
; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfmv.s.f v10, fa0
; CHECK-NEXT: vfredosum.vs v8, v8, v10
; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: ret
; CHECK-NEXT: vmv.v.x v12, a0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: add a1, a0, a0
-; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma
-; CHECK-NEXT: vslideup.vx v10, v12, a0
; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
; CHECK-NEXT: vmv.v.v v11, v12
+; CHECK-NEXT: add a1, a0, a0
; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma
; CHECK-NEXT: vslideup.vx v11, v12, a0
+; CHECK-NEXT: vslideup.vx v10, v12, a0
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; CHECK-NEXT: vfmv.s.f v12, fa0
; CHECK-NEXT: vfredosum.vs v8, v8, v12
define half @vreduce_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) {
; CHECK-LABEL: vreduce_fadd_nxv3f16:
; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v9, a0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 3
; CHECK-NEXT: slli a1, a0, 1
; CHECK-NEXT: add a1, a1, a0
; CHECK-NEXT: add a0, a1, a0
-; CHECK-NEXT: lui a2, 1048568
-; CHECK-NEXT: vsetvli a3, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v9, a2
; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
; CHECK-NEXT: vslideup.vx v8, v9, a1
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
define half @vreduce_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) {
; CHECK-LABEL: vreduce_fadd_nxv6f16:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfmv.s.f v10, fa0
; CHECK-NEXT: lui a0, 1048568
-; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v10, a0
+; CHECK-NEXT: vmv.v.x v11, a0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 2
; CHECK-NEXT: add a1, a0, a0
; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma
-; CHECK-NEXT: vslideup.vx v9, v10, a0
+; CHECK-NEXT: vslideup.vx v9, v11, a0
; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfmv.s.f v10, fa0
; CHECK-NEXT: vfredusum.vs v8, v8, v10
; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: ret
define half @vreduce_fmin_nxv10f16(<vscale x 10 x half> %v) {
; CHECK-LABEL: vreduce_fmin_nxv10f16:
; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, %hi(.LCPI73_0)
+; CHECK-NEXT: flh fa5, %lo(.LCPI73_0)(a0)
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfmv.v.f v12, fa5
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: lui a1, %hi(.LCPI73_0)
-; CHECK-NEXT: flh fa5, %lo(.LCPI73_0)(a1)
; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: add a1, a0, a0
-; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfmv.v.f v12, fa5
-; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma
-; CHECK-NEXT: vslideup.vx v10, v12, a0
; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
; CHECK-NEXT: vmv.v.v v11, v12
+; CHECK-NEXT: add a1, a0, a0
; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma
; CHECK-NEXT: vslideup.vx v11, v12, a0
+; CHECK-NEXT: vslideup.vx v10, v12, a0
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; CHECK-NEXT: vfmv.s.f v12, fa5
; CHECK-NEXT: vfredmin.vs v8, v8, v12
; CHECK-NEXT: bb.1.if.then:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %dead1:vr = IMPLICIT_DEF
; CHECK-NEXT: dead $x0 = PseudoVSETVLIX0 killed $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype, implicit $vl
- ; CHECK-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[PseudoVLE32_V_MF2_]], $noreg, 6 /* e64 */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 %dead1, [[PseudoVLE32_V_MF2_]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
; CHECK-NEXT: PseudoBR %bb.3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2.if.else:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %dead2:vr = IMPLICIT_DEF
; CHECK-NEXT: dead $x0 = PseudoVSETVLIX0 killed $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype, implicit $vl
- ; CHECK-NEXT: early-clobber %2:vr = PseudoVSEXT_VF2_M1 [[PseudoVLE32_V_MF2_]], $noreg, 6 /* e64 */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: early-clobber %2:vr = PseudoVSEXT_VF2_M1 %dead2, [[PseudoVLE32_V_MF2_]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3.if.end:
; CHECK-NEXT: [[PHI:%[0-9]+]]:vr = PHI %1, %bb.1, %2, %bb.2
PseudoBR %bb.1
bb.1.if.then:
- early-clobber %1:vr = PseudoVZEXT_VF2_M1 %0, %7, 6
+ %dead1:vr = IMPLICIT_DEF
+ early-clobber %1:vr = PseudoVZEXT_VF2_M1 %dead1, %0, %7, 6, 0
PseudoBR %bb.3
bb.2.if.else:
- early-clobber %2:vr = PseudoVSEXT_VF2_M1 %0, %7, 6
+ %dead2:vr = IMPLICIT_DEF
+ early-clobber %2:vr = PseudoVSEXT_VF2_M1 %dead2, %0, %7, 6, 0
bb.3.if.end:
%3:vr = PHI %1, %bb.1, %2, %bb.2
; CHECK-NEXT: %pt:vr = IMPLICIT_DEF
; CHECK-NEXT: dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 $x0, 223 /* e64, mf2, ta, ma */, implicit-def $vl, implicit-def $vtype
; CHECK-NEXT: [[PseudoVID_V_MF2_:%[0-9]+]]:vr = PseudoVID_V_MF2 %pt, -1, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: %pt2:vr = IMPLICIT_DEF
; CHECK-NEXT: dead [[PseudoVSETVLIX0_1:%[0-9]+]]:gpr = PseudoVSETVLIX0 $x0, 215 /* e32, mf2, ta, ma */, implicit-def $vl, implicit-def $vtype
- ; CHECK-NEXT: [[PseudoVMV_V_I_MF2_:%[0-9]+]]:vrnov0 = PseudoVMV_V_I_MF2 0, -1, 5 /* e32 */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: [[PseudoVMV_V_I_MF2_:%[0-9]+]]:vrnov0 = PseudoVMV_V_I_MF2 %pt2, 0, -1, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
%2:gpr = IMPLICIT_DEF
%pt:vr = IMPLICIT_DEF
%3:vr = PseudoVID_V_MF2 %pt, -1, 6, 0
- %4:vrnov0 = PseudoVMV_V_I_MF2 0, -1, 5
+ %pt2:vr = IMPLICIT_DEF
+ %4:vrnov0 = PseudoVMV_V_I_MF2 %pt2, 0, -1, 5, 0
bb.1:
successors: %bb.2(0x40000000), %bb.3(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x12
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x10
+ ; CHECK-NEXT: %dead:vr = IMPLICIT_DEF
; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 4, 208 /* e32, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
- ; CHECK-NEXT: [[PseudoVMV_V_I_M1_:%[0-9]+]]:vr = PseudoVMV_V_I_M1 0, 4, 5 /* e32 */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: [[PseudoVMV_V_I_M1_:%[0-9]+]]:vr = PseudoVMV_V_I_M1 %dead, 0, 4, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vr = COPY [[PseudoVMV_V_I_M1_]]
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vr = COPY [[COPY2]]
; CHECK-NEXT: [[LUI:%[0-9]+]]:gpr = LUI 1
%8:gpr = COPY $x12
%6:gpr = COPY $x10
- %11:vr = PseudoVMV_V_I_M1 0, 4, 5
+ %dead:vr = IMPLICIT_DEF
+ %11:vr = PseudoVMV_V_I_M1 %dead, 0, 4, 5, 0
%12:vr = COPY %11
%10:vr = COPY %12
%13:gpr = LUI 1
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x10
; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY]], 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
; CHECK-NEXT: [[PseudoVLE32_V_MF2_:%[0-9]+]]:vr = PseudoVLE32_V_MF2 [[COPY1]], $noreg, 5 /* e32 */, implicit $vl, implicit $vtype
- ; CHECK-NEXT: early-clobber %3:vr = PseudoVZEXT_VF2_M1 killed [[PseudoVLE32_V_MF2_]], $noreg, 6 /* e64 */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: %dead:vr = IMPLICIT_DEF
+ ; CHECK-NEXT: early-clobber %3:vr = PseudoVZEXT_VF2_M1 %dead, killed [[PseudoVLE32_V_MF2_]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
; CHECK-NEXT: $v8 = COPY %3
; CHECK-NEXT: PseudoRET implicit $v8
%1:gprnox0 = COPY $x11
%0:gpr = COPY $x10
%2:vr = PseudoVLE32_V_MF2 %0, %1, 5
- early-clobber %3:vr = PseudoVZEXT_VF2_M1 killed %2, %1, 6
+ %dead:vr = IMPLICIT_DEF
+ early-clobber %3:vr = PseudoVZEXT_VF2_M1 %dead, killed %2, %1, 6, 0
$v8 = COPY %3
PseudoRET implicit $v8
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 2, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
; CHECK-NEXT: [[PseudoVLE64_V_M1_:%[0-9]+]]:vr = PseudoVLE64_V_M1 [[COPY]], 2, 6 /* e64 */, implicit $vl, implicit $vtype :: (load (s128) from %ir.x)
- ; CHECK-NEXT: dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
- ; CHECK-NEXT: [[PseudoVMV_V_I_M1_:%[0-9]+]]:vr = PseudoVMV_V_I_M1 0, -1, 6 /* e64 */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead [[PseudoVSETVLIX0_:%[0-9]+]]:gpr = PseudoVSETVLIX0 $x0, 152 /* e64, m1, tu, ma */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: [[PseudoVMV_V_I_M1_:%[0-9]+]]:vr = PseudoVMV_V_I_M1 undef $v2, 0, -1, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
; CHECK-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 2, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
; CHECK-NEXT: [[PseudoVREDSUM_VS_M1_E8_:%[0-9]+]]:vr = PseudoVREDSUM_VS_M1_E8 [[DEF]], killed [[PseudoVLE64_V_M1_]], killed [[PseudoVMV_V_I_M1_]], 2, 6 /* e64 */, 1 /* ta, mu */, implicit $vl, implicit $vtype
; CHECK-NEXT: PseudoRET implicit $x10
%0:gpr = COPY $x10
%1:vr = PseudoVLE64_V_M1 %0, 2, 6 :: (load (s128) from %ir.x)
- %2:vr = PseudoVMV_V_I_M1 0, -1, 6
+ %2:vr = PseudoVMV_V_I_M1 undef $v2, 0, -1, 6, 0
%4:vr = IMPLICIT_DEF
%3:vr = PseudoVREDSUM_VS_M1_E8 %4, killed %1, killed %2, 2, 6, 1
%5:gpr = PseudoVMV_X_S_M1 killed %3, 6
; CHECK-NEXT: %pt:vrm2 = IMPLICIT_DEF
; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 4, 217 /* e64, m2, ta, ma */, implicit-def $vl, implicit-def $vtype
; CHECK-NEXT: [[PseudoVID_V_M2_:%[0-9]+]]:vrm2 = PseudoVID_V_M2 %pt, 4, 6 /* e64 */, 3 /* ta, ma */, implicit $vl, implicit $vtype
- ; CHECK-NEXT: dead $x0 = PseudoVSETVLIX0 killed $x0, 198 /* e8, mf4, ta, ma */, implicit-def $vl, implicit-def $vtype, implicit $vl
- ; CHECK-NEXT: [[PseudoVMV_V_I_MF4_:%[0-9]+]]:vr = PseudoVMV_V_I_MF4 0, 4, 3 /* e8 */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoVSETVLIX0 killed $x0, 134 /* e8, mf4, tu, ma */, implicit-def $vl, implicit-def $vtype, implicit $vl
+ ; CHECK-NEXT: [[PseudoVMV_V_I_MF4_:%[0-9]+]]:vr = PseudoVMV_V_I_MF4 undef [[PseudoVMV_V_I_MF4_]], 0, 4, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
; CHECK-NEXT: PseudoRET
%pt:vrm2 = IMPLICIT_DEF
%0:vrm2 = PseudoVID_V_M2 %pt, 4, 6, 3
- %4:vr = PseudoVMV_V_I_MF4 0, 4, 3
+ %4:vr = PseudoVMV_V_I_MF4 undef %4, 0, 4, 3, 0
PseudoRET
...
; CHECK-NEXT: $x10 = ADD $x2, killed $x10
; CHECK-NEXT: SD killed renamable $x16, killed $x10, 64 :: (store (s64) into %fixed-stack.1, align 16)
; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 2, 69 /* e8, mf8, ta, mu */, implicit-def $vl, implicit-def $vtype
- ; CHECK-NEXT: renamable $v8 = PseudoVMV_V_I_MF8 0, 2, 3 /* e8 */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: renamable $v8 = PseudoVMV_V_I_MF8 undef $v8, 0, 2, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
; CHECK-NEXT: $x10 = ADDI $x2, 32
; CHECK-NEXT: VS1R_V killed renamable $v8, killed $x10 :: (store unknown-size into %stack.1, align 8)
; CHECK-NEXT: {{ $}}
SD killed renamable $x17, %fixed-stack.0, 0 :: (store (s64))
SD killed renamable $x16, %fixed-stack.1, 0 :: (store (s64) into %fixed-stack.1, align 16)
dead $x0 = PseudoVSETIVLI 2, 69, implicit-def $vl, implicit-def $vtype
- renamable $v8 = PseudoVMV_V_I_MF8 0, 2, 3, implicit $vl, implicit $vtype
+ renamable $v8 = PseudoVMV_V_I_MF8 undef $v8, 0, 2, 3, 0, implicit $vl, implicit $vtype
VS1R_V killed renamable $v8, %stack.1 :: (store unknown-size into %stack.1, align 8)
bb.1.while.cond: