Other.MaskAgnostic);
}
- // Convert VLMUL to a fixed point value with 3 bits of fraction.
- unsigned getSEWLMULRatio() const {
- assert(isValid() && !isUnknown() &&
- "Can't use VTYPE for uninitialized or unknown");
+ static unsigned getSEWLMULRatio(unsigned SEW, RISCVII::VLMUL VLMul) {
unsigned LMul;
bool Fractional;
std::tie(LMul, Fractional) = RISCVVType::decodeVLMUL(VLMul);
return (SEW * 8) / LMul;
}
+ unsigned getSEWLMULRatio() const {
+ assert(isValid() && !isUnknown() &&
+ "Can't use VTYPE for uninitialized or unknown");
+ return getSEWLMULRatio(SEW, VLMul);
+ }
+
// Check if the VTYPE for these two VSETVLIInfos produce the same VLMAX.
bool hasSameVLMAX(const VSETVLIInfo &Other) const {
assert(isValid() && Other.isValid() &&
return hasSameAVL(InstrInfo);
}
+ bool isCompatibleWithLoadStoreEEW(unsigned EEW,
+ const VSETVLIInfo &InstrInfo) const {
+ assert(isValid() && InstrInfo.isValid() &&
+ "Can't compare invalid VSETVLIInfos");
+ assert(!InstrInfo.SEWLMULRatioOnly &&
+ "Expected a valid VTYPE for instruction!");
+ assert(EEW == InstrInfo.SEW && "Mismatched EEW/SEW for store");
+
+ if (isUnknown() || hasSEWLMULRatioOnly())
+ return false;
+
+ if (!hasSameAVL(InstrInfo))
+ return false;
+
+ // TODO: This check isn't required for stores. But we should ignore for all
+ // stores not just unit-stride and strided so leaving it for now.
+ if (TailAgnostic != InstrInfo.TailAgnostic ||
+ MaskAgnostic != InstrInfo.MaskAgnostic)
+ return false;
+
+ return getSEWLMULRatio() == getSEWLMULRatio(EEW, InstrInfo.VLMul);
+ }
+
bool operator==(const VSETVLIInfo &Other) const {
// Uninitialized is only equal to another Uninitialized.
if (!isValid())
return true;
}
+bool canSkipVSETVLIForLoadStore(const MachineInstr &MI,
+ const VSETVLIInfo &Require,
+ const VSETVLIInfo &CurInfo) {
+ unsigned EEW;
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case RISCV::PseudoVLE8_V_M1:
+ case RISCV::PseudoVLE8_V_M1_MASK:
+ case RISCV::PseudoVLE8_V_M2:
+ case RISCV::PseudoVLE8_V_M2_MASK:
+ case RISCV::PseudoVLE8_V_M4:
+ case RISCV::PseudoVLE8_V_M4_MASK:
+ case RISCV::PseudoVLE8_V_M8:
+ case RISCV::PseudoVLE8_V_M8_MASK:
+ case RISCV::PseudoVLE8_V_MF2:
+ case RISCV::PseudoVLE8_V_MF2_MASK:
+ case RISCV::PseudoVLE8_V_MF4:
+ case RISCV::PseudoVLE8_V_MF4_MASK:
+ case RISCV::PseudoVLE8_V_MF8:
+ case RISCV::PseudoVLE8_V_MF8_MASK:
+ case RISCV::PseudoVLSE8_V_M1:
+ case RISCV::PseudoVLSE8_V_M1_MASK:
+ case RISCV::PseudoVLSE8_V_M2:
+ case RISCV::PseudoVLSE8_V_M2_MASK:
+ case RISCV::PseudoVLSE8_V_M4:
+ case RISCV::PseudoVLSE8_V_M4_MASK:
+ case RISCV::PseudoVLSE8_V_M8:
+ case RISCV::PseudoVLSE8_V_M8_MASK:
+ case RISCV::PseudoVLSE8_V_MF2:
+ case RISCV::PseudoVLSE8_V_MF2_MASK:
+ case RISCV::PseudoVLSE8_V_MF4:
+ case RISCV::PseudoVLSE8_V_MF4_MASK:
+ case RISCV::PseudoVLSE8_V_MF8:
+ case RISCV::PseudoVLSE8_V_MF8_MASK:
+ case RISCV::PseudoVSE8_V_M1:
+ case RISCV::PseudoVSE8_V_M1_MASK:
+ case RISCV::PseudoVSE8_V_M2:
+ case RISCV::PseudoVSE8_V_M2_MASK:
+ case RISCV::PseudoVSE8_V_M4:
+ case RISCV::PseudoVSE8_V_M4_MASK:
+ case RISCV::PseudoVSE8_V_M8:
+ case RISCV::PseudoVSE8_V_M8_MASK:
+ case RISCV::PseudoVSE8_V_MF2:
+ case RISCV::PseudoVSE8_V_MF2_MASK:
+ case RISCV::PseudoVSE8_V_MF4:
+ case RISCV::PseudoVSE8_V_MF4_MASK:
+ case RISCV::PseudoVSE8_V_MF8:
+ case RISCV::PseudoVSE8_V_MF8_MASK:
+ case RISCV::PseudoVSSE8_V_M1:
+ case RISCV::PseudoVSSE8_V_M1_MASK:
+ case RISCV::PseudoVSSE8_V_M2:
+ case RISCV::PseudoVSSE8_V_M2_MASK:
+ case RISCV::PseudoVSSE8_V_M4:
+ case RISCV::PseudoVSSE8_V_M4_MASK:
+ case RISCV::PseudoVSSE8_V_M8:
+ case RISCV::PseudoVSSE8_V_M8_MASK:
+ case RISCV::PseudoVSSE8_V_MF2:
+ case RISCV::PseudoVSSE8_V_MF2_MASK:
+ case RISCV::PseudoVSSE8_V_MF4:
+ case RISCV::PseudoVSSE8_V_MF4_MASK:
+ case RISCV::PseudoVSSE8_V_MF8:
+ case RISCV::PseudoVSSE8_V_MF8_MASK:
+ EEW = 8;
+ break;
+ case RISCV::PseudoVLE16_V_M1:
+ case RISCV::PseudoVLE16_V_M1_MASK:
+ case RISCV::PseudoVLE16_V_M2:
+ case RISCV::PseudoVLE16_V_M2_MASK:
+ case RISCV::PseudoVLE16_V_M4:
+ case RISCV::PseudoVLE16_V_M4_MASK:
+ case RISCV::PseudoVLE16_V_M8:
+ case RISCV::PseudoVLE16_V_M8_MASK:
+ case RISCV::PseudoVLE16_V_MF2:
+ case RISCV::PseudoVLE16_V_MF2_MASK:
+ case RISCV::PseudoVLE16_V_MF4:
+ case RISCV::PseudoVLE16_V_MF4_MASK:
+ case RISCV::PseudoVLSE16_V_M1:
+ case RISCV::PseudoVLSE16_V_M1_MASK:
+ case RISCV::PseudoVLSE16_V_M2:
+ case RISCV::PseudoVLSE16_V_M2_MASK:
+ case RISCV::PseudoVLSE16_V_M4:
+ case RISCV::PseudoVLSE16_V_M4_MASK:
+ case RISCV::PseudoVLSE16_V_M8:
+ case RISCV::PseudoVLSE16_V_M8_MASK:
+ case RISCV::PseudoVLSE16_V_MF2:
+ case RISCV::PseudoVLSE16_V_MF2_MASK:
+ case RISCV::PseudoVLSE16_V_MF4:
+ case RISCV::PseudoVLSE16_V_MF4_MASK:
+ case RISCV::PseudoVSE16_V_M1:
+ case RISCV::PseudoVSE16_V_M1_MASK:
+ case RISCV::PseudoVSE16_V_M2:
+ case RISCV::PseudoVSE16_V_M2_MASK:
+ case RISCV::PseudoVSE16_V_M4:
+ case RISCV::PseudoVSE16_V_M4_MASK:
+ case RISCV::PseudoVSE16_V_M8:
+ case RISCV::PseudoVSE16_V_M8_MASK:
+ case RISCV::PseudoVSE16_V_MF2:
+ case RISCV::PseudoVSE16_V_MF2_MASK:
+ case RISCV::PseudoVSE16_V_MF4:
+ case RISCV::PseudoVSE16_V_MF4_MASK:
+ case RISCV::PseudoVSSE16_V_M1:
+ case RISCV::PseudoVSSE16_V_M1_MASK:
+ case RISCV::PseudoVSSE16_V_M2:
+ case RISCV::PseudoVSSE16_V_M2_MASK:
+ case RISCV::PseudoVSSE16_V_M4:
+ case RISCV::PseudoVSSE16_V_M4_MASK:
+ case RISCV::PseudoVSSE16_V_M8:
+ case RISCV::PseudoVSSE16_V_M8_MASK:
+ case RISCV::PseudoVSSE16_V_MF2:
+ case RISCV::PseudoVSSE16_V_MF2_MASK:
+ case RISCV::PseudoVSSE16_V_MF4:
+ case RISCV::PseudoVSSE16_V_MF4_MASK:
+ EEW = 16;
+ break;
+ case RISCV::PseudoVLE32_V_M1:
+ case RISCV::PseudoVLE32_V_M1_MASK:
+ case RISCV::PseudoVLE32_V_M2:
+ case RISCV::PseudoVLE32_V_M2_MASK:
+ case RISCV::PseudoVLE32_V_M4:
+ case RISCV::PseudoVLE32_V_M4_MASK:
+ case RISCV::PseudoVLE32_V_M8:
+ case RISCV::PseudoVLE32_V_M8_MASK:
+ case RISCV::PseudoVLE32_V_MF2:
+ case RISCV::PseudoVLE32_V_MF2_MASK:
+ case RISCV::PseudoVLSE32_V_M1:
+ case RISCV::PseudoVLSE32_V_M1_MASK:
+ case RISCV::PseudoVLSE32_V_M2:
+ case RISCV::PseudoVLSE32_V_M2_MASK:
+ case RISCV::PseudoVLSE32_V_M4:
+ case RISCV::PseudoVLSE32_V_M4_MASK:
+ case RISCV::PseudoVLSE32_V_M8:
+ case RISCV::PseudoVLSE32_V_M8_MASK:
+ case RISCV::PseudoVLSE32_V_MF2:
+ case RISCV::PseudoVLSE32_V_MF2_MASK:
+ case RISCV::PseudoVSE32_V_M1:
+ case RISCV::PseudoVSE32_V_M1_MASK:
+ case RISCV::PseudoVSE32_V_M2:
+ case RISCV::PseudoVSE32_V_M2_MASK:
+ case RISCV::PseudoVSE32_V_M4:
+ case RISCV::PseudoVSE32_V_M4_MASK:
+ case RISCV::PseudoVSE32_V_M8:
+ case RISCV::PseudoVSE32_V_M8_MASK:
+ case RISCV::PseudoVSE32_V_MF2:
+ case RISCV::PseudoVSE32_V_MF2_MASK:
+ case RISCV::PseudoVSSE32_V_M1:
+ case RISCV::PseudoVSSE32_V_M1_MASK:
+ case RISCV::PseudoVSSE32_V_M2:
+ case RISCV::PseudoVSSE32_V_M2_MASK:
+ case RISCV::PseudoVSSE32_V_M4:
+ case RISCV::PseudoVSSE32_V_M4_MASK:
+ case RISCV::PseudoVSSE32_V_M8:
+ case RISCV::PseudoVSSE32_V_M8_MASK:
+ case RISCV::PseudoVSSE32_V_MF2:
+ case RISCV::PseudoVSSE32_V_MF2_MASK:
+ EEW = 32;
+ break;
+ case RISCV::PseudoVLE64_V_M1:
+ case RISCV::PseudoVLE64_V_M1_MASK:
+ case RISCV::PseudoVLE64_V_M2:
+ case RISCV::PseudoVLE64_V_M2_MASK:
+ case RISCV::PseudoVLE64_V_M4:
+ case RISCV::PseudoVLE64_V_M4_MASK:
+ case RISCV::PseudoVLE64_V_M8:
+ case RISCV::PseudoVLE64_V_M8_MASK:
+ case RISCV::PseudoVLSE64_V_M1:
+ case RISCV::PseudoVLSE64_V_M1_MASK:
+ case RISCV::PseudoVLSE64_V_M2:
+ case RISCV::PseudoVLSE64_V_M2_MASK:
+ case RISCV::PseudoVLSE64_V_M4:
+ case RISCV::PseudoVLSE64_V_M4_MASK:
+ case RISCV::PseudoVLSE64_V_M8:
+ case RISCV::PseudoVLSE64_V_M8_MASK:
+ case RISCV::PseudoVSE64_V_M1:
+ case RISCV::PseudoVSE64_V_M1_MASK:
+ case RISCV::PseudoVSE64_V_M2:
+ case RISCV::PseudoVSE64_V_M2_MASK:
+ case RISCV::PseudoVSE64_V_M4:
+ case RISCV::PseudoVSE64_V_M4_MASK:
+ case RISCV::PseudoVSE64_V_M8:
+ case RISCV::PseudoVSE64_V_M8_MASK:
+ case RISCV::PseudoVSSE64_V_M1:
+ case RISCV::PseudoVSSE64_V_M1_MASK:
+ case RISCV::PseudoVSSE64_V_M2:
+ case RISCV::PseudoVSSE64_V_M2_MASK:
+ case RISCV::PseudoVSSE64_V_M4:
+ case RISCV::PseudoVSSE64_V_M4_MASK:
+ case RISCV::PseudoVSSE64_V_M8:
+ case RISCV::PseudoVSSE64_V_M8_MASK:
+ EEW = 64;
+ break;
+ }
+
+ return CurInfo.isCompatibleWithLoadStoreEEW(EEW, Require);
+}
+
bool RISCVInsertVSETVLI::computeVLVTYPEChanges(const MachineBasicBlock &MBB) {
bool HadVectorOp = false;
} else {
// If this instruction isn't compatible with the previous VL/VTYPE
// we need to insert a VSETVLI.
- if (needVSETVLI(NewInfo, BBInfo.Change))
+ // If this is a unit-stride or strided load/store, we may be able to use
+ // the EMUL=(EEW/SEW)*LMUL relationship to avoid changing vtype.
+ // NOTE: We only do this if the vtype we're comparing against was
+ // created in this block. We need the first and third phase to treat
+ // the store the same way.
+ if (!canSkipVSETVLIForLoadStore(MI, NewInfo, BBInfo.Change) &&
+ needVSETVLI(NewInfo, BBInfo.Change))
BBInfo.Change = NewInfo;
}
}
} else {
// If this instruction isn't compatible with the previous VL/VTYPE
// we need to insert a VSETVLI.
- if (needVSETVLI(NewInfo, CurInfo)) {
+ // If this is a unit-stride or strided load/store, we may be able to use
+ // the EMUL=(EEW/SEW)*LMUL relationship to avoid changing vtype.
+ // NOTE: We can't use predecessor information for the store. We must
+ // treat it the same as the first phase so that we produce the correct
+ // vl/vtype for succesor blocks.
+ if (!canSkipVSETVLIForLoadStore(MI, NewInfo, CurInfo) &&
+ needVSETVLI(NewInfo, CurInfo)) {
// If the previous VL/VTYPE is set by VSETVLI and do not use, Merge it
// with current VL/VTYPE.
bool NeedInsertVSETVLI = true;
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu
; CHECK-NEXT: vle16.v v25, (a0)
; CHECK-NEXT: vfwcvt.f.f.v v26, v25
-; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
; CHECK-NEXT: vse32.v v26, (a1)
; CHECK-NEXT: ret
%a = load <2 x half>, <2 x half>* %x
; CHECK-NEXT: vfwcvt.f.f.v v26, v25
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
; CHECK-NEXT: vfwcvt.f.f.v v25, v26
-; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu
; CHECK-NEXT: vse64.v v25, (a1)
; CHECK-NEXT: ret
%a = load <2 x half>, <2 x half>* %x
; LMULMAX8-NEXT: vsetivli zero, 8, e16, m1, ta, mu
; LMULMAX8-NEXT: vle16.v v25, (a0)
; LMULMAX8-NEXT: vfwcvt.f.f.v v26, v25
-; LMULMAX8-NEXT: vsetvli zero, zero, e32, m2, ta, mu
; LMULMAX8-NEXT: vse32.v v26, (a1)
; LMULMAX8-NEXT: ret
;
; LMULMAX1-NEXT: vfwcvt.f.f.v v27, v26
; LMULMAX1-NEXT: vfwcvt.f.f.v v26, v25
; LMULMAX1-NEXT: addi a0, a1, 16
-; LMULMAX1-NEXT: vsetvli zero, zero, e32, m1, ta, mu
; LMULMAX1-NEXT: vse32.v v27, (a0)
; LMULMAX1-NEXT: vse32.v v26, (a1)
; LMULMAX1-NEXT: ret
; LMULMAX8-NEXT: vfwcvt.f.f.v v26, v25
; LMULMAX8-NEXT: vsetvli zero, zero, e32, m2, ta, mu
; LMULMAX8-NEXT: vfwcvt.f.f.v v28, v26
-; LMULMAX8-NEXT: vsetvli zero, zero, e64, m4, ta, mu
; LMULMAX8-NEXT: vse64.v v28, (a1)
; LMULMAX8-NEXT: ret
;
; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
; LMULMAX1-NEXT: vfwcvt.f.f.v v25, v29
; LMULMAX1-NEXT: addi a0, a1, 32
-; LMULMAX1-NEXT: vsetvli zero, zero, e64, m1, ta, mu
; LMULMAX1-NEXT: vse64.v v27, (a0)
; LMULMAX1-NEXT: vse64.v v25, (a1)
; LMULMAX1-NEXT: addi a0, a1, 48
; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, mu
; RV32-NEXT: vmv.s.x v0, a0
; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
-; RV32-NEXT: vid.v v25
-; RV32-NEXT: vrsub.vi v25, v25, 4
; RV32-NEXT: lui a0, %hi(.LCPI7_0)
; RV32-NEXT: addi a0, a0, %lo(.LCPI7_0)
-; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu
; RV32-NEXT: vlse64.v v26, (a0), zero
+; RV32-NEXT: vid.v v25
+; RV32-NEXT: vrsub.vi v25, v25, 4
; RV32-NEXT: vsetvli zero, zero, e64, m2, tu, mu
; RV32-NEXT: vrgatherei16.vv v26, v8, v25, v0.t
; RV32-NEXT: vmv2r.v v8, v26
; CHECK-LABEL: copysign_neg_trunc_v4f16_v4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
-; CHECK-NEXT: vle16.v v25, (a0)
-; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
-; CHECK-NEXT: vle32.v v26, (a1)
-; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
-; CHECK-NEXT: vfncvt.f.f.w v27, v26
-; CHECK-NEXT: vfsgnjn.vv v25, v25, v27
+; CHECK-NEXT: vle32.v v25, (a1)
+; CHECK-NEXT: vle16.v v26, (a0)
+; CHECK-NEXT: vfncvt.f.f.w v27, v25
+; CHECK-NEXT: vfsgnjn.vv v25, v26, v27
; CHECK-NEXT: vse16.v v25, (a0)
; CHECK-NEXT: ret
%a = load <4 x half>, <4 x half>* %x
; CHECK-LABEL: copysign_neg_ext_v2f64_v2f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; CHECK-NEXT: vle64.v v25, (a0)
+; CHECK-NEXT: vle32.v v25, (a1)
+; CHECK-NEXT: vle64.v v26, (a0)
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
-; CHECK-NEXT: vle32.v v26, (a1)
-; CHECK-NEXT: vfwcvt.f.f.v v27, v26
+; CHECK-NEXT: vfwcvt.f.f.v v27, v25
; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu
-; CHECK-NEXT: vfsgnjn.vv v25, v25, v27
+; CHECK-NEXT: vfsgnjn.vv v25, v26, v27
; CHECK-NEXT: vse64.v v25, (a0)
; CHECK-NEXT: ret
%a = load <2 x double>, <2 x double>* %x
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
; CHECK-NEXT: vle32.v v25, (a0)
; CHECK-NEXT: vfwcvt.rtz.x.f.v v26, v25
-; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu
; CHECK-NEXT: vse64.v v26, (a1)
; CHECK-NEXT: ret
%a = load <2 x float>, <2 x float>* %x
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
; CHECK-NEXT: vle32.v v25, (a0)
; CHECK-NEXT: vfwcvt.rtz.xu.f.v v26, v25
-; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu
; CHECK-NEXT: vse64.v v26, (a1)
; CHECK-NEXT: ret
%a = load <2 x float>, <2 x float>* %x
; LMULMAX8-NEXT: vsetivli zero, 8, e32, m2, ta, mu
; LMULMAX8-NEXT: vle32.v v26, (a0)
; LMULMAX8-NEXT: vfwcvt.rtz.x.f.v v28, v26
-; LMULMAX8-NEXT: vsetvli zero, zero, e64, m4, ta, mu
; LMULMAX8-NEXT: vse64.v v28, (a1)
; LMULMAX8-NEXT: ret
;
; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v27, v25
; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v25, v26
; LMULMAX1-NEXT: addi a0, a1, 16
-; LMULMAX1-NEXT: vsetvli zero, zero, e64, m1, ta, mu
; LMULMAX1-NEXT: vse64.v v29, (a0)
; LMULMAX1-NEXT: vse64.v v25, (a1)
; LMULMAX1-NEXT: addi a0, a1, 48
; LMULMAX8-NEXT: vsetivli zero, 8, e32, m2, ta, mu
; LMULMAX8-NEXT: vle32.v v26, (a0)
; LMULMAX8-NEXT: vfwcvt.rtz.xu.f.v v28, v26
-; LMULMAX8-NEXT: vsetvli zero, zero, e64, m4, ta, mu
; LMULMAX8-NEXT: vse64.v v28, (a1)
; LMULMAX8-NEXT: ret
;
; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v27, v25
; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v25, v26
; LMULMAX1-NEXT: addi a0, a1, 16
-; LMULMAX1-NEXT: vsetvli zero, zero, e64, m1, ta, mu
; LMULMAX1-NEXT: vse64.v v29, (a0)
; LMULMAX1-NEXT: vse64.v v25, (a1)
; LMULMAX1-NEXT: addi a0, a1, 48
; CHECK-NEXT: vfwcvt.f.f.v v26, v25
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
; CHECK-NEXT: vfwcvt.rtz.x.f.v v25, v26
-; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu
; CHECK-NEXT: vse64.v v25, (a1)
; CHECK-NEXT: ret
%a = load <2 x half>, <2 x half>* %x
; CHECK-NEXT: vfwcvt.f.f.v v26, v25
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
; CHECK-NEXT: vfwcvt.rtz.xu.f.v v25, v26
-; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu
; CHECK-NEXT: vse64.v v25, (a1)
; CHECK-NEXT: ret
%a = load <2 x half>, <2 x half>* %x
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
; CHECK-NEXT: vle8.v v25, (a0)
-; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
; CHECK-NEXT: vle16.v v26, (a1)
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
; CHECK-NEXT: vsext.vf2 v27, v25
; CHECK-NEXT: vwmul.vv v8, v27, v26
; CHECK-NEXT: ret
; CHECK-LABEL: vwmul_v4i64_v4i32_v4i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT: vle32.v v25, (a0)
-; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, mu
-; CHECK-NEXT: vle8.v v26, (a1)
-; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
-; CHECK-NEXT: vsext.vf4 v27, v26
-; CHECK-NEXT: vwmul.vv v8, v25, v27
+; CHECK-NEXT: vle8.v v25, (a1)
+; CHECK-NEXT: vle32.v v26, (a0)
+; CHECK-NEXT: vsext.vf4 v27, v25
+; CHECK-NEXT: vwmul.vv v8, v26, v27
; CHECK-NEXT: ret
%a = load <4 x i32>, <4 x i32>* %x
%b = load <4 x i8>, <4 x i8>* %y
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
; CHECK-NEXT: vle8.v v25, (a0)
-; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
; CHECK-NEXT: vle16.v v26, (a1)
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
; CHECK-NEXT: vzext.vf2 v27, v25
; CHECK-NEXT: vwmulu.vv v8, v27, v26
; CHECK-NEXT: ret
; CHECK-LABEL: vwmulu_v4i64_v4i32_v4i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT: vle32.v v25, (a0)
-; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, mu
-; CHECK-NEXT: vle8.v v26, (a1)
-; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
-; CHECK-NEXT: vzext.vf4 v27, v26
-; CHECK-NEXT: vwmulu.vv v8, v25, v27
+; CHECK-NEXT: vle8.v v25, (a1)
+; CHECK-NEXT: vle32.v v26, (a0)
+; CHECK-NEXT: vzext.vf4 v27, v25
+; CHECK-NEXT: vwmulu.vv v8, v26, v27
; CHECK-NEXT: ret
%a = load <4 x i32>, <4 x i32>* %x
%b = load <4 x i8>, <4 x i8>* %y