/// (split, spill) during the process and that must be assigned.
/// \p FixedRegisters contains all the virtual registers that cannot be
/// recolored.
+///
+/// \p RecolorStack tracks the original assignments of successfully recolored
+/// registers.
+///
/// \p Depth gives the current depth of the last chance recoloring.
/// \return a physical register that can be used for VirtReg or ~0u if none
/// exists.
AllocationOrder &Order,
SmallVectorImpl<Register> &NewVRegs,
SmallVirtRegSet &FixedRegisters,
+ RecoloringStack &RecolorStack,
unsigned Depth) {
if (!TRI->shouldUseLastChanceRecoloringForVirtReg(*MF, VirtReg))
return ~0u;
LLVM_DEBUG(dbgs() << "Try last chance recoloring for " << VirtReg << '\n');
+
+ const ssize_t EntryStackSize = RecolorStack.size();
+
// Ranges must be Done.
assert((ExtraInfo->getStage(VirtReg) >= RS_Done || !VirtReg.isSpillable()) &&
"Last chance recoloring should really be last chance");
// Set of Live intervals that will need to be recolored.
SmallLISet RecoloringCandidates;
- // Record the original mapping virtual register to physical register in case
- // the recoloring fails.
- DenseMap<Register, MCRegister> VirtRegToPhysReg;
+
// Mark VirtReg as fixed, i.e., it will not be recolored pass this point in
// this recoloring "session".
assert(!FixedRegisters.count(VirtReg.reg()));
LLVM_DEBUG(dbgs() << "Try to assign: " << VirtReg << " to "
<< printReg(PhysReg, TRI) << '\n');
RecoloringCandidates.clear();
- VirtRegToPhysReg.clear();
CurrentNewVRegs.clear();
// It is only possible to recolor virtual register interference.
"Interferences are supposed to be with allocated variables");
// Record the current allocation.
- VirtRegToPhysReg[ItVirtReg] = VRM->getPhys(ItVirtReg);
+ RecolorStack.push_back(std::make_pair(RC, VRM->getPhys(ItVirtReg)));
+
// unset the related struct.
Matrix->unassign(*RC);
}
// at this point for the next physical register.
SmallVirtRegSet SaveFixedRegisters(FixedRegisters);
if (tryRecoloringCandidates(RecoloringQueue, CurrentNewVRegs,
- FixedRegisters, Depth)) {
+ FixedRegisters, RecolorStack, Depth)) {
// Push the queued vregs into the main queue.
for (Register NewVReg : CurrentNewVRegs)
NewVRegs.push_back(NewVReg);
NewVRegs.push_back(R);
}
- for (const LiveInterval *RC : RecoloringCandidates) {
- Register ItVirtReg = RC->reg();
- if (VRM->hasPhys(ItVirtReg))
- Matrix->unassign(*RC);
- MCRegister ItPhysReg = VirtRegToPhysReg[ItVirtReg];
- Matrix->assign(*RC, ItPhysReg);
+ // Roll back our unsuccessful recoloring. Also roll back any successful
+ // recolorings in any recursive recoloring attempts, since it's possible
+ // they would have introduced conflicts with assignments we will be
+ // restoring further up the stack. Perform all unassignments prior to
+ // reassigning, since sub-recolorings may have conflicted with the registers
+ // we are going to restore to their original assignments.
+ for (ssize_t I = RecolorStack.size() - 1; I >= EntryStackSize; --I) {
+ const LiveInterval *LI;
+ MCRegister PhysReg;
+ std::tie(LI, PhysReg) = RecolorStack[I];
+
+ if (VRM->hasPhys(LI->reg()))
+ Matrix->unassign(*LI);
+ }
+
+ for (size_t I = EntryStackSize; I != RecolorStack.size(); ++I) {
+ const LiveInterval *LI;
+ MCRegister PhysReg;
+ std::tie(LI, PhysReg) = RecolorStack[I];
+ Matrix->assign(*LI, PhysReg);
}
+
+ // Pop the stack of recoloring attempts.
+ RecolorStack.resize(EntryStackSize);
}
// Last chance recoloring did not worked either, give up.
bool RAGreedy::tryRecoloringCandidates(PQueue &RecoloringQueue,
SmallVectorImpl<Register> &NewVRegs,
SmallVirtRegSet &FixedRegisters,
+ RecoloringStack &RecolorStack,
unsigned Depth) {
while (!RecoloringQueue.empty()) {
const LiveInterval *LI = dequeue(RecoloringQueue);
LLVM_DEBUG(dbgs() << "Try to recolor: " << *LI << '\n');
- MCRegister PhysReg =
- selectOrSplitImpl(*LI, NewVRegs, FixedRegisters, Depth + 1);
+ MCRegister PhysReg = selectOrSplitImpl(*LI, NewVRegs, FixedRegisters,
+ RecolorStack, Depth + 1);
// When splitting happens, the live-range may actually be empty.
// In that case, this is okay to continue the recoloring even
// if we did not find an alternative color for it. Indeed,
CutOffInfo = CO_None;
LLVMContext &Ctx = MF->getFunction().getContext();
SmallVirtRegSet FixedRegisters;
- MCRegister Reg = selectOrSplitImpl(VirtReg, NewVRegs, FixedRegisters);
+ RecoloringStack RecolorStack;
+ MCRegister Reg =
+ selectOrSplitImpl(VirtReg, NewVRegs, FixedRegisters, RecolorStack);
if (Reg == ~0U && (CutOffInfo != CO_None)) {
uint8_t CutOffEncountered = CutOffInfo & (CO_Depth | CO_Interf);
if (CutOffEncountered == CO_Depth)
MCRegister RAGreedy::selectOrSplitImpl(const LiveInterval &VirtReg,
SmallVectorImpl<Register> &NewVRegs,
SmallVirtRegSet &FixedRegisters,
+ RecoloringStack &RecolorStack,
unsigned Depth) {
uint8_t CostPerUseLimit = uint8_t(~0u);
// First try assigning a free register.
// If we couldn't allocate a register from spilling, there is probably some
// invalid inline assembly. The base class will report it.
- if (Stage >= RS_Done || !VirtReg.isSpillable())
+ if (Stage >= RS_Done || !VirtReg.isSpillable()) {
return tryLastChanceRecoloring(VirtReg, Order, NewVRegs, FixedRegisters,
- Depth);
+ RecolorStack, Depth);
+ }
// Finally spill VirtReg itself.
if ((EnableDeferredSpilling ||
using PQueue = std::priority_queue<std::pair<unsigned, unsigned>>;
using SmallLISet = SmallPtrSet<const LiveInterval *, 4>;
+ // We need to track all tentative recolorings so we can roll back any
+ // successful and unsuccessful recoloring attempts.
+ using RecoloringStack =
+ SmallVector<std::pair<const LiveInterval *, MCRegister>, 8>;
+
// context
MachineFunction *MF;
private:
MCRegister selectOrSplitImpl(const LiveInterval &,
SmallVectorImpl<Register> &, SmallVirtRegSet &,
- unsigned = 0);
+ RecoloringStack &, unsigned = 0);
bool LRE_CanEraseVirtReg(Register) override;
void LRE_WillShrinkVirtReg(Register) override;
SmallVectorImpl<Register> &, const SmallVirtRegSet &);
unsigned tryLastChanceRecoloring(const LiveInterval &, AllocationOrder &,
SmallVectorImpl<Register> &,
- SmallVirtRegSet &, unsigned);
+ SmallVirtRegSet &, RecoloringStack &,
+ unsigned);
bool tryRecoloringCandidates(PQueue &, SmallVectorImpl<Register> &,
- SmallVirtRegSet &, unsigned);
+ SmallVirtRegSet &, RecoloringStack &, unsigned);
void tryHintRecoloring(const LiveInterval &);
void tryHintsRecoloring();
--- /dev/null
+# RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -start-before=greedy,0 -stop-after=virtregrewriter,1 -verify-machineinstrs -o - 2> %t.err %s | FileCheck %s
+# RUN: FileCheck -check-prefix=ERR %s < %t.err
+
+# ERR: error: register allocation failed: maximum depth for recoloring reached. Use -fexhaustive-register-search to skip cutoffs
+# ERR-NEXT: error: ran out of registers during register allocation
+
+# This testcase used to fail with an "overlapping insert" assertion
+# when trying to roll back an unsucessful recoloring of %25. One of
+# the interfering vregs is successfully recolored, and the other is
+# not. We need to roll back the successfully recolored interfering
+# vreg in order to avoid conflicting with the original assignment of
+# the original register when rolling back the second.
+
+
+# %25 initially assigned to $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
+# interfering candidates %15 %17
+# assigned %15 to $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+# %15
+# %18 -> normal recolored $sgpr28_sgpr29_sgpr30_sgpr31
+ # %20 -> normal recolored $sgpr60_sgpr61_sgpr62_sgpr63
+# %17 candidates %37 %39
+# tentative assign %17 $sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91
+# %37 to $sgpr72_sgpr73_sgpr74_sgpr75 succeeded
+# %39 last chance recoloring, fails max depth
+# Fail to assign: %17 to $sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 at depth 4
+# %37 reassign to $sgpr84_sgpr85_sgpr86_sgpr87 unassign from $sgpr72_sgpr73_sgpr74_sgpr75
+# %39 reassign to $sgpr88_sgpr89_sgpr90_sgpr91
+# %17 candidates %39 %41
+# Try to assign: %17 to $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+# %39 Try assign to $sgpr72_sgpr73_sgpr74_sgpr75 succeeded
+# %41 last chance recoloring, fail max depth
+# %39 reassign to $sgpr88_sgpr89_sgpr90_sgpr91 unassign from $sgpr72_sgpr73_sgpr74_sgpr75
+# %41 reassign to $sgpr92_sgpr93_sgpr94_sgpr95
+# %17 candidates %41 %16
+# Try assign %41 to $sgpr72_sgpr73_sgpr74_sgpr75 succeeded
+# %16 last chance recolor, fail max depth
+# fail to recolor %17
+#
+# Have to roll back the succesful recoloring of %15 when %17's
+# recoloring failed. Previously we would leave the recoloring of %18
+# and %20 in place. The recoloring of %20 to
+# $sgpr60_sgpr61_sgpr62_sgpr63 conflicts with the parent restore of
+# %25 to $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
+
+# CHECK-LABEL: name: issue48473
+# CHECK: S_NOP 0, implicit killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed renamable $sgpr12_sgpr13_sgpr14_sgpr15, implicit killed renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23, implicit killed renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit killed renamable $sgpr84_sgpr85_sgpr86_sgpr87, implicit killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, implicit killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, implicit killed renamable $sgpr88_sgpr89_sgpr90_sgpr91, implicit killed renamable $sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83, implicit killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59, implicit killed renamable $sgpr92_sgpr93_sgpr94_sgpr95, implicit killed renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, implicit renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, implicit killed renamable $sgpr96_sgpr97_sgpr98_sgpr99, implicit killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, implicit killed renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
+
+---
+name: issue48473
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr100_sgpr101_sgpr102_sgpr103'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr4' }
+ occupancy: 20
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4_sgpr5_sgpr6_sgpr7
+
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:sgpr_128 = COPY $sgpr4_sgpr5_sgpr6_sgpr7
+ %2:sgpr_256 = S_LOAD_DWORDX8_IMM undef %3:sgpr_64, 1000, 0 :: (load 32, addrspace 6)
+ %4:sgpr_256 = S_LOAD_DWORDX8_IMM undef %3:sgpr_64, 1088, 0 :: (load 32, addrspace 6)
+ %5:sgpr_256 = S_LOAD_DWORDX8_IMM undef %3:sgpr_64, 1152, 0 :: (load 32, addrspace 6)
+ %6:sgpr_256 = S_LOAD_DWORDX8_IMM undef %3:sgpr_64, 1216, 0 :: (load 32, addrspace 6)
+ %7:sgpr_256 = S_LOAD_DWORDX8_IMM undef %3:sgpr_64, 1280, 0 :: (load 32, addrspace 6)
+ %8:sgpr_256 = S_LOAD_DWORDX8_IMM undef %3:sgpr_64, 1408, 0 :: (load 32, addrspace 6)
+ %9:sgpr_128 = S_LOAD_DWORDX4_IMM undef %3:sgpr_64, 0, 0 :: (load 16, addrspace 6)
+ %10:sgpr_128 = S_LOAD_DWORDX4_IMM undef %3:sgpr_64, 0, 0 :: (load 16, addrspace 6)
+ %11:sgpr_128 = S_LOAD_DWORDX4_IMM undef %3:sgpr_64, 0, 0 :: (load 16, addrspace 6)
+ %12:sgpr_128 = S_LOAD_DWORDX4_IMM undef %3:sgpr_64, 0, 0 :: (load 16, addrspace 6)
+ %13:sgpr_128 = S_LOAD_DWORDX4_IMM undef %3:sgpr_64, 0, 0 :: (load 16, addrspace 6)
+ %14:sgpr_128 = IMPLICIT_DEF
+ S_NOP 0, implicit-def %15:sgpr_256, implicit-def %16:sgpr_128, implicit-def %17:sgpr_256
+ S_NOP 0, implicit %0, implicit %1, implicit %2, implicit %4, implicit %9, implicit %5, implicit %10, implicit %6, implicit %11, implicit %8, implicit %13, implicit %7, implicit %12, implicit %17, implicit %17, implicit %16, implicit %14, implicit %15
+ S_ENDPGM 0
+
+...
--- /dev/null
+# RUN: not llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs=0 -start-before=greedy,1 -stop-after=virtregrewriter,1 %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s
+# RUN: not --crash llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -start-before=greedy,1 -stop-after=virtregrewriter,1 %s -o /dev/null 2>&1 | FileCheck -check-prefixes=ERR,VERIFIER %s
+
+# FIXME: We should not produce a verifier error after erroring
+
+# ERR: error: inline assembly requires more registers than available
+# VERIFIER: *** Bad machine code: Using an undefined physical register ***
+
+# This testcase cannot be compiled with the enforced register
+# budget. Previously, tryLastChanceRecoloring would assert here. It
+# was attempting to recolor a superregister with an overlapping
+# subregister over the same range.
+
+--- |
+ define void @foo() #0 {
+ ret void
+ }
+
+ attributes #0 = { "amdgpu-waves-per-eu"="8,8" }
+
+...
+---
+name: foo
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vreg_512 }
+ - { id: 3, class: vreg_256 }
+ - { id: 4, class: vreg_128 }
+ - { id: 5, class: vreg_96 }
+ - { id: 6, class: vreg_96 }
+ - { id: 7, class: vreg_512 }
+ - { id: 8, class: vreg_256 }
+ - { id: 9, class: vreg_128 }
+ - { id: 10, class: vreg_96 }
+ - { id: 11, class: vreg_96 }
+ - { id: 12, class: sreg_64 }
+ - { id: 13, class: sgpr_64 }
+ - { id: 14, class: vgpr_32 }
+machineFunctionInfo:
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+
+ INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $agpr0
+ %14:vgpr_32 = COPY killed $agpr0
+ INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 11534346 /* regdef:VReg_512 */, def %7, 10158090 /* regdef:VReg_256 */, def %8, 4784138 /* regdef:VReg_128 */, def %9, 3670026 /* regdef:VReg_96 */, def %10, 3670026 /* regdef:VReg_96 */, def %11
+ INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 12 /* clobber */, implicit-def dead early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 11534345 /* reguse:VReg_512 */, %7
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10158089 /* reguse:VReg_256 */, %8
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_128 */, %9
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_96 */, %10
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_96 */, %11
+ $agpr1 = COPY %14
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9 /* reguse */, killed $agpr1
+ SI_RETURN
+
+...
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+f,+m,+zfh,+experimental-zvfh < %s | FileCheck %s
+
+; This testcase failed to compile after
+; c46aab01c002b7a04135b8b7f1f52d8c9ae23a58, which was reverted.
+
+; FIXME: The failure does not reproduce with -stop-before=greedy
+; output MIR with -start-before=greedy
+
+define void @last_chance_recoloring_failure() {
+; CHECK-LABEL: last_chance_recoloring_failure:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi sp, sp, -32
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
+; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset ra, -8
+; CHECK-NEXT: .cfi_offset s0, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: li a0, 55
+; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu
+; CHECK-NEXT: vloxseg2ei32.v v8, (a0), v8
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: vs4r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vs4r.v v12, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, mu
+; CHECK-NEXT: vmclr.m v0
+; CHECK-NEXT: li s0, 36
+; CHECK-NEXT: vsetvli zero, s0, e16, m4, tu, mu
+; CHECK-NEXT: vfwadd.vv v8, v8, v8, v0.t
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: call func@plt
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu
+; CHECK-NEXT: vrgather.vv v4, v8, v8, v0.t
+; CHECK-NEXT: vsetvli zero, s0, e16, m4, ta, mu
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 2
+; CHECK-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwsub.wv v16, v8, v24
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu
+; CHECK-NEXT: vssubu.vv v4, v4, v8, v0.t
+; CHECK-NEXT: vsetvli zero, s0, e32, m8, tu, mu
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfdiv.vv v8, v16, v8, v0.t
+; CHECK-NEXT: vse32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 24
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
+; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
+; CHECK-NEXT: addi sp, sp, 32
+; CHECK-NEXT: ret
+entry:
+ %i = call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.riscv.vloxseg2.nxv16f16.nxv16i32.i64(half* nonnull poison, <vscale x 16 x i32> poison, i64 55)
+ %i1 = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %i, 0
+ %i2 = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16f16.nxv16f16.i64(<vscale x 16 x float> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> zeroinitializer, i64 36, i64 0)
+ call void @func()
+ %i3 = call <vscale x 16 x i16> @llvm.riscv.vrgather.vv.mask.nxv16i16.i64(<vscale x 16 x i16> poison, <vscale x 16 x i16> poison, <vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i64 32, i64 0)
+ %i4 = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.nxv16f16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> %i2, <vscale x 16 x half> %i1, i64 36)
+ %i5 = call <vscale x 16 x i16> @llvm.riscv.vssubu.mask.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> %i3, <vscale x 16 x i16> %i3, <vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i64 32, i64 0)
+ %i6 = call <vscale x 16 x float> @llvm.riscv.vfdiv.mask.nxv16f32.nxv16f32.i64(<vscale x 16 x float> %i4, <vscale x 16 x float> %i2, <vscale x 16 x float> poison, <vscale x 16 x i1> poison, i64 36, i64 0)
+ call void @llvm.riscv.vse.nxv16f32.i64(<vscale x 16 x float> %i6, <vscale x 16 x float>* nonnull poison, i64 36)
+ ret void
+}
+
+declare void @func()
+declare { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.riscv.vloxseg2.nxv16f16.nxv16i32.i64(half* nocapture, <vscale x 16 x i32>, i64)
+declare <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16f16.nxv16f16.i64(<vscale x 16 x float>, <vscale x 16 x half>, <vscale x 16 x half>, <vscale x 16 x i1>, i64, i64 immarg)
+declare <vscale x 16 x i16> @llvm.riscv.vrgather.vv.mask.nxv16i16.i64(<vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i1>, i64, i64 immarg)
+declare <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.nxv16f16.i64(<vscale x 16 x float>, <vscale x 16 x float>, <vscale x 16 x half>, i64)
+declare <vscale x 16 x i16> @llvm.riscv.vssubu.mask.nxv16i16.nxv16i16.i64(<vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i1>, i64, i64 immarg)
+declare <vscale x 16 x float> @llvm.riscv.vfdiv.mask.nxv16f32.nxv16f32.i64(<vscale x 16 x float>, <vscale x 16 x float>, <vscale x 16 x float>, <vscale x 16 x i1>, i64, i64 immarg)
+declare void @llvm.riscv.vse.nxv16f32.i64(<vscale x 16 x float>, <vscale x 16 x float>* nocapture, i64) #3