/// use or a live out.
bool isRegUsedAfter(MachineInstr *MI, int PhysReg);
- /// Provides the first instruction before MI that uses PhysReg
- MachineInstr *getInstWithUseBefore(MachineInstr *MI, int PhysReg);
-
- /// Provides all instructions before MI that uses PhysReg
- void getAllInstWithUseBefore(MachineInstr *MI, int PhysReg,
- SmallVectorImpl<MachineInstr*> &Uses);
-
/// Provides the clearance - the number of instructions since the closest
/// reaching def instuction of PhysReg that reaches MI.
int getClearance(MachineInstr *MI, MCPhysReg PhysReg);
/// Provides the uses, in the same block as MI, of register that MI defines.
/// This does not consider live-outs.
void getReachingLocalUses(MachineInstr *MI, int PhysReg,
- SmallVectorImpl<MachineInstr*> &Uses);
-
- /// Provide the number of uses, in the same block as MI, of the register that
- /// MI defines.
- unsigned getNumUses(MachineInstr *MI, int PhysReg);
+ SmallPtrSetImpl<MachineInstr*> &Uses);
+
+ /// For the given block, collect the instructions that use the live-in
+ /// value of the provided register. Return whether the value is still
+ /// live on exit.
+ bool getLiveInUses(MachineBasicBlock *MBB, int PhysReg,
+ SmallPtrSetImpl<MachineInstr*> &Uses);
+
+ /// Collect the users of the value stored in PhysReg, which is defined
+ /// by MI.
+ void getGlobalUses(MachineInstr *MI, int PhysReg,
+ SmallPtrSetImpl<MachineInstr*> &Uses);
private:
/// Set up LiveRegs by merging predecessor live-out values.
}
void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def, int PhysReg,
- SmallVectorImpl<MachineInstr*> &Uses) {
+ SmallPtrSetImpl<MachineInstr*> &Uses) {
MachineBasicBlock *MBB = Def->getParent();
MachineBasicBlock::iterator MI = MachineBasicBlock::iterator(Def);
while (++MI != MBB->end()) {
if (!MO.isReg() || !MO.isUse() || MO.getReg() != PhysReg)
continue;
- Uses.push_back(&*MI);
+ Uses.insert(&*MI);
if (MO.isKill())
return;
}
}
}
-unsigned ReachingDefAnalysis::getNumUses(MachineInstr *Def, int PhysReg) {
- SmallVector<MachineInstr*, 4> Uses;
- getReachingLocalUses(Def, PhysReg, Uses);
- return Uses.size();
+bool ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB, int PhysReg,
+ SmallPtrSetImpl<MachineInstr*> &Uses) {
+ for (auto &MI : *MBB) {
+ for (auto &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.isUse() || MO.getReg() != PhysReg)
+ continue;
+ if (getReachingDef(&MI, PhysReg) >= 0)
+ return false;
+ Uses.insert(&MI);
+ }
+ }
+ return isReachingDefLiveOut(&MBB->back(), PhysReg);
+}
+
+void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, int PhysReg,
+ SmallPtrSetImpl<MachineInstr*> &Uses) {
+ MachineBasicBlock *MBB = MI->getParent();
+
+ // Collect the uses that each def touches within the block.
+ getReachingLocalUses(MI, PhysReg, Uses);
+
+ // Handle live-out values.
+ if (auto *LiveOut = getLocalLiveOutMIDef(MI->getParent(), PhysReg)) {
+ if (LiveOut != MI)
+ return;
+
+ SmallVector<MachineBasicBlock*, 4> ToVisit;
+ ToVisit.insert(ToVisit.begin(), MBB->successors().begin(),
+ MBB->successors().end());
+ SmallPtrSet<MachineBasicBlock*, 4>Visited;
+ while (!ToVisit.empty()) {
+ MachineBasicBlock *MBB = ToVisit.back();
+ ToVisit.pop_back();
+ if (Visited.count(MBB) || !MBB->isLiveIn(PhysReg))
+ continue;
+ if (getLiveInUses(MBB, PhysReg, Uses))
+ ToVisit.insert(ToVisit.end(), MBB->successors().begin(),
+ MBB->successors().end());
+ Visited.insert(MBB);
+ }
+ }
}
bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI, int PhysReg) {
return Def < 0 ? nullptr : getInstFromId(MBB, Def);
}
-
-MachineInstr *ReachingDefAnalysis::getInstWithUseBefore(MachineInstr *MI,
- int PhysReg) {
- auto I = MachineBasicBlock::reverse_iterator(MI);
- auto E = MI->getParent()->rend();
- I++;
-
- for ( ; I != E; I++)
- for (auto &MO : I->operands())
- if (MO.isReg() && MO.isUse() && MO.getReg() == PhysReg)
- return &*I;
-
- return nullptr;
-}
-
-void ReachingDefAnalysis::getAllInstWithUseBefore(MachineInstr *MI,
- int PhysReg, SmallVectorImpl<MachineInstr*> &Uses) {
- MachineInstr *Use = nullptr;
- MachineInstr *Pos = MI;
-
- while ((Use = getInstWithUseBefore(Pos, PhysReg))) {
- Uses.push_back(Use);
- Pos = Use;
- }
-}
// This table shows the VPT instruction variants, i.e. the different
// mask field encodings, see also B5.6. Predication/conditional execution in
// the ArmARM.
-enum VPTMaskValue {
- T = 8, // 0b1000
- TT = 4, // 0b0100
- TE = 12, // 0b1100
- TTT = 2, // 0b0010
- TTE = 6, // 0b0110
- TEE = 10, // 0b1010
- TET = 14, // 0b1110
- TTTT = 1, // 0b0001
- TTTE = 3, // 0b0011
- TTEE = 5, // 0b0101
- TTET = 7, // 0b0111
- TEEE = 9, // 0b1001
- TEET = 11, // 0b1011
- TETT = 13, // 0b1101
- TETE = 15 // 0b1111
-};
+
+
+inline static unsigned getARMVPTBlockMask(unsigned NumInsts) {
+ switch (NumInsts) {
+ case 1:
+ return ARMVCC::T;
+ case 2:
+ return ARMVCC::TT;
+ case 3:
+ return ARMVCC::TTT;
+ case 4:
+ return ARMVCC::TTTT;
+ default:
+ break;
+ };
+ llvm_unreachable("Unexpected number of instruction in a VPT block");
+}
+
static inline bool isVPTOpcode(int Opc) {
return Opc == ARM::MVE_VPTv16i8 || Opc == ARM::MVE_VPTv16u8 ||
return 0;
}
+static inline unsigned getTailPredVectorWidth(unsigned Opcode) {
+ switch (Opcode) {
+ default:
+ llvm_unreachable("unhandled vctp opcode");
+ case ARM::MVE_VCTP8: return 16;
+ case ARM::MVE_VCTP16: return 8;
+ case ARM::MVE_VCTP32: return 4;
+ case ARM::MVE_VCTP64: return 2;
+ }
+ return 0;
+}
+
static inline
bool isVCTP(MachineInstr *MI) {
switch (MI->getOpcode()) {
Opc == ARM::STMDB_UPD || Opc == ARM::VSTMDDB_UPD;
}
+static inline bool isSubImmOpcode(int Opc) {
+ return Opc == ARM::SUBri ||
+ Opc == ARM::tSUBi3 || Opc == ARM::tSUBi8 ||
+ Opc == ARM::tSUBSi3 || Opc == ARM::tSUBSi8 ||
+ Opc == ARM::t2SUBri || Opc == ARM::t2SUBri12 || Opc == ARM::t2SUBSri;
+}
+
+static inline bool isMovRegOpcode(int Opc) {
+ return Opc == ARM::MOVr || Opc == ARM::tMOVr || Opc == ARM::t2MOVr;
+}
+
/// isValidCoprocessorNumber - decide whether an explicit coprocessor
/// number is legal in generic instructions like CDP. The answer can
/// vary with the subtarget.
t2PseudoInst<(outs), (ins rGPR:$elts), 4, IIC_Br,
[(int_set_loop_iterations rGPR:$elts)]>, Sched<[WriteBr]>;
+let hasSideEffects = 0 in
def t2LoopDec :
t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size),
4, IIC_Br, []>, Sched<[WriteBr]>;
#include "Thumb2InstrInfo.h"
#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineLoopUtils.h"
public:
PredicatedMI(MachineInstr *I, SetVector<MachineInstr*> &Preds) :
- MI(I) {
- Predicates.insert(Preds.begin(), Preds.end());
- }
+ MI(I) { Predicates.insert(Preds.begin(), Preds.end()); }
};
// Represent a VPT block, a list of instructions that begins with a VPST and
VPTBlock *CurrentBlock = nullptr;
SetVector<MachineInstr*> CurrentPredicate;
SmallVector<VPTBlock, 4> VPTBlocks;
+ SmallPtrSet<MachineInstr*, 4> ToRemove;
bool Revert = false;
bool CannotTailPredicate = false;
void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
- void RemoveLoopUpdate(LowOverheadLoop &LoLoop);
-
void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
return true;
}
+static bool IsSafeToRemove(MachineInstr *MI, ReachingDefAnalysis *RDA,
+ SmallPtrSetImpl<MachineInstr*> &Visited,
+ SmallPtrSetImpl<MachineInstr*> &ToRemove,
+ SmallPtrSetImpl<MachineInstr*> &Ignore) {
+ if (Visited.count(MI) || Ignore.count(MI))
+ return true;
+ else if (MI->mayLoadOrStore() || MI->hasUnmodeledSideEffects() ||
+ MI->isBranch() || MI->isTerminator() || MI->isReturn()) {
+ // Unless told to ignore the instruction, don't remove anything which has
+ // side effects.
+ LLVM_DEBUG(dbgs() << "ARM Loops: Has side effects: " << *MI);
+ return false;
+ }
+
+ Visited.insert(MI);
+ for (auto &MO : MI->operands()) {
+ if (!MO.isReg() || MO.isUse() || MO.getReg() == 0)
+ continue;
+
+ SmallPtrSet<MachineInstr*, 4> Uses;
+ RDA->getGlobalUses(MI, MO.getReg(), Uses);
+
+ for (auto I : Uses) {
+ if (Ignore.count(I) || ToRemove.count(I))
+ continue;
+ if (!IsSafeToRemove(I, RDA, Visited, ToRemove, Ignore)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove " << *I);
+ return false;
+ }
+ }
+ }
+ ToRemove.insert(MI);
+ LLVM_DEBUG(dbgs() << "ARM Loops: Can remove: " << *MI);
+ return true;
+}
+
bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt,
- ReachingDefAnalysis *RDA, MachineLoopInfo *MLI) {
+ ReachingDefAnalysis *RDA,
+ MachineLoopInfo *MLI) {
assert(VCTP && "VCTP instruction expected but is not set");
// All predication within the loop should be based on vctp. If the block
// isn't predicated on entry, check whether the vctp is within the block
if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI))
continue;
LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI
- << " - which is predicated on:\n";
- for (auto *MI : PredMI.Predicates)
- dbgs() << " - " << *MI;
- );
+ << " - which is predicated on:\n";
+ for (auto *MI : PredMI.Predicates)
+ dbgs() << " - " << *MI);
return false;
}
}
// The element count register maybe defined after InsertPt, in which case we
// need to try to move either InsertPt or the def so that the [w|d]lstp can
// use the value.
- MachineBasicBlock *InsertBB = InsertPt->getParent();
- if (!RDA->isReachingDefLiveOut(InsertPt, NumElements)) {
+ MachineBasicBlock *InsertBB = StartInsertPt->getParent();
+ if (!RDA->isReachingDefLiveOut(StartInsertPt, NumElements)) {
if (auto *ElemDef = RDA->getLocalLiveOutMIDef(InsertBB, NumElements)) {
- if (IsSafeToMove<MachineBasicBlock::reverse_iterator>(ElemDef, InsertPt, RDA)) {
+ if (IsSafeToMove<MachineBasicBlock::reverse_iterator>(
+ ElemDef, StartInsertPt, RDA)) {
ElemDef->removeFromParent();
- InsertBB->insert(MachineBasicBlock::iterator(InsertPt), ElemDef);
+ InsertBB->insert(MachineBasicBlock::iterator(StartInsertPt), ElemDef);
LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: "
<< *ElemDef);
- } else if (IsSafeToMove<MachineBasicBlock::iterator>(InsertPt, ElemDef, RDA)) {
- InsertPt->removeFromParent();
- InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), InsertPt);
+ } else if (IsSafeToMove<MachineBasicBlock::iterator>(
+ StartInsertPt, ElemDef, RDA)) {
+ StartInsertPt->removeFromParent();
+ InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef),
+ StartInsertPt);
LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
} else {
LLVM_DEBUG(dbgs() << "ARM Loops: Unable to move element count to loop "
MBB = *MBB->pred_begin();
}
- LLVM_DEBUG(dbgs() << "ARM Loops: Will use tail predication.\n");
+ // Check that the value change of the element count is what we expect and
+ // that the predication will be equivalent. For this we need:
+ // NumElements = NumElements - VectorWidth. The sub will be a sub immediate
+ // and we can also allow register copies within the chain too.
+ auto IsValidSub = [](MachineInstr *MI, unsigned ExpectedVecWidth) {
+ unsigned ImmOpIdx = 0;
+ switch (MI->getOpcode()) {
+ default:
+ llvm_unreachable("unhandled sub opcode");
+ case ARM::tSUBi3:
+ case ARM::tSUBi8:
+ ImmOpIdx = 3;
+ break;
+ case ARM::t2SUBri:
+ case ARM::t2SUBri12:
+ ImmOpIdx = 2;
+ break;
+ }
+ return MI->getOperand(ImmOpIdx).getImm() == ExpectedVecWidth;
+ };
+
+ MBB = VCTP->getParent();
+ if (MachineInstr *Def = RDA->getReachingMIDef(&MBB->back(), NumElements)) {
+ SmallPtrSet<MachineInstr*, 2> Visited;
+ SmallPtrSet<MachineInstr*, 2> ElementChain;
+ SmallPtrSet<MachineInstr*, 2> Ignore = { VCTP };
+ unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode());
+
+ if (IsSafeToRemove(Def, RDA, Visited, ElementChain, Ignore)) {
+ bool FoundSub = false;
+
+ for (auto *MI : ElementChain) {
+ if (isMovRegOpcode(MI->getOpcode()))
+ continue;
+
+ if (isSubImmOpcode(MI->getOpcode())) {
+ if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth))
+ return false;
+ FoundSub = true;
+ } else
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "ARM Loops: Will remove element count chain:\n";
+ for (auto *MI : ElementChain)
+ dbgs() << " - " << *MI);
+ ToRemove.insert(ElementChain.begin(), ElementChain.end());
+ }
+ }
return true;
}
dbgs() << " - " << Preheader->getName() << "\n";
else if (auto *Preheader = MLI->findLoopPreheader(ML))
dbgs() << " - " << Preheader->getName() << "\n";
+ else if (auto *Preheader = MLI->findLoopPreheader(ML, true))
+ dbgs() << " - " << Preheader->getName() << "\n";
for (auto *MBB : ML->getBlocks())
dbgs() << " - " << MBB->getName() << "\n";
);
// Check we know how to tail predicate any mve instructions.
LoLoop.AnalyseMVEInst(&MI);
}
-
- // We need to ensure that LR is not used or defined inbetween LoopDec and
- // LoopEnd.
- if (!LoLoop.Dec || LoLoop.End || LoLoop.Revert)
- continue;
-
- // If we find that LR has been written or read between LoopDec and
- // LoopEnd, expect that the decremented value is being used else where.
- // Because this value isn't actually going to be produced until the
- // latch, by LE, we would need to generate a real sub. The value is also
- // likely to be copied/reloaded for use of LoopEnd - in which in case
- // we'd need to perform an add because it gets subtracted again by LE!
- // The other option is to then generate the other form of LE which doesn't
- // perform the sub.
- for (auto &MO : MI.operands()) {
- if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() &&
- MO.getReg() == ARM::LR) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI);
- LoLoop.Revert = true;
- break;
- }
- }
}
}
return false;
}
+ SmallPtrSet<MachineInstr*, 2> Visited;
+ SmallPtrSet<MachineInstr*, 2> Ignore = { LoLoop.End };
+ SmallPtrSet<MachineInstr*, 4> Remove;
+ if (!IsSafeToRemove(LoLoop.Dec, RDA, Visited, Remove, Ignore)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove loop count chain.\n");
+ LoLoop.Revert = true;
+ } else {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Will need to remove:\n";
+ for (auto *I : Remove)
+ dbgs() << " - " << *I);
+ LoLoop.ToRemove.insert(Remove.begin(), Remove.end());
+ }
+
LoLoop.CheckLegality(BBUtils.get(), RDA, MLI);
Expand(LoLoop);
return true;
}
MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n");
+ // When using tail-predication, try to delete the dead code that was used to
+ // calculate the number of loop iterations.
+ if (LoLoop.IsTailPredicationLegal()) {
+ SmallVector<MachineInstr*, 4> Killed;
+ SmallVector<MachineInstr*, 4> Dead;
+ if (auto *Def = RDA->getReachingMIDef(LoLoop.Start,
+ LoLoop.Start->getOperand(0).getReg())) {
+ SmallPtrSet<MachineInstr*, 4> Visited;
+ SmallPtrSet<MachineInstr*, 4> Remove;
+ SmallPtrSet<MachineInstr*, 4> Ignore = { LoLoop.Start, LoLoop.Dec,
+ LoLoop.End, LoLoop.VCTP,
+ LoLoop.InsertPt };
+ SmallVector<MachineInstr*, 4> Chain = { Def };
+ while (!Chain.empty()) {
+ MachineInstr *MI = Chain.back();
+ Chain.pop_back();
+ if (IsSafeToRemove(MI, RDA, Visited, Remove, Ignore)) {
+ for (auto &MO : MI->operands()) {
+ if (!MO.isReg() || !MO.isUse() || MO.getReg() == 0)
+ continue;
+ if (auto *Op = RDA->getReachingMIDef(MI, MO.getReg()))
+ Chain.push_back(Op);
+ }
+ Ignore.insert(MI);
+ }
+ }
+ LoLoop.ToRemove.insert(Remove.begin(), Remove.end());
+ }
+ }
+
MachineInstr *InsertPt = LoLoop.InsertPt;
MachineInstr *Start = LoLoop.Start;
MachineBasicBlock *MBB = InsertPt->getParent();
if (!IsDo)
MIB.add(Start->getOperand(1));
- // When using tail-predication, try to delete the dead code that was used to
- // calculate the number of loop iterations.
- if (LoLoop.IsTailPredicationLegal()) {
- SmallVector<MachineInstr*, 4> Killed;
- SmallVector<MachineInstr*, 4> Dead;
- if (auto *Def = RDA->getReachingMIDef(Start,
- Start->getOperand(0).getReg())) {
- Killed.push_back(Def);
-
- while (!Killed.empty()) {
- MachineInstr *Def = Killed.back();
- Killed.pop_back();
- Dead.push_back(Def);
- for (auto &MO : Def->operands()) {
- if (!MO.isReg() || !MO.isKill())
- continue;
-
- MachineInstr *Kill = RDA->getReachingMIDef(Def, MO.getReg());
- if (Kill && RDA->getNumUses(Kill, MO.getReg()) == 1)
- Killed.push_back(Kill);
- }
- }
- for (auto *MI : Dead)
- MI->eraseFromParent();
- }
- }
-
// If we're inserting at a mov lr, then remove it as it's redundant.
if (InsertPt != Start)
- InsertPt->eraseFromParent();
- Start->eraseFromParent();
+ LoLoop.ToRemove.insert(InsertPt);
+ LoLoop.ToRemove.insert(Start);
LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
return &*MIB;
}
-// Goal is to optimise and clean-up these loops:
-//
-// vector.body:
-// renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
-// renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3(tied-def 0), 4
-// ..
-// $lr = MVE_DLSTP_32 renamable $r3
-//
-// The SUB is the old update of the loop iteration count expression, which
-// is no longer needed. This sub is removed when the element count, which is in
-// r3 in this example, is defined by an instruction in the loop, and it has
-// no uses.
-//
-void ARMLowOverheadLoops::RemoveLoopUpdate(LowOverheadLoop &LoLoop) {
- Register ElemCount = LoLoop.VCTP->getOperand(1).getReg();
- MachineInstr *LastInstrInBlock = &LoLoop.VCTP->getParent()->back();
-
- LLVM_DEBUG(dbgs() << "ARM Loops: Trying to remove loop update stmt\n");
-
- if (LoLoop.ML->getNumBlocks() != 1) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Single block loop expected\n");
- return;
- }
-
- LLVM_DEBUG(dbgs() << "ARM Loops: Analyzing elemcount in operand: ";
- LoLoop.VCTP->getOperand(1).dump());
-
- // Find the definition we are interested in removing, if there is one.
- MachineInstr *Def = RDA->getReachingMIDef(LastInstrInBlock, ElemCount);
- if (!Def) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Can't find a def, nothing to do.\n");
- return;
- }
-
- // Bail if we define CPSR and it is not dead
- if (!Def->registerDefIsDead(ARM::CPSR, TRI)) {
- LLVM_DEBUG(dbgs() << "ARM Loops: CPSR is not dead\n");
- return;
- }
-
- // Bail if elemcount is used in exit blocks, i.e. if it is live-in.
- if (isRegLiveInExitBlocks(LoLoop.ML, ElemCount)) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Elemcount is live-out, can't remove stmt\n");
- return;
- }
-
- // Bail if there are uses after this Def in the block.
- SmallVector<MachineInstr*, 4> Uses;
- RDA->getReachingLocalUses(Def, ElemCount, Uses);
- if (Uses.size()) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Local uses in block, can't remove stmt\n");
- return;
- }
-
- Uses.clear();
- RDA->getAllInstWithUseBefore(Def, ElemCount, Uses);
-
- // Remove Def if there are no uses, or if the only use is the VCTP
- // instruction.
- if (!Uses.size() || (Uses.size() == 1 && Uses[0] == LoLoop.VCTP)) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop update instruction: ";
- Def->dump());
- Def->eraseFromParent();
- return;
- }
-
- LLVM_DEBUG(dbgs() << "ARM Loops: Can't remove loop update, it's used by:\n";
- for (auto U : Uses) U->dump());
-}
-
void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
auto RemovePredicate = [](MachineInstr *MI) {
LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
MIB.addImm(getARMVPTBlockMask(Size));
LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST());
LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
- Block.getVPST()->eraseFromParent();
+ LoLoop.ToRemove.insert(Block.getVPST());
}
} else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
// A vpt block which is only predicated upon vctp and has no internal vpr
// - Remove vpst.
// - Unpredicate the remaining instructions.
LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST());
- Block.getVPST()->eraseFromParent();
+ LoLoop.ToRemove.insert(Block.getVPST());
for (auto &PredMI : Insts)
RemovePredicate(PredMI.MI);
}
}
-
LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP);
- LoLoop.VCTP->eraseFromParent();
+ LoLoop.ToRemove.insert(LoLoop.VCTP);
}
void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
MIB.add(End->getOperand(0));
MIB.add(End->getOperand(1));
LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
-
- LoLoop.End->eraseFromParent();
- LoLoop.Dec->eraseFromParent();
+ End->eraseFromParent();
return &*MIB;
};
RemoveDeadBranch(LoLoop.Start);
LoLoop.End = ExpandLoopEnd(LoLoop);
RemoveDeadBranch(LoLoop.End);
- if (LoLoop.IsTailPredicationLegal()) {
- RemoveLoopUpdate(LoLoop);
+ if (LoLoop.IsTailPredicationLegal())
ConvertVPTBlocks(LoLoop);
+ for (auto *I : LoLoop.ToRemove) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I);
+ I->eraseFromParent();
}
}
};
}
-inline static unsigned getARMVPTBlockMask(unsigned NumInsts) {
- switch (NumInsts) {
- case 1:
- return ARMVCC::T;
- case 2:
- return ARMVCC::TT;
- case 3:
- return ARMVCC::TTT;
- case 4:
- return ARMVCC::TTTT;
- default:
- break;
- };
- llvm_unreachable("Unexpected number of instruction in a VPT block");
-}
-
inline static const char *ARMVPTPredToString(ARMVCC::VPTCodes CC) {
switch (CC) {
case ARMVCC::None: return "none";
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve,+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
-# There are 2 SUBS, and the 2nd one is identified as the def.
-# Thus, the 1st is a use, and we shouldn't optimise away the SUBS.
-
-# CHECK: bb.1.vector.body:
-# CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
-# CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
-# CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1
+# There are 2 SUBS, so don't use tail predication
--- |
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
%lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
%8 = call <4 x i1> @llvm.arm.vctp32(i32 %7)
%9 = sub i32 %7, 4
- %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3
- %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3
+ %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef)
+ %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef)
%10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
- call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8), !tbaa !3
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8)
%scevgep = getelementptr i32, i32* %lsr.iv, i32 4
%scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
%scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
%11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
%12 = icmp ne i32 %11, 0
- br i1 %12, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
+ br i1 %12, label %vector.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
- declare void @llvm.set.loop.iterations.i32(i32) #1
- declare <4 x i1> @llvm.arm.vctp32(i32) #2
- declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
- declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3
- declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #4
- declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3
- declare void @llvm.stackprotector(i8*, i8**) #5
-
- attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" }
- attributes #1 = { noduplicate nounwind }
- attributes #2 = { nounwind readnone }
- attributes #3 = { argmemonly nounwind willreturn }
- attributes #4 = { argmemonly nounwind readonly willreturn }
- attributes #5 = { nounwind }
-
- !llvm.module.flags = !{!0, !1}
- !llvm.ident = !{!2}
-
- !0 = !{i32 1, !"wchar_size", i32 4}
- !1 = !{i32 1, !"min_enum_size", i32 4}
- !2 = !{!"clang version 10.0.0 (http://github.com/llvm/llvm-project 2589b6d9edda73280fe1dc1d944ee34e22fe9a6f)"}
- !3 = !{!4, !4, i64 0}
- !4 = !{!"int", !5, i64 0}
- !5 = !{!"omnipotent char", !6, i64 0}
- !6 = !{!"Simple C++ TBAA"}
- !7 = distinct !{!7, !8}
- !8 = !{!"llvm.loop.isvectorized", i32 1}
+ declare void @llvm.set.loop.iterations.i32(i32)
+ declare <4 x i1> @llvm.arm.vctp32(i32)
+ declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+ declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
+ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+ declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
...
---
constants: []
machineFunctionInfo: {}
body: |
+ ; CHECK-LABEL: name: use_before_def
+ ; CHECK: bb.0.entry:
+ ; CHECK: successors: %bb.1(0x80000000)
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3
+ ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
+ ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14, $noreg
+ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7
+ ; CHECK: tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr
+ ; CHECK: t2IT 11, 8, implicit-def $itstate
+ ; CHECK: tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+ ; CHECK: renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg
+ ; CHECK: renamable $lr = t2MOVi 1, 14, $noreg, $noreg
+ ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
+ ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+ ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
+ ; CHECK: $lr = t2DLS killed renamable $lr
+ ; CHECK: bb.1.vector.body:
+ ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3
+ ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
+ ; CHECK: MVE_VPST 4, implicit $vpr
+ ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4)
+ ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4)
+ ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
+ ; CHECK: renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+ ; CHECK: MVE_VPST 8, implicit $vpr
+ ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4)
+ ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
+ ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1
+ ; CHECK: bb.2.for.cond.cleanup:
+ ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc
bb.0.entry:
successors: %bb.1(0x80000000)
- liveins: $r0, $r1, $r2, $r3, $lr
+ liveins: $r0, $r1, $r2, $r3, $r7, $lr
- frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp
+ frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
frame-setup CFI_INSTRUCTION def_cfa_offset 8
frame-setup CFI_INSTRUCTION offset $lr, -4
frame-setup CFI_INSTRUCTION offset $r7, -8
renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
MVE_VPST 4, implicit $vpr
- renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4, !tbaa !3)
- renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4, !tbaa !3)
+ renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4)
+ renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4)
renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
MVE_VPST 8, implicit $vpr
- renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4, !tbaa !3)
+ renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4)
renamable $lr = t2LoopDec killed renamable $lr, 1
renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
-# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve,+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
-
-# The CPSR is not dead:
-#
-# renamable $r3, $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
-#
-# We shouldn't optimise away the SUB.
-
-# CHECK: bb.1.vector.body:
-# CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
-# CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s
--- |
- target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
- target triple = "thumbv8.1m.main-arm-unknown-eabi"
-
define dso_local void @CPSR_not_dead(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
entry:
%cmp8 = icmp sgt i32 %N, 0
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
+ %lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %5, %vector.ph ]
%lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
%lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
%lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
- %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
- %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
+ %6 = phi i32 [ %N, %vector.ph ], [ %8, %vector.body ]
%lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
- %8 = call <4 x i1> @llvm.arm.vctp32(i32 %7)
- %9 = sub i32 %7, 4
- %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3
- %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3
- %10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
- call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8), !tbaa !3
+ %7 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %6)
+ %8 = sub i32 %6, 4
+ %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %7, <4 x i32> undef)
+ %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %7, <4 x i32> undef)
+ %9 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %9, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %7)
%scevgep = getelementptr i32, i32* %lsr.iv, i32 4
%scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
%scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
- %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
- %12 = icmp ne i32 %11, 0
- br i1 %12, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
+ %10 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
+ %11 = icmp ne i32 %10, 0
+ %lsr.iv.next = add nsw i32 %lsr.iv1, -1
+ br i1 %11, label %vector.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
- declare void @llvm.set.loop.iterations.i32(i32) #1
- declare <4 x i1> @llvm.arm.vctp32(i32) #2
- declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
- declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3
- declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #4
- declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3
- declare void @llvm.stackprotector(i8*, i8**) #5
-
- attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" }
- attributes #1 = { noduplicate nounwind }
- attributes #2 = { nounwind readnone }
- attributes #3 = { argmemonly nounwind willreturn }
- attributes #4 = { argmemonly nounwind readonly willreturn }
- attributes #5 = { nounwind }
-
- !llvm.module.flags = !{!0, !1}
- !llvm.ident = !{!2}
-
- !0 = !{i32 1, !"wchar_size", i32 4}
- !1 = !{i32 1, !"min_enum_size", i32 4}
- !2 = !{!"clang version 10.0.0 (http://github.com/llvm/llvm-project 2589b6d9edda73280fe1dc1d944ee34e22fe9a6f)"}
- !3 = !{!4, !4, i64 0}
- !4 = !{!"int", !5, i64 0}
- !5 = !{!"omnipotent char", !6, i64 0}
- !6 = !{!"Simple C++ TBAA"}
- !7 = distinct !{!7, !8}
- !8 = !{!"llvm.loop.isvectorized", i32 1}
+ declare void @llvm.set.loop.iterations.i32(i32)
+ declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+ declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+ declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
...
---
stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
- stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+ stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
callSites: []
constants: []
machineFunctionInfo: {}
body: |
+ ; CHECK-LABEL: name: CPSR_not_dead
+ ; CHECK: bb.0.entry:
+ ; CHECK: successors: %bb.1(0x80000000)
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4
+ ; CHECK: frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8
+ ; CHECK: tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr
+ ; CHECK: t2IT 11, 8, implicit-def $itstate
+ ; CHECK: tPOP_RET 11, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+ ; CHECK: $lr = MVE_DLSTP_32 renamable $r3
+ ; CHECK: bb.1.vector.body:
+ ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3
+ ; CHECK: renamable $r3, $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
+ ; CHECK: t2IT 11, 8, implicit-def $itstate
+ ; CHECK: tPOP_RET 11, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+ ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv13, align 4)
+ ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1416, align 4)
+ ; CHECK: renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+ ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1719, align 4)
+ ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1
+ ; CHECK: bb.2.for.cond.cleanup:
+ ; CHECK: t2IT 11, 8, implicit-def dead $itstate
+ ; CHECK: tPOP_RET 14, $noreg, def $r4, def $pc
bb.0.entry:
successors: %bb.1(0x80000000)
- liveins: $r0, $r1, $r2, $r3, $lr
+ liveins: $r0, $r1, $r2, $r3, $r4, $lr
- frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp
+ frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
frame-setup CFI_INSTRUCTION def_cfa_offset 8
frame-setup CFI_INSTRUCTION offset $lr, -4
- frame-setup CFI_INSTRUCTION offset $r7, -8
- $r7 = frame-setup tMOVr $sp, 14, $noreg
- frame-setup CFI_INSTRUCTION def_cfa_register $r7
+ frame-setup CFI_INSTRUCTION offset $r4, -8
tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr
t2IT 11, 8, implicit-def $itstate
- tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+ tPOP_RET 11, killed $cpsr, def $r4, def $pc, implicit killed $itstate
renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg
renamable $lr = t2MOVi 1, 14, $noreg, $noreg
renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
- renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
- t2DoLoopStart renamable $lr
+ renamable $r4 = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
+ t2DoLoopStart renamable $r4
+ $r12 = tMOVr killed $r4, 14, $noreg
bb.1.vector.body:
successors: %bb.1(0x7c000000), %bb.2(0x04000000)
- liveins: $lr, $r0, $r1, $r2, $r3
+ liveins: $r0, $r1, $r2, $r3, $r12
renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
- MVE_VPST 4, implicit $vpr
- renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4, !tbaa !3)
- renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4, !tbaa !3)
+ $lr = tMOVr $r12, 14, $noreg
+ renamable $r12 = nsw t2SUBri killed $r12, 1, 14, $noreg, $noreg
renamable $r3, $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
+ t2IT 11, 8, implicit-def $itstate
+ tPOP_RET 11, killed $cpsr, def $r4, def $pc, implicit killed $itstate
+ MVE_VPST 4, implicit $vpr
+ renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4)
+ renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4)
+ renamable $lr = t2LoopDec killed renamable $lr, 1
renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
MVE_VPST 8, implicit $vpr
- renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4, !tbaa !3)
- renamable $lr = t2LoopDec killed renamable $lr, 1
- t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+ renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4)
+ t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr
tB %bb.2, 14, $noreg
bb.2.for.cond.cleanup:
- tPOP_RET 14, $noreg, def $r7, def $pc
+ t2IT 11, 8, implicit-def $itstate
+ tPOP_RET 14, $noreg, def $r4, def $pc
...
+++ /dev/null
-# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve,+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
-
-# Local use after def, this mov is using r3:
-#
-# $r2 = tMOVr killed $r3, 14, $noreg
-#
-# We should optimise away the SUB
-
-# CHECK: bb.1.vector.body:
-# CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
-# CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1
-
---- |
- target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
- target triple = "thumbv8.1m.main-arm-unknown-eabi"
-
- define dso_local void @local_use_after_def(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
- entry:
- %cmp8 = icmp sgt i32 %N, 0
- %0 = add i32 %N, 3
- %1 = lshr i32 %0, 2
- %2 = shl nuw i32 %1, 2
- %3 = add i32 %2, -4
- %4 = lshr i32 %3, 2
- %5 = add nuw nsw i32 %4, 1
- br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
-
- vector.ph: ; preds = %entry
- call void @llvm.set.loop.iterations.i32(i32 %5)
- br label %vector.body
-
- vector.body: ; preds = %vector.body, %vector.ph
- %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
- %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
- %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
- %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
- %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
- %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
- %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
- %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
- %8 = call <4 x i1> @llvm.arm.vctp32(i32 %7)
- %9 = sub i32 %7, 4
- %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3
- %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3
- %10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
- call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8), !tbaa !3
- %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
- %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
- %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
- %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
- %12 = icmp ne i32 %11, 0
- br i1 %12, label %vector.body, label %for.cond.cleanup, !llvm.loop !7
-
- for.cond.cleanup: ; preds = %vector.body, %entry
- ret void
- }
- declare void @llvm.set.loop.iterations.i32(i32) #1
- declare <4 x i1> @llvm.arm.vctp32(i32) #2
- declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
- declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3
- declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #4
- declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3
- declare void @llvm.stackprotector(i8*, i8**) #5
-
- attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" }
- attributes #1 = { noduplicate nounwind }
- attributes #2 = { nounwind readnone }
- attributes #3 = { argmemonly nounwind willreturn }
- attributes #4 = { argmemonly nounwind readonly willreturn }
- attributes #5 = { nounwind }
-
- !llvm.module.flags = !{!0, !1}
- !llvm.ident = !{!2}
-
- !0 = !{i32 1, !"wchar_size", i32 4}
- !1 = !{i32 1, !"min_enum_size", i32 4}
- !2 = !{!"clang version 10.0.0 (http://github.com/llvm/llvm-project 2589b6d9edda73280fe1dc1d944ee34e22fe9a6f)"}
- !3 = !{!4, !4, i64 0}
- !4 = !{!"int", !5, i64 0}
- !5 = !{!"omnipotent char", !6, i64 0}
- !6 = !{!"Simple C++ TBAA"}
- !7 = distinct !{!7, !8}
- !8 = !{!"llvm.loop.isvectorized", i32 1}
-
-...
----
-name: local_use_after_def
-alignment: 2
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-failedISel: false
-tracksRegLiveness: true
-hasWinCFI: false
-registers: []
-liveins:
- - { reg: '$r0', virtual-reg: '' }
- - { reg: '$r1', virtual-reg: '' }
- - { reg: '$r2', virtual-reg: '' }
- - { reg: '$r3', virtual-reg: '' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 8
- offsetAdjustment: 0
- maxAlignment: 4
- adjustsStack: false
- hasCalls: false
- stackProtector: ''
- maxCallFrameSize: 0
- cvBytesOfCalleeSavedRegisters: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
- localFrameSize: 0
- savePoint: ''
- restorePoint: ''
-fixedStack: []
-stack:
- - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
- stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
- stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-callSites: []
-constants: []
-machineFunctionInfo: {}
-body: |
- bb.0.entry:
- successors: %bb.1(0x80000000)
- liveins: $r0, $r1, $r2, $r3, $lr
-
- frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp
- frame-setup CFI_INSTRUCTION def_cfa_offset 8
- frame-setup CFI_INSTRUCTION offset $lr, -4
- frame-setup CFI_INSTRUCTION offset $r7, -8
- $r7 = frame-setup tMOVr $sp, 14, $noreg
- frame-setup CFI_INSTRUCTION def_cfa_register $r7
- tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr
- t2IT 11, 8, implicit-def $itstate
- tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate
- renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg
- renamable $lr = t2MOVi 1, 14, $noreg, $noreg
- renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
- renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
- renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
- t2DoLoopStart renamable $lr
-
- bb.1.vector.body:
- successors: %bb.1(0x7c000000), %bb.2(0x04000000)
- liveins: $lr, $r0, $r1, $r2, $r3
-
- renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
- MVE_VPST 4, implicit $vpr
- renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4, !tbaa !3)
- renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4, !tbaa !3)
- renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
- renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
- MVE_VPST 8, implicit $vpr
- renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4, !tbaa !3)
- renamable $lr = t2LoopDec killed renamable $lr, 1
- $r2 = tMOVr killed $r3, 14, $noreg
- t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
- tB %bb.2, 14, $noreg
-
- bb.2.for.cond.cleanup:
- tPOP_RET 14, $noreg, def $r7, def $pc
-
-...
; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
; CHECK-NEXT: vldrwt.u32 q3, [r1], #16
-; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vfma.f32 q0, q3, q2
; CHECK-NEXT: le lr, .LBB1_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
; CHECK-NEXT: .LBB2_3: @ %else25
; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1
; CHECK-NEXT: vmul.f16 q5, q6, q5
-; CHECK-NEXT: sub.w lr, lr, #1
+; CHECK-NEXT: adds r0, #8
; CHECK-NEXT: vmovx.f16 s2, s21
; CHECK-NEXT: vmovx.f16 s0, s20
; CHECK-NEXT: vcvtb.f32.f16 s27, s2
-; CHECK-NEXT: adds r0, #8
-; CHECK-NEXT: vcvtb.f32.f16 s26, s21
; CHECK-NEXT: adds r1, #8
-; CHECK-NEXT: vcvtb.f32.f16 s25, s0
+; CHECK-NEXT: vcvtb.f32.f16 s26, s21
; CHECK-NEXT: adds r3, #4
+; CHECK-NEXT: vcvtb.f32.f16 s25, s0
+; CHECK-NEXT: subs.w lr, lr, #1
; CHECK-NEXT: vcvtb.f32.f16 s24, s20
; CHECK-NEXT: vadd.f32 q5, q3, q6
-; CHECK-NEXT: cmp.w lr, #0
; CHECK-NEXT: bne .LBB2_4
; CHECK-NEXT: b .LBB2_21
; CHECK-NEXT: .LBB2_4: @ %vector.body
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
+--- |
+ define dso_local void @incorrect_sub_16(i16* noalias nocapture %A, i16* noalias nocapture readonly %B, i16* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+ entry:
+ %cmp8 = icmp sgt i32 %N, 0
+ %0 = add i32 %N, 3
+ %1 = lshr i32 %0, 2
+ %2 = shl nuw i32 %1, 2
+ %3 = add i32 %2, -4
+ %4 = lshr i32 %3, 2
+ %5 = add nuw nsw i32 %4, 1
+ br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
+
+ vector.ph: ; preds = %entry
+ call void @llvm.set.loop.iterations.i32(i32 %5)
+ br label %vector.body
+
+ vector.body: ; preds = %vector.body, %vector.ph
+ %lsr.iv17 = phi i16* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
+ %lsr.iv14 = phi i16* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
+ %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
+ %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+ %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
+ %lsr.iv13 = bitcast i16* %lsr.iv to <8 x i16>*
+ %lsr.iv1416 = bitcast i16* %lsr.iv14 to <8 x i16>*
+ %lsr.iv1719 = bitcast i16* %lsr.iv17 to <8 x i16>*
+ %8 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %7)
+ %9 = sub i32 %7, 7
+ %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv13, i32 4, <8 x i1> %8, <8 x i16> undef)
+ %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv1416, i32 4, <8 x i1> %8, <8 x i16> undef)
+ %10 = add nsw <8 x i16> %wide.masked.load12, %wide.masked.load
+ call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %10, <8 x i16>* %lsr.iv1719, i32 4, <8 x i1> %8)
+ %scevgep = getelementptr i16, i16* %lsr.iv, i32 8
+ %scevgep15 = getelementptr i16, i16* %lsr.iv14, i32 8
+ %scevgep18 = getelementptr i16, i16* %lsr.iv17, i32 8
+ %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
+ %12 = icmp ne i32 %11, 0
+ br i1 %12, label %vector.body, label %for.cond.cleanup
+
+ for.cond.cleanup: ; preds = %vector.body, %entry
+ ret void
+ }
+ declare void @llvm.set.loop.iterations.i32(i32)
+ declare <8 x i1> @llvm.arm.mve.vctp16(i32)
+ declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+ declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
+ declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
+...
+---
+name: incorrect_sub_16
+alignment: 2
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+registers: []
+liveins:
+ - { reg: '$r0', virtual-reg: '' }
+ - { reg: '$r1', virtual-reg: '' }
+ - { reg: '$r2', virtual-reg: '' }
+ - { reg: '$r3', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 8
+ offsetAdjustment: 0
+ maxAlignment: 4
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 0
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack: []
+stack:
+ - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites: []
+constants: []
+machineFunctionInfo: {}
+body: |
+ ; CHECK-LABEL: name: incorrect_sub_16
+ ; CHECK: bb.0.entry:
+ ; CHECK: successors: %bb.1(0x80000000)
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r7
+ ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
+ ; CHECK: tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr
+ ; CHECK: t2IT 11, 8, implicit-def $itstate
+ ; CHECK: tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+ ; CHECK: renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg
+ ; CHECK: renamable $lr = t2MOVi 1, 14, $noreg, $noreg
+ ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
+ ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+ ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
+ ; CHECK: $lr = t2DLS killed renamable $lr
+ ; CHECK: bb.1.vector.body:
+ ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3
+ ; CHECK: renamable $vpr = MVE_VCTP16 renamable $r3, 0, $noreg
+ ; CHECK: MVE_VPST 4, implicit $vpr
+ ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4)
+ ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRHU16_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4)
+ ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 7, 14, $noreg
+ ; CHECK: renamable $q0 = nsw MVE_VADDi16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+ ; CHECK: MVE_VPST 8, implicit $vpr
+ ; CHECK: renamable $r0 = MVE_VSTRHU16_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4)
+ ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1
+ ; CHECK: bb.2.for.cond.cleanup:
+ ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc
+ bb.0.entry:
+ successors: %bb.1(0x80000000)
+ liveins: $r0, $r1, $r2, $r3, $r7, $lr
+
+ frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+ frame-setup CFI_INSTRUCTION def_cfa_offset 8
+ frame-setup CFI_INSTRUCTION offset $lr, -4
+ frame-setup CFI_INSTRUCTION offset $r7, -8
+ tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr
+ t2IT 11, 8, implicit-def $itstate
+ tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+ renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg
+ renamable $lr = t2MOVi 1, 14, $noreg, $noreg
+ renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
+ renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+ renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
+ t2DoLoopStart renamable $lr
+
+ bb.1.vector.body:
+ successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+ liveins: $lr, $r0, $r1, $r2, $r3
+
+ renamable $vpr = MVE_VCTP16 renamable $r3, 0, $noreg
+ MVE_VPST 4, implicit $vpr
+ renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4)
+ renamable $r2, renamable $q1 = MVE_VLDRHU16_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4)
+ renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 7, 14, $noreg
+ renamable $q0 = nsw MVE_VADDi16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+ MVE_VPST 8, implicit $vpr
+ renamable $r0 = MVE_VSTRHU16_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4)
+ renamable $lr = t2LoopDec killed renamable $lr, 1
+ t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+ tB %bb.2, 14, $noreg
+
+ bb.2.for.cond.cleanup:
+ tPOP_RET 14, $noreg, def $r7, def $pc
+
+...
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve,+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
+
+# Local use after def, this mov is using r3:
+#
+# $r2 = tMOVr killed $r3, 14, $noreg
+#
+# We should optimise away the SUB
+
+--- |
+ define dso_local void @incorrect_sub_32(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+ entry:
+ %cmp8 = icmp sgt i32 %N, 0
+ %0 = add i32 %N, 3
+ %1 = lshr i32 %0, 2
+ %2 = shl nuw i32 %1, 2
+ %3 = add i32 %2, -4
+ %4 = lshr i32 %3, 2
+ %5 = add nuw nsw i32 %4, 1
+ br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
+
+ vector.ph: ; preds = %entry
+ call void @llvm.set.loop.iterations.i32(i32 %5)
+ br label %vector.body
+
+ vector.body: ; preds = %vector.body, %vector.ph
+ %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
+ %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
+ %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
+ %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+ %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
+ %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
+ %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
+ %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
+ %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
+ %9 = sub i32 %7, 5
+ %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef)
+ %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef)
+ %10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8)
+ %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+ %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
+ %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
+ %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
+ %12 = icmp ne i32 %11, 0
+ br i1 %12, label %vector.body, label %for.cond.cleanup
+
+ for.cond.cleanup: ; preds = %vector.body, %entry
+ ret void
+ }
+ declare void @llvm.set.loop.iterations.i32(i32)
+ declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+ declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+ declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
+
+...
+---
+name: incorrect_sub_32
+alignment: 2
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+registers: []
+liveins:
+ - { reg: '$r0', virtual-reg: '' }
+ - { reg: '$r1', virtual-reg: '' }
+ - { reg: '$r2', virtual-reg: '' }
+ - { reg: '$r3', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 8
+ offsetAdjustment: 0
+ maxAlignment: 4
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 0
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack: []
+stack:
+ - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites: []
+constants: []
+machineFunctionInfo: {}
+body: |
+ ; CHECK-LABEL: name: incorrect_sub_32
+ ; CHECK: bb.0.entry:
+ ; CHECK: successors: %bb.1(0x80000000)
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r7
+ ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
+ ; CHECK: tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr
+ ; CHECK: t2IT 11, 8, implicit-def $itstate
+ ; CHECK: tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+ ; CHECK: renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg
+ ; CHECK: renamable $lr = t2MOVi 1, 14, $noreg, $noreg
+ ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
+ ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+ ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
+ ; CHECK: $lr = t2DLS killed renamable $lr
+ ; CHECK: bb.1.vector.body:
+ ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3
+ ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
+ ; CHECK: MVE_VPST 4, implicit $vpr
+ ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4)
+ ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4)
+ ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 5, 14, $noreg
+ ; CHECK: renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+ ; CHECK: MVE_VPST 8, implicit $vpr
+ ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4)
+ ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1
+ ; CHECK: bb.2.for.cond.cleanup:
+ ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc
+ bb.0.entry:
+ successors: %bb.1(0x80000000)
+ liveins: $r0, $r1, $r2, $r3, $r7, $lr
+
+ frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+ frame-setup CFI_INSTRUCTION def_cfa_offset 8
+ frame-setup CFI_INSTRUCTION offset $lr, -4
+ frame-setup CFI_INSTRUCTION offset $r7, -8
+ tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr
+ t2IT 11, 8, implicit-def $itstate
+ tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+ renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg
+ renamable $lr = t2MOVi 1, 14, $noreg, $noreg
+ renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
+ renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+ renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
+ t2DoLoopStart renamable $lr
+
+ bb.1.vector.body:
+ successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+ liveins: $lr, $r0, $r1, $r2, $r3
+
+ renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
+ MVE_VPST 4, implicit $vpr
+ renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4)
+ renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4)
+ renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 5, 14, $noreg
+ renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+ MVE_VPST 8, implicit $vpr
+ renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4)
+ renamable $lr = t2LoopDec killed renamable $lr, 1
+ t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+ tB %bb.2, 14, $noreg
+
+ bb.2.for.cond.cleanup:
+ tPOP_RET 14, $noreg, def $r7, def $pc
+
+...
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
+--- |
+ define dso_local void @incorrect_sub_8(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, i8* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+ entry:
+ %cmp8 = icmp sgt i32 %N, 0
+ %0 = add i32 %N, 3
+ %1 = lshr i32 %0, 2
+ %2 = shl nuw i32 %1, 2
+ %3 = add i32 %2, -4
+ %4 = lshr i32 %3, 2
+ %5 = add nuw nsw i32 %4, 1
+ br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
+
+ vector.ph: ; preds = %entry
+ call void @llvm.set.loop.iterations.i32(i32 %5)
+ br label %vector.body
+
+ vector.body: ; preds = %vector.body, %vector.ph
+ %lsr.iv17 = phi i8* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
+ %lsr.iv14 = phi i8* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
+ %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
+ %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+ %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
+ %lsr.iv13 = bitcast i8* %lsr.iv to <16 x i8>*
+ %lsr.iv1416 = bitcast i8* %lsr.iv14 to <16 x i8>*
+ %lsr.iv1719 = bitcast i8* %lsr.iv17 to <16 x i8>*
+ %8 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %7)
+ %9 = sub i32 %7, 15
+ %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv13, i32 4, <16 x i1> %8, <16 x i8> undef)
+ %wide.masked.load12 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv1416, i32 4, <16 x i1> %8, <16 x i8> undef)
+ %10 = add nsw <16 x i8> %wide.masked.load12, %wide.masked.load
+ call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %10, <16 x i8>* %lsr.iv1719, i32 4, <16 x i1> %8)
+ %scevgep = getelementptr i8, i8* %lsr.iv, i32 16
+ %scevgep15 = getelementptr i8, i8* %lsr.iv14, i32 16
+ %scevgep18 = getelementptr i8, i8* %lsr.iv17, i32 16
+ %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
+ %12 = icmp ne i32 %11, 0
+ br i1 %12, label %vector.body, label %for.cond.cleanup
+
+ for.cond.cleanup: ; preds = %vector.body, %entry
+ ret void
+ }
+ declare void @llvm.set.loop.iterations.i32(i32)
+ declare <16 x i1> @llvm.arm.mve.vctp8(i32)
+ declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+ declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
+ declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
+ declare void @llvm.stackprotector(i8*, i8**)
+...
+---
+name: incorrect_sub_8
+alignment: 2
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+registers: []
+liveins:
+ - { reg: '$r0', virtual-reg: '' }
+ - { reg: '$r1', virtual-reg: '' }
+ - { reg: '$r2', virtual-reg: '' }
+ - { reg: '$r3', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 8
+ offsetAdjustment: 0
+ maxAlignment: 4
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 0
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack: []
+stack:
+ - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites: []
+constants: []
+machineFunctionInfo: {}
+body: |
+ ; CHECK-LABEL: name: incorrect_sub_8
+ ; CHECK: bb.0.entry:
+ ; CHECK: successors: %bb.1(0x80000000)
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r7
+ ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
+ ; CHECK: tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr
+ ; CHECK: t2IT 11, 8, implicit-def $itstate
+ ; CHECK: tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+ ; CHECK: renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg
+ ; CHECK: renamable $lr = t2MOVi 1, 14, $noreg, $noreg
+ ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
+ ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+ ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
+ ; CHECK: $lr = t2DLS killed renamable $lr
+ ; CHECK: bb.1.vector.body:
+ ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3
+ ; CHECK: renamable $vpr = MVE_VCTP8 renamable $r3, 0, $noreg
+ ; CHECK: MVE_VPST 4, implicit $vpr
+ ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRBU8_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4)
+ ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRBU8_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4)
+ ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 15, 14, $noreg
+ ; CHECK: renamable $q0 = nsw MVE_VADDi8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+ ; CHECK: MVE_VPST 8, implicit $vpr
+ ; CHECK: renamable $r0 = MVE_VSTRBU8_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4)
+ ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1
+ ; CHECK: bb.2.for.cond.cleanup:
+ ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc
+ bb.0.entry:
+ successors: %bb.1(0x80000000)
+ liveins: $r0, $r1, $r2, $r3, $r7, $lr
+
+ frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+ frame-setup CFI_INSTRUCTION def_cfa_offset 8
+ frame-setup CFI_INSTRUCTION offset $lr, -4
+ frame-setup CFI_INSTRUCTION offset $r7, -8
+ tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr
+ t2IT 11, 8, implicit-def $itstate
+ tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+ renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg
+ renamable $lr = t2MOVi 1, 14, $noreg, $noreg
+ renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
+ renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+ renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
+ t2DoLoopStart renamable $lr
+
+ bb.1.vector.body:
+ successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+ liveins: $lr, $r0, $r1, $r2, $r3
+
+ renamable $vpr = MVE_VCTP8 renamable $r3, 0, $noreg
+ MVE_VPST 4, implicit $vpr
+ renamable $r1, renamable $q0 = MVE_VLDRBU8_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4)
+ renamable $r2, renamable $q1 = MVE_VLDRBU8_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4)
+ renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 15, 14, $noreg
+ renamable $q0 = nsw MVE_VADDi8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+ MVE_VPST 8, implicit $vpr
+ renamable $r0 = MVE_VSTRBU8_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4)
+ renamable $lr = t2LoopDec killed renamable $lr, 1
+ t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+ tB %bb.2, 14, $noreg
+
+ bb.2.for.cond.cleanup:
+ tPOP_RET 14, $noreg, def $r7, def $pc
+
+...
; CHECK: tPOP_RET 0, killed $cpsr, def $r4, def $pc, implicit killed $itstate
; CHECK: renamable $r12 = t2LSRri killed renamable $r3, 1, 14, $noreg, $noreg
; CHECK: renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
- ; CHECK: $lr = MVE_DLSTP_32 renamable $r12
+ ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r12
; CHECK: bb.1.vector.body:
; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000)
- ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r12
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3
; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg
; CHECK: renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1)
; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg
; CHECK: renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 4, 14, $noreg
- ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
; CHECK: renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1)
; CHECK: renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4)
; CHECK: tPOP_RET 0, killed $cpsr, def $r4, def $pc, implicit killed $itstate
; CHECK: $r12 = t2MOVr killed $r3, 14, $noreg, $noreg
; CHECK: renamable $r12 = t2LSRri killed renamable $r12, 1, 14, $noreg, $noreg
- ; CHECK: $lr = MVE_DLSTP_32 renamable $r12
+ ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r12
; CHECK: renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
; CHECK: bb.1.vector.body:
; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000)
- ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r12
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3
; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg
; CHECK: renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1)
; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg
; CHECK: renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 4, 14, $noreg
- ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
; CHECK: renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1)
; CHECK: renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4)
; CHECK: $r12 = t2MOVr killed $r3, 14, $noreg, $noreg
; CHECK: renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
; CHECK: renamable $r12 = t2LSRri killed renamable $r12, 1, 14, $noreg, $noreg
- ; CHECK: $lr = MVE_DLSTP_32 renamable $r12
+ ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r12
; CHECK: bb.1.vector.body:
; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000)
- ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r12
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3
; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg
; CHECK: renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1)
; CHECK: renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg
; CHECK: renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 4, 14, $noreg
- ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
; CHECK: renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1)
; CHECK: renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4)
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: subs r2, #4
-; CHECK-NEXT: vldrh.s32 q2, [r1], #8
; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: vldrh.s32 q2, [r1], #8
; CHECK-NEXT: vmla.u32 q0, q2, r0
; CHECK-NEXT: letp lr, .LBB1_1
; CHECK-NEXT: @ %bb.2: @ %middle.block
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: subs r2, #4
-; CHECK-NEXT: vldrh.u32 q2, [r1], #8
; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: vldrh.u32 q2, [r1], #8
; CHECK-NEXT: vmla.u32 q0, q2, r0
; CHECK-NEXT: letp lr, .LBB3_1
; CHECK-NEXT: @ %bb.2: @ %middle.block
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: subs r2, #4
-; CHECK-NEXT: vldrw.u32 q2, [r1], #16
; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: vldrw.u32 q2, [r1], #16
; CHECK-NEXT: vmla.u32 q0, q2, r0
; CHECK-NEXT: letp lr, .LBB4_1
; CHECK-NEXT: @ %bb.2: @ %middle.block
; CHECK-NEXT: adds r5, r0, r4
; CHECK-NEXT: vldrb.u32 q0, [r5]
; CHECK-NEXT: adds r5, r1, r4
+; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vldrb.u32 q1, [r5]
; CHECK-NEXT: vmul.i32 q0, q1, q0
-; CHECK-NEXT: adds r4, #4
-; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vadd.i32 q0, q0, r2
; CHECK-NEXT: vstrw.32 q0, [r3], #16
; CHECK-NEXT: letp lr, .LBB5_5
; CHECK-NEXT: vldrh.s32 q0, [r0], #8
; CHECK-NEXT: vldrh.s32 q1, [r1], #8
; CHECK-NEXT: vmul.i32 q0, q1, q0
-; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vadd.i32 q0, q0, r2
; CHECK-NEXT: vstrw.32 q0, [r3], #16
; CHECK-NEXT: letp lr, .LBB6_1
; CHECK-NEXT: adds r5, r0, r4
; CHECK-NEXT: vldrb.u32 q0, [r5]
; CHECK-NEXT: adds r5, r1, r4
+; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vldrb.u32 q1, [r5]
; CHECK-NEXT: vmul.i32 q0, q1, q0
-; CHECK-NEXT: adds r4, #4
-; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vadd.i32 q0, q0, r2
; CHECK-NEXT: vstrw.32 q0, [r3], #16
; CHECK-NEXT: letp lr, .LBB7_5
; CHECK-NEXT: vldrh.u32 q0, [r0], #8
; CHECK-NEXT: vldrh.u32 q1, [r1], #8
; CHECK-NEXT: vmul.i32 q0, q1, q0
-; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vadd.i32 q0, q0, r2
; CHECK-NEXT: vstrw.32 q0, [r3], #16
; CHECK-NEXT: letp lr, .LBB8_1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: vmul.i32 q0, q1, q0
-; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vadd.i32 q0, q0, r2
; CHECK-NEXT: vstrw.32 q0, [r3], #16
; CHECK-NEXT: letp lr, .LBB9_5
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
+
+--- |
+ define dso_local arm_aapcs_vfpcc void @remove_mov_lr_chain(float* nocapture readonly %pSrc, float* nocapture %pDst, i32 %blockSize) #0 {
+ entry:
+ %cmp5 = icmp eq i32 %blockSize, 0
+ br i1 %cmp5, label %while.end, label %while.body.preheader
+
+ while.body.preheader: ; preds = %entry
+ %min.iters.check = icmp ult i32 %blockSize, 4
+ br i1 %min.iters.check, label %while.body.preheader19, label %vector.memcheck
+
+ vector.memcheck: ; preds = %while.body.preheader
+ %scevgep = getelementptr float, float* %pDst, i32 %blockSize
+ %scevgep12 = getelementptr float, float* %pSrc, i32 %blockSize
+ %bound0 = icmp ugt float* %scevgep12, %pDst
+ %bound1 = icmp ugt float* %scevgep, %pSrc
+ %found.conflict = and i1 %bound0, %bound1
+ %0 = lshr i32 %blockSize, 2
+ %1 = shl nuw i32 %0, 2
+ %2 = add i32 %1, -4
+ %3 = lshr i32 %2, 2
+ %4 = add nuw nsw i32 %3, 1
+ br i1 %found.conflict, label %while.body.preheader19, label %vector.ph
+
+ vector.ph: ; preds = %vector.memcheck
+ %n.vec = and i32 %blockSize, -4
+ %ind.end = sub i32 %blockSize, %n.vec
+ %ind.end15 = getelementptr float, float* %pSrc, i32 %n.vec
+ %ind.end17 = getelementptr float, float* %pDst, i32 %n.vec
+ %scevgep9 = getelementptr float, float* %pDst, i32 -4
+ %scevgep14 = getelementptr float, float* %pSrc, i32 -4
+ call void @llvm.set.loop.iterations.i32(i32 %4)
+ br label %vector.body
+
+ vector.body: ; preds = %vector.body, %vector.ph
+ %lsr.iv15 = phi float* [ %scevgep16, %vector.body ], [ %scevgep14, %vector.ph ]
+ %lsr.iv10 = phi float* [ %scevgep11, %vector.body ], [ %scevgep9, %vector.ph ]
+ %5 = phi i32 [ %4, %vector.ph ], [ %7, %vector.body ]
+ %lsr.iv1517 = bitcast float* %lsr.iv15 to <4 x float>*
+ %lsr.iv1012 = bitcast float* %lsr.iv10 to <4 x float>*
+ %scevgep18 = getelementptr <4 x float>, <4 x float>* %lsr.iv1517, i32 1
+ %wide.load = load <4 x float>, <4 x float>* %scevgep18, align 4
+ %6 = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %wide.load)
+ %scevgep13 = getelementptr <4 x float>, <4 x float>* %lsr.iv1012, i32 1
+ store <4 x float> %6, <4 x float>* %scevgep13, align 4
+ %scevgep11 = getelementptr float, float* %lsr.iv10, i32 4
+ %scevgep16 = getelementptr float, float* %lsr.iv15, i32 4
+ %7 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %5, i32 1)
+ %8 = icmp ne i32 %7, 0
+ br i1 %8, label %vector.body, label %middle.block
+
+ middle.block: ; preds = %vector.body
+ %cmp.n = icmp eq i32 %n.vec, %blockSize
+ br i1 %cmp.n, label %while.end, label %while.body.preheader19
+
+ while.body.preheader19: ; preds = %middle.block, %vector.memcheck, %while.body.preheader
+ %blkCnt.08.ph = phi i32 [ %blockSize, %vector.memcheck ], [ %blockSize, %while.body.preheader ], [ %ind.end, %middle.block ]
+ %pSrc.addr.07.ph = phi float* [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end15, %middle.block ]
+ %pDst.addr.06.ph = phi float* [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end17, %middle.block ]
+ %scevgep1 = getelementptr float, float* %pSrc.addr.07.ph, i32 -1
+ %scevgep4 = getelementptr float, float* %pDst.addr.06.ph, i32 -1
+ call void @llvm.set.loop.iterations.i32(i32 %blkCnt.08.ph)
+ br label %while.body
+
+ while.body: ; preds = %while.body, %while.body.preheader19
+ %lsr.iv5 = phi float* [ %scevgep6, %while.body ], [ %scevgep4, %while.body.preheader19 ]
+ %lsr.iv = phi float* [ %scevgep2, %while.body ], [ %scevgep1, %while.body.preheader19 ]
+ %9 = phi i32 [ %blkCnt.08.ph, %while.body.preheader19 ], [ %12, %while.body ]
+ %scevgep3 = getelementptr float, float* %lsr.iv, i32 1
+ %scevgep7 = getelementptr float, float* %lsr.iv5, i32 1
+ %10 = load float, float* %scevgep3, align 4
+ %11 = tail call fast float @llvm.fabs.f32(float %10)
+ store float %11, float* %scevgep7, align 4
+ %scevgep2 = getelementptr float, float* %lsr.iv, i32 1
+ %scevgep6 = getelementptr float, float* %lsr.iv5, i32 1
+ %12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %9, i32 1)
+ %13 = icmp ne i32 %12, 0
+ br i1 %13, label %while.body, label %while.end
+
+ while.end: ; preds = %while.body, %middle.block, %entry
+ ret void
+ }
+ declare float @llvm.fabs.f32(float)
+ declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
+ declare void @llvm.set.loop.iterations.i32(i32)
+ declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+
+...
+---
+name: remove_mov_lr_chain
+alignment: 2
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+registers: []
+liveins:
+ - { reg: '$r0', virtual-reg: '' }
+ - { reg: '$r1', virtual-reg: '' }
+ - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 16
+ offsetAdjustment: 0
+ maxAlignment: 4
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 0
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack: []
+stack:
+ - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 2, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$r5', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 3, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites: []
+constants: []
+machineFunctionInfo: {}
+body: |
+ ; CHECK-LABEL: name: remove_mov_lr_chain
+ ; CHECK: bb.0.entry:
+ ; CHECK: successors: %bb.9(0x30000000), %bb.1(0x50000000)
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r4, $r5, $r7
+ ; CHECK: frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -12
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -16
+ ; CHECK: tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr
+ ; CHECK: tBcc %bb.9, 0, killed $cpsr
+ ; CHECK: bb.1.while.body.preheader:
+ ; CHECK: successors: %bb.6(0x40000000), %bb.2(0x40000000)
+ ; CHECK: liveins: $r0, $r1, $r2
+ ; CHECK: tCMPi8 renamable $r2, 4, 14, $noreg, implicit-def $cpsr
+ ; CHECK: tBcc %bb.6, 3, killed $cpsr
+ ; CHECK: bb.2.vector.memcheck:
+ ; CHECK: successors: %bb.3(0x40000000), %bb.6(0x40000000)
+ ; CHECK: liveins: $r0, $r1, $r2
+ ; CHECK: renamable $r3 = t2ADDrs renamable $r0, renamable $r2, 18, 14, $noreg, $noreg
+ ; CHECK: tCMPr killed renamable $r3, renamable $r1, 14, $noreg, implicit-def $cpsr
+ ; CHECK: t2IT 8, 4, implicit-def $itstate
+ ; CHECK: renamable $r3 = t2ADDrs renamable $r1, renamable $r2, 18, 8, $cpsr, $noreg, implicit $itstate
+ ; CHECK: tCMPr killed renamable $r3, renamable $r0, 8, killed $cpsr, implicit-def $cpsr, implicit killed $itstate
+ ; CHECK: tBcc %bb.6, 8, killed $cpsr
+ ; CHECK: bb.3.vector.ph:
+ ; CHECK: successors: %bb.4(0x80000000)
+ ; CHECK: liveins: $r0, $r1, $r2
+ ; CHECK: renamable $r4 = t2BICri renamable $r2, 3, 14, $noreg, $noreg
+ ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg
+ ; CHECK: renamable $r12 = t2SUBri renamable $r4, 4, 14, $noreg, $noreg
+ ; CHECK: renamable $r7, dead $cpsr = tSUBrr renamable $r2, renamable $r4, 14, $noreg
+ ; CHECK: renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg
+ ; CHECK: renamable $r12 = t2ADDrs renamable $r0, renamable $r4, 18, 14, $noreg, $noreg
+ ; CHECK: $lr = t2DLS renamable $r3
+ ; CHECK: renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 16, 14, $noreg
+ ; CHECK: dead $r5 = tMOVr killed $r3, 14, $noreg
+ ; CHECK: renamable $r3 = t2ADDrs renamable $r1, renamable $r4, 18, 14, $noreg, $noreg
+ ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 16, 14, $noreg
+ ; CHECK: bb.4.vector.body:
+ ; CHECK: successors: %bb.4(0x7c000000), %bb.5(0x04000000)
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r7, $r12
+ ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRWU32_pre killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.scevgep18, align 4)
+ ; CHECK: renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VABSf32 killed renamable $q0, 0, $noreg, undef renamable $q0
+ ; CHECK: renamable $r1 = MVE_VSTRBU8_pre killed renamable $q0, killed renamable $r1, 16, 0, $noreg :: (store 16 into %ir.scevgep13, align 4)
+ ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.4
+ ; CHECK: bb.5.middle.block:
+ ; CHECK: successors: %bb.7(0x80000000)
+ ; CHECK: liveins: $r2, $r3, $r4, $r7, $r12
+ ; CHECK: tCMPr killed renamable $r4, killed renamable $r2, 14, $noreg, implicit-def $cpsr
+ ; CHECK: $lr = tMOVr killed $r7, 14, $noreg
+ ; CHECK: t2IT 0, 8, implicit-def $itstate
+ ; CHECK: tPOP_RET 0, killed $cpsr, def $r4, def $r5, def $r7, def $pc, implicit killed $itstate
+ ; CHECK: tB %bb.7, 14, $noreg
+ ; CHECK: bb.6:
+ ; CHECK: successors: %bb.7(0x80000000)
+ ; CHECK: liveins: $r0, $r1, $r2
+ ; CHECK: $lr = tMOVr killed $r2, 14, $noreg
+ ; CHECK: $r12 = tMOVr killed $r0, 14, $noreg
+ ; CHECK: $r3 = tMOVr killed $r1, 14, $noreg
+ ; CHECK: bb.7.while.body.preheader19:
+ ; CHECK: successors: %bb.8(0x80000000)
+ ; CHECK: liveins: $lr, $r3, $r12
+ ; CHECK: renamable $r0, dead $cpsr = tSUBi3 killed renamable $r3, 4, 14, $noreg
+ ; CHECK: renamable $r1 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+ ; CHECK: $lr = t2DLS killed renamable $lr
+ ; CHECK: bb.8.while.body:
+ ; CHECK: successors: %bb.8(0x7c000000), %bb.9(0x04000000)
+ ; CHECK: liveins: $lr, $r0, $r1
+ ; CHECK: renamable $s0 = VLDRS renamable $r1, 1, 14, $noreg :: (load 4 from %ir.scevgep3)
+ ; CHECK: renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 4, 14, $noreg
+ ; CHECK: renamable $s0 = nnan ninf nsz arcp contract afn reassoc VABSS killed renamable $s0, 14, $noreg
+ ; CHECK: VSTRS killed renamable $s0, renamable $r0, 1, 14, $noreg :: (store 4 into %ir.scevgep7)
+ ; CHECK: renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 4, 14, $noreg
+ ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.8
+ ; CHECK: bb.9.while.end:
+ ; CHECK: tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc
+ bb.0.entry:
+ successors: %bb.9(0x30000000), %bb.1(0x50000000)
+ liveins: $r0, $r1, $r2, $r4, $r5, $r7, $lr
+
+ frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+ frame-setup CFI_INSTRUCTION def_cfa_offset 16
+ frame-setup CFI_INSTRUCTION offset $lr, -4
+ frame-setup CFI_INSTRUCTION offset $r7, -8
+ frame-setup CFI_INSTRUCTION offset $r5, -12
+ frame-setup CFI_INSTRUCTION offset $r4, -16
+ tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr
+ tBcc %bb.9, 0, killed $cpsr
+
+ bb.1.while.body.preheader:
+ successors: %bb.6(0x40000000), %bb.2(0x40000000)
+ liveins: $r0, $r1, $r2
+
+ tCMPi8 renamable $r2, 4, 14, $noreg, implicit-def $cpsr
+ tBcc %bb.6, 3, killed $cpsr
+
+ bb.2.vector.memcheck:
+ successors: %bb.3(0x40000000), %bb.6(0x40000000)
+ liveins: $r0, $r1, $r2
+
+ renamable $r3 = t2ADDrs renamable $r0, renamable $r2, 18, 14, $noreg, $noreg
+ tCMPr killed renamable $r3, renamable $r1, 14, $noreg, implicit-def $cpsr
+ t2IT 8, 4, implicit-def $itstate
+ renamable $r3 = t2ADDrs renamable $r1, renamable $r2, 18, 8, $cpsr, $noreg, implicit $itstate
+ tCMPr killed renamable $r3, renamable $r0, 8, killed $cpsr, implicit-def $cpsr, implicit killed $itstate
+ tBcc %bb.6, 8, killed $cpsr
+
+ bb.3.vector.ph:
+ successors: %bb.4(0x80000000)
+ liveins: $r0, $r1, $r2
+
+ renamable $r4 = t2BICri renamable $r2, 3, 14, $noreg, $noreg
+ renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg
+ renamable $r12 = t2SUBri renamable $r4, 4, 14, $noreg, $noreg
+ renamable $r7, dead $cpsr = tSUBrr renamable $r2, renamable $r4, 14, $noreg
+ renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg
+ renamable $r12 = t2ADDrs renamable $r0, renamable $r4, 18, 14, $noreg, $noreg
+ t2DoLoopStart renamable $r3
+ renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 16, 14, $noreg
+ $r5 = tMOVr killed $r3, 14, $noreg
+ renamable $r3 = t2ADDrs renamable $r1, renamable $r4, 18, 14, $noreg, $noreg
+ renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 16, 14, $noreg
+
+ bb.4.vector.body:
+ successors: %bb.4(0x7c000000), %bb.5(0x04000000)
+ liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r7, $r12
+
+ renamable $r0, renamable $q0 = MVE_VLDRWU32_pre killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.scevgep18, align 4)
+ $lr = tMOVr killed $r5, 14, $noreg
+ renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VABSf32 killed renamable $q0, 0, $noreg, undef renamable $q0
+ renamable $r1 = MVE_VSTRBU8_pre killed renamable $q0, killed renamable $r1, 16, 0, $noreg :: (store 16 into %ir.scevgep13, align 4)
+ renamable $lr = t2LoopDec killed renamable $lr, 1
+ $r5 = tMOVr $lr, 14, $noreg
+ t2LoopEnd killed renamable $lr, %bb.4, implicit-def dead $cpsr
+ tB %bb.5, 14, $noreg
+
+ bb.5.middle.block:
+ successors: %bb.7(0x80000000)
+ liveins: $r2, $r3, $r4, $r7, $r12
+
+ tCMPr killed renamable $r4, killed renamable $r2, 14, $noreg, implicit-def $cpsr
+ $lr = tMOVr killed $r7, 14, $noreg
+ t2IT 0, 8, implicit-def $itstate
+ tPOP_RET 0, killed $cpsr, def $r4, def $r5, def $r7, def $pc, implicit killed $itstate
+ tB %bb.7, 14, $noreg
+
+ bb.6:
+ successors: %bb.7(0x80000000)
+ liveins: $r0, $r1, $r2
+
+ $lr = tMOVr killed $r2, 14, $noreg
+ $r12 = tMOVr killed $r0, 14, $noreg
+ $r3 = tMOVr killed $r1, 14, $noreg
+
+ bb.7.while.body.preheader19:
+ successors: %bb.8(0x80000000)
+ liveins: $lr, $r3, $r12
+
+ renamable $r0, dead $cpsr = tSUBi3 killed renamable $r3, 4, 14, $noreg
+ renamable $r1 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+ t2DoLoopStart renamable $lr
+
+ bb.8.while.body:
+ successors: %bb.8(0x7c000000), %bb.9(0x04000000)
+ liveins: $lr, $r0, $r1
+
+ renamable $s0 = VLDRS renamable $r1, 1, 14, $noreg :: (load 4 from %ir.scevgep3)
+ renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 4, 14, $noreg
+ renamable $s0 = nnan ninf nsz arcp contract afn reassoc VABSS killed renamable $s0, 14, $noreg
+ VSTRS killed renamable $s0, renamable $r0, 1, 14, $noreg :: (store 4 into %ir.scevgep7)
+ renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 4, 14, $noreg
+ renamable $lr = t2LoopDec killed renamable $lr, 1
+ t2LoopEnd renamable $lr, %bb.8, implicit-def dead $cpsr
+ tB %bb.9, 14, $noreg
+
+ bb.9.while.end:
+ tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc
+
+...
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=thumbv8.1m.main -run-pass=arm-low-overhead-loops %s -verify-machineinstrs -o - | FileCheck %s
-# CHECK-NOT: $lr = t2DLS
-# CHECK: $lr = tMOVr $r0, 14
-# CHECK-NOT: $lr = t2LEUpdate
--- |
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv8.1m.main"
-
+
define i32 @do_copy(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) {
entry:
%scevgep = getelementptr i32, i32* %q, i32 -1
preheader:
br label %while.body
-
+
while.body: ; preds = %while.body, %entry
%lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %preheader ]
%lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %preheader ]
%2 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1)
%3 = icmp ne i32 %2, 0
br i1 %3, label %while.body, label %while.end
-
+
while.end: ; preds = %while.body
ret i32 0
}
-
+
declare void @llvm.set.loop.iterations.i32(i32) #0
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0
-
+
attributes #0 = { noduplicate nounwind }
attributes #1 = { nounwind }
restorePoint: ''
fixedStack: []
stack:
- - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
- stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+ - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
- stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+ - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
callSites: []
constants: []
machineFunctionInfo: {}
body: |
+ ; CHECK-LABEL: name: do_copy
+ ; CHECK: bb.0.entry:
+ ; CHECK: successors: %bb.1(0x80000000)
+ ; CHECK: liveins: $lr, $r2, $r7
+ ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, implicit-def $sp, implicit $sp
+ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
+ ; CHECK: renamable $r0 = t2SUBri killed renamable $lr, 4, 14, $noreg, def dead $cpsr
+ ; CHECK: renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg
+ ; CHECK: bb.1.preheader:
+ ; CHECK: successors: %bb.2(0x80000000)
+ ; CHECK: liveins: $r0, $r1
+ ; CHECK: $lr = tMOVr $r0, 14, $noreg
+ ; CHECK: bb.2.while.body:
+ ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+ ; CHECK: liveins: $lr, $r0, $r1
+ ; CHECK: renamable $r2, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep6)
+ ; CHECK: early-clobber renamable $r0 = t2STR_PRE killed renamable $r2, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep2)
+ ; CHECK: $lr = t2SUBri killed renamable $lr, 1, 14, $noreg, def $cpsr
+ ; CHECK: tBcc %bb.2, 1, killed $cpsr
+ ; CHECK: tB %bb.3, 14, $noreg
+ ; CHECK: bb.3.while.end:
+ ; CHECK: $r0, dead $cpsr = tMOVi8 0, 14, $noreg
+ ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0
bb.0.entry:
successors: %bb.1(0x80000000)
liveins: $r0, $r1, $r2, $r7, $lr
-
+
frame-setup tPUSH 14, $noreg, killed $r7, implicit-def $sp, implicit $sp
frame-setup CFI_INSTRUCTION def_cfa_offset 8
frame-setup CFI_INSTRUCTION offset $lr, -4
successors: %bb.2(0x80000000)
liveins: $r0
$lr = tMOVr $r0, 14, $noreg
-
+
bb.2.while.body:
successors: %bb.2(0x7c000000), %bb.3(0x04000000)
liveins: $lr, $r0, $r1
-
+
renamable $r2, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep6)
early-clobber renamable $r0 = t2STR_PRE killed renamable $r2, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep2)
renamable $lr = t2LoopDec killed renamable $lr, 1
t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr
tB %bb.3, 14, $noreg
-
+
bb.3.while.end:
$r0, dead $cpsr = tMOVi8 0, 14, $noreg
tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0
%vctp = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp8)
%and = and <4 x i1> %vctp, %invariant.mask
%tmp11 = sub i32 %tmp8, 4
- %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %and, <4 x i32> undef), !tbaa !3
+ %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %and, <4 x i32> undef)
%tmp18 = icmp ne <4 x i32> %tmp17, zeroinitializer
%tmp20 = and <4 x i1> %tmp18, %vctp
- %tmp22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %tmp20, <4 x i32> undef), !tbaa !3
+ %tmp22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %tmp20, <4 x i32> undef)
%tmp23 = mul nsw <4 x i32> %tmp22, %tmp17
- call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp23, <4 x i32>* %lsr.iv1, i32 4, <4 x i1> %tmp20), !tbaa !3
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp23, <4 x i32>* %lsr.iv1, i32 4, <4 x i1> %tmp20)
%tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp7, i32 1)
%tmp13 = icmp ne i32 %tmp12, 0
%scevgep = getelementptr i32, i32* %lsr.iv, i32 4
%scevgep3 = getelementptr i32, i32* %lsr.iv2, i32 4
- br i1 %tmp13, label %bb9, label %bb27, !llvm.loop !7
+ br i1 %tmp13, label %bb9, label %bb27
bb27: ; preds = %bb9, %bb
ret void
}
- ; Function Attrs: argmemonly nounwind readonly willreturn
- declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
- ; Function Attrs: argmemonly nounwind willreturn
- declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
- ; Function Attrs: noduplicate nounwind
- declare void @llvm.set.loop.iterations.i32(i32) #3
- ; Function Attrs: noduplicate nounwind
- declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
- ; Function Attrs: nounwind readnone
- declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
- ; Function Attrs: nounwind readnone
- declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #4
- ; Function Attrs: nounwind
- declare void @llvm.stackprotector(i8*, i8**) #5
-
- attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mve" "unsafe-fp-math"="false" "use-soft-float"="false" }
- attributes #1 = { argmemonly nounwind readonly willreturn "target-features"="+mve" }
- attributes #2 = { argmemonly nounwind willreturn "target-features"="+mve" }
- attributes #3 = { noduplicate nounwind "target-features"="+mve" }
- attributes #4 = { nounwind readnone "target-features"="+mve" }
- attributes #5 = { nounwind }
-
- !llvm.module.flags = !{!0, !1}
- !llvm.ident = !{!2}
-
- !0 = !{i32 1, !"wchar_size", i32 4}
- !1 = !{i32 1, !"min_enum_size", i32 4}
- !2 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project.git 8f92f97150cbdd3b9f569570b8377db78ed61a9e)"}
- !3 = !{!4, !4, i64 0}
- !4 = !{!"int", !5, i64 0}
- !5 = !{!"omnipotent char", !6, i64 0}
- !6 = !{!"Simple C/C++ TBAA"}
- !7 = distinct !{!7, !8}
- !8 = !{!"llvm.loop.isvectorized", i32 1}
+ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+ declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
+ declare void @llvm.set.loop.iterations.i32(i32)
+ declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+ declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+ declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
...
---
; CHECK: bb.0.bb:
; CHECK: successors: %bb.3(0x30000000), %bb.1(0x50000000)
; CHECK: liveins: $lr, $r0, $r1, $r2, $r3
- ; CHECK: frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp
+ ; CHECK: frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
; CHECK: renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load 4 from %stack.0)
; CHECK: MVE_VPST 4, implicit $vpr
; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr
- ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4, !tbaa !3)
+ ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4)
; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg
; CHECK: MVE_VPST 4, implicit $vpr
; CHECK: renamable $vpr = MVE_VCMPi32r renamable $q0, $zr, 1, 1, killed renamable $vpr
- ; CHECK: renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4, !tbaa !3)
+ ; CHECK: renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4)
; CHECK: renamable $q0 = nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
; CHECK: MVE_VPST 8, implicit $vpr
- ; CHECK: MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4, !tbaa !3)
+ ; CHECK: MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
; CHECK: $r0 = tMOVr $r3, 14, $noreg
; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2
; CHECK: bb.3.bb27:
successors: %bb.3(0x30000000), %bb.1(0x50000000)
liveins: $r0, $r1, $r2, $r3, $lr
- frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp
+ frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp
frame-setup CFI_INSTRUCTION def_cfa_offset 8
frame-setup CFI_INSTRUCTION offset $lr, -4
frame-setup CFI_INSTRUCTION offset $r7, -8
renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load 4 from %stack.0)
MVE_VPST 4, implicit $vpr
renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr
- renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4, !tbaa !3)
+ renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4)
renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg
MVE_VPST 4, implicit $vpr
renamable $vpr = MVE_VCMPi32r renamable $q0, $zr, 1, 1, killed renamable $vpr
- renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4, !tbaa !3)
+ renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4)
renamable $q0 = nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
MVE_VPST 8, implicit $vpr
- MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4, !tbaa !3)
+ MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
renamable $lr = t2LoopDec killed renamable $lr, 1
$r0 = tMOVr $r3, 14, $noreg
t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr
%vctp = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp8)
%and = and <4 x i1> %vctp, %invariant.mask
%tmp11 = sub i32 %tmp8, 4
- %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %and, <4 x i32> undef), !tbaa !3
- %tmp22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %and, <4 x i32> undef), !tbaa !3
+ %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %and, <4 x i32> undef)
+ %tmp22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %and, <4 x i32> undef)
%tmp23 = mul nsw <4 x i32> %tmp22, %tmp17
- call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp23, <4 x i32>* %lsr.iv1, i32 4, <4 x i1> %and), !tbaa !3
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp23, <4 x i32>* %lsr.iv1, i32 4, <4 x i1> %and)
%tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp7, i32 1)
%tmp13 = icmp ne i32 %tmp12, 0
%scevgep = getelementptr i32, i32* %lsr.iv, i32 4
%scevgep3 = getelementptr i32, i32* %lsr.iv2, i32 4
- br i1 %tmp13, label %bb9, label %bb27, !llvm.loop !7
+ br i1 %tmp13, label %bb9, label %bb27
bb27: ; preds = %bb9, %bb
ret void
}
- ; Function Attrs: argmemonly nounwind readonly willreturn
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
- ; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
- ; Function Attrs: noduplicate nounwind
declare void @llvm.set.loop.iterations.i32(i32) #3
- ; Function Attrs: noduplicate nounwind
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
- ; Function Attrs: nounwind readnone
declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
- ; Function Attrs: nounwind readnone
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #4
- ; Function Attrs: nounwind
- declare void @llvm.stackprotector(i8*, i8**) #5
-
- attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mve" "unsafe-fp-math"="false" "use-soft-float"="false" }
- attributes #1 = { argmemonly nounwind readonly willreturn "target-features"="+mve" }
- attributes #2 = { argmemonly nounwind willreturn "target-features"="+mve" }
- attributes #3 = { noduplicate nounwind "target-features"="+mve" }
- attributes #4 = { nounwind readnone "target-features"="+mve" }
- attributes #5 = { nounwind }
-
- !llvm.module.flags = !{!0, !1}
- !llvm.ident = !{!2}
-
- !0 = !{i32 1, !"wchar_size", i32 4}
- !1 = !{i32 1, !"min_enum_size", i32 4}
- !2 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project.git 8f92f97150cbdd3b9f569570b8377db78ed61a9e)"}
- !3 = !{!4, !4, i64 0}
- !4 = !{!"int", !5, i64 0}
- !5 = !{!"omnipotent char", !6, i64 0}
- !6 = !{!"Simple C/C++ TBAA"}
- !7 = distinct !{!7, !8}
- !8 = !{!"llvm.loop.isvectorized", i32 1}
...
---
; CHECK: bb.0.bb:
; CHECK: successors: %bb.3(0x30000000), %bb.1(0x50000000)
; CHECK: liveins: $lr, $r0, $r1, $r2, $r3
- ; CHECK: frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp
+ ; CHECK: frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
; CHECK: liveins: $lr, $r0, $r1, $r3
; CHECK: renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load 4 from %stack.0)
; CHECK: MVE_VPST 4, implicit $vpr
- ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4, !tbaa !3)
- ; CHECK: renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4, !tbaa !3)
+ ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4)
+ ; CHECK: renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4)
; CHECK: renamable $q0 = nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
; CHECK: MVE_VPST 8, implicit $vpr
- ; CHECK: MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4, !tbaa !3)
+ ; CHECK: MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
; CHECK: $r0 = tMOVr $r3, 14, $noreg
; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2
; CHECK: bb.3.bb27:
successors: %bb.3(0x30000000), %bb.1(0x50000000)
liveins: $r0, $r1, $r2, $r3, $lr
- frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp
+ frame-setup tPUSH 14, $noreg, killed $lr, implicit-def $sp, implicit $sp
frame-setup CFI_INSTRUCTION def_cfa_offset 8
frame-setup CFI_INSTRUCTION offset $lr, -4
frame-setup CFI_INSTRUCTION offset $r7, -8
renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load 4 from %stack.0)
MVE_VPST 2, implicit $vpr
renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr
- renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4, !tbaa !3)
- renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4, !tbaa !3)
+ renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4)
+ renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4)
renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg
renamable $q0 = nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
MVE_VPST 8, implicit $vpr
- MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4, !tbaa !3)
+ MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
renamable $lr = t2LoopDec killed renamable $lr, 1
$r0 = tMOVr $r3, 14, $noreg
t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve,+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
+
+--- |
+ define dso_local void @vctp_tsubi3(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+ entry:
+ %cmp8 = icmp sgt i32 %N, 0
+ %0 = add i32 %N, 3
+ %1 = lshr i32 %0, 2
+ %2 = shl nuw i32 %1, 2
+ %3 = add i32 %2, -4
+ %4 = lshr i32 %3, 2
+ %5 = add nuw nsw i32 %4, 1
+ br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
+
+ vector.ph: ; preds = %entry
+ call void @llvm.set.loop.iterations.i32(i32 %5)
+ br label %vector.body
+
+ vector.body: ; preds = %vector.body, %vector.ph
+ %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
+ %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
+ %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
+ %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+ %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
+ %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
+ %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
+ %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
+ %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
+ %9 = sub i32 %7, 5
+ %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef)
+ %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef)
+ %10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8)
+ %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+ %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
+ %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
+ %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
+ %12 = icmp ne i32 %11, 0
+ br i1 %12, label %vector.body, label %for.cond.cleanup
+
+ for.cond.cleanup: ; preds = %vector.body, %entry
+ ret void
+ }
+ declare void @llvm.set.loop.iterations.i32(i32) #1
+ declare <4 x i1> @llvm.arm.mve.vctp32(i32) #2
+ declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
+ declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3
+ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #4
+ declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3
+
+...
+---
+name: vctp_tsubi3
+alignment: 2
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+registers: []
+liveins:
+ - { reg: '$r0', virtual-reg: '' }
+ - { reg: '$r1', virtual-reg: '' }
+ - { reg: '$r2', virtual-reg: '' }
+ - { reg: '$r3', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 8
+ offsetAdjustment: 0
+ maxAlignment: 4
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 0
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack: []
+stack:
+ - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites: []
+constants: []
+machineFunctionInfo: {}
+body: |
+ ; CHECK-LABEL: name: vctp_tsubi3
+ ; CHECK: bb.0.entry:
+ ; CHECK: successors: %bb.1(0x80000000)
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r7
+ ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
+ ; CHECK: tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr
+ ; CHECK: t2IT 11, 8, implicit-def $itstate
+ ; CHECK: tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+ ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r3
+ ; CHECK: bb.1.vector.body:
+ ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+ ; CHECK: liveins: $lr, $r0, $r1, $r2
+ ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv13, align 4)
+ ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1416, align 4)
+ ; CHECK: renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+ ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1719, align 4)
+ ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1
+ ; CHECK: bb.2.for.cond.cleanup:
+ ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc
+ bb.0.entry:
+ successors: %bb.1(0x80000000)
+ liveins: $r0, $r1, $r2, $r3, $r7, $lr
+
+ frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+ frame-setup CFI_INSTRUCTION def_cfa_offset 8
+ frame-setup CFI_INSTRUCTION offset $lr, -4
+ frame-setup CFI_INSTRUCTION offset $r7, -8
+ tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr
+ t2IT 11, 8, implicit-def $itstate
+ tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+ renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg
+ renamable $lr = t2MOVi 1, 14, $noreg, $noreg
+ renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
+ renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+ renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
+ t2DoLoopStart renamable $lr
+
+ bb.1.vector.body:
+ successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+ liveins: $lr, $r0, $r1, $r2, $r3
+
+ renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
+ MVE_VPST 4, implicit $vpr
+ renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4)
+ renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4)
+ renamable $r3, dead $cpsr = tSUBi3 killed renamable $r3, 4, 14, $noreg
+ renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+ MVE_VPST 8, implicit $vpr
+ renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4)
+ renamable $lr = t2LoopDec killed renamable $lr, 1
+ t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+ tB %bb.2, 14, $noreg
+
+ bb.2.for.cond.cleanup:
+ tPOP_RET 14, $noreg, def $r7, def $pc
+
+...
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve,+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
+
+--- |
+ define dso_local void @vctp_tsubi3(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+ entry:
+ %cmp8 = icmp sgt i32 %N, 0
+ %0 = add i32 %N, 3
+ %1 = lshr i32 %0, 2
+ %2 = shl nuw i32 %1, 2
+ %3 = add i32 %2, -4
+ %4 = lshr i32 %3, 2
+ %5 = add nuw nsw i32 %4, 1
+ br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
+
+ vector.ph: ; preds = %entry
+ call void @llvm.set.loop.iterations.i32(i32 %5)
+ br label %vector.body
+
+ vector.body: ; preds = %vector.body, %vector.ph
+ %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
+ %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
+ %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
+ %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+ %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
+ %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
+ %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
+ %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
+ %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
+ %9 = sub i32 %7, 5
+ %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef)
+ %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef)
+ %10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8)
+ %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+ %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
+ %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
+ %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
+ %12 = icmp ne i32 %11, 0
+ br i1 %12, label %vector.body, label %for.cond.cleanup
+
+ for.cond.cleanup: ; preds = %vector.body, %entry
+ ret void
+ }
+ declare void @llvm.set.loop.iterations.i32(i32)
+ declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+ declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+ declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
+
+...
+---
+name: vctp_tsubi3
+alignment: 2
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+registers: []
+liveins:
+ - { reg: '$r0', virtual-reg: '' }
+ - { reg: '$r1', virtual-reg: '' }
+ - { reg: '$r2', virtual-reg: '' }
+ - { reg: '$r3', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 8
+ offsetAdjustment: 0
+ maxAlignment: 4
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 0
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack: []
+stack:
+ - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites: []
+constants: []
+machineFunctionInfo: {}
+body: |
+ ; CHECK-LABEL: name: vctp_tsubi3
+ ; CHECK: bb.0.entry:
+ ; CHECK: successors: %bb.1(0x80000000)
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r7
+ ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
+ ; CHECK: tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr
+ ; CHECK: t2IT 11, 8, implicit-def $itstate
+ ; CHECK: tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+ ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r3
+ ; CHECK: bb.1.vector.body:
+ ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+ ; CHECK: liveins: $lr, $r0, $r1, $r2
+ ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv13, align 4)
+ ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1416, align 4)
+ ; CHECK: renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+ ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1719, align 4)
+ ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1
+ ; CHECK: bb.2.for.cond.cleanup:
+ ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc
+ bb.0.entry:
+ successors: %bb.1(0x80000000)
+ liveins: $r0, $r1, $r2, $r3, $r7, $lr
+
+ frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+ frame-setup CFI_INSTRUCTION def_cfa_offset 8
+ frame-setup CFI_INSTRUCTION offset $lr, -4
+ frame-setup CFI_INSTRUCTION offset $r7, -8
+ tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr
+ t2IT 11, 8, implicit-def $itstate
+ tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+ renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg
+ renamable $lr = t2MOVi 1, 14, $noreg, $noreg
+ renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
+ renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+ renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
+ t2DoLoopStart renamable $lr
+
+ bb.1.vector.body:
+ successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+ liveins: $lr, $r0, $r1, $r2, $r3
+
+ renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
+ MVE_VPST 4, implicit $vpr
+ renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4)
+ renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4)
+ renamable $r3 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg
+ renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+ MVE_VPST 8, implicit $vpr
+ renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4)
+ renamable $lr = t2LoopDec killed renamable $lr, 1
+ t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+ tB %bb.2, 14, $noreg
+
+ bb.2.for.cond.cleanup:
+ tPOP_RET 14, $noreg, def $r7, def $pc
+
+...
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve,+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
+
+--- |
+ define dso_local void @vctp_tsubi3(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+ entry:
+ %cmp8 = icmp sgt i32 %N, 0
+ %0 = add i32 %N, 3
+ %1 = lshr i32 %0, 2
+ %2 = shl nuw i32 %1, 2
+ %3 = add i32 %2, -4
+ %4 = lshr i32 %3, 2
+ %5 = add nuw nsw i32 %4, 1
+ br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
+
+ vector.ph: ; preds = %entry
+ call void @llvm.set.loop.iterations.i32(i32 %5)
+ br label %vector.body
+
+ vector.body: ; preds = %vector.body, %vector.ph
+ %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
+ %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
+ %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
+ %6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
+ %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
+ %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
+ %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
+ %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
+ %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
+ %9 = sub i32 %7, 5
+ %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef)
+ %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef)
+ %10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8)
+ %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+ %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
+ %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
+ %11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
+ %12 = icmp ne i32 %11, 0
+ br i1 %12, label %vector.body, label %for.cond.cleanup
+
+ for.cond.cleanup: ; preds = %vector.body, %entry
+ ret void
+ }
+ declare void @llvm.set.loop.iterations.i32(i32)
+ declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+ declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+ declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
+
+...
+---
+name: vctp_tsubi3
+alignment: 2
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+registers: []
+liveins:
+ - { reg: '$r0', virtual-reg: '' }
+ - { reg: '$r1', virtual-reg: '' }
+ - { reg: '$r2', virtual-reg: '' }
+ - { reg: '$r3', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 8
+ offsetAdjustment: 0
+ maxAlignment: 4
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 0
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack: []
+stack:
+ - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites: []
+constants: []
+machineFunctionInfo: {}
+body: |
+ ; CHECK-LABEL: name: vctp_tsubi3
+ ; CHECK: bb.0.entry:
+ ; CHECK: successors: %bb.1(0x80000000)
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r7
+ ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
+ ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
+ ; CHECK: tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr
+ ; CHECK: t2IT 11, 8, implicit-def $itstate
+ ; CHECK: tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+ ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r3
+ ; CHECK: bb.1.vector.body:
+ ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+ ; CHECK: liveins: $lr, $r0, $r1, $r2
+ ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv13, align 4)
+ ; CHECK: renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1416, align 4)
+ ; CHECK: renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+ ; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1719, align 4)
+ ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1
+ ; CHECK: bb.2.for.cond.cleanup:
+ ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc
+ bb.0.entry:
+ successors: %bb.1(0x80000000)
+ liveins: $r0, $r1, $r2, $r3, $r7, $lr
+
+ frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+ frame-setup CFI_INSTRUCTION def_cfa_offset 8
+ frame-setup CFI_INSTRUCTION offset $lr, -4
+ frame-setup CFI_INSTRUCTION offset $r7, -8
+ tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr
+ t2IT 11, 8, implicit-def $itstate
+ tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate
+ renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg
+ renamable $lr = t2MOVi 1, 14, $noreg, $noreg
+ renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
+ renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
+ renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
+ t2DoLoopStart renamable $lr
+
+ bb.1.vector.body:
+ successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+ liveins: $lr, $r0, $r1, $r2, $r3
+
+ renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
+ MVE_VPST 4, implicit $vpr
+ renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4)
+ renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4)
+ renamable $r3 = t2SUBri12 killed renamable $r3, 4, 14, $noreg
+ renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+ MVE_VPST 8, implicit $vpr
+ renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4)
+ renamable $lr = t2LoopDec killed renamable $lr, 1
+ t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+ tB %bb.2, 14, $noreg
+
+ bb.2.for.cond.cleanup:
+ tPOP_RET 14, $noreg, def $r7, def $pc
+
+...
; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -16
; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -20
; CHECK: renamable $r12 = t2LDRi12 $sp, 44, 14, $noreg :: (load 4 from %fixed-stack.6, align 8)
- ; CHECK: $lr = MVE_WLSTP_32 renamable $r12, %bb.3
+ ; CHECK: $lr = MVE_WLSTP_32 killed renamable $r12, %bb.3
; CHECK: bb.1.for.body.lr.ph:
; CHECK: successors: %bb.2(0x80000000)
- ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r12
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3
; CHECK: $r7, $r6 = t2LDRDi8 $sp, 36, 14, $noreg :: (load 4 from %fixed-stack.4, align 8), (load 4 from %fixed-stack.5)
; CHECK: $r5, $r4 = t2LDRDi8 $sp, 20, 14, $noreg :: (load 4 from %fixed-stack.0, align 8), (load 4 from %fixed-stack.1)
; CHECK: renamable $q0 = MVE_VDUP32 killed renamable $r6, 0, $noreg, undef renamable $q0
; CHECK: renamable $q1 = MVE_VDUP32 killed renamable $r7, 0, $noreg, undef renamable $q1
; CHECK: bb.2.for.body:
; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000)
- ; CHECK: liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3, $r4, $r5, $r12
+ ; CHECK: liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3, $r4, $r5
; CHECK: renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 4, 0, $noreg :: (load 16 from %ir.input_2_cast, align 4)
; CHECK: renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 4, 0, $noreg :: (load 16 from %ir.input_1_cast, align 4)
; CHECK: renamable $q2 = MVE_VADD_qr_i32 killed renamable $q2, renamable $r3, 0, $noreg, undef renamable $q2
; CHECK: renamable $q3 = MVE_VADD_qr_i32 killed renamable $q3, renamable $r2, 0, $noreg, undef renamable $q3
- ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
; CHECK: renamable $q2 = MVE_VMULi32 killed renamable $q3, killed renamable $q2, 0, $noreg, undef renamable $q2
; CHECK: renamable $q2 = MVE_VADD_qr_i32 killed renamable $q2, renamable $r4, 0, $noreg, undef renamable $q2
; CHECK: renamable $q2 = MVE_VMAXu32 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2
; CHECK: bb.0.entry:
; CHECK: successors: %bb.1(0x40000000), %bb.3(0x40000000)
; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7
- ; CHECK: frame-setup tPUSH 14, $noreg, killed $r4, $r5, killed $r6, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+ ; CHECK: frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r6, killed $r7, killed $lr, implicit-def $sp, implicit $sp
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 20
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -20
; CHECK: renamable $r7 = tLDRspi $sp, 10, 14, $noreg :: (load 4 from %fixed-stack.5)
; CHECK: renamable $r12 = t2MOVi 0, 14, $noreg, $noreg
- ; CHECK: dead $lr = MVE_WLSTP_32 killed renamable $r7, %bb.3
+ ; CHECK: $lr = MVE_WLSTP_32 killed renamable $r7, %bb.3
; CHECK: bb.1.for.body.lr.ph:
; CHECK: successors: %bb.2(0x80000000)
- ; CHECK: liveins: $r0, $r1, $r2, $r3, $r5
- ; CHECK: $r6 = tMOVr killed $r5, 14, $noreg
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3
; CHECK: $r5, $r12 = t2LDRDi8 $sp, 32, 14, $noreg :: (load 4 from %fixed-stack.3), (load 4 from %fixed-stack.4, align 8)
; CHECK: renamable $r4 = tLDRspi $sp, 5, 14, $noreg :: (load 4 from %fixed-stack.0, align 8)
; CHECK: renamable $q0 = MVE_VDUP32 killed renamable $r12, 0, $noreg, undef renamable $q0
; CHECK: renamable $r12 = t2MOVi 0, 14, $noreg, $noreg
; CHECK: bb.2.for.body:
; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000)
- ; CHECK: liveins: $q0, $q1, $r0, $r1, $r2, $r3, $r4, $r6, $r12
+ ; CHECK: liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3, $r4, $r12
; CHECK: renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 4, 0, $noreg :: (load 16 from %ir.input_2_cast, align 4)
; CHECK: renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 4, 0, $noreg :: (load 16 from %ir.input_1_cast, align 4)
; CHECK: renamable $q2 = MVE_VADD_qr_i32 killed renamable $q2, renamable $r3, 0, $noreg, undef renamable $q2
; CHECK: renamable $q3 = MVE_VADD_qr_i32 killed renamable $q3, renamable $r2, 0, $noreg, undef renamable $q3
- ; CHECK: $lr = tMOVr $r6, 14, $noreg
; CHECK: renamable $q2 = MVE_VMULi32 killed renamable $q3, killed renamable $q2, 0, $noreg, undef renamable $q2
- ; CHECK: renamable $r6, dead $cpsr = tSUBi8 killed $r6, 1, 14, $noreg
; CHECK: renamable $q2 = MVE_VADD_qr_i32 killed renamable $q2, renamable $r4, 0, $noreg, undef renamable $q2
; CHECK: renamable $q2 = MVE_VMAXu32 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2
; CHECK: renamable $q3 = MVE_VMINu32 renamable $q2, renamable $q0, 0, $noreg, undef renamable $q3
; CHECK: renamable $r12 = MVE_VMLADAVas32 killed renamable $r12, killed renamable $q3, killed renamable $q2, 0, killed $noreg
- ; CHECK: dead $lr = MVE_LETP killed renamable $lr, %bb.2
+ ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2
; CHECK: bb.3.for.cond.cleanup:
; CHECK: liveins: $r12
; CHECK: $r0 = tMOVr killed $r12, 14, $noreg
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8
- ; CHECK: $lr = MVE_WLSTP_8 renamable $r3, %bb.1
+ ; CHECK: $lr = MVE_WLSTP_8 killed renamable $r3, %bb.1
; CHECK: tB %bb.3, 14, $noreg
; CHECK: bb.1.vector.ph:
; CHECK: successors: %bb.2(0x80000000)
- ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3
+ ; CHECK: liveins: $lr, $r0, $r1, $r2
; CHECK: renamable $r12 = t2MOVi 0, 14, $noreg, $noreg
; CHECK: bb.2.vector.body:
; CHECK: successors: %bb.3(0x04000000), %bb.2(0x7c000000)
- ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r12
+ ; CHECK: liveins: $lr, $r0, $r1, $r2, $r12
; CHECK: renamable $r4 = t2ADDrr renamable $r1, renamable $r12, 14, $noreg, $noreg
; CHECK: renamable $q0 = MVE_VLDRBU8 killed renamable $r4, 0, 0, $noreg :: (load 16 from %ir.scevgep45, align 1)
; CHECK: renamable $r4 = t2ADDrr renamable $r2, renamable $r12, 14, $noreg, $noreg
; CHECK: renamable $q1 = MVE_VLDRBU8 killed renamable $r4, 0, 0, $noreg :: (load 16 from %ir.scevgep23, align 1)
; CHECK: renamable $r4 = t2ADDrr renamable $r0, renamable $r12, 14, $noreg, $noreg
; CHECK: renamable $r12 = t2ADDri killed renamable $r12, 16, 14, $noreg, $noreg
- ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 16, 14, $noreg
; CHECK: renamable $q0 = MVE_VMULi8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
; CHECK: MVE_VSTRBU8 killed renamable $q0, killed renamable $r4, 0, 0, killed $noreg :: (store 16 into %ir.scevgep1, align 1)
; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
- ; CHECK: $lr = MVE_WLSTP_16 renamable $r3, %bb.1
+ ; CHECK: $lr = MVE_WLSTP_16 killed renamable $r3, %bb.1
; CHECK: tB %bb.2, 14, $noreg
; CHECK: bb.1.vector.body:
; CHECK: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
- ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3
+ ; CHECK: liveins: $lr, $r0, $r1, $r2
; CHECK: renamable $q0 = MVE_VLDRHU16 renamable $r1, 0, 0, $noreg :: (load 16 from %ir.lsr.iv57, align 2)
; CHECK: renamable $q1 = MVE_VLDRHU16 renamable $r2, 0, 0, $noreg :: (load 16 from %ir.lsr.iv24, align 2)
; CHECK: renamable $q0 = MVE_VMULi16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
; CHECK: renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14, $noreg
; CHECK: renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 16, 14, $noreg
; CHECK: renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 16, 14, $noreg
- ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 8, 14, $noreg
; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1
; CHECK: bb.2.for.cond.cleanup:
; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc