From: Sam Tebbs Date: Mon, 5 Jul 2021 15:08:58 +0000 (+0100) Subject: [ARM][LowOverheadLoops] Make some stack spills valid for tail predication X-Git-Tag: llvmorg-14-init~1339 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ff0ef6a518578231d963c9cdeeae51411efb97d5;p=platform%2Fupstream%2Fllvm.git [ARM][LowOverheadLoops] Make some stack spills valid for tail predication This patch makes vector spills valid for tail predication when all loads from the same stack slot are within the loop Differential Revision: https://reviews.llvm.org/D105443 --- diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index ecdb380..ea41442 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -1084,7 +1084,85 @@ bool LowOverheadLoop::AddVCTP(MachineInstr *MI) { return true; } -bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) { +static bool ValidateMVEStore(MachineInstr *MI, MachineLoop *ML) { + + auto GetFrameIndex = [](MachineMemOperand *Operand) { + const PseudoSourceValue *PseudoValue = Operand->getPseudoValue(); + if (PseudoValue && PseudoValue->kind() == PseudoSourceValue::FixedStack) { + if (const auto *FS = dyn_cast(PseudoValue)) { + return FS->getFrameIndex(); + } + } + return -1; + }; + + auto IsStackOp = [GetFrameIndex](MachineInstr *I) { + switch (I->getOpcode()) { + case ARM::MVE_VSTRWU32: + case ARM::MVE_VLDRWU32: { + return I->getOperand(1).getReg() == ARM::SP && + I->memoperands().size() == 1 && + GetFrameIndex(I->memoperands().front()) >= 0; + } + default: + return false; + } + }; + + // An unpredicated vector register spill is allowed if all of the uses of the + // stack slot are within the loop + if (MI->getOpcode() != ARM::MVE_VSTRWU32 || !IsStackOp(MI)) + return false; + + // Search all blocks after the loop for accesses to the same stack slot. + // ReachingDefAnalysis doesn't work for sp as it relies on registers being + // live-out (which sp never is) to know what blocks to look in + if (MI->memoperands().size() == 0) + return false; + int FI = GetFrameIndex(MI->memoperands().front()); + + MachineFrameInfo FrameInfo = MI->getParent()->getParent()->getFrameInfo(); + if (FI == -1 || !FrameInfo.isSpillSlotObjectIndex(FI)) + return false; + + SmallVector Frontier; + ML->getExitBlocks(Frontier); + SmallPtrSet Visited{MI->getParent()}; + unsigned Idx = 0; + while (Idx < Frontier.size()) { + MachineBasicBlock *BB = Frontier[Idx]; + bool LookAtSuccessors = true; + for (auto &I : *BB) { + if (!IsStackOp(&I) || I.memoperands().size() == 0) + continue; + if (GetFrameIndex(I.memoperands().front()) != FI) + continue; + // If this block has a store to the stack slot before any loads then we + // can ignore the block + if (I.getOpcode() == ARM::MVE_VSTRWU32) { + LookAtSuccessors = false; + break; + } + // If the store and the load are using the same stack slot then the + // store isn't valid for tail predication + if (I.getOpcode() == ARM::MVE_VLDRWU32) + return false; + } + + if (LookAtSuccessors) { + for (auto Succ : BB->successors()) { + if (!Visited.contains(Succ) && !is_contained(Frontier, Succ)) + Frontier.push_back(Succ); + } + } + Visited.insert(BB); + Idx++; + } + + return true; +} + +bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) { if (CannotTailPredicate) return false; @@ -1140,7 +1218,7 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) { // If the instruction is already explicitly predicated, then the conversion // will be fine, but ensure that all store operations are predicated. - if (MI->mayStore()) + if (MI->mayStore() && !ValidateMVEStore(MI, &ML)) return IsUse; // If this instruction defines the VPR, update the predicate for the diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector_spill_in_loop.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector_spill_in_loop.mir new file mode 100644 index 0000000..8501727 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector_spill_in_loop.mir @@ -0,0 +1,166 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s +--- | + define hidden void @vector_spill_in_loop() { + entry: + ret void + } + + define hidden void @vector_spill_load_outside() { + entry: + ret void + } +... +--- +name: vector_spill_in_loop +tracksRegLiveness: true +stack: + - { id: 0, name: '', type: spill-slot, offset: -120, size: 16, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +body: | + ; CHECK-LABEL: name: vector_spill_in_loop + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $q0, $r1, $r2, $r3, $r4, $r5, $r7, $r8, $r10, $r11, $r12 + ; CHECK: $r6 = tMOVr $r2, 14 /* CC::al */, $noreg + ; CHECK: $r0 = tMOVr $r12, 14 /* CC::al */, $noreg + ; CHECK: $lr = MVE_DLSTP_16 renamable $r3 + ; CHECK: bb.1: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r10, $r11, $r12 + ; CHECK: renamable $r0, renamable $q6 = MVE_VLDRHU16_post killed renamable $r0, 16, 0, $noreg + ; CHECK: renamable $q3 = MVE_VLDRHU16 renamable $r6, 0, 0, $noreg + ; CHECK: renamable $q5 = MVE_VSHR_immu16 killed renamable $q3, 11, 0, $noreg, undef renamable $q5 + ; CHECK: MVE_VSTRWU32 killed renamable $q5, $sp, 80, 0, $noreg :: (store (s128) into %stack.0, align 8) + ; CHECK: dead renamable $q7 = MVE_VLDRWU32 $sp, 80, 0, $noreg :: (load (s128) from %stack.0, align 8) + ; CHECK: dead renamable $vpr = MVE_VCMPi16r killed renamable $q6, renamable $r8, 1, 0, killed $noreg + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 + bb.0: + successors: %bb.1(0x80000000) + liveins: $q0, $r1, $r2, $r3, $r4, $r5, $r7, $r8, $r10, $r11, $r12 + + $r6 = tMOVr $r2, 14 /* CC::al */, $noreg + $r0 = tMOVr $r12, 14 /* CC::al */, $noreg + $r9 = tMOVr $r3, 14 /* CC::al */, $noreg + renamable $lr = t2DoLoopStartTP renamable $r1, renamable $r3 + + bb.1: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12 + + renamable $vpr = MVE_VCTP16 renamable $r9, 0, $noreg + MVE_VPST 8, implicit $vpr + renamable $r0, renamable $q6 = MVE_VLDRHU16_post killed renamable $r0, 16, 1, renamable $vpr + renamable $q3 = MVE_VLDRHU16 renamable $r6, 0, 1, renamable $vpr + MVE_VPST 2, implicit $vpr + renamable $q5 = MVE_VSHR_immu16 renamable $q3, 11, 1, renamable $vpr, undef renamable $q5 + renamable $r9 = nsw t2SUBri killed renamable $r9, 8, 14 /* CC::al */, $noreg, $noreg + MVE_VSTRWU32 killed renamable $q5, $sp, 80, 0, $noreg :: (store (s128) into %stack.0, align 8) + MVE_VPST 8, implicit $vpr + renamable $q7 = MVE_VLDRWU32 $sp, 80, 0, $noreg :: (load (s128) from %stack.0, align 8) + MVE_VPST 1, implicit $vpr + renamable $vpr = MVE_VCMPi16r killed renamable $q6, renamable $r8, 1, 1, killed renamable $vpr + renamable $lr = t2LoopEndDec killed renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14 /* CC::al */, $noreg + + bb.2: + successors: %bb.3(0x04000000), %bb.0(0x7c000000) + liveins: $q0, $r1, $r2, $r3, $r4, $r5, $r7, $r8, $r10, $r11, $r12 + + renamable $r0 = tLDRspi $sp, 1, 14 /* CC::al */, $noreg + renamable $r10 = nuw t2ADDri killed renamable $r10, 1, 14 /* CC::al */, $noreg, $noreg + renamable $r12 = t2ADDrs killed renamable $r12, killed renamable $r0, 10, 14 /* CC::al */, $noreg, $noreg + renamable $r0 = tLDRspi $sp, 3, 14 /* CC::al */, $noreg + renamable $r2 = t2ADDrs killed renamable $r2, killed renamable $r0, 10, 14 /* CC::al */, $noreg, $noreg + renamable $r0 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg + tCMPhir renamable $r10, killed renamable $r0, 14 /* CC::al */, $noreg, implicit-def $cpsr + tBcc %bb.0, 1 /* CC::ne */, killed $cpsr + + bb.3: + $sp = frame-destroy tADDspi $sp, 24, 14 /* CC::al */, $noreg + $sp = frame-destroy VLDMDIA_UPD $sp, 14 /* CC::al */, $noreg, def $d8, def $d9, def $d10, def $d11, def $d12, def $d13, def $d14, def $d15 + $sp = frame-destroy tADDspi $sp, 1, 14 /* CC::al */, $noreg + $sp = frame-destroy t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11, def $pc +... +--- +name: vector_spill_load_outside +stack: + - { id: 0, name: '', type: spill-slot, offset: -120, size: 16, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: vector_spill_load_outside + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $q0, $r1, $r2, $r3, $r4, $r5, $r7, $r8, $r10, $r11, $r12 + ; CHECK: $r6 = tMOVr $r2, 14 /* CC::al */, $noreg + ; CHECK: $r0 = tMOVr $r12, 14 /* CC::al */, $noreg + ; CHECK: $r9 = tMOVr $r3, 14 /* CC::al */, $noreg + ; CHECK: $lr = t2DLS renamable $r1 + ; CHECK: bb.1: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12 + ; CHECK: renamable $vpr = MVE_VCTP16 renamable $r9, 0, $noreg + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r0, renamable $q6 = MVE_VLDRHU16_post killed renamable $r0, 16, 1, renamable $vpr + ; CHECK: renamable $q3 = MVE_VLDRHU16 renamable $r6, 0, 1, renamable $vpr + ; CHECK: MVE_VPST 2, implicit $vpr + ; CHECK: renamable $q5 = MVE_VSHR_immu16 killed renamable $q3, 11, 1, renamable $vpr, undef renamable $q5 + ; CHECK: renamable $r9 = nsw t2SUBri killed renamable $r9, 8, 14 /* CC::al */, $noreg, $noreg + ; CHECK: MVE_VSTRWU32 killed renamable $q5, $sp, 80, 0, $noreg :: (store (s128) into %stack.0, align 8) + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: dead renamable $q7 = MVE_VLDRWU32 $sp, 80, 0, $noreg :: (load (s128) from %stack.0, align 8) + ; CHECK: MVE_VPST 1, implicit $vpr + ; CHECK: dead renamable $vpr = MVE_VCMPi16r killed renamable $q6, renamable $r8, 1, 1, killed renamable $vpr + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + bb.0: + successors: %bb.1(0x80000000) + liveins: $q0, $r1, $r2, $r3, $r4, $r5, $r7, $r8, $r10, $r11, $r12 + + $r6 = tMOVr $r2, 14 /* CC::al */, $noreg + $r0 = tMOVr $r12, 14 /* CC::al */, $noreg + $r9 = tMOVr $r3, 14 /* CC::al */, $noreg + renamable $lr = t2DoLoopStartTP renamable $r1, renamable $r3 + + bb.1: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12 + + renamable $vpr = MVE_VCTP16 renamable $r9, 0, $noreg + MVE_VPST 8, implicit $vpr + renamable $r0, renamable $q6 = MVE_VLDRHU16_post killed renamable $r0, 16, 1, renamable $vpr + renamable $q3 = MVE_VLDRHU16 renamable $r6, 0, 1, renamable $vpr + MVE_VPST 2, implicit $vpr + renamable $q5 = MVE_VSHR_immu16 renamable $q3, 11, 1, renamable $vpr, undef renamable $q5 + renamable $r9 = nsw t2SUBri killed renamable $r9, 8, 14 /* CC::al */, $noreg, $noreg + MVE_VSTRWU32 killed renamable $q5, $sp, 80, 0, $noreg :: (store (s128) into %stack.0, align 8) + MVE_VPST 8, implicit $vpr + renamable $q7 = MVE_VLDRWU32 $sp, 80, 0, $noreg :: (load (s128) from %stack.0, align 8) + MVE_VPST 1, implicit $vpr + renamable $vpr = MVE_VCMPi16r killed renamable $q6, renamable $r8, 1, 1, killed renamable $vpr + renamable $lr = t2LoopEndDec killed renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14 /* CC::al */, $noreg + + bb.2: + successors: %bb.3(0x04000000), %bb.0(0x7c000000) + liveins: $q0, $r1, $r2, $r3, $r4, $r5, $r7, $r8, $r10, $r11, $r12 + + renamable $q7 = MVE_VLDRWU32 $sp, 80, 0, $noreg :: (load (s128) from %stack.0, align 8) + renamable $r0 = tLDRspi $sp, 1, 14 /* CC::al */, $noreg + renamable $r10 = nuw t2ADDri killed renamable $r10, 1, 14 /* CC::al */, $noreg, $noreg + renamable $r12 = t2ADDrs killed renamable $r12, killed renamable $r0, 10, 14 /* CC::al */, $noreg, $noreg + renamable $r0 = tLDRspi $sp, 3, 14 /* CC::al */, $noreg + renamable $r2 = t2ADDrs killed renamable $r2, killed renamable $r0, 10, 14 /* CC::al */, $noreg, $noreg + renamable $r0 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg + tCMPhir renamable $r10, killed renamable $r0, 14 /* CC::al */, $noreg, implicit-def $cpsr + tBcc %bb.0, 1 /* CC::ne */, killed $cpsr + + bb.3: + $sp = frame-destroy tADDspi $sp, 24, 14 /* CC::al */, $noreg + $sp = frame-destroy VLDMDIA_UPD $sp, 14 /* CC::al */, $noreg, def $d8, def $d9, def $d10, def $d11, def $d12, def $d13, def $d14, def $d15 + $sp = frame-destroy tADDspi $sp, 1, 14 /* CC::al */, $noreg + $sp = frame-destroy t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11, def $pc + +...