From 120a5e9a745f931d805d63c6c5313a1aa24d98f5 Mon Sep 17 00:00:00 2001 From: David Green Date: Sun, 29 Sep 2019 08:38:48 +0000 Subject: [PATCH] [ARM] Cortex-M4 schedule additions This is an attempt to fill in some of the missing instructions from the Cortex-M4 schedule, and make it easier to do the same for other ARM cpus. - Some instructions are marked as hasNoSchedulingInfo as they are pseudos or otherwise do not require scheduling info - A lot of features have been marked not supported - Some WriteRes's have been added for cvt instructions. - Some extra instruction latencies have been added, notably by relaxing the regex for dsp instruction to catch more cases, and some fp instructions. This goes a long way to get the CompleteModel working for this CPU. It does not go far enough as to get all scheduling info for all output operands correct. Differential Revision: https://reviews.llvm.org/D67957 llvm-svn: 373163 --- llvm/lib/Target/ARM/ARMInstrFormats.td | 1 + llvm/lib/Target/ARM/ARMInstrInfo.td | 18 ++++---- llvm/lib/Target/ARM/ARMInstrThumb2.td | 3 +- llvm/lib/Target/ARM/ARMInstrVFP.td | 11 +++-- llvm/lib/Target/ARM/ARMScheduleM4.td | 24 ++++++++-- .../CodeGen/ARM/ParallelDSP/complex_dot_prod.ll | 6 +-- .../CodeGen/ARM/ParallelDSP/multi-use-loads.ll | 52 +++++++++++----------- .../CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll | 1 - 8 files changed, 69 insertions(+), 47 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td index 53af05c..dd14ebd 100644 --- a/llvm/lib/Target/ARM/ARMInstrFormats.td +++ b/llvm/lib/Target/ARM/ARMInstrFormats.td @@ -465,6 +465,7 @@ class AsmPseudoInst let isCodeGenOnly = 0; // So we get asm matcher for it. let AsmString = asm; let isPseudo = 1; + let hasNoSchedulingInfo = 1; } class ARMAsmPseudo diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 766c87e..e25260d 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -1967,7 +1967,7 @@ multiclass AI_str1nopc; @@ -4895,14 +4895,13 @@ def SB : AInoP<(outs), (ins), MiscFrm, NoItinerary, "sb", "", []>, let hasSideEffects = 1; } -let usesCustomInserter = 1, Defs = [CPSR] in { - -// Pseudo instruction that combines movs + predicated rsbmi -// to implement integer ABS +let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in { + // Pseudo instruction that combines movs + predicated rsbmi + // to implement integer ABS def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>; } -let usesCustomInserter = 1, Defs = [CPSR] in { +let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in { def COPY_STRUCT_BYVAL_I32 : PseudoInst< (outs), (ins GPR:$dst, GPR:$src, i32imm:$size, i32imm:$alignment), NoItinerary, @@ -5604,12 +5603,12 @@ def MSRbanked : ABI<0b0001, (outs), (ins banked_reg:$banked, GPRnopc:$Rn), def win__chkstk : SDNode<"ARMISD::WIN__CHKSTK", SDTNone, [SDNPHasChain, SDNPSideEffect]>; -let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in +let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP], hasNoSchedulingInfo = 1 in def WIN__CHKSTK : PseudoInst<(outs), (ins), NoItinerary, [(win__chkstk)]>; def win__dbzchk : SDNode<"ARMISD::WIN__DBZCHK", SDT_WIN__DBZCHK, [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>; -let usesCustomInserter = 1, Defs = [CPSR] in +let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in def WIN__DBZCHK : PseudoInst<(outs), (ins tGPR:$divisor), NoItinerary, [(win__dbzchk tGPR:$divisor)]>; @@ -6156,7 +6155,7 @@ def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm", def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>, ComplexDeprecationPredicate<"IT">; -let mayLoad = 1, mayStore =1, hasSideEffects = 1 in +let mayLoad = 1, mayStore =1, hasSideEffects = 1, hasNoSchedulingInfo = 1 in def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn), NoItinerary, [(set GPR:$Rd, (int_arm_space timm:$size, GPR:$Rn))]>; @@ -6199,4 +6198,5 @@ def CompilerBarrier : PseudoInst<(outs), (ins i32imm:$ordering), NoItinerary, let hasSideEffects = 1; let Size = 0; let AsmString = "@ COMPILER BARRIER"; + let hasNoSchedulingInfo = 1; } diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td index b165b85..ef5d090 100644 --- a/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -4086,7 +4086,7 @@ def t2LDRpci_pic : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr, pclabel:$cp), // Pseudo isntruction that combines movs + predicated rsbmi // to implement integer ABS -let usesCustomInserter = 1, Defs = [CPSR] in { +let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in { def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src), NoItinerary, []>, Requires<[IsThumb2]>; } @@ -5088,6 +5088,7 @@ def t2BF_LabelPseudo : t2PseudoInst<(outs ), (ins pclabel:$cp), 0, NoItinerary, []> { let isTerminator = 1; let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB]; + let hasNoSchedulingInfo = 1; } def t2BFi : t2BF<(ins bflabel_u4:$b_label, bflabel_s16:$label, pred:$p), diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td index 4a95630..d3380ab 100644 --- a/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -324,7 +324,7 @@ defm : VFPDTAnyInstAlias<"vpop${p}", "$r", // However, there is no UAL syntax for them, so we keep them around for // (dis)assembly only. multiclass vfp_ldstx_mult { - let Predicates = [HasFPRegs] in { + let Predicates = [HasFPRegs], hasNoSchedulingInfo = 1 in { // Unknown precision def XIA : AXXI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops), @@ -1732,7 +1732,8 @@ def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0, def VTOUHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 0, (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits), - IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", []> { + IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", []>, + Sched<[WriteFPCVT]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. let D = VFPNeonA8Domain; @@ -1740,7 +1741,8 @@ def VTOUHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 0, def VTOSLS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 1, (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits), - IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", []> { + IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", []>, + Sched<[WriteFPCVT]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. let D = VFPNeonA8Domain; @@ -1748,7 +1750,8 @@ def VTOSLS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 1, def VTOULS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 1, (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits), - IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", []> { + IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", []>, + Sched<[WriteFPCVT]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines on A8. let D = VFPNeonA8Domain; diff --git a/llvm/lib/Target/ARM/ARMScheduleM4.td b/llvm/lib/Target/ARM/ARMScheduleM4.td index 38c8ea2..bfa5fc0 100644 --- a/llvm/lib/Target/ARM/ARMScheduleM4.td +++ b/llvm/lib/Target/ARM/ARMScheduleM4.td @@ -18,6 +18,9 @@ def CortexM4Model : SchedMachineModel { let PostRAScheduler = 1; let CompleteModel = 0; + let UnsupportedFeatures = [IsARM, HasNEON, HasDotProd, HasZCZ, HasMVEInt, + IsNotMClass, HasDPVFP, HasFPARMv8, HasFullFP16, Has8MSecExt, HasV8, + HasV8_3a, HasTrustZone, HasDFB, IsWindows]; } @@ -50,6 +53,7 @@ def : M4UnitL2; def : M4UnitL2; def : M4UnitL2I<(instregex "(t|t2)LDM")>; +def : M4UnitL2I<(instregex "(t|t2)LDR")>; // Stores we use a latency of 1 as they have no outputs @@ -78,9 +82,20 @@ def : M4UnitL1; def : M4UnitL1; def : M4UnitL1I<(instregex "(t|t2)MOV")>; def : M4UnitL1I<(instrs COPY)>; -def : M4UnitL1I<(instregex "t2IT")>; -def : M4UnitL1I<(instregex "t2SEL", "t2USAD8", - "t2(S|Q|SH|U|UQ|UH)(ADD16|ASX|SAX|SUB16|ADD8|SUB8)", "t2USADA8", "(t|t2)REV")>; +def : M4UnitL1I<(instregex "t2IT", "t2MSR", "t2MRS")>; +def : M4UnitL1I<(instregex "t2CLREX")>; +def : M4UnitL1I<(instregex "t2SEL", "t2USAD8", "t2SML[AS]", + "t2(S|Q|SH|U|UQ|UH|QD)(ADD|ASX|SAX|SUB)", "t2USADA8", "(t|t2)REV")>; + +// These instructions are not of much interest to scheduling as they will not +// be generated or it is not very useful to schedule them. They are here to make +// the model more complete. +def : M4UnitL1I<(instregex "t2CDP", "t2LDC", "t2MCR", "t2MRC", "t2MRRC", "t2STC")>; +def : M4UnitL1I<(instregex "tCPS", "t2ISB", "t2DSB", "t2DMB", "t2?HINT$")>; +def : M4UnitL1I<(instregex "t2?UDF$", "tBKPT", "t2DBG")>; +def : M4UnitL1I<(instregex "t?2?Int_eh_sjlj_", "tADDframe", "t?ADJCALL")>; +def : M4UnitL1I<(instregex "CMP_SWAP", "JUMPTABLE", "MEMCPY")>; +def : M4UnitL1I<(instregex "VSETLNi32", "VGETLNi32")>; def : ReadAdvance; def : ReadAdvance; @@ -112,6 +127,9 @@ def : M4UnitL1; def : M4UnitL1; def : M4UnitL1; def : M4UnitL1; +def : M4UnitL1I<(instregex "VMOVS", "FCONSTS", "VCMP", "VNEG", "VABS")>; +def : M4UnitL2I<(instregex "VMOVD")>; +def : M4UnitL1I<(instregex "VMRS", "VMSR", "FMSTAT")>; def : ReadAdvance; def : ReadAdvance; diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll b/llvm/test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll index 669972c..68702b7 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll @@ -8,13 +8,13 @@ ; CHECK: smultt ; CHECK: smlalbb ; CHECK: smultt -; CHECK: smlaldx ; CHECK: smlalbb -; CHECK: smlaldx ; CHECK: smultt ; CHECK: smlalbb -; CHECK: smlaldx ; CHECK: smultt +; CHECK: smlaldx +; CHECK: smlaldx +; CHECK: smlaldx ; CHECK: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} define dso_local arm_aapcscc void @complex_dot_prod(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i32* nocapture %realResult, i32* nocapture %imagResult) { entry: diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll b/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll index 66170aa..37e39a0 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll @@ -19,8 +19,8 @@ define i32 @add_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture rea ; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-LE-NEXT: ldr lr, [r3, #2]! ; CHECK-LE-NEXT: ldr r4, [r2, #2]! -; CHECK-LE-NEXT: subs r0, #1 ; CHECK-LE-NEXT: sxtah r1, r1, lr +; CHECK-LE-NEXT: subs r0, #1 ; CHECK-LE-NEXT: smlad r12, r4, lr, r12 ; CHECK-LE-NEXT: bne .LBB0_2 ; CHECK-LE-NEXT: @ %bb.3: @ %for.cond.cleanup @@ -47,13 +47,13 @@ define i32 @add_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture rea ; CHECK-BE-NEXT: .LBB0_2: @ %for.body ; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ldrsh lr, [r3, #2]! -; CHECK-BE-NEXT: ldrsh r4, [r2, #2]! -; CHECK-BE-NEXT: ldrsh.w r5, [r3, #2] +; CHECK-BE-NEXT: ldrsh r5, [r2, #2]! +; CHECK-BE-NEXT: ldrsh.w r4, [r3, #2] ; CHECK-BE-NEXT: ldrsh.w r6, [r2, #2] -; CHECK-BE-NEXT: smlabb r4, r4, lr, r12 -; CHECK-BE-NEXT: subs r0, #1 -; CHECK-BE-NEXT: smlabb r12, r6, r5, r4 +; CHECK-BE-NEXT: smlabb r5, r5, lr, r12 ; CHECK-BE-NEXT: add r1, lr +; CHECK-BE-NEXT: subs r0, #1 +; CHECK-BE-NEXT: smlabb r12, r6, r4, r5 ; CHECK-BE-NEXT: bne .LBB0_2 ; CHECK-BE-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-BE-NEXT: add.w r0, r12, r1 @@ -154,8 +154,8 @@ define i32 @mul_bottom_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapt ; CHECK-BE-NEXT: ldrsh.w r5, [r3, #2] ; CHECK-BE-NEXT: ldrsh.w r6, [r2, #2] ; CHECK-BE-NEXT: smlabb r4, r4, lr, r12 -; CHECK-BE-NEXT: subs r0, #1 ; CHECK-BE-NEXT: smlabb r12, r6, r5, r4 +; CHECK-BE-NEXT: subs r0, #1 ; CHECK-BE-NEXT: mul r1, lr, r1 ; CHECK-BE-NEXT: bne .LBB1_2 ; CHECK-BE-NEXT: @ %bb.3: @ %for.cond.cleanup @@ -215,17 +215,17 @@ define i32 @mul_top_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture ; CHECK-LE-NEXT: cmp r0, #1 ; CHECK-LE-NEXT: blt .LBB2_4 ; CHECK-LE-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-LE-NEXT: subs r2, #2 +; CHECK-LE-NEXT: sub.w lr, r2, #2 ; CHECK-LE-NEXT: subs r3, #2 ; CHECK-LE-NEXT: mov.w r12, #0 ; CHECK-LE-NEXT: movs r1, #0 ; CHECK-LE-NEXT: .p2align 2 ; CHECK-LE-NEXT: .LBB2_2: @ %for.body ; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-LE-NEXT: ldr r4, [r2, #2]! -; CHECK-LE-NEXT: ldr lr, [r3, #2]! -; CHECK-LE-NEXT: asrs r5, r4, #16 -; CHECK-LE-NEXT: smlad r12, r4, lr, r12 +; CHECK-LE-NEXT: ldr r2, [lr, #2]! +; CHECK-LE-NEXT: ldr r4, [r3, #2]! +; CHECK-LE-NEXT: asrs r5, r2, #16 +; CHECK-LE-NEXT: smlad r12, r2, r4, r12 ; CHECK-LE-NEXT: subs r0, #1 ; CHECK-LE-NEXT: mul r1, r5, r1 ; CHECK-LE-NEXT: bne .LBB2_2 @@ -257,8 +257,8 @@ define i32 @mul_top_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture ; CHECK-BE-NEXT: ldrsh.w r5, [r3, #2] ; CHECK-BE-NEXT: ldrsh.w r6, [r2, #2] ; CHECK-BE-NEXT: smlabb r4, r4, lr, r12 -; CHECK-BE-NEXT: subs r0, #1 ; CHECK-BE-NEXT: smlabb r12, r6, r5, r4 +; CHECK-BE-NEXT: subs r0, #1 ; CHECK-BE-NEXT: mul r1, r6, r1 ; CHECK-BE-NEXT: bne .LBB2_2 ; CHECK-BE-NEXT: @ %bb.3: @ %for.cond.cleanup @@ -343,8 +343,8 @@ define i32 @and_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture rea ; ; CHECK-BE-LABEL: and_user: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .save {r4, r5, r6, lr} -; CHECK-BE-NEXT: push {r4, r5, r6, lr} +; CHECK-BE-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-BE-NEXT: cmp r0, #1 ; CHECK-BE-NEXT: blt .LBB3_4 ; CHECK-BE-NEXT: @ %bb.1: @ %for.body.preheader @@ -356,23 +356,23 @@ define i32 @and_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture rea ; CHECK-BE-NEXT: .LBB3_2: @ %for.body ; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ldrsh lr, [r3, #2]! -; CHECK-BE-NEXT: ldrsh r4, [r2, #2]! -; CHECK-BE-NEXT: ldrsh.w r5, [r3, #2] -; CHECK-BE-NEXT: ldrsh.w r6, [r2, #2] -; CHECK-BE-NEXT: smlabb r4, r4, lr, r12 -; CHECK-BE-NEXT: uxth.w lr, lr -; CHECK-BE-NEXT: smlabb r12, r6, r5, r4 +; CHECK-BE-NEXT: ldrsh r5, [r2, #2]! +; CHECK-BE-NEXT: ldrsh.w r4, [r3, #2] +; CHECK-BE-NEXT: ldrsh.w r7, [r2, #2] +; CHECK-BE-NEXT: uxth.w r6, lr +; CHECK-BE-NEXT: smlabb r5, r5, lr, r12 +; CHECK-BE-NEXT: smlabb r12, r7, r4, r5 ; CHECK-BE-NEXT: subs r0, #1 -; CHECK-BE-NEXT: mul r1, lr, r1 +; CHECK-BE-NEXT: mul r1, r6, r1 ; CHECK-BE-NEXT: bne .LBB3_2 ; CHECK-BE-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-BE-NEXT: add.w r0, r12, r1 -; CHECK-BE-NEXT: pop {r4, r5, r6, pc} +; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-BE-NEXT: .LBB3_4: ; CHECK-BE-NEXT: mov.w r12, #0 ; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: add.w r0, r12, r1 -; CHECK-BE-NEXT: pop {r4, r5, r6, pc} +; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc} entry: %cmp24 = icmp sgt i32 %arg, 0 br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup @@ -466,10 +466,10 @@ define i32 @multi_uses(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture r ; CHECK-BE-NEXT: ldrsh.w r5, [r3, #2] ; CHECK-BE-NEXT: ldrsh.w r6, [r2, #2] ; CHECK-BE-NEXT: smlabb r4, r4, r1, r12 -; CHECK-BE-NEXT: subs r0, #1 ; CHECK-BE-NEXT: smlabb r12, r6, r5, r4 ; CHECK-BE-NEXT: eor.w r6, r1, lr -; CHECK-BE-NEXT: mul r1, r6, r1 +; CHECK-BE-NEXT: muls r1, r6, r1 +; CHECK-BE-NEXT: subs r0, #1 ; CHECK-BE-NEXT: lsl.w lr, r1, #16 ; CHECK-BE-NEXT: bne .LBB4_2 ; CHECK-BE-NEXT: @ %bb.3: @ %for.cond.cleanup diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll b/llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll index 83418dc..c72d458 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll @@ -46,7 +46,6 @@ entry: ; CHECK-REG-PRESSURE: ldr{{.*}}, [sp ; CHECK-REG-PRESSURE: ldr{{.*}}, [sp ; CHECK-REG-PRESSURE: ldr{{.*}}, [sp -; CHECK-REG-PRESSURE: ldr{{.*}}, [sp ; CHECK-REG-PRESSURE: bne .LBB0_1 for.body: -- 2.7.4