From b9887ef32a5d06108dfabbbe181bd8e4ea7abbfe Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Wed, 25 Feb 2015 14:41:06 +0000 Subject: [PATCH] Improve handling of stack accesses in Thumb-1 Thumb-1 only allows SP-based LDR and STR to be word-sized, and SP-base LDR, STR, and ADD only allow offsets that are a multiple of 4. Make some changes to better make use of these instructions: * Use word loads for anyext byte and halfword loads from the stack. * Enforce 4-byte alignment on objects accessed in this way, to ensure that the offset is valid. * Do the same for objects whose frame index is used, in order to avoid having to use more than one ADD to generate the frame index. * Correct how many bits of offset we think AddrModeT1_s has. Patch by John Brawn. llvm-svn: 230496 --- llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp | 7 +- llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp | 15 ++++ llvm/lib/Target/ARM/ARMInstrThumb.td | 11 +++ llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 36 ++++++--- .../CodeGen/ARM/2015-01-21-thumbv4t-ldstr-opt.ll | 91 ++++++++++------------ llvm/test/CodeGen/ARM/atomic-ops-v8.ll | 2 +- llvm/test/CodeGen/ARM/debug-frame-vararg.ll | 14 ++-- llvm/test/CodeGen/ARM/frame-register.ll | 6 +- llvm/test/CodeGen/ARM/thumb1-varalloc.ll | 32 ++------ llvm/test/CodeGen/ARM/thumb1_return_sequence.ll | 48 ++++-------- llvm/test/CodeGen/Thumb/stack-access.ll | 74 ++++++++++++++++++ llvm/test/CodeGen/Thumb/stm-merge.ll | 9 ++- llvm/test/CodeGen/Thumb/vargs.ll | 16 ++-- 13 files changed, 223 insertions(+), 138 deletions(-) create mode 100644 llvm/test/CodeGen/Thumb/stack-access.ll diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index b3fd403..7574727 100644 --- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -546,12 +546,13 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { // and pick a real one. Offset += 128; // 128 bytes of spill slots - // If there is a frame pointer, try using it. + // If there's a frame pointer and the addressing mode allows it, try using it. // The FP is only available if there is no dynamic realignment. We // don't know for sure yet whether we'll need that, so we guess based // on whether there are any local variables that would trigger it. unsigned StackAlign = TFI->getStackAlignment(); - if (TFI->hasFP(MF) && + if (TFI->hasFP(MF) && + (MI->getDesc().TSFlags & ARMII::AddrModeMask) != ARMII::AddrModeT1_s && !((MFI->getLocalFrameMaxAlign() > StackAlign) && canRealignStack(MF))) { if (isFrameOffsetLegal(MI, FPOffset)) return false; @@ -668,7 +669,7 @@ bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, NumBits = 8; break; case ARMII::AddrModeT1_s: - NumBits = 5; + NumBits = 8; Scale = 4; isSigned = false; break; diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 77464bd..6ebf640 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1197,6 +1197,11 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm) { if (N.getOpcode() == ISD::FrameIndex) { int FI = cast(N)->getIndex(); + // Only multiples of 4 are allowed for the offset, so the frame object + // alignment must be at least 4. + MachineFrameInfo *MFI = MF->getFrameInfo(); + if (MFI->getObjectAlignment(FI) < 4) + MFI->setObjectAlignment(FI, 4); Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); OffImm = CurDAG->getTargetConstant(0, MVT::i32); return true; @@ -1214,6 +1219,11 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N, Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast(Base)->getIndex(); + // For LHS+RHS to result in an offset that's a multiple of 4 the object + // indexed by the LHS must be 4-byte aligned. + MachineFrameInfo *MFI = MF->getFrameInfo(); + if (MFI->getObjectAlignment(FI) < 4) + MFI->setObjectAlignment(FI, 4); Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); } OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); @@ -2502,6 +2512,11 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { int FI = cast(N)->getIndex(); SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); if (Subtarget->isThumb1Only()) { + // Set the alignment of the frame object to 4, to avoid having to generate + // more than one ADD + MachineFrameInfo *MFI = MF->getFrameInfo(); + if (MFI->getObjectAlignment(FI) < 4) + MFI->setObjectAlignment(FI, 4); return CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI, CurDAG->getTargetConstant(0, MVT::i32)); } else { diff --git a/llvm/lib/Target/ARM/ARMInstrThumb.td b/llvm/lib/Target/ARM/ARMInstrThumb.td index cc953c6..3c62e0e 100644 --- a/llvm/lib/Target/ARM/ARMInstrThumb.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb.td @@ -1375,6 +1375,17 @@ def : T1Pat<(zextloadi1 t_addrmode_rrs1:$addr), def : T1Pat<(zextloadi1 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>; +// extload from the stack -> word load from the stack, as it avoids having to +// materialize the base in a separate register. This only works when a word +// load puts the byte/halfword value in the same place in the register that the +// byte/halfword load would, i.e. when little-endian. +def : T1Pat<(extloadi1 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>, + Requires<[IsThumb, IsThumb1Only, IsLE]>; +def : T1Pat<(extloadi8 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>, + Requires<[IsThumb, IsThumb1Only, IsLE]>; +def : T1Pat<(extloadi16 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>, + Requires<[IsThumb, IsThumb1Only, IsLE]>; + // extload -> zextload def : T1Pat<(extloadi1 t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>; def : T1Pat<(extloadi1 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>; diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index bd40658..a8d0981 100644 --- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -170,7 +170,8 @@ static int getMemoryOpOffset(const MachineInstr *MI) { return OffField; // Thumb1 immediate offsets are scaled by 4 - if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi) + if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi || + Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) return OffField * 4; int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField) @@ -206,6 +207,7 @@ static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) { case ARM_AM::ib: return ARM::STMIB; } case ARM::tLDRi: + case ARM::tLDRspi: // tLDMIA is writeback-only - unless the base register is in the input // reglist. ++NumLDMGened; @@ -214,6 +216,7 @@ static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) { case ARM_AM::ia: return ARM::tLDMIA; } case ARM::tSTRi: + case ARM::tSTRspi: // There is no non-writeback tSTMIA either. ++NumSTMGened; switch (Mode) { @@ -328,7 +331,7 @@ AMSubMode getLoadStoreMultipleSubMode(int Opcode) { } // end namespace llvm static bool isT1i32Load(unsigned Opc) { - return Opc == ARM::tLDRi; + return Opc == ARM::tLDRi || Opc == ARM::tLDRspi; } static bool isT2i32Load(unsigned Opc) { @@ -340,7 +343,7 @@ static bool isi32Load(unsigned Opc) { } static bool isT1i32Store(unsigned Opc) { - return Opc == ARM::tSTRi; + return Opc == ARM::tSTRi || Opc == ARM::tSTRspi; } static bool isT2i32Store(unsigned Opc) { @@ -356,6 +359,8 @@ static unsigned getImmScale(unsigned Opc) { default: llvm_unreachable("Unhandled opcode!"); case ARM::tLDRi: case ARM::tSTRi: + case ARM::tLDRspi: + case ARM::tSTRspi: return 1; case ARM::tLDRHi: case ARM::tSTRHi: @@ -495,6 +500,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, if (isThumb1) for (unsigned I = 0; I < NumRegs; ++I) if (Base == Regs[I].first) { + assert(Base != ARM::SP && "Thumb1 does not allow SP in register list"); if (Opcode == ARM::tLDRi) { Writeback = false; break; @@ -515,7 +521,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, } else if (Offset == -4 * (int)NumRegs && isNotVFP && !isThumb1) { // VLDM/VSTM do not support DB mode without also updating the base reg. Mode = ARM_AM::db; - } else if (Offset != 0) { + } else if (Offset != 0 || Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) { // Check if this is a supported opcode before inserting instructions to // calculate a new base register. if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return false; @@ -545,6 +551,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, int BaseOpc = isThumb2 ? ARM::t2ADDri : + (isThumb1 && Base == ARM::SP) ? ARM::tADDrSPi : (isThumb1 && Offset < 8) ? ARM::tADDi3 : isThumb1 ? ARM::tADDi8 : ARM::ADDri; @@ -552,7 +559,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, Offset = - Offset; BaseOpc = isThumb2 ? ARM::t2SUBri : - (isThumb1 && Offset < 8) ? ARM::tSUBi3 : + (isThumb1 && Offset < 8 && Base != ARM::SP) ? ARM::tSUBi3 : isThumb1 ? ARM::tSUBi8 : ARM::SUBri; } @@ -566,7 +573,8 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, // or // MOV NewBase, Base // ADDS NewBase, #imm8. - if (Base != NewBase && Offset >= 8) { + if (Base != NewBase && + (BaseOpc == ARM::tADDi8 || BaseOpc == ARM::tSUBi8)) { // Need to insert a MOV to the new base first. if (isARMLowRegister(NewBase) && isARMLowRegister(Base) && !STI->hasV6Ops()) { @@ -584,9 +592,15 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, Base = NewBase; BaseKill = false; } - AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase), true) - .addReg(Base, getKillRegState(BaseKill)).addImm(Offset) - .addImm(Pred).addReg(PredReg); + if (BaseOpc == ARM::tADDrSPi) { + assert(Offset % 4 == 0 && "tADDrSPi offset is scaled by 4"); + BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase) + .addReg(Base, getKillRegState(BaseKill)).addImm(Offset/4) + .addImm(Pred).addReg(PredReg); + } else + AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase), true) + .addReg(Base, getKillRegState(BaseKill)).addImm(Offset) + .addImm(Pred).addReg(PredReg); } else { BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase) .addReg(Base, getKillRegState(BaseKill)).addImm(Offset) @@ -967,6 +981,8 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { case ARM::STRi12: case ARM::tLDRi: case ARM::tSTRi: + case ARM::tLDRspi: + case ARM::tSTRspi: case ARM::t2LDRi8: case ARM::t2LDRi12: case ARM::t2STRi8: @@ -1402,6 +1418,8 @@ static bool isMemoryOp(const MachineInstr *MI) { case ARM::STRi12: case ARM::tLDRi: case ARM::tSTRi: + case ARM::tLDRspi: + case ARM::tSTRspi: case ARM::t2LDRi8: case ARM::t2LDRi12: case ARM::t2STRi8: diff --git a/llvm/test/CodeGen/ARM/2015-01-21-thumbv4t-ldstr-opt.ll b/llvm/test/CodeGen/ARM/2015-01-21-thumbv4t-ldstr-opt.ll index f3cc3d8..de2dead 100644 --- a/llvm/test/CodeGen/ARM/2015-01-21-thumbv4t-ldstr-opt.ll +++ b/llvm/test/CodeGen/ARM/2015-01-21-thumbv4t-ldstr-opt.ll @@ -1,55 +1,48 @@ ; RUN: llc -mtriple=thumbv4t-none--eabi < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-V4T ; RUN: llc -mtriple=thumbv6m-none--eabi < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-V6M -; CHECK-LABEL: foo -define i32 @foo(i32 %z, ...) #0 { -entry: - %a = alloca i32, align 4 - %b = alloca i32, align 4 - %c = alloca i32, align 4 - %d = alloca i32, align 4 - %e = alloca i32, align 4 - %f = alloca i32, align 4 - %g = alloca i32, align 4 - %h = alloca i32, align 4 - - store i32 1, i32* %a, align 4 - store i32 2, i32* %b, align 4 - store i32 3, i32* %c, align 4 - store i32 4, i32* %d, align 4 - store i32 5, i32* %e, align 4 - store i32 6, i32* %f, align 4 - store i32 7, i32* %g, align 4 - store i32 8, i32* %h, align 4 - - %0 = load i32* %a, align 4 - %1 = load i32* %b, align 4 - %2 = load i32* %c, align 4 - %3 = load i32* %d, align 4 - %4 = load i32* %e, align 4 - %5 = load i32* %f, align 4 - %6 = load i32* %g, align 4 - %7 = load i32* %h, align 4 - - %add = add nsw i32 %0, %1 - %add4 = add nsw i32 %add, %2 - %add5 = add nsw i32 %add4, %3 - %add6 = add nsw i32 %add5, %4 - %add7 = add nsw i32 %add6, %5 - %add8 = add nsw i32 %add7, %6 - %add9 = add nsw i32 %add8, %7 - - %addz = add nsw i32 %add9, %z - call void @llvm.va_start(i8* null) - ret i32 %addz - -; CHECK: sub sp, #40 -; CHECK-NEXT: add [[BASE:r[0-9]]], sp, #8 - -; CHECK-V4T: movs [[NEWBASE:r[0-9]]], [[BASE]] -; CHECK-V6M: mov [[NEWBASE:r[0-9]]], [[BASE]] -; CHECK-NEXT: adds [[NEWBASE]], #8 +; CHECK-LABEL: test1 +define i32 @test1(i32* %p) { + +; Offsets less than 8 can be generated in a single add +; CHECK: adds [[NEWBASE:r[0-9]]], r0, #4 + %1 = getelementptr inbounds i32* %p, i32 1 + %2 = getelementptr inbounds i32* %p, i32 2 + %3 = getelementptr inbounds i32* %p, i32 3 + %4 = getelementptr inbounds i32* %p, i32 4 + ; CHECK-NEXT: ldm [[NEWBASE]], + %5 = load i32* %1, align 4 + %6 = load i32* %2, align 4 + %7 = load i32* %3, align 4 + %8 = load i32* %4, align 4 + + %9 = add nsw i32 %5, %6 + %10 = add nsw i32 %9, %7 + %11 = add nsw i32 %10, %8 + ret i32 %11 } -declare void @llvm.va_start(i8*) nounwind +; CHECK-LABEL: test2 +define i32 @test2(i32* %p) { + +; Offsets >=8 require a mov and an add +; CHECK-V4T: movs [[NEWBASE:r[0-9]]], r0 +; CHECK-V6M: mov [[NEWBASE:r[0-9]]], r0 +; CHECK-NEXT: adds [[NEWBASE]], #8 + %1 = getelementptr inbounds i32* %p, i32 2 + %2 = getelementptr inbounds i32* %p, i32 3 + %3 = getelementptr inbounds i32* %p, i32 4 + %4 = getelementptr inbounds i32* %p, i32 5 + +; CHECK-NEXT: ldm [[NEWBASE]], + %5 = load i32* %1, align 4 + %6 = load i32* %2, align 4 + %7 = load i32* %3, align 4 + %8 = load i32* %4, align 4 + + %9 = add nsw i32 %5, %6 + %10 = add nsw i32 %9, %7 + %11 = add nsw i32 %10, %8 + ret i32 %11 +} diff --git a/llvm/test/CodeGen/ARM/atomic-ops-v8.ll b/llvm/test/CodeGen/ARM/atomic-ops-v8.ll index 7072aaa..6ba1352 100644 --- a/llvm/test/CodeGen/ARM/atomic-ops-v8.ll +++ b/llvm/test/CodeGen/ARM/atomic-ops-v8.ll @@ -1296,7 +1296,7 @@ define void @test_atomic_store_monotonic_regoff_i8(i64 %base, i64 %off, i8 %val) %addr = inttoptr i64 %addr_int to i8* store atomic i8 %val, i8* %addr monotonic, align 1 -; CHECK-LE: ldrb{{(\.w)?}} [[VAL:r[0-9]+]], [sp] +; CHECK-LE: ldr{{b?(\.w)?}} [[VAL:r[0-9]+]], [sp] ; CHECK-LE: strb [[VAL]], [r0, r2] ; CHECK-BE: ldrb{{(\.w)?}} [[VAL:r[0-9]+]], [sp, #3] ; CHECK-BE: strb [[VAL]], [r1, r3] diff --git a/llvm/test/CodeGen/ARM/debug-frame-vararg.ll b/llvm/test/CodeGen/ARM/debug-frame-vararg.ll index 05521d8..65be2db 100644 --- a/llvm/test/CodeGen/ARM/debug-frame-vararg.ll +++ b/llvm/test/CodeGen/ARM/debug-frame-vararg.ll @@ -88,24 +88,22 @@ ; CHECK-THUMB-FP: .cfi_startproc ; CHECK-THUMB-FP: sub sp, #16 ; CHECK-THUMB-FP: .cfi_def_cfa_offset 16 -; CHECK-THUMB-FP: push {r4, r5, r7, lr} -; CHECK-THUMB-FP: .cfi_def_cfa_offset 32 +; CHECK-THUMB-FP: push {r4, lr} +; CHECK-THUMB-FP: .cfi_def_cfa_offset 24 ; CHECK-THUMB-FP: .cfi_offset lr, -20 -; CHECK-THUMB-FP: .cfi_offset r7, -24 -; CHECK-THUMB-FP: .cfi_offset r5, -28 -; CHECK-THUMB-FP: .cfi_offset r4, -32 +; CHECK-THUMB-FP: .cfi_offset r4, -24 ; CHECK-THUMB-FP: sub sp, #8 -; CHECK-THUMB-FP: .cfi_def_cfa_offset 40 +; CHECK-THUMB-FP: .cfi_def_cfa_offset 32 ; CHECK-THUMB-FP-ELIM-LABEL: sum ; CHECK-THUMB-FP-ELIM: .cfi_startproc ; CHECK-THUMB-FP-ELIM: sub sp, #16 ; CHECK-THUMB-FP-ELIM: .cfi_def_cfa_offset 16 -; CHECK-THUMB-FP-ELIM: push {r4, r5, r7, lr} +; CHECK-THUMB-FP-ELIM: push {r4, r6, r7, lr} ; CHECK-THUMB-FP-ELIM: .cfi_def_cfa_offset 32 ; CHECK-THUMB-FP-ELIM: .cfi_offset lr, -20 ; CHECK-THUMB-FP-ELIM: .cfi_offset r7, -24 -; CHECK-THUMB-FP-ELIM: .cfi_offset r5, -28 +; CHECK-THUMB-FP-ELIM: .cfi_offset r6, -28 ; CHECK-THUMB-FP-ELIM: .cfi_offset r4, -32 ; CHECK-THUMB-FP-ELIM: add r7, sp, #8 ; CHECK-THUMB-FP-ELIM: .cfi_def_cfa r7, 24 diff --git a/llvm/test/CodeGen/ARM/frame-register.ll b/llvm/test/CodeGen/ARM/frame-register.ll index e6a55bd..b04e376 100644 --- a/llvm/test/CodeGen/ARM/frame-register.ll +++ b/llvm/test/CodeGen/ARM/frame-register.ll @@ -30,9 +30,9 @@ entry: ; CHECK-ARM: push {r11, lr} ; CHECK-ARM: mov r11, sp -; CHECK-THUMB: push {r4, r6, r7, lr} -; CHECK-THUMB: add r7, sp, #8 +; CHECK-THUMB: push {r7, lr} +; CHECK-THUMB: add r7, sp, #0 ; CHECK-DARWIN-ARM: push {r7, lr} -; CHECK-DARWIN-THUMB: push {r4, r7, lr} +; CHECK-DARWIN-THUMB: push {r7, lr} diff --git a/llvm/test/CodeGen/ARM/thumb1-varalloc.ll b/llvm/test/CodeGen/ARM/thumb1-varalloc.ll index 8d5888d..82c4ad5 100644 --- a/llvm/test/CodeGen/ARM/thumb1-varalloc.ll +++ b/llvm/test/CodeGen/ARM/thumb1-varalloc.ll @@ -43,26 +43,6 @@ bb3: declare noalias i8* @strdup(i8* nocapture) nounwind declare i32 @_called_func(i8*, i32*) nounwind -; Variable ending up at unaligned offset from sp (i.e. not a multiple of 4) -define void @test_local_var_addr() { -; CHECK-LABEL: test_local_var_addr: - - %addr1 = alloca i8 - %addr2 = alloca i8 - -; CHECK: mov r0, sp -; CHECK: adds r0, #{{[0-9]+}} -; CHECK: blx - call void @take_ptr(i8* %addr1) - -; CHECK: mov r0, sp -; CHECK: adds r0, #{{[0-9]+}} -; CHECK: blx - call void @take_ptr(i8* %addr2) - - ret void -} - ; Simple variable ending up *at* sp. define void @test_simple_var() { ; CHECK-LABEL: test_simple_var: @@ -126,14 +106,16 @@ define void @test_local_var_offset_1020() { ret void } -; Max range addressable with tADDrSPi + tADDi8 -define void @test_local_var_offset_1275() { -; CHECK-LABEL: test_local_var_offset_1275 +; Max range addressable with tADDrSPi + tADDi8 is 1275, however the automatic +; 4-byte aligning of objects on the stack combined with 8-byte stack alignment +; means that 1268 is the max offset we can use. +define void @test_local_var_offset_1268() { +; CHECK-LABEL: test_local_var_offset_1268 %addr1 = alloca i8, i32 1 - %addr2 = alloca i8, i32 1275 + %addr2 = alloca i8, i32 1268 ; CHECK: add r0, sp, #1020 -; CHECK: adds r0, #255 +; CHECK: adds r0, #248 ; CHECK-NEXT: blx call void @take_ptr(i8* %addr1) diff --git a/llvm/test/CodeGen/ARM/thumb1_return_sequence.ll b/llvm/test/CodeGen/ARM/thumb1_return_sequence.ll index 318e6e4..c831260 100644 --- a/llvm/test/CodeGen/ARM/thumb1_return_sequence.ll +++ b/llvm/test/CodeGen/ARM/thumb1_return_sequence.ll @@ -3,7 +3,7 @@ ; CHECK-V4T-LABEL: clobberframe ; CHECK-V5T-LABEL: clobberframe -define <4 x i32> @clobberframe() #0 { +define <4 x i32> @clobberframe(<6 x i32>* %p) #0 { entry: ; Prologue ; -------- @@ -11,9 +11,10 @@ entry: ; CHECK-V4T: sub sp, ; CHECK-V5T: push {[[SAVED:(r[4567](, )?)+]], lr} - %b = alloca <4 x i32>, align 16 + %b = alloca <6 x i32>, align 16 %a = alloca <4 x i32>, align 16 - store <4 x i32> , <4 x i32>* %b, align 16 + %stuff = load <6 x i32>* %p, align 16 + store <6 x i32> %stuff, <6 x i32>* %b, align 16 store <4 x i32> , <4 x i32>* %a, align 16 %0 = load <4 x i32>* %a, align 16 ret <4 x i32> %0 @@ -70,40 +71,25 @@ entry: ; CHECK-V4T-LABEL: simpleframe ; CHECK-V5T-LABEL: simpleframe -define i32 @simpleframe() #0 { +define i32 @simpleframe(<6 x i32>* %p) #0 { entry: ; Prologue ; -------- ; CHECK-V4T: push {[[SAVED:(r[4567](, )?)+]], lr} ; CHECK-V5T: push {[[SAVED:(r[4567](, )?)+]], lr} - %a = alloca i32, align 4 - %b = alloca i32, align 4 - %c = alloca i32, align 4 - %d = alloca i32, align 4 - store i32 1, i32* %a, align 4 - store i32 2, i32* %b, align 4 - store i32 3, i32* %c, align 4 - store i32 4, i32* %d, align 4 - %0 = load i32* %a, align 4 - %inc = add nsw i32 %0, 1 - store i32 %inc, i32* %a, align 4 - %1 = load i32* %b, align 4 - %inc1 = add nsw i32 %1, 1 - store i32 %inc1, i32* %b, align 4 - %2 = load i32* %c, align 4 - %inc2 = add nsw i32 %2, 1 - store i32 %inc2, i32* %c, align 4 - %3 = load i32* %d, align 4 - %inc3 = add nsw i32 %3, 1 - store i32 %inc3, i32* %d, align 4 - %4 = load i32* %a, align 4 - %5 = load i32* %b, align 4 - %add = add nsw i32 %4, %5 - %6 = load i32* %c, align 4 - %add4 = add nsw i32 %add, %6 - %7 = load i32* %d, align 4 - %add5 = add nsw i32 %add4, %7 + %0 = load <6 x i32>* %p, align 16 + %1 = extractelement <6 x i32> %0, i32 0 + %2 = extractelement <6 x i32> %0, i32 1 + %3 = extractelement <6 x i32> %0, i32 2 + %4 = extractelement <6 x i32> %0, i32 3 + %5 = extractelement <6 x i32> %0, i32 4 + %6 = extractelement <6 x i32> %0, i32 5 + %add1 = add nsw i32 %1, %2 + %add2 = add nsw i32 %add1, %3 + %add3 = add nsw i32 %add2, %4 + %add4 = add nsw i32 %add3, %5 + %add5 = add nsw i32 %add4, %6 ret i32 %add5 ; Epilogue diff --git a/llvm/test/CodeGen/Thumb/stack-access.ll b/llvm/test/CodeGen/Thumb/stack-access.ll new file mode 100644 index 0000000..bcffda2 --- /dev/null +++ b/llvm/test/CodeGen/Thumb/stack-access.ll @@ -0,0 +1,74 @@ +; RUN: llc -mtriple=thumb-eabi < %s -o - | FileCheck %s + +; Check that stack addresses are generated using a single ADD +define void @test1(i8** %p) { + %x = alloca i8, align 1 + %y = alloca i8, align 1 + %z = alloca i8, align 1 +; CHECK: add r1, sp, #8 +; CHECK: str r1, [r0] + store i8* %x, i8** %p, align 4 +; CHECK: add r1, sp, #4 +; CHECK: str r1, [r0] + store i8* %y, i8** %p, align 4 +; CHECK: mov r1, sp +; CHECK: str r1, [r0] + store i8* %z, i8** %p, align 4 + ret void +} + +; Stack offsets larger than 1020 still need two ADDs +define void @test2([1024 x i8]** %p) { + %arr1 = alloca [1024 x i8], align 1 + %arr2 = alloca [1024 x i8], align 1 +; CHECK: add r1, sp, #1020 +; CHECK: adds r1, #4 +; CHECK: str r1, [r0] + store [1024 x i8]* %arr1, [1024 x i8]** %p, align 4 +; CHECK: mov r1, sp +; CHECK: str r1, [r0] + store [1024 x i8]* %arr2, [1024 x i8]** %p, align 4 + ret void +} + +; If possible stack-based lrdb/ldrh are widened to use SP-based addressing +define i32 @test3() #0 { + %x = alloca i8, align 1 + %y = alloca i8, align 1 +; CHECK: ldr r0, [sp] + %1 = load i8* %x, align 1 +; CHECK: ldr r1, [sp, #4] + %2 = load i8* %y, align 1 + %3 = add nsw i8 %1, %2 + %4 = zext i8 %3 to i32 + ret i32 %4 +} + +define i32 @test4() #0 { + %x = alloca i16, align 2 + %y = alloca i16, align 2 +; CHECK: ldr r0, [sp] + %1 = load i16* %x, align 2 +; CHECK: ldr r1, [sp, #4] + %2 = load i16* %y, align 2 + %3 = add nsw i16 %1, %2 + %4 = zext i16 %3 to i32 + ret i32 %4 +} + +; Don't widen if the value needs to be zero-extended +define zeroext i8 @test5() { + %x = alloca i8, align 1 +; CHECK: mov r0, sp +; CHECK: ldrb r0, [r0] + %1 = load i8* %x, align 1 + ret i8 %1 +} + +define zeroext i16 @test6() { + %x = alloca i16, align 2 +; CHECK: mov r0, sp +; CHECK: ldrh r0, [r0] + %1 = load i16* %x, align 2 + ret i16 %1 +} diff --git a/llvm/test/CodeGen/Thumb/stm-merge.ll b/llvm/test/CodeGen/Thumb/stm-merge.ll index 76e71f4..d4b4cd2 100644 --- a/llvm/test/CodeGen/Thumb/stm-merge.ll +++ b/llvm/test/CodeGen/Thumb/stm-merge.ll @@ -7,16 +7,17 @@ target triple = "thumbv6m--linux-gnueabi" @e = internal unnamed_addr global i32* null, align 4 ; Function Attrs: nounwind optsize -define void @fn1() #0 { +define void @fn1(i32 %x, i32 %y, i32 %z) #0 { entry: ; CHECK-LABEL: fn1: ; CHECK: stm r[[BASE:[0-9]]]!, {{.*}} ; CHECK-NOT: {{.*}} r[[BASE]] -; CHECK: ldr r[[BASE]], {{.*}} %g = alloca i32, align 4 %h = alloca i32, align 4 - store i32 1, i32* %g, align 4 - store i32 0, i32* %h, align 4 + %i = alloca i32, align 4 + store i32 %x, i32* %i, align 4 + store i32 %y, i32* %h, align 4 + store i32 %z, i32* %g, align 4 %.pr = load i32* @d, align 4 %cmp11 = icmp slt i32 %.pr, 1 br i1 %cmp11, label %for.inc.lr.ph, label %for.body5 diff --git a/llvm/test/CodeGen/Thumb/vargs.ll b/llvm/test/CodeGen/Thumb/vargs.ll index 4078b01..71e8afa 100644 --- a/llvm/test/CodeGen/Thumb/vargs.ll +++ b/llvm/test/CodeGen/Thumb/vargs.ll @@ -6,6 +6,10 @@ define void @f(i32 %a, ...) { entry: +; Check that space is reserved above the pushed lr for variadic argument +; registers to be stored in. +; CHECK: sub sp, #[[IMM:[0-9]+]] +; CHECK: push %va = alloca i8*, align 4 ; [#uses=4] %va.upgrd.1 = bitcast i8** %va to i8* ; [#uses=1] call void @llvm.va_start( i8* %va.upgrd.1 ) @@ -27,6 +31,13 @@ bb7: ; preds = %bb %va.upgrd.4 = bitcast i8** %va to i8* ; [#uses=1] call void @llvm.va_end( i8* %va.upgrd.4 ) ret void + +; The return sequence should pop the lr to r3, recover the stack space used to +; store variadic argument registers, then return via r3. Possibly there is a pop +; before this, but only if the function happened to use callee-saved registers. +; CHECK: pop {r3} +; CHECK: add sp, #[[IMM]] +; CHECK: bx r3 } declare void @llvm.va_start(i8*) @@ -34,8 +45,3 @@ declare void @llvm.va_start(i8*) declare i32 @printf(i8*, ...) declare void @llvm.va_end(i8*) - -; CHECK: pop -; CHECK: pop -; CHECK-NOT: pop - -- 2.7.4