From 970b0d576c35c296111ffd9be6ffd04c294760db Mon Sep 17 00:00:00 2001 From: Oliver Stannard Date: Mon, 17 Nov 2014 11:18:10 +0000 Subject: [PATCH] [Thumb1] Re-write emitThumbRegPlusImmediate This was motivated by a bug which caused code like this to be miscompiled: declare void @take_ptr(i8*) define void @test() { %addr1.32 = alloca i8 %addr2.32 = alloca i32, i32 1028 call void @take_ptr(i8* %addr1) ret void } This was emitting the following assembly to get the value of %addr1: add r0, sp, #1020 add r0, r0, #8 However, "add r0, r0, #8" is not a valid Thumb1 instruction, and this could not be assembled. The generated object file contained this, resulting in r0 holding SP+8 rather tha SP+1028: add r0, sp, #1020 add r0, sp, #8 This function looked like it could have caused miscompilations for other combinations of registers and offsets (though I don't think it is currently called with these), and the heuristic it used did not match the emitted code in all cases. llvm-svn: 222125 --- llvm/include/llvm/Support/MathExtras.h | 4 +- llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp | 270 ++++++++++++++--------------- llvm/test/CodeGen/ARM/thumb1-varalloc.ll | 47 ++++- llvm/test/CodeGen/Thumb/large-stack.ll | 50 ++++-- 4 files changed, 212 insertions(+), 159 deletions(-) diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h index 9abc57a..9d16182 100644 --- a/llvm/include/llvm/Support/MathExtras.h +++ b/llvm/include/llvm/Support/MathExtras.h @@ -595,10 +595,10 @@ inline uint64_t PowerOf2Floor(uint64_t A) { /// RoundUpToAlignment(5, 8) = 8 /// RoundUpToAlignment(17, 8) = 24 /// RoundUpToAlignment(~0LL, 8) = 0 +/// RoundUpToAlignment(321, 255) = 510 /// \endcode inline uint64_t RoundUpToAlignment(uint64_t Value, uint64_t Align) { - assert(isPowerOf2_64(Align) && "Alignment must be power of 2!"); - return (Value + Align - 1) & ~uint64_t(Align - 1); + return (Value + Align - 1) / Align * Align; } /// Returns the offset to the next integer (mod 2**64) that is greater than diff --git a/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp b/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp index 9a01b8a..c10c809 100644 --- a/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp @@ -66,6 +66,10 @@ Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB, int Val, ARMCC::CondCodes Pred, unsigned PredReg, unsigned MIFlags) const { + assert((isARMLowRegister(DestReg) || + isVirtualRegister(DestReg)) && + "Thumb1 does not have ldr to high register"); + MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); MachineConstantPool *ConstantPool = MF.getConstantPool(); @@ -106,15 +110,15 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB, NumBytes = -NumBytes; } unsigned LdReg = DestReg; - if (DestReg == ARM::SP) { + if (DestReg == ARM::SP) assert(BaseReg == ARM::SP && "Unexpected!"); + if (!isARMLowRegister(DestReg) && !MRI.isVirtualRegister(DestReg)) LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass); - } - if (NumBytes <= 255 && NumBytes >= 0) + if (NumBytes <= 255 && NumBytes >= 0 && CanChangeCC) { AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg)) .addImm(NumBytes).setMIFlags(MIFlags); - else if (NumBytes < 0 && NumBytes >= -255) { + } else if (NumBytes < 0 && NumBytes >= -255 && CanChangeCC) { AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg)) .addImm(NumBytes).setMIFlags(MIFlags); AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg)) @@ -124,7 +128,8 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB, ARMCC::AL, 0, MIFlags); // Emit add / sub. - int Opc = (isSub) ? ARM::tSUBrr : (isHigh ? ARM::tADDhirr : ARM::tADDrr); + int Opc = (isSub) ? ARM::tSUBrr : ((isHigh || !CanChangeCC) ? ARM::tADDhirr + : ARM::tADDrr); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg); if (Opc != ARM::tADDhirr) @@ -136,32 +141,10 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB, AddDefaultPred(MIB); } -/// calcNumMI - Returns the number of instructions required to materialize -/// the specific add / sub r, c instruction. -static unsigned calcNumMI(int Opc, int ExtraOpc, unsigned Bytes, - unsigned NumBits, unsigned Scale) { - unsigned NumMIs = 0; - unsigned Chunk = ((1 << NumBits) - 1) * Scale; - - if (Opc == ARM::tADDrSPi) { - unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes; - Bytes -= ThisVal; - NumMIs++; - NumBits = 8; - Scale = 1; // Followed by a number of tADDi8. - Chunk = ((1 << NumBits) - 1) * Scale; - } - - NumMIs += Bytes / Chunk; - if ((Bytes % Chunk) != 0) - NumMIs++; - if (ExtraOpc) - NumMIs++; - return NumMIs; -} - /// emitThumbRegPlusImmediate - Emits a series of instructions to materialize -/// a destreg = basereg + immediate in Thumb code. +/// a destreg = basereg + immediate in Thumb code. Tries a series of ADDs or +/// SUBs first, and uses a constant pool value if the instruction sequence would +/// be too long. This is allowed to modify the condition flags. void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, DebugLoc dl, @@ -172,131 +155,146 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB, bool isSub = NumBytes < 0; unsigned Bytes = (unsigned)NumBytes; if (isSub) Bytes = -NumBytes; - bool isMul4 = (Bytes & 3) == 0; - bool isTwoAddr = false; - bool DstNotEqBase = false; - unsigned NumBits = 1; - unsigned Scale = 1; - int Opc = 0; + + int CopyOpc = 0; + unsigned CopyBits = 0; + unsigned CopyScale = 1; + bool CopyNeedsCC = false; int ExtraOpc = 0; - bool NeedCC = false; - - if (DestReg == BaseReg && BaseReg == ARM::SP) { - assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!"); - NumBits = 7; - Scale = 4; - Opc = isSub ? ARM::tSUBspi : ARM::tADDspi; - isTwoAddr = true; - } else if (!isSub && BaseReg == ARM::SP) { - // r1 = add sp, 403 - // => - // r1 = add sp, 100 * 4 - // r1 = add r1, 3 - if (!isMul4) { - Bytes &= ~3; - ExtraOpc = ARM::tADDi3; + unsigned ExtraBits = 0; + unsigned ExtraScale = 1; + bool ExtraNeedsCC = false; + + // Strategy: + // We need to select two types of instruction, maximizing the available + // immediate range of each. The instructions we use will depend on whether + // DestReg and BaseReg are low, high or the stack pointer. + // * CopyOpc - DestReg = BaseReg + imm + // This will be emitted once if DestReg != BaseReg, and never if + // DestReg == BaseReg. + // * ExtraOpc - DestReg = DestReg + imm + // This will be emitted as many times as necessary to add the + // full immediate. + // If the immediate ranges of these instructions are not large enough to cover + // NumBytes with a reasonable number of instructions, we fall back to using a + // value loaded from a constant pool. + if (DestReg == ARM::SP) { + if (BaseReg == ARM::SP) { + // sp -> sp + // Already in right reg, no copy needed + } else { + // low -> sp or high -> sp + CopyOpc = ARM::tMOVr; + CopyBits = 0; } - DstNotEqBase = true; - NumBits = 8; - Scale = 4; - Opc = ARM::tADDrSPi; - } else { - // sp = sub sp, c - // r1 = sub sp, c - // r8 = sub sp, c - if (DestReg != BaseReg) - DstNotEqBase = true; - if (DestReg == ARM::SP) { - Opc = isSub ? ARM::tSUBspi : ARM::tADDspi; - assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!"); - NumBits = 7; - Scale = 4; + ExtraOpc = isSub ? ARM::tSUBspi : ARM::tADDspi; + ExtraBits = 7; + ExtraScale = 4; + } else if (isARMLowRegister(DestReg)) { + if (BaseReg == ARM::SP) { + // sp -> low + assert(!isSub && "Thumb1 does not have tSUBrSPi"); + CopyOpc = ARM::tADDrSPi; + CopyBits = 8; + CopyScale = 4; + } else if (DestReg == BaseReg) { + // low -> same low + // Already in right reg, no copy needed + } else if (isARMLowRegister(BaseReg)) { + // low -> different low + CopyOpc = isSub ? ARM::tSUBi3 : ARM::tADDi3; + CopyBits = 3; + CopyNeedsCC = true; + } else { + // high -> low + CopyOpc = ARM::tMOVr; + CopyBits = 0; + } + ExtraOpc = isSub ? ARM::tSUBi8 : ARM::tADDi8; + ExtraBits = 8; + ExtraNeedsCC = true; + } else /* DestReg is high */ { + if (DestReg == BaseReg) { + // high -> same high + // Already in right reg, no copy needed } else { - Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8; - NumBits = 8; - NeedCC = true; + // {low,high,sp} -> high + CopyOpc = ARM::tMOVr; + CopyBits = 0; } - isTwoAddr = true; + ExtraOpc = 0; } - unsigned NumMIs = calcNumMI(Opc, ExtraOpc, Bytes, NumBits, Scale); + // We could handle an unaligned immediate with an unaligned copy instruction + // and an aligned extra instruction, but this case is not currently needed. + assert(((Bytes & 3) == 0 || ExtraScale == 1) && + "Unaligned offset, but all instructions require alignment"); + + unsigned CopyRange = ((1 << CopyBits) - 1) * CopyScale; + // If we would emit the copy with an immediate of 0, just use tMOVr. + if (CopyOpc && Bytes < CopyScale) { + CopyOpc = ARM::tMOVr; + CopyBits = 0; + CopyScale = 1; + CopyNeedsCC = false; + CopyRange = 0; + } + unsigned ExtraRange = ((1 << ExtraBits) - 1) * ExtraScale; // per instruction + unsigned RequiredCopyInstrs = CopyOpc ? 1 : 0; + unsigned RangeAfterCopy = (CopyRange > Bytes) ? 0 : (Bytes - CopyRange); + + // We could handle this case when the copy instruction does not require an + // aligned immediate, but we do not currently do this. + assert(RangeAfterCopy % ExtraScale == 0 && + "Extra instruction requires immediate to be aligned"); + + unsigned RequiredExtraInstrs; + if (ExtraRange) + RequiredExtraInstrs = RoundUpToAlignment(RangeAfterCopy, ExtraRange) / ExtraRange; + else if (RangeAfterCopy > 0) + // We need an extra instruction but none is available + RequiredExtraInstrs = 1000000; + else + RequiredExtraInstrs = 0; + unsigned RequiredInstrs = RequiredCopyInstrs + RequiredExtraInstrs; unsigned Threshold = (DestReg == ARM::SP) ? 3 : 2; - if (NumMIs > Threshold) { - // This will expand into too many instructions. Load the immediate from a - // constpool entry. + + // Use a constant pool, if the sequence of ADDs/SUBs is too expensive. + if (RequiredInstrs > Threshold) { emitThumbRegPlusImmInReg(MBB, MBBI, dl, DestReg, BaseReg, NumBytes, true, TII, MRI, MIFlags); return; } - if (DstNotEqBase) { - if (isARMLowRegister(DestReg) && isARMLowRegister(BaseReg)) { - // If both are low registers, emit DestReg = add BaseReg, max(Imm, 7) - unsigned Chunk = (1 << 3) - 1; - unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes; - Bytes -= ThisVal; - const MCInstrDesc &MCID = TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3); - const MachineInstrBuilder MIB = - AddDefaultT1CC(BuildMI(MBB, MBBI, dl, MCID, DestReg) - .setMIFlags(MIFlags)); - AddDefaultPred(MIB.addReg(BaseReg, RegState::Kill).addImm(ThisVal)); - } else if (isARMLowRegister(DestReg) && BaseReg == ARM::SP && Bytes > 0) { - unsigned ThisVal = std::min(1020U, Bytes / 4 * 4); - Bytes -= ThisVal; - AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), DestReg) - .addReg(BaseReg, RegState::Kill).addImm(ThisVal / 4)) - .setMIFlags(MIFlags); - } else { - AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg) - .addReg(BaseReg, RegState::Kill)) - .setMIFlags(MIFlags); + // Emit zero or one copy instructions + if (CopyOpc) { + unsigned CopyImm = std::min(Bytes, CopyRange) / CopyScale; + Bytes -= CopyImm * CopyScale; + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(CopyOpc), DestReg); + if (CopyNeedsCC) + MIB = AddDefaultT1CC(MIB); + MIB.addReg(BaseReg, RegState::Kill); + if (CopyOpc != ARM::tMOVr) { + MIB.addImm(CopyImm); } + AddDefaultPred(MIB.setMIFlags(MIFlags)); + BaseReg = DestReg; } - unsigned Chunk = ((1 << NumBits) - 1) * Scale; + // Emit zero or more in-place add/sub instructions while (Bytes) { - unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes; - Bytes -= ThisVal; - ThisVal /= Scale; - // Build the new tADD / tSUB. - if (isTwoAddr) { - MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg); - if (NeedCC) - MIB = AddDefaultT1CC(MIB); - MIB.addReg(DestReg).addImm(ThisVal); - MIB = AddDefaultPred(MIB); - MIB.setMIFlags(MIFlags); - } else { - bool isKill = BaseReg != ARM::SP; - MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg); - if (NeedCC) - MIB = AddDefaultT1CC(MIB); - MIB.addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal); - MIB = AddDefaultPred(MIB); - MIB.setMIFlags(MIFlags); - - BaseReg = DestReg; - if (Opc == ARM::tADDrSPi) { - // r4 = add sp, imm - // r4 = add r4, imm - // ... - NumBits = 8; - Scale = 1; - Chunk = ((1 << NumBits) - 1) * Scale; - Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8; - NeedCC = isTwoAddr = true; - } - } - } + unsigned ExtraImm = std::min(Bytes, ExtraRange) / ExtraScale; + Bytes -= ExtraImm * ExtraScale; - if (ExtraOpc) { - const MCInstrDesc &MCID = TII.get(ExtraOpc); - AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, MCID, DestReg)) - .addReg(DestReg, RegState::Kill) - .addImm(((unsigned)NumBytes) & 3) - .setMIFlags(MIFlags)); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(ExtraOpc), DestReg); + if (ExtraNeedsCC) + MIB = AddDefaultT1CC(MIB); + MIB.addReg(BaseReg).addImm(ExtraImm); + MIB = AddDefaultPred(MIB); + MIB.setMIFlags(MIFlags); } } diff --git a/llvm/test/CodeGen/ARM/thumb1-varalloc.ll b/llvm/test/CodeGen/ARM/thumb1-varalloc.ll index 19c5dbece..8d5888d 100644 --- a/llvm/test/CodeGen/ARM/thumb1-varalloc.ll +++ b/llvm/test/CodeGen/ARM/thumb1-varalloc.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -mtriple=thumbv6-apple-darwin | FileCheck %s ; RUN: llc < %s -mtriple=thumbv6-apple-darwin -regalloc=basic | FileCheck %s +; RUN: llc < %s -o %t -filetype=obj -mtriple=thumbv6-apple-darwin +; RUN: llvm-objdump -triple=thumbv6-apple-darwin -d %t | FileCheck %s @__bar = external hidden global i8* @__baz = external hidden global i8* @@ -49,13 +51,13 @@ define void @test_local_var_addr() { %addr2 = alloca i8 ; CHECK: mov r0, sp -; CHECK: adds r0, r0, #{{[0-9]+}} -; CHECK: blx _take_ptr +; CHECK: adds r0, #{{[0-9]+}} +; CHECK: blx call void @take_ptr(i8* %addr1) ; CHECK: mov r0, sp -; CHECK: adds r0, r0, #{{[0-9]+}} -; CHECK: blx _take_ptr +; CHECK: adds r0, #{{[0-9]+}} +; CHECK: blx call void @take_ptr(i8* %addr2) ret void @@ -70,7 +72,7 @@ define void @test_simple_var() { ; CHECK: mov r0, sp ; CHECK-NOT: adds r0 -; CHECK: blx _take_ptr +; CHECK: blx call void @take_ptr(i8* %addr8) ret void } @@ -85,12 +87,12 @@ define void @test_local_var_addr_aligned() { %addr2 = bitcast i32* %addr2.32 to i8* ; CHECK: add r0, sp, #{{[0-9]+}} -; CHECK: blx _take_ptr +; CHECK: blx call void @take_ptr(i8* %addr1) ; CHECK: mov r0, sp ; CHECK-NOT: add r0 -; CHECK: blx _take_ptr +; CHECK: blx call void @take_ptr(i8* %addr2) ret void @@ -104,8 +106,35 @@ define void @test_local_var_big_offset() { %addr2.32 = alloca i32, i32 257 ; CHECK: add [[RTMP:r[0-9]+]], sp, #1020 -; CHECL: add r0, [[RTMP]], #8 -; CHECK: blx _take_ptr +; CHECK: adds [[RTMP]], #8 +; CHECK: blx + call void @take_ptr(i8* %addr1) + + ret void +} + +; Max range addressable with tADDrSPi +define void @test_local_var_offset_1020() { +; CHECK-LABEL: test_local_var_offset_1020 + %addr1 = alloca i8, i32 4 + %addr2 = alloca i8, i32 1020 + +; CHECK: add r0, sp, #1020 +; CHECK-NEXT: blx + call void @take_ptr(i8* %addr1) + + ret void +} + +; Max range addressable with tADDrSPi + tADDi8 +define void @test_local_var_offset_1275() { +; CHECK-LABEL: test_local_var_offset_1275 + %addr1 = alloca i8, i32 1 + %addr2 = alloca i8, i32 1275 + +; CHECK: add r0, sp, #1020 +; CHECK: adds r0, #255 +; CHECK-NEXT: blx call void @take_ptr(i8* %addr1) ret void diff --git a/llvm/test/CodeGen/Thumb/large-stack.ll b/llvm/test/CodeGen/Thumb/large-stack.ll index c371ac6..269bdd9 100644 --- a/llvm/test/CodeGen/Thumb/large-stack.ll +++ b/llvm/test/CodeGen/Thumb/large-stack.ll @@ -1,31 +1,57 @@ -; RUN: llc < %s -mtriple=thumb-apple-ios | FileCheck %s +; RUN: llc < %s -mtriple=thumb-apple-ios | FileCheck %s --check-prefix=CHECK --check-prefix=IOS +; RUN: llc < %s -mtriple=thumb-none-eabi | FileCheck %s --check-prefix=CHECK --check-prefix=EABI +; RUN: llc < %s -o %t -filetype=obj -mtriple=thumbv6-apple-ios +; RUN: llvm-objdump -triple=thumbv6-apple-ios -d %t | FileCheck %s --check-prefix=CHECK --check-prefix=IOS +; RUN: llc < %s -o %t -filetype=obj -mtriple=thumbv6-none-eabi +; RUN: llvm-objdump -triple=thumbv6-none-eabi -d %t | FileCheck %s --check-prefix=CHECK --check-prefix=EABI +; Largest stack for which a single tADDspi/tSUBspi is enough define void @test1() { ; CHECK-LABEL: test1: -; CHECK: sub sp, #256 -; CHECK: add sp, #256 - %tmp = alloca [ 64 x i32 ] , align 4 +; CHECK: sub sp, #508 +; CHECK: add sp, #508 + %tmp = alloca [ 508 x i8 ] , align 4 ret void } +; Largest stack for which three tADDspi/tSUBspis are enough +define void @test100() { +; CHECK-LABEL: test100: +; CHECK: sub sp, #508 +; CHECK: sub sp, #508 +; CHECK: sub sp, #508 +; EABI: add sp, #508 +; EABI: add sp, #508 +; EABI: add sp, #508 +; IOS: subs r4, r7, #4 +; IOS: mov sp, r4 + %tmp = alloca [ 1524 x i8 ] , align 4 + ret void +} + +; Smallest stack for which we use a constant pool define void @test2() { ; CHECK-LABEL: test2: -; CHECK: ldr r0, LCPI +; CHECK: ldr r0, ; CHECK: add sp, r0 -; CHECK: subs r4, r7, #4 -; CHECK: mov sp, r4 - %tmp = alloca [ 4168 x i8 ] , align 4 +; EABI: ldr r0, +; EABI: add sp, r0 +; IOS: subs r4, r7, #4 +; IOS: mov sp, r4 + %tmp = alloca [ 1528 x i8 ] , align 4 ret void } define i32 @test3() { ; CHECK-LABEL: test3: -; CHECK: ldr r1, LCPI +; CHECK: ldr r1, ; CHECK: add sp, r1 -; CHECK: ldr r1, LCPI +; CHECK: ldr r1, ; CHECK: add r1, sp -; CHECK: subs r4, r7, #4 -; CHECK: mov sp, r4 +; EABI: ldr r1, +; EABI: add sp, r1 +; IOS: subs r4, r7, #4 +; IOS: mov sp, r4 %retval = alloca i32, align 4 %tmp = alloca i32, align 4 %a = alloca [805306369 x i8], align 16 -- 2.7.4