From 1b12b1a33520fc3a1cb7b22274312a86ce7a3718 Mon Sep 17 00:00:00 2001 From: John Brawn Date: Thu, 6 Jul 2023 10:43:40 +0100 Subject: [PATCH] [ARM] Restructure MOVi32imm expansion to not do pointless instructions The expansion of the various MOVi32imm pseudo-instructions works by splitting the operand into components (either halfwords or bytes) and emitting instructions to combine those components into the final result. When the operand is an immediate with some components being zero this can result in pointless instructions that just add zero. Avoid this by restructuring things so that a separate function handles splitting the operand into components, then don't emit the component if it is a zero immediate. This is straightforward for movw/movt, where we just don't emit the movt if it's zero, but the thumb1 expansion using mov/add/lsl is more complex, as even when we don't emit a given byte we still need to get the shift correct. Differential Revision: https://reviews.llvm.org/D154943 --- llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp | 237 ++++++++++----------- .../CodeGen/ARM/execute-only-big-stack-frame.ll | 2 - llvm/test/CodeGen/ARM/execute-only.ll | 65 ++++++ llvm/test/CodeGen/ARM/large-stack.ll | 6 +- llvm/test/CodeGen/Thumb2/segmented-stacks.ll | 8 +- 5 files changed, 175 insertions(+), 143 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 1adaa6a..2f9236b 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -954,113 +954,109 @@ static MachineOperand makeImplicit(const MachineOperand &MO) { return NewMO; } +static MachineOperand getMovOperand(const MachineOperand &MO, + unsigned TargetFlag) { + unsigned TF = MO.getTargetFlags() | TargetFlag; + switch (MO.getType()) { + case MachineOperand::MO_Immediate: { + unsigned Imm = MO.getImm(); + switch (TargetFlag) { + case ARMII::MO_HI_8_15: + Imm = (Imm >> 24) & 0xff; + break; + case ARMII::MO_HI_0_7: + Imm = (Imm >> 16) & 0xff; + break; + case ARMII::MO_LO_8_15: + Imm = (Imm >> 8) & 0xff; + break; + case ARMII::MO_LO_0_7: + Imm = Imm & 0xff; + break; + case ARMII::MO_HI16: + Imm = (Imm >> 16) & 0xffff; + break; + case ARMII::MO_LO16: + Imm = Imm & 0xffff; + break; + default: + llvm_unreachable("Only HI/LO target flags are expected"); + } + return MachineOperand::CreateImm(Imm); + } + case MachineOperand::MO_ExternalSymbol: + return MachineOperand::CreateES(MO.getSymbolName(), TF); + case MachineOperand::MO_JumpTableIndex: + return MachineOperand::CreateJTI(MO.getIndex(), TF); + default: + return MachineOperand::CreateGA(MO.getGlobal(), MO.getOffset(), TF); + } +} + void ARMExpandPseudo::ExpandTMOV32BitImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); const MachineOperand &MO = MI.getOperand(1); - MachineInstrBuilder Upper8_15, LSL_U8_15, Upper0_7, Lower8_15, Lower0_7; unsigned MIFlags = MI.getFlags(); LLVM_DEBUG(dbgs() << "Expanding: "; MI.dump()); - Upper8_15 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tMOVi8), DstReg) - .add(t1CondCodeOp(true)); - - LSL_U8_15 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tLSLri), DstReg) - .add(t1CondCodeOp(true)) - .addReg(DstReg) - .addImm(8) - .add(predOps(ARMCC::AL)) - .setMIFlags(MIFlags); - - Upper0_7 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tADDi8), DstReg) - .add(t1CondCodeOp(true)) - .addReg(DstReg); - - MachineInstr *LSL_U0_7 = MBB.getParent()->CloneMachineInstr(LSL_U8_15); - MBB.insert(MBBI, LSL_U0_7); - - Lower8_15 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tADDi8), DstReg) - .add(t1CondCodeOp(true)) - .addReg(DstReg); - - MachineInstr *LSL_L8_15 = MBB.getParent()->CloneMachineInstr(LSL_U8_15); - MBB.insert(MBBI, LSL_L8_15); - - Lower0_7 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tADDi8)) - .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) - .add(t1CondCodeOp(true)) - .addReg(DstReg); + // Expand the mov into a sequence of mov/add+lsl of the individual bytes. We + // want to avoid emitting any zero bytes, as they won't change the result, and + // also don't want any pointless shifts, so instead of immediately emitting + // the shift for a byte we keep track of how much we will need to shift and do + // it before the next nonzero byte. + unsigned PendingShift = 0; + for (unsigned Byte = 0; Byte < 4; ++Byte) { + unsigned Flag = Byte == 0 ? ARMII::MO_HI_8_15 + : Byte == 1 ? ARMII::MO_HI_0_7 + : Byte == 2 ? ARMII::MO_LO_8_15 + : ARMII::MO_LO_0_7; + MachineOperand Operand = getMovOperand(MO, Flag); + bool ZeroImm = Operand.isImm() && Operand.getImm() == 0; + unsigned Op = PendingShift ? ARM::tADDi8 : ARM::tMOVi8; + + // Emit the pending shift if we're going to emit this byte or if we've + // reached the end. + if (PendingShift && (!ZeroImm || Byte == 3)) { + MachineInstr *Lsl = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tLSLri), DstReg) + .add(t1CondCodeOp(true)) + .addReg(DstReg) + .addImm(PendingShift) + .add(predOps(ARMCC::AL)) + .setMIFlags(MIFlags); + (void)Lsl; + LLVM_DEBUG(dbgs() << "And: "; Lsl->dump();); + PendingShift = 0; + } - Upper8_15.setMIFlags(MIFlags); - Upper0_7.setMIFlags(MIFlags); - Lower8_15.setMIFlags(MIFlags); - Lower0_7.setMIFlags(MIFlags); + // Emit this byte if it's nonzero. + if (!ZeroImm) { + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Op), DstReg) + .add(t1CondCodeOp(true)); + if (Op == ARM::tADDi8) + MIB.addReg(DstReg); + MIB.add(Operand); + MIB.add(predOps(ARMCC::AL)); + MIB.setMIFlags(MIFlags); + LLVM_DEBUG(dbgs() << (Op == ARM::tMOVi8 ? "To: " : "And:") << " "; + MIB.getInstr()->dump();); + } - switch (MO.getType()) { - case MachineOperand::MO_Immediate: { - unsigned Imm = MO.getImm(); - unsigned Hi8_15 = (Imm >> 24) & 0xff; - unsigned Hi0_7 = (Imm >> 16) & 0xff; - unsigned Lo8_15 = (Imm >> 8) & 0xff; - unsigned Lo0_7 = Imm & 0xff; - Upper8_15 = Upper8_15.addImm(Hi8_15); - Upper0_7 = Upper0_7.addImm(Hi0_7); - Lower8_15 = Lower8_15.addImm(Lo8_15); - Lower0_7 = Lower0_7.addImm(Lo0_7); - break; - } - case MachineOperand::MO_ExternalSymbol: { - const char *ES = MO.getSymbolName(); - unsigned TF = MO.getTargetFlags(); - Upper8_15 = Upper8_15.addExternalSymbol(ES, TF | ARMII::MO_HI_8_15); - Upper0_7 = Upper0_7.addExternalSymbol(ES, TF | ARMII::MO_HI_0_7); - Lower8_15 = Lower8_15.addExternalSymbol(ES, TF | ARMII::MO_LO_8_15); - Lower0_7 = Lower0_7.addExternalSymbol(ES, TF | ARMII::MO_LO_0_7); - break; - } - case MachineOperand::MO_JumpTableIndex: { - unsigned Idx = MO.getIndex(); - unsigned TF = MO.getTargetFlags(); - Upper8_15 = Upper8_15.addJumpTableIndex(Idx, TF | ARMII::MO_HI_8_15); - Upper0_7 = Upper0_7.addJumpTableIndex(Idx, TF | ARMII::MO_HI_0_7); - Lower8_15 = Lower8_15.addJumpTableIndex(Idx, TF | ARMII::MO_LO_8_15); - Lower0_7 = Lower0_7.addJumpTableIndex(Idx, TF | ARMII::MO_LO_0_7); - break; - } - default: { - const GlobalValue *GV = MO.getGlobal(); - unsigned TF = MO.getTargetFlags(); - Upper8_15 = - Upper8_15.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI_8_15); - Upper0_7 = - Upper0_7.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI_0_7); - Lower8_15 = - Lower8_15.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO_8_15); - Lower0_7 = - Lower0_7.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO_0_7); - break; - } + // Don't accumulate the shift value if we've not yet seen a nonzero byte. + if (PendingShift || !ZeroImm) + PendingShift += 8; } - Upper8_15 = Upper8_15.add(predOps(ARMCC::AL)); - Upper0_7 = Upper0_7.add(predOps(ARMCC::AL)); - Lower8_15 = Lower8_15.add(predOps(ARMCC::AL)); - Lower0_7 = Lower0_7.add(predOps(ARMCC::AL)); + // The dest is dead on the last instruction we emitted if it was dead on the + // original instruction. + (--MBBI)->getOperand(0).setIsDead(DstIsDead); MI.eraseFromParent(); - LLVM_DEBUG(dbgs() << "To: "; Upper8_15.getInstr()->dump();); - LLVM_DEBUG(dbgs() << "And: "; LSL_U8_15.getInstr()->dump();); - LLVM_DEBUG(dbgs() << "And: "; Upper0_7.getInstr()->dump();); - LLVM_DEBUG(dbgs() << "And: "; LSL_U0_7->dump();); - LLVM_DEBUG(dbgs() << "And: "; Lower8_15.getInstr()->dump();); - LLVM_DEBUG(dbgs() << "And: "; LSL_L8_15->dump();); - LLVM_DEBUG(dbgs() << "And: "; Lower0_7.getInstr()->dump();); } void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, @@ -1132,53 +1128,34 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, } LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LO16Opc), DstReg); - HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc)) - .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) - .addReg(DstReg); - LO16.setMIFlags(MIFlags); - HI16.setMIFlags(MIFlags); - - switch (MO.getType()) { - case MachineOperand::MO_Immediate: { - unsigned Imm = MO.getImm(); - unsigned Lo16 = Imm & 0xffff; - unsigned Hi16 = (Imm >> 16) & 0xffff; - LO16 = LO16.addImm(Lo16); - HI16 = HI16.addImm(Hi16); - break; - } - case MachineOperand::MO_ExternalSymbol: { - const char *ES = MO.getSymbolName(); - unsigned TF = MO.getTargetFlags(); - LO16 = LO16.addExternalSymbol(ES, TF | ARMII::MO_LO16); - HI16 = HI16.addExternalSymbol(ES, TF | ARMII::MO_HI16); - break; - } - default: { - const GlobalValue *GV = MO.getGlobal(); - unsigned TF = MO.getTargetFlags(); - LO16 = LO16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO16); - HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16); - break; - } - } - + LO16.add(getMovOperand(MO, ARMII::MO_LO16)); LO16.cloneMemRefs(MI); - HI16.cloneMemRefs(MI); LO16.addImm(Pred).addReg(PredReg); - HI16.addImm(Pred).addReg(PredReg); + if (isCC) + LO16.add(makeImplicit(MI.getOperand(1))); + LO16.copyImplicitOps(MI); + LLVM_DEBUG(dbgs() << "To: "; LO16.getInstr()->dump();); + + MachineOperand HIOperand = getMovOperand(MO, ARMII::MO_HI16); + if (!(HIOperand.isImm() && HIOperand.getImm() == 0)) { + HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg); + HI16.setMIFlags(MIFlags); + HI16.add(HIOperand); + HI16.cloneMemRefs(MI); + HI16.addImm(Pred).addReg(PredReg); + HI16.copyImplicitOps(MI); + LLVM_DEBUG(dbgs() << "And: "; HI16.getInstr()->dump();); + } else { + LO16->getOperand(0).setIsDead(DstIsDead); + } if (RequiresBundling) finalizeBundle(MBB, LO16->getIterator(), MBBI->getIterator()); - if (isCC) - LO16.add(makeImplicit(MI.getOperand(1))); - LO16.copyImplicitOps(MI); - HI16.copyImplicitOps(MI); MI.eraseFromParent(); - LLVM_DEBUG(dbgs() << "To: "; LO16.getInstr()->dump();); - LLVM_DEBUG(dbgs() << "And: "; HI16.getInstr()->dump();); } // The size of the area, accessed by that VLSTM/VLLDM diff --git a/llvm/test/CodeGen/ARM/execute-only-big-stack-frame.ll b/llvm/test/CodeGen/ARM/execute-only-big-stack-frame.ll index 439f1a4..07c206e 100644 --- a/llvm/test/CodeGen/ARM/execute-only-big-stack-frame.ll +++ b/llvm/test/CodeGen/ARM/execute-only-big-stack-frame.ll @@ -26,11 +26,9 @@ define i8 @test_big_stack_frame() { ; CHECK-MOVW-MOVT-ADD: add sp, [[REG1]] ; CHECK-MOVW-MOVT-ADD-NOT: ldr {{r[0-9]+}}, .{{.*}} ; CHECK-MOVW-MOVT-ADD: movw [[REG2:r[0-9]+]], #65532 -; CHECK-MOVW-MOVT-ADD: movt [[REG2]], #0 ; CHECK-MOVW-MOVT-ADD: add [[REG2]], sp ; CHECK-MOVW-MOVT-ADD-NOT: ldr {{r[0-9]+}}, .{{.*}} ; CHECK-MOVW-MOVT-ADD: movw [[REG3:r[0-9]+]], #65532 -; CHECK-MOVW-MOVT-ADD: movt [[REG3]], #0 ; CHECK-MOVW-MOVT-ADD: add [[REG3]], sp ; CHECK-MOVW-MOVT-ADD-NOT: ldr {{r[0-9]+}}, .{{.*}} ; CHECK-MOVW-MOVT-ADD: movw [[REG4:r[0-9]+]], #0 diff --git a/llvm/test/CodeGen/ARM/execute-only.ll b/llvm/test/CodeGen/ARM/execute-only.ll index 304b855..94efc4c 100644 --- a/llvm/test/CodeGen/ARM/execute-only.ll +++ b/llvm/test/CodeGen/ARM/execute-only.ll @@ -142,3 +142,68 @@ entry: %v = load i32, ptr @external_global ret i32 %v } + +define i32 @test_imm() { +entry: +; CHECK-LABEL: test_imm: +; CHECK: movw [[IMMDEST:r[0-9]+]], #13124 +; CHECK-NEXT: movt [[IMMDEST]], #4386 +; CHECK-NEXT: bx lr +; CHECK-T1-LABEL: test_imm: +; CHECK-T1: movs [[IMMDEST:r[0-9]+]], #17 +; CHECK-T1-NEXT: lsls [[IMMDEST]], [[IMMDEST]], #8 +; CHECK-T1-NEXT: adds [[IMMDEST]], #34 +; CHECK-T1-NEXT: lsls [[IMMDEST]], [[IMMDEST]], #8 +; CHECK-T1-NEXT: adds [[IMMDEST]], #51 +; CHECK-T1-NEXT: lsls [[IMMDEST]], [[IMMDEST]], #8 +; CHECK-T1-NEXT: adds [[IMMDEST]], #68 +; CHECK-T1-NEXT: bx lr + + ret i32 u0x11223344 +} + +define i32 @test_imm_high_half() { +entry: +; CHECK-LABEL: test_imm_high_half: +; CHECK-T2BASE: movw [[IMMDEST:r[0-9]+]], #0 +; CHECK-T2: movs [[IMMDEST:r[0-9]+]], #0 +; CHECK-NEXT: movt [[IMMDEST]], #4386 +; CHECK-NEXT: bx lr +; CHECK-T1-LABEL: test_imm_high_half: +; CHECK-T1: movs [[IMMDEST:r[0-9]+]], #17 +; CHECK-T1-NEXT: lsls [[IMMDEST]], [[IMMDEST]], #8 +; CHECK-T1-NEXT: adds [[IMMDEST]], #34 +; CHECK-T1-NEXT: lsls [[IMMDEST]], [[IMMDEST]], #16 +; CHECK-T1-NEXT: bx lr + + ret i32 u0x11220000 +} + +define i32 @test_imm_low_half() { +; CHECK-LABEL: test_imm_low_half: +; CHECK: movw [[IMMDEST:r[0-9]+]], #13124 +; CHECK-NEXT: bx lr +; CHECK-T1-LABEL: test_imm_low_half: +; CHECK-T1: movs [[IMMDEST]], #51 +; CHECK-T1-NEXT: lsls [[IMMDEST]], [[IMMDEST]], #8 +; CHECK-T1-NEXT: adds [[IMMDEST]], #68 +; CHECK-T1-NEXT: bx lr + +entry: + ret i32 u0x3344 +} + +define i32 @test_imm_middle_bytes() { +; CHECK-LABEL: test_imm_middle_bytes: +; CHECK: movw [[IMMDEST:r[0-9]+]], #13056 +; CHECK-NEXT: movt [[IMMDEST]], #34 +; CHECK-NEXT: bx lr +; CHECK-T1-LABEL: test_imm_middle_bytes: +; CHECK-T1: movs [[IMMDEST]], #34 +; CHECK-T1-NEXT: lsls [[IMMDEST]], [[IMMDEST]], #8 +; CHECK-T1-NEXT: adds [[IMMDEST]], #51 +; CHECK-T1-NEXT: lsls [[IMMDEST]], [[IMMDEST]], #8 +; CHECK-T1-NEXT: bx lr + + ret i32 u0x223300 +} diff --git a/llvm/test/CodeGen/ARM/large-stack.ll b/llvm/test/CodeGen/ARM/large-stack.ll index 9e59aff..a1adf5d 100644 --- a/llvm/test/CodeGen/ARM/large-stack.ll +++ b/llvm/test/CodeGen/ARM/large-stack.ll @@ -43,11 +43,7 @@ define i32 @test3() { store i32 0, ptr %tmp ;; are we choosing correct store/tSTRspi pattern for execute-only ; CHECK: movs [[REG:r[0-9]+]], #0x30 -; CHECK-NEXT: lsls [[REG]], [[REG]], #0x8 -; CHECK-NEXT: adds [[REG]], #0x0 -; CHECK-NEXT: lsls [[REG]], [[REG]], #0x8 -; CHECK-NEXT: adds [[REG]], #0x0 -; CHECK-NEXT: lsls [[REG]], [[REG]], #0x8 +; CHECK-NEXT: lsls [[REG]], [[REG]], #0x18 ; CHECK-NEXT: adds [[REG]], #0x8 %tmp1 = load i32, ptr %tmp ret i32 %tmp1 diff --git a/llvm/test/CodeGen/Thumb2/segmented-stacks.ll b/llvm/test/CodeGen/Thumb2/segmented-stacks.ll index 60f029c..b837d63 100644 --- a/llvm/test/CodeGen/Thumb2/segmented-stacks.ll +++ b/llvm/test/CodeGen/Thumb2/segmented-stacks.ll @@ -75,9 +75,8 @@ define void @test_large() #0 { ; THUMB-LABEL: test_large: ; THUMB: push {r4, r5} -; THUMB-NEXT: movw r4, #40192 ; THUMB-NEXT: mov r5, sp -; THUMB-NEXT: movt r4, #0 +; THUMB-NEXT: movw r4, #40192 ; THUMB-NEXT: sub r5, r5, r4 ; THUMB-NEXT: mrc p15, #0, r4, c13, c0, #3 ; THUMB-NEXT: ldr.w r4, [r4, #252] @@ -85,7 +84,6 @@ define void @test_large() #0 { ; THUMB-NEXT: bls .LBB1_2 ; THUMB: movw r4, #40192 -; THUMB-NEXT: movt r4, #0 ; THUMB-NEXT: mov r5, #0 ; THUMB-NEXT: push {lr} ; THUMB-NEXT: bl __morestack @@ -129,9 +127,8 @@ define fastcc void @test_fastcc_large() #0 { ; THUMB-LABEL: test_fastcc_large: ; THUMB: push {r4, r5} -; THUMB-NEXT: movw r4, #40192 ; THUMB-NEXT: mov r5, sp -; THUMB-NEXT: movt r4, #0 +; THUMB-NEXT: movw r4, #40192 ; THUMB-NEXT: sub r5, r5, r4 ; THUMB-NEXT: mrc p15, #0, r4, c13, c0, #3 ; THUMB-NEXT: ldr.w r4, [r4, #252] @@ -139,7 +136,6 @@ define fastcc void @test_fastcc_large() #0 { ; THUMB-NEXT: bls .LBB2_2 ; THUMB: movw r4, #40192 -; THUMB-NEXT: movt r4, #0 ; THUMB-NEXT: mov r5, #0 ; THUMB-NEXT: push {lr} ; THUMB-NEXT: bl __morestack -- 2.7.4