From d0dbc991c0983613c14a8f3ec606cd572cda8d23 Mon Sep 17 00:00:00 2001 From: Ben Shi Date: Sat, 16 Oct 2021 22:14:59 +0000 Subject: [PATCH] Revert "[AArch64] Optimize add/sub with immediate" This reverts commit 9bf6bef9951a1c230796ccad2c5c0195ce4c4dff. --- llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp | 188 +++------------------ llvm/test/CodeGen/AArch64/addsub.ll | 96 ++++------- .../CodeGenPrepare/AArch64/large-offset-gep.ll | 5 +- 3 files changed, 60 insertions(+), 229 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp index 9ff92e6..d091c8f 100644 --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -11,17 +11,10 @@ // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri // MOVi64imm + ANDXrr ==> ANDXri + ANDXri // -// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi -// MOVi64imm + ADDXrr ==> ANDXri + ANDXri -// -// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi -// MOVi64imm + SUBXrr ==> SUBXri + SUBXri -// // The mov pseudo instruction could be expanded to multiple mov instructions // later. In this case, we could try to split the constant operand of mov -// instruction into two immediates which can be directly encoded into -// *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of -// multiple `mov` + `and/add/sub` instructions. +// instruction into two bitmask immediates. It makes two AND instructions +// intead of multiple `mov` + `and` instructions. //===----------------------------------------------------------------------===// #include "AArch64ExpandImm.h" @@ -48,13 +41,6 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass { MachineLoopInfo *MLI; MachineRegisterInfo *MRI; - bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI, - MachineInstr *&SubregToRegMI); - - template - bool visitADDSUB(MachineInstr &MI, - SmallSetVector &ToBeRemoved, bool IsAdd); - template bool visitAND(MachineInstr &MI, SmallSetVector &ToBeRemoved); @@ -133,9 +119,31 @@ bool AArch64MIPeepholeOpt::visitAND( assert((RegSize == 32 || RegSize == 64) && "Invalid RegSize for AND bitmask peephole optimization"); - // Perform several essential checks against current MI. - MachineInstr *MovMI, *SubregToRegMI; - if (!checkMovImmInstr(MI, MovMI, SubregToRegMI)) + // Check whether AND's MBB is in loop and the AND is loop invariant. + MachineBasicBlock *MBB = MI.getParent(); + MachineLoop *L = MLI->getLoopFor(MBB); + if (L && !L->isLoopInvariant(MI)) + return false; + + // Check whether AND's operand is MOV with immediate. + MachineInstr *MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); + MachineInstr *SubregToRegMI = nullptr; + // If it is SUBREG_TO_REG, check its operand. + if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) { + SubregToRegMI = MovMI; + MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg()); + } + + if (MovMI->getOpcode() != AArch64::MOVi32imm && + MovMI->getOpcode() != AArch64::MOVi64imm) + return false; + + // If the MOV has multiple uses, do not split the immediate because it causes + // more instructions. + if (!MRI->hasOneUse(MovMI->getOperand(0).getReg())) + return false; + + if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg())) return false; // Split the bitmask immediate into two. @@ -152,7 +160,6 @@ bool AArch64MIPeepholeOpt::visitAND( // Create new AND MIs. DebugLoc DL = MI.getDebugLoc(); - MachineBasicBlock *MBB = MI.getParent(); const TargetRegisterClass *ANDImmRC = (RegSize == 32) ? &AArch64::GPR32spRegClass : &AArch64::GPR64spRegClass; Register DstReg = MI.getOperand(0).getReg(); @@ -178,135 +185,6 @@ bool AArch64MIPeepholeOpt::visitAND( return true; } -template -static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) { - // The immediate must be in the form of ((imm0 << 12) + imm1), in which both - // imm0 and imm1 are non-zero 12-bit unsigned int. - if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 || - (Imm & ~static_cast(0xffffff)) != 0) - return false; - - // The immediate can not be composed via a single instruction. - SmallVector Insn; - AArch64_IMM::expandMOVImm(Imm, RegSize, Insn); - if (Insn.size() == 1) - return false; - - // Split Imm into (Imm0 << 12) + Imm1; - Imm0 = (Imm >> 12) & 0xfff; - Imm1 = Imm & 0xfff; - return true; -} - -template -bool AArch64MIPeepholeOpt::visitADDSUB( - MachineInstr &MI, SmallSetVector &ToBeRemoved, - bool IsAdd) { - // Try below transformation. - // - // MOVi32imm + ADDWrr ==> ANDWri + ANDWri - // MOVi64imm + ADDXrr ==> ANDXri + ANDXri - // - // MOVi32imm + SUBWrr ==> SUBWri + SUBWri - // MOVi64imm + SUBXrr ==> SUBXri + SUBXri - // - // The mov pseudo instruction could be expanded to multiple mov instructions - // later. Let's try to split the constant operand of mov instruction into two - // legal add/sub immediates. It makes only two ADD/SUB instructions intead of - // multiple `mov` + `and/sub` instructions. - - unsigned RegSize = sizeof(T) * 8; - assert((RegSize == 32 || RegSize == 64) && - "Invalid RegSize for legal add/sub immediate peephole optimization"); - - // Perform several essential checks against current MI. - MachineInstr *MovMI, *SubregToRegMI; - if (!checkMovImmInstr(MI, MovMI, SubregToRegMI)) - return false; - - // Split the immediate to Imm0 and Imm1, and calculate the Opcode. - T Imm = static_cast(MovMI->getOperand(1).getImm()), Imm0, Imm1; - unsigned Opcode; - if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) { - if (IsAdd) - Opcode = RegSize == 32 ? AArch64::ADDWri : AArch64::ADDXri; - else - Opcode = RegSize == 32 ? AArch64::SUBWri : AArch64::SUBXri; - } else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) { - if (IsAdd) - Opcode = RegSize == 32 ? AArch64::SUBWri : AArch64::SUBXri; - else - Opcode = RegSize == 32 ? AArch64::ADDWri : AArch64::ADDXri; - } else { - return false; - } - - // Create new ADD/SUB MIs. - DebugLoc DL = MI.getDebugLoc(); - MachineBasicBlock *MBB = MI.getParent(); - const TargetRegisterClass *RC = - (RegSize == 32) ? &AArch64::GPR32spRegClass : &AArch64::GPR64spRegClass; - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); - Register TmpReg = MRI->createVirtualRegister(RC); - - MRI->constrainRegClass(SrcReg, RC); - BuildMI(*MBB, MI, DL, TII->get(Opcode), TmpReg) - .addReg(SrcReg) - .addImm(Imm0) - .addImm(12); - - MRI->constrainRegClass(DstReg, RC); - BuildMI(*MBB, MI, DL, TII->get(Opcode), DstReg) - .addReg(TmpReg) - .addImm(Imm1) - .addImm(0); - - // Record the MIs need to be removed. - ToBeRemoved.insert(&MI); - if (SubregToRegMI) - ToBeRemoved.insert(SubregToRegMI); - ToBeRemoved.insert(MovMI); - - return true; -} - -// Checks if the corresponding MOV immediate instruction is applicable for -// this peephole optimization. -bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI, - MachineInstr *&MovMI, - MachineInstr *&SubregToRegMI) { - // Check whether current MI is in loop and is loop invariant. - MachineBasicBlock *MBB = MI.getParent(); - MachineLoop *L = MLI->getLoopFor(MBB); - if (L && !L->isLoopInvariant(MI)) - return false; - - // Check whether current MI's operand is MOV with immediate. - MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); - SubregToRegMI = nullptr; - // If it is SUBREG_TO_REG, check its operand. - if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) { - SubregToRegMI = MovMI; - MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg()); - } - - if (MovMI->getOpcode() != AArch64::MOVi32imm && - MovMI->getOpcode() != AArch64::MOVi64imm) - return false; - - // If the MOV has multiple uses, do not split the immediate because it causes - // more instructions. - if (!MRI->hasOneUse(MovMI->getOperand(0).getReg())) - return false; - - if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg())) - return false; - - // It is OK to perform this peephole optimization. - return true; -} - bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -332,18 +210,6 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { case AArch64::ANDXrr: Changed = visitAND(MI, ToBeRemoved); break; - case AArch64::ADDWrr: - Changed = visitADDSUB(MI, ToBeRemoved, true); - break; - case AArch64::SUBWrr: - Changed = visitADDSUB(MI, ToBeRemoved, false); - break; - case AArch64::ADDXrr: - Changed = visitADDSUB(MI, ToBeRemoved, true); - break; - case AArch64::SUBXrr: - Changed = visitADDSUB(MI, ToBeRemoved, false); - break; } } } diff --git a/llvm/test/CodeGen/AArch64/addsub.ll b/llvm/test/CodeGen/AArch64/addsub.ll index 37c9e4c..f0857fe 100644 --- a/llvm/test/CodeGen/AArch64/addsub.ll +++ b/llvm/test/CodeGen/AArch64/addsub.ll @@ -152,8 +152,9 @@ define void @sub_med() { define i64 @add_two_parts_imm_i64(i64 %a) { ; CHECK-LABEL: add_two_parts_imm_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #2730, lsl #12 // =11182080 -; CHECK-NEXT: add x0, x8, #1365 +; CHECK-NEXT: mov w8, #42325 +; CHECK-NEXT: movk w8, #170, lsl #16 +; CHECK-NEXT: add x0, x0, x8 ; CHECK-NEXT: ret %b = add i64 %a, 11183445 ret i64 %b @@ -162,8 +163,9 @@ define i64 @add_two_parts_imm_i64(i64 %a) { define i32 @add_two_parts_imm_i32(i32 %a) { ; CHECK-LABEL: add_two_parts_imm_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, #2730, lsl #12 // =11182080 -; CHECK-NEXT: add w0, w8, #1365 +; CHECK-NEXT: mov w8, #42325 +; CHECK-NEXT: movk w8, #170, lsl #16 +; CHECK-NEXT: add w0, w0, w8 ; CHECK-NEXT: ret %b = add i32 %a, 11183445 ret i32 %b @@ -172,8 +174,9 @@ define i32 @add_two_parts_imm_i32(i32 %a) { define i64 @add_two_parts_imm_i64_neg(i64 %a) { ; CHECK-LABEL: add_two_parts_imm_i64_neg: ; CHECK: // %bb.0: -; CHECK-NEXT: sub x8, x0, #2730, lsl #12 // =11182080 -; CHECK-NEXT: sub x0, x8, #1365 +; CHECK-NEXT: mov x8, #-42325 +; CHECK-NEXT: movk x8, #65365, lsl #16 +; CHECK-NEXT: add x0, x0, x8 ; CHECK-NEXT: ret %b = add i64 %a, -11183445 ret i64 %b @@ -182,8 +185,9 @@ define i64 @add_two_parts_imm_i64_neg(i64 %a) { define i32 @add_two_parts_imm_i32_neg(i32 %a) { ; CHECK-LABEL: add_two_parts_imm_i32_neg: ; CHECK: // %bb.0: -; CHECK-NEXT: sub w8, w0, #2730, lsl #12 // =11182080 -; CHECK-NEXT: sub w0, w8, #1365 +; CHECK-NEXT: mov w8, #23211 +; CHECK-NEXT: movk w8, #65365, lsl #16 +; CHECK-NEXT: add w0, w0, w8 ; CHECK-NEXT: ret %b = add i32 %a, -11183445 ret i32 %b @@ -192,8 +196,9 @@ define i32 @add_two_parts_imm_i32_neg(i32 %a) { define i64 @sub_two_parts_imm_i64(i64 %a) { ; CHECK-LABEL: sub_two_parts_imm_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: sub x8, x0, #2730, lsl #12 // =11182080 -; CHECK-NEXT: sub x0, x8, #1365 +; CHECK-NEXT: mov x8, #-42325 +; CHECK-NEXT: movk x8, #65365, lsl #16 +; CHECK-NEXT: add x0, x0, x8 ; CHECK-NEXT: ret %b = sub i64 %a, 11183445 ret i64 %b @@ -202,8 +207,9 @@ define i64 @sub_two_parts_imm_i64(i64 %a) { define i32 @sub_two_parts_imm_i32(i32 %a) { ; CHECK-LABEL: sub_two_parts_imm_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sub w8, w0, #2730, lsl #12 // =11182080 -; CHECK-NEXT: sub w0, w8, #1365 +; CHECK-NEXT: mov w8, #23211 +; CHECK-NEXT: movk w8, #65365, lsl #16 +; CHECK-NEXT: add w0, w0, w8 ; CHECK-NEXT: ret %b = sub i32 %a, 11183445 ret i32 %b @@ -212,8 +218,9 @@ define i32 @sub_two_parts_imm_i32(i32 %a) { define i64 @sub_two_parts_imm_i64_neg(i64 %a) { ; CHECK-LABEL: sub_two_parts_imm_i64_neg: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #2730, lsl #12 // =11182080 -; CHECK-NEXT: add x0, x8, #1365 +; CHECK-NEXT: mov w8, #42325 +; CHECK-NEXT: movk w8, #170, lsl #16 +; CHECK-NEXT: add x0, x0, x8 ; CHECK-NEXT: ret %b = sub i64 %a, -11183445 ret i64 %b @@ -222,57 +229,14 @@ define i64 @sub_two_parts_imm_i64_neg(i64 %a) { define i32 @sub_two_parts_imm_i32_neg(i32 %a) { ; CHECK-LABEL: sub_two_parts_imm_i32_neg: ; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, #2730, lsl #12 // =11182080 -; CHECK-NEXT: add w0, w8, #1365 -; CHECK-NEXT: ret - %b = sub i32 %a, -11183445 - ret i32 %b -} - -define i32 @add_27962026(i32 %a) { -; CHECK-LABEL: add_27962026: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #43690 -; CHECK-NEXT: movk w8, #426, lsl #16 -; CHECK-NEXT: add w0, w0, w8 -; CHECK-NEXT: ret - %b = add i32 %a, 27962026 - ret i32 %b -} - -define i32 @add_65534(i32 %a) { -; CHECK-LABEL: add_65534: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #65534 +; CHECK-NEXT: mov w8, #42325 +; CHECK-NEXT: movk w8, #170, lsl #16 ; CHECK-NEXT: add w0, w0, w8 ; CHECK-NEXT: ret - %b = add i32 %a, 65534 + %b = sub i32 %a, -11183445 ret i32 %b } -declare i32 @foox(i32) - -define void @add_in_loop(i32 %0) { -; CHECK-LABEL: add_in_loop: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: mov w19, #43690 -; CHECK-NEXT: movk w19, #170, lsl #16 -; CHECK-NEXT: .LBB15_1: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add w0, w0, w19 -; CHECK-NEXT: bl foox -; CHECK-NEXT: b .LBB15_1 - br label %2 -2: - %3 = phi i32 [ %0, %1 ], [ %5, %2 ] - %4 = add nsw i32 %3, 11184810 - %5 = tail call i32 @foox(i32 %4) #2 - br label %2 -} - define void @testing() { ; CHECK-LABEL: testing: ; CHECK: // %bb.0: @@ -280,7 +244,7 @@ define void @testing() { ; CHECK-NEXT: ldr x8, [x8, :got_lo12:var_i32] ; CHECK-NEXT: ldr w9, [x8] ; CHECK-NEXT: cmp w9, #4095 -; CHECK-NEXT: b.ne .LBB16_6 +; CHECK-NEXT: b.ne .LBB13_6 ; CHECK-NEXT: // %bb.1: // %test2 ; CHECK-NEXT: adrp x10, :got:var2_i32 ; CHECK-NEXT: add w11, w9, #1 @@ -288,26 +252,26 @@ define void @testing() { ; CHECK-NEXT: str w11, [x8] ; CHECK-NEXT: ldr w10, [x10] ; CHECK-NEXT: cmp w10, #3567, lsl #12 // =14610432 -; CHECK-NEXT: b.lo .LBB16_6 +; CHECK-NEXT: b.lo .LBB13_6 ; CHECK-NEXT: // %bb.2: // %test3 ; CHECK-NEXT: add w11, w9, #2 ; CHECK-NEXT: cmp w9, #123 ; CHECK-NEXT: str w11, [x8] -; CHECK-NEXT: b.lt .LBB16_6 +; CHECK-NEXT: b.lt .LBB13_6 ; CHECK-NEXT: // %bb.3: // %test4 ; CHECK-NEXT: add w11, w9, #3 ; CHECK-NEXT: cmp w10, #321 ; CHECK-NEXT: str w11, [x8] -; CHECK-NEXT: b.gt .LBB16_6 +; CHECK-NEXT: b.gt .LBB13_6 ; CHECK-NEXT: // %bb.4: // %test5 ; CHECK-NEXT: add w11, w9, #4 ; CHECK-NEXT: cmn w10, #443 ; CHECK-NEXT: str w11, [x8] -; CHECK-NEXT: b.ge .LBB16_6 +; CHECK-NEXT: b.ge .LBB13_6 ; CHECK-NEXT: // %bb.5: // %test6 ; CHECK-NEXT: add w9, w9, #5 ; CHECK-NEXT: str w9, [x8] -; CHECK-NEXT: .LBB16_6: // %common.ret +; CHECK-NEXT: .LBB13_6: // %common.ret ; CHECK-NEXT: ret %val = load i32, i32* @var_i32 %val2 = load i32, i32* @var2_i32 diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll index 97e8772..1c58708 100644 --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll @@ -214,9 +214,10 @@ define void @test5([65536 x i32]** %s, i32 %n) { ; CHECK-LABEL: test5: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: mov w10, #14464 +; CHECK-NEXT: movk w10, #1, lsl #16 ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: add x9, x9, #19, lsl #12 // =77824 -; CHECK-NEXT: add x9, x9, #2176 +; CHECK-NEXT: add x9, x9, x10 ; CHECK-NEXT: cmp w8, w1 ; CHECK-NEXT: b.ge .LBB4_2 ; CHECK-NEXT: .LBB4_1: // %while_body -- 2.7.4