From 75358f060c099c9a290cea43eb15f872ab7f2343 Mon Sep 17 00:00:00 2001 From: zhongyunde Date: Fri, 7 Oct 2022 19:31:48 +0800 Subject: [PATCH] [AArch64] Lower multiplication by a constant int to madd Lower a = b * C -1 into madd a) instcombine change b * C -1 --> b * C + (-1) b) machine-combine change b * C + (-1) --> madd Assembler will transform the neg immedate of sub to add, see https://gcc.godbolt.org/z/cTcxePPf4 Fixes AArch64 part of https://github.com/llvm/llvm-project/issues/57255. Reviewed By: efriedma Differential Revision: https://reviews.llvm.org/D134336 --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 64 +++++++++++++++++----- llvm/test/CodeGen/AArch64/addimm-mulimm.ll | 16 +++--- .../test/CodeGen/AArch64/machine-outliner-throw.ll | 4 +- llvm/test/CodeGen/AArch64/madd-combiner.ll | 4 +- llvm/test/CodeGen/AArch64/mul_pow2.ll | 47 ++++++++++++++-- llvm/test/CodeGen/AArch64/srem-seteq.ll | 8 +-- llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll | 10 ++-- 7 files changed, 114 insertions(+), 39 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 4167875..3994c82 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -5796,7 +5796,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( case MachineCombinerPattern::MULADDXI_OP1: { // MUL I=A,B,0 // ADD R,I,Imm - // ==> ORR V, ZR, Imm + // ==> MOV V, Imm // ==> MADD R,A,B,V // --- Create(MADD); const TargetRegisterClass *OrrRC; @@ -5824,13 +5824,31 @@ void AArch64InstrInfo::genAlternativeCodeSequence( Imm = Imm << Val; } uint64_t UImm = SignExtend64(Imm, BitSize); - uint64_t Encoding; - if (!AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) + // The immediate can be composed via a single instruction. + SmallVector Insn; + AArch64_IMM::expandMOVImm(UImm, BitSize, Insn); + if (Insn.size() != 1) return; - MachineInstrBuilder MIB1 = - BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) - .addReg(ZeroReg) - .addImm(Encoding); + auto MovI = Insn.begin(); + MachineInstrBuilder MIB1; + // MOV is an alias for one of three instructions: movz, movn, and orr. + if (MovI->Opcode == OrrOpc) + MIB1 = BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) + .addReg(ZeroReg) + .addImm(MovI->Op2); + else { + if (BitSize == 32) + assert((MovI->Opcode == AArch64::MOVNWi || + MovI->Opcode == AArch64::MOVZWi) && + "Expected opcode"); + else + assert((MovI->Opcode == AArch64::MOVNXi || + MovI->Opcode == AArch64::MOVZXi) && + "Expected opcode"); + MIB1 = BuildMI(MF, Root.getDebugLoc(), TII->get(MovI->Opcode), NewVR) + .addImm(MovI->Op1) + .addImm(MovI->Op2); + } InsInstrs.push_back(MIB1); InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); @@ -5888,7 +5906,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( case MachineCombinerPattern::MULSUBXI_OP1: { // MUL I=A,B,0 // SUB R,I, Imm - // ==> ORR V, ZR, -Imm + // ==> MOV V, -Imm // ==> MADD R,A,B,V // = -Imm + A*B // --- Create(MADD); const TargetRegisterClass *OrrRC; @@ -5915,13 +5933,31 @@ void AArch64InstrInfo::genAlternativeCodeSequence( Imm = Imm << Val; } uint64_t UImm = SignExtend64(-Imm, BitSize); - uint64_t Encoding; - if (!AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) + // The immediate can be composed via a single instruction. + SmallVector Insn; + AArch64_IMM::expandMOVImm(UImm, BitSize, Insn); + if (Insn.size() != 1) return; - MachineInstrBuilder MIB1 = - BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) - .addReg(ZeroReg) - .addImm(Encoding); + auto MovI = Insn.begin(); + MachineInstrBuilder MIB1; + // MOV is an alias for one of three instructions: movz, movn, and orr. + if (MovI->Opcode == OrrOpc) + MIB1 = BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) + .addReg(ZeroReg) + .addImm(MovI->Op2); + else { + if (BitSize == 32) + assert((MovI->Opcode == AArch64::MOVNWi || + MovI->Opcode == AArch64::MOVZWi) && + "Expected opcode"); + else + assert((MovI->Opcode == AArch64::MOVNXi || + MovI->Opcode == AArch64::MOVZXi) && + "Expected opcode"); + MIB1 = BuildMI(MF, Root.getDebugLoc(), TII->get(MovI->Opcode), NewVR) + .addImm(MovI->Op1) + .addImm(MovI->Op2); + } InsInstrs.push_back(MIB1); InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); diff --git a/llvm/test/CodeGen/AArch64/addimm-mulimm.ll b/llvm/test/CodeGen/AArch64/addimm-mulimm.ll index ef17ff1..cc6523d 100644 --- a/llvm/test/CodeGen/AArch64/addimm-mulimm.ll +++ b/llvm/test/CodeGen/AArch64/addimm-mulimm.ll @@ -5,8 +5,8 @@ define i64 @addimm_mulimm_accept_00(i64 %a) { ; CHECK-LABEL: addimm_mulimm_accept_00: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #37 -; CHECK-NEXT: mul x8, x0, x8 -; CHECK-NEXT: add x0, x8, #1147 +; CHECK-NEXT: mov x9, #1147 +; CHECK-NEXT: madd x0, x0, x8, x9 ; CHECK-NEXT: ret %tmp0 = add i64 %a, 31 %tmp1 = mul i64 %tmp0, 37 @@ -17,8 +17,8 @@ define i64 @addimm_mulimm_accept_01(i64 %a) { ; CHECK-LABEL: addimm_mulimm_accept_01: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #37 -; CHECK-NEXT: mul x8, x0, x8 -; CHECK-NEXT: sub x0, x8, #1147 +; CHECK-NEXT: mov x9, #-1147 +; CHECK-NEXT: madd x0, x0, x8, x9 ; CHECK-NEXT: ret %tmp0 = add i64 %a, -31 %tmp1 = mul i64 %tmp0, 37 @@ -29,8 +29,8 @@ define signext i32 @addimm_mulimm_accept_02(i32 signext %a) { ; CHECK-LABEL: addimm_mulimm_accept_02: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #37 -; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: add w0, w8, #1147 +; CHECK-NEXT: mov w9, #1147 +; CHECK-NEXT: madd w0, w0, w8, w9 ; CHECK-NEXT: ret %tmp0 = add i32 %a, 31 %tmp1 = mul i32 %tmp0, 37 @@ -41,8 +41,8 @@ define signext i32 @addimm_mulimm_accept_03(i32 signext %a) { ; CHECK-LABEL: addimm_mulimm_accept_03: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #37 -; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: sub w0, w8, #1147 +; CHECK-NEXT: mov w9, #-1147 +; CHECK-NEXT: madd w0, w0, w8, w9 ; CHECK-NEXT: ret %tmp0 = add i32 %a, -31 %tmp1 = mul i32 %tmp0, 37 diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-throw.ll b/llvm/test/CodeGen/AArch64/machine-outliner-throw.ll index 2b03fa3..b852089 100644 --- a/llvm/test/CodeGen/AArch64/machine-outliner-throw.ll +++ b/llvm/test/CodeGen/AArch64/machine-outliner-throw.ll @@ -13,7 +13,7 @@ define dso_local i32 @_Z5func1i(i32 %x) #0 { ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: orr w8, wzr, #0x1 +; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: madd w19, w0, w0, w8 ; CHECK-NEXT: mov w0, #4 ; CHECK-NEXT: bl __cxa_allocate_exception @@ -37,7 +37,7 @@ define dso_local i32 @_Z5func2c(i8 %x) #0 { ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: and w8, w0, #0xff ; CHECK-NEXT: mov w0, #4 -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: madd w19, w8, w8, w9 ; CHECK-NEXT: bl __cxa_allocate_exception ; CHECK-NEXT: bl OUTLINED_FUNCTION_0 diff --git a/llvm/test/CodeGen/AArch64/madd-combiner.ll b/llvm/test/CodeGen/AArch64/madd-combiner.ll index 07fbcdd..28e80b1 100644 --- a/llvm/test/CodeGen/AArch64/madd-combiner.ll +++ b/llvm/test/CodeGen/AArch64/madd-combiner.ll @@ -6,7 +6,7 @@ define i32 @mul_add_imm(i32 %a, i32 %b) { ; CHECK-LABEL: mul_add_imm: ; CHECK: ; %bb.0: -; CHECK-NEXT: orr w8, wzr, #0x4 +; CHECK-NEXT: mov w8, #4 ; CHECK-NEXT: madd w0, w0, w1, w8 ; CHECK-NEXT: ret %1 = mul i32 %a, %b @@ -39,7 +39,7 @@ define void @mul_add_imm2() { ; CHECK-FAST-LABEL: mul_add_imm2: ; CHECK-FAST: ; %bb.0: ; %entry ; CHECK-FAST-NEXT: mov x8, #-3 -; CHECK-FAST-NEXT: orr x9, xzr, #0xfffffffffffffffd +; CHECK-FAST-NEXT: mov x9, #-3 ; CHECK-FAST-NEXT: madd x8, x8, x8, x9 ; CHECK-FAST-NEXT: mov x9, #45968 ; CHECK-FAST-NEXT: movk x9, #48484, lsl #16 diff --git a/llvm/test/CodeGen/AArch64/mul_pow2.ll b/llvm/test/CodeGen/AArch64/mul_pow2.ll index 30c639a..6ec0b62 100644 --- a/llvm/test/CodeGen/AArch64/mul_pow2.ll +++ b/llvm/test/CodeGen/AArch64/mul_pow2.ll @@ -290,6 +290,45 @@ define i64 @test6_smnegl(i32 %x) { ret i64 %sub } +; We may hoist the "mov" instructions out of a loop +define i32 @mull6_sub(i32 %x) { +; CHECK-LABEL: mull6_sub: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #6 +; CHECK-NEXT: mov w9, #-1 +; CHECK-NEXT: madd w0, w0, w8, w9 +; CHECK-NEXT: ret +; +; GISEL-LABEL: mull6_sub: +; GISEL: // %bb.0: +; GISEL-NEXT: mov w8, #6 +; GISEL-NEXT: mov w9, #-1 +; GISEL-NEXT: madd w0, w0, w8, w9 +; GISEL-NEXT: ret + %mul = mul nsw i32 %x, 6 + %sub = add nsw i32 %mul, -1 + ret i32 %sub +} + +define i64 @mull6_sub_orr(i64 %x) { +; CHECK-LABEL: mull6_sub_orr: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #6 +; CHECK-NEXT: mov x9, #16773120 +; CHECK-NEXT: madd x0, x0, x8, x9 +; CHECK-NEXT: ret +; +; GISEL-LABEL: mull6_sub_orr: +; GISEL: // %bb.0: +; GISEL-NEXT: mov w8, #6 +; GISEL-NEXT: mov x9, #16773120 +; GISEL-NEXT: madd x0, x0, x8, x9 +; GISEL-NEXT: ret + %mul = mul nsw i64 %x, 6 + %sub = add nsw i64 %mul, 16773120 + ret i64 %sub +} + define i32 @test7(i32 %x) { ; CHECK-LABEL: test7: ; CHECK: // %bb.0: @@ -731,11 +770,11 @@ define <4 x i32> @muladd_demand_commute(<4 x i32> %x, <4 x i32> %y) { ; ; GISEL-LABEL: muladd_demand_commute: ; GISEL: // %bb.0: -; GISEL-NEXT: adrp x8, .LCPI42_1 -; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI42_1] -; GISEL-NEXT: adrp x8, .LCPI42_0 +; GISEL-NEXT: adrp x8, .LCPI44_1 +; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI44_1] +; GISEL-NEXT: adrp x8, .LCPI44_0 ; GISEL-NEXT: mla v1.4s, v0.4s, v2.4s -; GISEL-NEXT: ldr q0, [x8, :lo12:.LCPI42_0] +; GISEL-NEXT: ldr q0, [x8, :lo12:.LCPI44_0] ; GISEL-NEXT: and v0.16b, v1.16b, v0.16b ; GISEL-NEXT: ret %m = mul <4 x i32> %x, diff --git a/llvm/test/CodeGen/AArch64/srem-seteq.ll b/llvm/test/CodeGen/AArch64/srem-seteq.ll index 5192de1..4bb29d3 100644 --- a/llvm/test/CodeGen/AArch64/srem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq.ll @@ -47,7 +47,7 @@ define i32 @test_srem_odd_bit30(i32 %X) nounwind { ; CHECK-LABEL: test_srem_odd_bit30: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: movk w8, #27306, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 ; CHECK-NEXT: cmp w8, #3 @@ -64,7 +64,7 @@ define i32 @test_srem_odd_bit31(i32 %X) nounwind { ; CHECK-LABEL: test_srem_odd_bit31: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #21845 -; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: mov w9, #1 ; CHECK-NEXT: movk w8, #54613, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 ; CHECK-NEXT: cmp w8, #3 @@ -122,7 +122,7 @@ define i32 @test_srem_even_bit30(i32 %X) nounwind { ; CHECK-LABEL: test_srem_even_bit30: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #20165 -; CHECK-NEXT: orr w9, wzr, #0x8 +; CHECK-NEXT: mov w9, #8 ; CHECK-NEXT: movk w8, #64748, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 ; CHECK-NEXT: ror w8, w8, #3 @@ -140,7 +140,7 @@ define i32 @test_srem_even_bit31(i32 %X) nounwind { ; CHECK-LABEL: test_srem_even_bit31: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #1285 -; CHECK-NEXT: orr w9, wzr, #0x2 +; CHECK-NEXT: mov w9, #2 ; CHECK-NEXT: movk w8, #50437, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 ; CHECK-NEXT: ror w8, w8, #1 diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll index c8627b8..b3be59d 100644 --- a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll @@ -137,11 +137,11 @@ define i1 @t32_6_3(i32 %X) nounwind { ; CHECK-LABEL: t32_6_3: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: mov w9, #43691 +; CHECK-NEXT: mov w9, #-1 ; CHECK-NEXT: movk w8, #43690, lsl #16 +; CHECK-NEXT: madd w8, w0, w8, w9 +; CHECK-NEXT: mov w9, #43691 ; CHECK-NEXT: movk w9, #10922, lsl #16 -; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: sub w8, w8, #1 ; CHECK-NEXT: ror w8, w8, #1 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w0, lo @@ -209,8 +209,8 @@ define i1 @t8_3_2(i8 %X) nounwind { ; CHECK-LABEL: t8_3_2: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-85 -; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: sub w8, w8, #86 +; CHECK-NEXT: mov w9, #-86 +; CHECK-NEXT: madd w8, w0, w8, w9 ; CHECK-NEXT: and w8, w8, #0xff ; CHECK-NEXT: cmp w8, #85 ; CHECK-NEXT: cset w0, lo -- 2.7.4