From ae51a828685d088aca5b5be9dbb3d0695312f2e1 Mon Sep 17 00:00:00 2001 From: Owen Anderson Date: Tue, 3 Jan 2023 23:23:31 -0700 Subject: [PATCH] Teach the AArch64 backend to materialize immediates using a pair of ORR-immediate instructions. Credit to czwarich for figuring out the algorithm to test for this. Re-applied with fix for ubsan error on out-of-range shift. Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D140952 --- llvm/lib/Target/AArch64/AArch64ExpandImm.cpp | 145 +++++++++++++++++++++ .../Target/AArch64/AArch64ExpandPseudoInsts.cpp | 38 +++++- llvm/test/CodeGen/AArch64/arm64-movi.ll | 49 +++---- 3 files changed, 194 insertions(+), 38 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp index 6b6fd65..ff85e21 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp @@ -239,6 +239,143 @@ static bool trySequenceOfOnes(uint64_t UImm, return true; } +static uint64_t GetRunOfOnesStartingAt(uint64_t V, uint64_t StartPosition) { + uint64_t NumOnes = llvm::countTrailingOnes(V >> StartPosition); + + uint64_t UnshiftedOnes; + if (NumOnes == 64) { + UnshiftedOnes = ~0ULL; + } else { + UnshiftedOnes = (1ULL << NumOnes) - 1; + } + return UnshiftedOnes << StartPosition; +} + +static uint64_t rotl(uint64_t n, uint64_t d) { + d %= 64; + if (d == 0) + return n; + return (n << d) | (n >> (64 - d)); +} + +static uint64_t rotr(uint64_t n, uint64_t d) { + d %= 64; + if (d == 0) + return n; + return (n >> d) | (n << (64 - d)); +} + +static uint64_t MaximallyReplicateSubImmediate(uint64_t V, uint64_t Subset) { + uint64_t Result = Subset; + + // 64, 32, 16, 8, 4, 2 + for (uint64_t i = 0; i < 6; ++i) { + uint64_t Rotation = 1 << (6 - i); + uint64_t Closure = Result | rotl(Result, Rotation); + if (Closure != (Closure & V)) { + break; + } + Result = Closure; + } + + return Result; +} + +// Find the logical immediate that covers the most bits in RemainingBits, +// allowing for additional bits to be set that were set in OriginalBits. +static uint64_t maximalLogicalImmWithin(uint64_t RemainingBits, + uint64_t OriginalBits) { + // Find the first set bit. + uint32_t Position = llvm::countTrailingZeros(RemainingBits); + + // Get the first run of set bits. + uint64_t FirstRun = GetRunOfOnesStartingAt(OriginalBits, Position); + + // Replicate the run as many times as possible, as long as the bits are set in + // RemainingBits. + uint64_t MaximalImm = MaximallyReplicateSubImmediate(OriginalBits, FirstRun); + + return MaximalImm; +} + +static std::optional> +decomposeIntoOrrOfLogicalImmediates(uint64_t UImm) { + if (UImm == 0 || ~UImm == 0) + return std::nullopt; + + // Make sure we don't have a run of ones split around the rotation boundary. + uint32_t InitialTrailingOnes = llvm::countTrailingOnes(UImm); + uint64_t RotatedBits = rotr(UImm, InitialTrailingOnes); + + // Find the largest logical immediate that fits within the full immediate. + uint64_t MaximalImm1 = maximalLogicalImmWithin(RotatedBits, RotatedBits); + + // Remove all bits that are set by this mask. + uint64_t RemainingBits = RotatedBits & ~MaximalImm1; + + // Find the largest logical immediate covering the remaining bits, allowing + // for additional bits to be set that were also set in the original immediate. + uint64_t MaximalImm2 = maximalLogicalImmWithin(RemainingBits, RotatedBits); + + // If any bits still haven't been covered, then give up. + if (RemainingBits & ~MaximalImm2) + return std::nullopt; + + // Make sure to un-rotate the immediates. + return std::make_pair(rotl(MaximalImm1, InitialTrailingOnes), + rotl(MaximalImm2, InitialTrailingOnes)); +} + +// Attempt to expand an immediate as the ORR of a pair of logical immediates. +static bool tryOrrOfLogicalImmediates(uint64_t UImm, + SmallVectorImpl &Insn) { + auto MaybeDecomposition = decomposeIntoOrrOfLogicalImmediates(UImm); + if (MaybeDecomposition == std::nullopt) + return false; + uint64_t Imm1 = MaybeDecomposition->first; + uint64_t Imm2 = MaybeDecomposition->second; + + uint64_t Encoding1, Encoding2; + bool Imm1Success = AArch64_AM::processLogicalImmediate(Imm1, 64, Encoding1); + bool Imm2Success = AArch64_AM::processLogicalImmediate(Imm2, 64, Encoding2); + + if (Imm1Success && Imm2Success) { + // Create the ORR-immediate instructions. + Insn.push_back({AArch64::ORRXri, 0, Encoding1}); + Insn.push_back({AArch64::ORRXri, 1, Encoding2}); + return true; + } + + return false; +} + +// Attempt to expand an immediate as the AND of a pair of logical immediates. +// This is done by applying DeMorgan's law, under which logical immediates +// are closed. +static bool tryAndOfLogicalImmediates(uint64_t UImm, + SmallVectorImpl &Insn) { + // Apply DeMorgan's law to turn this into an ORR problem. + auto MaybeDecomposition = decomposeIntoOrrOfLogicalImmediates(~UImm); + if (MaybeDecomposition == std::nullopt) + return false; + uint64_t Imm1 = MaybeDecomposition->first; + uint64_t Imm2 = MaybeDecomposition->second; + + uint64_t Encoding1, Encoding2; + bool Imm1Success = AArch64_AM::processLogicalImmediate(~Imm1, 64, Encoding1); + bool Imm2Success = AArch64_AM::processLogicalImmediate(~Imm2, 64, Encoding2); + + if (Imm1Success && Imm2Success) { + // Materialize Imm1, the LHS of the AND + Insn.push_back({AArch64::ORRXri, 0, Encoding1}); + // AND Imm1 with Imm2 + Insn.push_back({AArch64::ANDXri, 1, Encoding2}); + return true; + } + + return false; +} + /// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to a /// MOVZ or MOVN of width BitSize followed by up to 3 MOVK instructions. static inline void expandMOVImmSimple(uint64_t Imm, unsigned BitSize, @@ -372,6 +509,14 @@ void AArch64_IMM::expandMOVImm(uint64_t Imm, unsigned BitSize, } } + // Attempt to use a sequence of two ORR-immediate instructions. + if (tryOrrOfLogicalImmediates(Imm, Insn)) + return; + + // Attempt to use a sequence of ORR-immediate followed by AND-immediate. + if (tryAndOfLogicalImmediates(Imm, Insn)) + return; + // FIXME: Add more two-instruction sequences. // Three instruction sequences. diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 3370da4..0da93d0 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -148,10 +148,40 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, case AArch64::ORRWri: case AArch64::ORRXri: - MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode)) - .add(MI.getOperand(0)) - .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR) - .addImm(I->Op2)); + if (I->Op1 == 0) { + MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode)) + .add(MI.getOperand(0)) + .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR) + .addImm(I->Op2)); + } else { + Register DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + MIBS.push_back( + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode)) + .addReg(DstReg, RegState::Define | + getDeadRegState(DstIsDead && LastItem) | + RenamableState) + .addReg(DstReg) + .addImm(I->Op2)); + } + break; + case AArch64::ANDXri: + if (I->Op1 == 0) { + MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode)) + .add(MI.getOperand(0)) + .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR) + .addImm(I->Op2)); + } else { + Register DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + MIBS.push_back( + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode)) + .addReg(DstReg, RegState::Define | + getDeadRegState(DstIsDead && LastItem) | + RenamableState) + .addReg(DstReg) + .addImm(I->Op2)); + } break; case AArch64::MOVNWi: case AArch64::MOVNXi: diff --git a/llvm/test/CodeGen/AArch64/arm64-movi.ll b/llvm/test/CodeGen/AArch64/arm64-movi.ll index 9a1efba..2ec58db 100644 --- a/llvm/test/CodeGen/AArch64/arm64-movi.ll +++ b/llvm/test/CodeGen/AArch64/arm64-movi.ll @@ -109,14 +109,11 @@ define i64 @movz_skip1_movk() nounwind { ret i64 147695335379508 } -; FIXME: Prefer "mov w0, #2863311530; lsl x0, x0, #4" -; or "mov x0, #-6148914691236517206; and x0, x0, #45812984480" define i64 @orr_lsl_pattern() nounwind { ; CHECK-LABEL: orr_lsl_pattern: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x0, #43680 -; CHECK-NEXT: movk x0, #43690, lsl #16 -; CHECK-NEXT: movk x0, #10, lsl #32 +; CHECK-NEXT: mov x0, #-6148914691236517206 +; CHECK-NEXT: and x0, x0, #0x1fffffffe0 ; CHECK-NEXT: ret ret i64 45812984480 } @@ -322,13 +319,11 @@ define i64 @orr_movk15() nounwind { ret i64 549621596159 } -; FIXME: prefer "mov x0, #2147483646; orr x0, x0, #36028659580010496" define i64 @orr_movk16() nounwind { ; CHECK-LABEL: orr_movk16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x0, #36028659580010496 -; CHECK-NEXT: movk x0, #65534 -; CHECK-NEXT: movk x0, #32767, lsl #16 +; CHECK-NEXT: mov x0, #2147483646 +; CHECK-NEXT: orr x0, x0, #0x7fffe0007fffe0 ; CHECK-NEXT: ret ret i64 36028661727494142 } @@ -351,13 +346,11 @@ define i64 @orr_movk18() nounwind { ret i64 137438953409 } -; FIXME: prefer "mov x0, #72340172838076673; and x0, x0, #2199023255296" define i64 @orr_and() nounwind { ; CHECK-LABEL: orr_and: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x0, #256 -; CHECK-NEXT: movk x0, #257, lsl #16 -; CHECK-NEXT: movk x0, #257, lsl #32 +; CHECK-NEXT: mov x0, #72340172838076673 +; CHECK-NEXT: and x0, x0, #0xffffffffff00 ; CHECK-NEXT: ret ret i64 1103823438080 } @@ -395,59 +388,47 @@ define i64 @movn_eor() nounwind { ret i64 3689348814437076172 } -; FIXME: prefer "mov x0, #536866816; orr x0, x0, #0x3fff800000000000" define i64 @orr_orr_64() nounwind { ; CHECK-LABEL: orr_orr_64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x0, #4611545280939032576 -; CHECK-NEXT: movk x0, #61440 -; CHECK-NEXT: movk x0, #8191, lsl #16 +; CHECK-NEXT: mov x0, #536866816 +; CHECK-NEXT: orr x0, x0, #0x3fff800000000000 ; CHECK-NEXT: ret ret i64 4611545281475899392 } -; FIXME: prefer "mov x0, #558551907040256; orr x0, x0, #0x1000100010001000" define i64 @orr_orr_32() nounwind { ; CHECK-LABEL: orr_orr_32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x0, #-287953294993589248 -; CHECK-NEXT: movk x0, #7169, lsl #16 -; CHECK-NEXT: movk x0, #7169, lsl #48 +; CHECK-NEXT: mov x0, #558551907040256 +; CHECK-NEXT: orr x0, x0, #0x1c001c001c001c00 ; CHECK-NEXT: ret ret i64 2018171185438784512 } -; FIXME: prefer "mov x0, #281479271743489; orr x0, x0, #0x1000100010001000" define i64 @orr_orr_16() nounwind { ; CHECK-LABEL: orr_orr_16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x0, #4097 -; CHECK-NEXT: movk x0, #4097, lsl #16 -; CHECK-NEXT: movk x0, #4097, lsl #32 -; CHECK-NEXT: movk x0, #4097, lsl #48 +; CHECK-NEXT: mov x0, #1152939097061330944 +; CHECK-NEXT: orr x0, x0, #0x1000100010001 ; CHECK-NEXT: ret ret i64 1153220576333074433 } -; FIXME: prefer "mov x0, #144680345676153346; orr x0, x0, #0x1818181818181818" define i64 @orr_orr_8() nounwind { ; CHECK-LABEL: orr_orr_8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x0, #6682 -; CHECK-NEXT: movk x0, #6682, lsl #16 -; CHECK-NEXT: movk x0, #6682, lsl #32 -; CHECK-NEXT: movk x0, #6682, lsl #48 +; CHECK-NEXT: mov x0, #144680345676153346 +; CHECK-NEXT: orr x0, x0, #0x1818181818181818 ; CHECK-NEXT: ret ret i64 1880844493789993498 } -; FIXME: prefer "mov x0, #-6148914691236517206; orr x0, x0, #0x0FFFFF0000000000" define i64 @orr_64_orr_8() nounwind { ; CHECK-LABEL: orr_64_orr_8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x0, #-6148914691236517206 -; CHECK-NEXT: movk x0, #65450, lsl #32 -; CHECK-NEXT: movk x0, #45055, lsl #48 +; CHECK-NEXT: orr x0, x0, #0xfffff0000000000 ; CHECK-NEXT: ret ret i64 -5764607889538110806 } -- 2.7.4