From 840d10a1d2c939016f387a041b0fbb3a9d592b17 Mon Sep 17 00:00:00 2001 From: Ayke van Laethem Date: Tue, 6 Dec 2022 13:43:23 +0100 Subject: [PATCH] [AVR] Custom lower 32-bit shift instructions 32-bit shift instructions were previously expanded using the default SelectionDAG expander, which meant it used 16-bit constant shifts and ORed them together. This works, but is far from optimal. I've optimized 32-bit shifts on AVR using a custom inserter. This is done using three new pseudo-instructions that take the upper and lower bits of the value in two separate 16-bit registers and outputs two 16-bit registers. This is the first commit in a series. When completed, shift instructions will take around 31% less instructions on average for constant 32-bit shifts, and is in all cases equal or better than the old behavior. It also tends to match or outperform avr-gcc: the only cases where avr-gcc does better is when it uses a loop to shift, or when the LLVM register allocator inserts some unnecessary movs. But it even outperforms avr-gcc in some cases where avr-gcc does not use a loop. As a side effect, non-constant 32-bit shifts also become more efficient. For some real-world differences: the build of compiler-rt I use in TinyGo becomes 2.7% smaller and the build of picolibc I use becomes 0.9% smaller. I think picolibc is a better representation of real-world code, but even a ~1% reduction in code size is really significant. The current patch just lays the groundwork. The result is actually a regression in code size. Later patches will use this as a basis to optimize these shift instructions. Differential Revision: https://reviews.llvm.org/D140569 --- llvm/lib/Target/AVR/AVRISelLowering.cpp | 169 ++++++++++++++++++++++++++++++++ llvm/lib/Target/AVR/AVRISelLowering.h | 5 + llvm/lib/Target/AVR/AVRInstrInfo.td | 18 ++++ llvm/test/CodeGen/AVR/shift32.ll | 129 ++++++++++++++++++++++++ 4 files changed, 321 insertions(+) create mode 100644 llvm/test/CodeGen/AVR/shift32.ll diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp index eed25e8..1770cdb 100644 --- a/llvm/lib/Target/AVR/AVRISelLowering.cpp +++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -88,6 +88,9 @@ AVRTargetLowering::AVRTargetLowering(const AVRTargetMachine &TM, setOperationAction(ISD::SRA, MVT::i16, Custom); setOperationAction(ISD::SHL, MVT::i16, Custom); setOperationAction(ISD::SRL, MVT::i16, Custom); + setOperationAction(ISD::SRA, MVT::i32, Custom); + setOperationAction(ISD::SHL, MVT::i32, Custom); + setOperationAction(ISD::SRL, MVT::i32, Custom); setOperationAction(ISD::SHL_PARTS, MVT::i16, Expand); setOperationAction(ISD::SRA_PARTS, MVT::i16, Expand); setOperationAction(ISD::SRL_PARTS, MVT::i16, Expand); @@ -247,10 +250,13 @@ const char *AVRTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE(CALL); NODE(WRAPPER); NODE(LSL); + NODE(LSLW); NODE(LSR); + NODE(LSRW); NODE(ROL); NODE(ROR); NODE(ASR); + NODE(ASRW); NODE(LSLLOOP); NODE(LSRLOOP); NODE(ROLLOOP); @@ -279,6 +285,57 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const { assert(isPowerOf2_32(VT.getSizeInBits()) && "Expected power-of-2 shift amount"); + if (VT.getSizeInBits() == 32) { + if (!isa(N->getOperand(1))) { + // 32-bit shifts are converted to a loop in IR. + // This should be unreachable. + report_fatal_error("Expected a constant shift amount!"); + } + SDVTList ResTys = DAG.getVTList(MVT::i16, MVT::i16); + SDValue SrcLo = + DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i16, Op.getOperand(0), + DAG.getConstant(0, dl, MVT::i16)); + SDValue SrcHi = + DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i16, Op.getOperand(0), + DAG.getConstant(1, dl, MVT::i16)); + uint64_t ShiftAmount = + cast(N->getOperand(1))->getZExtValue(); + if (ShiftAmount == 16) { + // Special case these two operations because they appear to be used by the + // generic codegen parts to lower 32-bit numbers. + // TODO: perhaps we can lower shift amounts bigger than 16 to a 16-bit + // shift of a part of the 32-bit value? + switch (Op.getOpcode()) { + case ISD::SHL: { + SDValue Zero = DAG.getConstant(0, dl, MVT::i16); + return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i32, Zero, SrcLo); + } + case ISD::SRL: { + SDValue Zero = DAG.getConstant(0, dl, MVT::i16); + return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i32, SrcHi, Zero); + } + } + } + SDValue Cnt = DAG.getTargetConstant(ShiftAmount, dl, MVT::i8); + unsigned Opc; + switch (Op.getOpcode()) { + default: + llvm_unreachable("Invalid 32-bit shift opcode!"); + case ISD::SHL: + Opc = AVRISD::LSLW; + break; + case ISD::SRL: + Opc = AVRISD::LSRW; + break; + case ISD::SRA: + Opc = AVRISD::ASRW; + break; + } + SDValue Result = DAG.getNode(Opc, dl, ResTys, SrcLo, SrcHi, Cnt); + return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i32, Result.getValue(0), + Result.getValue(1)); + } + // Expand non-constant shifts to loops. if (!isa(N->getOperand(1))) { switch (Op.getOpcode()) { @@ -1789,6 +1846,114 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI, return RemBB; } +// Do a multibyte AVR shift. Insert shift instructions and put the output +// registers in the Regs array. +// Because AVR does not have a normal shift instruction (only a single bit shift +// instruction), we have to emulate this behavior with other instructions. +static void insertMultibyteShift(MachineInstr &MI, MachineBasicBlock *BB, + MutableArrayRef> Regs, + ISD::NodeType Opc, int64_t ShiftAmt) { + const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const DebugLoc &dl = MI.getDebugLoc(); + + const bool ShiftLeft = Opc == ISD::SHL; + const bool ArithmeticShift = Opc == ISD::SRA; + + // Shift by one. This is the fallback that always works, and the shift + // operation that is used for 1, 2, and 3 bit shifts. + while (ShiftLeft && ShiftAmt) { + // Shift one to the left. + for (ssize_t I = Regs.size() - 1; I >= 0; I--) { + Register Out = MRI.createVirtualRegister(&AVR::GPR8RegClass); + Register In = Regs[I].first; + Register InSubreg = Regs[I].second; + if (I == (ssize_t)Regs.size() - 1) { // first iteration + BuildMI(*BB, MI, dl, TII.get(AVR::ADDRdRr), Out) + .addReg(In, 0, InSubreg) + .addReg(In, 0, InSubreg); + } else { + BuildMI(*BB, MI, dl, TII.get(AVR::ADCRdRr), Out) + .addReg(In, 0, InSubreg) + .addReg(In, 0, InSubreg); + } + Regs[I] = std::pair(Out, 0); + } + ShiftAmt--; + } + while (!ShiftLeft && ShiftAmt) { + // Shift one to the right. + for (size_t I = 0; I < Regs.size(); I++) { + Register Out = MRI.createVirtualRegister(&AVR::GPR8RegClass); + Register In = Regs[I].first; + Register InSubreg = Regs[I].second; + if (I == 0) { + unsigned Opc = ArithmeticShift ? AVR::ASRRd : AVR::LSRRd; + BuildMI(*BB, MI, dl, TII.get(Opc), Out).addReg(In, 0, InSubreg); + } else { + BuildMI(*BB, MI, dl, TII.get(AVR::RORRd), Out).addReg(In, 0, InSubreg); + } + Regs[I] = std::pair(Out, 0); + } + ShiftAmt--; + } + + if (ShiftAmt != 0) { + llvm_unreachable("don't know how to shift!"); // sanity check + } +} + +// Do a wide (32-bit) shift. +MachineBasicBlock * +AVRTargetLowering::insertWideShift(MachineInstr &MI, + MachineBasicBlock *BB) const { + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + const DebugLoc &dl = MI.getDebugLoc(); + + // How much to shift to the right (meaning: a negative number indicates a left + // shift). + int64_t ShiftAmt = MI.getOperand(4).getImm(); + ISD::NodeType Opc; + switch (MI.getOpcode()) { + case AVR::Lsl32: + Opc = ISD::SHL; + break; + case AVR::Lsr32: + Opc = ISD::SRL; + break; + case AVR::Asr32: + Opc = ISD::SRA; + break; + } + + // Read the input registers, with the most significant register at index 0. + std::array, 4> Registers = { + std::pair(MI.getOperand(3).getReg(), AVR::sub_hi), + std::pair(MI.getOperand(3).getReg(), AVR::sub_lo), + std::pair(MI.getOperand(2).getReg(), AVR::sub_hi), + std::pair(MI.getOperand(2).getReg(), AVR::sub_lo), + }; + + // Do the shift. The registers are modified in-place. + insertMultibyteShift(MI, BB, Registers, Opc, ShiftAmt); + + // Combine the 8-bit registers into 16-bit register pairs. + BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(1).getReg()) + .addReg(Registers[0].first, 0, Registers[0].second) + .addImm(AVR::sub_hi) + .addReg(Registers[1].first, 0, Registers[1].second) + .addImm(AVR::sub_lo); + BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(0).getReg()) + .addReg(Registers[2].first, 0, Registers[2].second) + .addImm(AVR::sub_hi) + .addReg(Registers[3].first, 0, Registers[3].second) + .addImm(AVR::sub_lo); + + // Remove the pseudo instruction. + MI.eraseFromParent(); + return BB; +} + static bool isCopyMulResult(MachineBasicBlock::iterator const &I) { if (I->getOpcode() == AVR::COPY) { Register SrcReg = I->getOperand(1).getReg(); @@ -1901,6 +2066,10 @@ AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case AVR::Asr8: case AVR::Asr16: return insertShift(MI, MBB); + case AVR::Lsl32: + case AVR::Lsr32: + case AVR::Asr32: + return insertWideShift(MI, MBB); case AVR::MULRdRr: case AVR::MULSRdRr: return insertMul(MI, MBB); diff --git a/llvm/lib/Target/AVR/AVRISelLowering.h b/llvm/lib/Target/AVR/AVRISelLowering.h index 8d91c1b..74092ca 100644 --- a/llvm/lib/Target/AVR/AVRISelLowering.h +++ b/llvm/lib/Target/AVR/AVRISelLowering.h @@ -39,14 +39,17 @@ enum NodeType { LSLBN, ///< Byte logical shift left N bits. LSLWN, ///< Word logical shift left N bits. LSLHI, ///< Higher 8-bit of word logical shift left. + LSLW, ///< Wide logical shift left. LSR, ///< Logical shift right. LSRBN, ///< Byte logical shift right N bits. LSRWN, ///< Word logical shift right N bits. LSRLO, ///< Lower 8-bit of word logical shift right. + LSRW, ///< Wide logical shift right. ASR, ///< Arithmetic shift right. ASRBN, ///< Byte arithmetic shift right N bits. ASRWN, ///< Word arithmetic shift right N bits. ASRLO, ///< Lower 8-bit of word arithmetic shift right. + ASRW, ///< Wide arithmetic shift right. ROR, ///< Bit rotate right. ROL, ///< Bit rotate left. LSLLOOP, ///< A loop of single logical shift left instructions. @@ -186,6 +189,8 @@ protected: private: MachineBasicBlock *insertShift(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *insertWideShift(MachineInstr &MI, + MachineBasicBlock *BB) const; MachineBasicBlock *insertMul(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *insertCopyZero(MachineInstr &MI, MachineBasicBlock *BB) const; diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td index 2a0f320..05ee94b 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.td +++ b/llvm/lib/Target/AVR/AVRInstrInfo.td @@ -69,6 +69,9 @@ def AVRasrbn : SDNode<"AVRISD::ASRBN", SDTIntBinOp>; def AVRlslwn : SDNode<"AVRISD::LSLWN", SDTIntBinOp>; def AVRlsrwn : SDNode<"AVRISD::LSRWN", SDTIntBinOp>; def AVRasrwn : SDNode<"AVRISD::ASRWN", SDTIntBinOp>; +def AVRlslw : SDNode<"AVRISD::LSLW", SDTIntShiftDOp>; +def AVRlsrw : SDNode<"AVRISD::LSRW", SDTIntShiftDOp>; +def AVRasrw : SDNode<"AVRISD::ASRW", SDTIntShiftDOp>; // Pseudo shift nodes for non-constant shift amounts. def AVRlslLoop : SDNode<"AVRISD::LSLLOOP", SDTIntShiftOp>; @@ -2337,6 +2340,11 @@ def Lsl16 : ShiftPseudo<(outs DREGS : $src, i8 : $cnt))]>; +def Lsl32 : ShiftPseudo<(outs DREGS:$dstlo, DREGS:$dsthi), + (ins DREGS:$srclo, DREGS:$srchi, i8imm:$cnt), + "# Lsl32 PSEUDO", + [(set i16:$dstlo, i16:$dsthi, (AVRlslw i16:$srclo, i16:$srchi, i8:$cnt))]>; + def Lsr8 : ShiftPseudo<(outs GPR8 : $dst), (ins GPR8 @@ -2357,6 +2365,11 @@ def Lsr16 : ShiftPseudo<(outs DREGS : $src, i8 : $cnt))]>; +def Lsr32 : ShiftPseudo<(outs DREGS:$dstlo, DREGS:$dsthi), + (ins DREGS:$srclo, DREGS:$srchi, i8imm:$cnt), + "# Lsr32 PSEUDO", + [(set i16:$dstlo, i16:$dsthi, (AVRlsrw i16:$srclo, i16:$srchi, i8:$cnt))]>; + def Rol8 : ShiftPseudo<(outs GPR8 : $dst), (ins GPR8 @@ -2417,6 +2430,11 @@ def Asr16 : ShiftPseudo<(outs DREGS : $src, i8 : $cnt))]>; +def Asr32 : ShiftPseudo<(outs DREGS:$dstlo, DREGS:$dsthi), + (ins DREGS:$srclo, DREGS:$srchi, i8imm:$cnt), + "# Asr32 PSEUDO", + [(set i16:$dstlo, i16:$dsthi, (AVRasrw i16:$srclo, i16:$srchi, i8:$cnt))]>; + // lowered to a copy from the zero register. let usesCustomInserter=1 in def CopyZero : Pseudo<(outs GPR8:$rd), (ins), "clrz\t$rd", [(set i8:$rd, 0)]>; diff --git a/llvm/test/CodeGen/AVR/shift32.ll b/llvm/test/CodeGen/AVR/shift32.ll new file mode 100644 index 0000000..bc25e40 --- /dev/null +++ b/llvm/test/CodeGen/AVR/shift32.ll @@ -0,0 +1,129 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=avr -mattr=movw -verify-machineinstrs | FileCheck %s + +define i32 @shl_i32_1(i32 %a) { +; CHECK-LABEL: shl_i32_1: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsl r22 +; CHECK-NEXT: rol r23 +; CHECK-NEXT: rol r24 +; CHECK-NEXT: rol r25 +; CHECK-NEXT: ret + %res = shl i32 %a, 1 + ret i32 %res +} + +define i32 @shl_i32_2(i32 %a) { +; CHECK-LABEL: shl_i32_2: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsl r22 +; CHECK-NEXT: rol r23 +; CHECK-NEXT: rol r24 +; CHECK-NEXT: rol r25 +; CHECK-NEXT: lsl r22 +; CHECK-NEXT: rol r23 +; CHECK-NEXT: rol r24 +; CHECK-NEXT: rol r25 +; CHECK-NEXT: ret + %res = shl i32 %a, 2 + ret i32 %res +} + +; This is a special case: this shift is performed directly inside SelectionDAG +; instead of as a custom lowering like the other shift operations. +define i32 @shl_i32_16(i32 %a) { +; CHECK-LABEL: shl_i32_16: +; CHECK: ; %bb.0: +; CHECK-NEXT: movw r24, r22 +; CHECK-NEXT: ldi r22, 0 +; CHECK-NEXT: ldi r23, 0 +; CHECK-NEXT: ret + %res = shl i32 %a, 16 + ret i32 %res +} + +; Combined with the register allocator, shift instructions can sometimes be +; optimized away entirely. The least significant registers are simply stored +; directly instead of moving them first. +define void @shl_i32_16_ptr(i32 %a, ptr %ptr) { +; CHECK-LABEL: shl_i32_16_ptr: +; CHECK: ; %bb.0: +; CHECK-NEXT: movw r30, r20 +; CHECK-NEXT: std Z+2, r22 +; CHECK-NEXT: std Z+3, r23 +; CHECK-NEXT: ldi r24, 0 +; CHECK-NEXT: ldi r25, 0 +; CHECK-NEXT: st Z, r24 +; CHECK-NEXT: std Z+1, r25 +; CHECK-NEXT: ret + %res = shl i32 %a, 16 + store i32 %res, ptr %ptr + ret void +} + +define i32 @lshr_i32_1(i32 %a) { +; CHECK-LABEL: lshr_i32_1: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: ret + %res = lshr i32 %a, 1 + ret i32 %res +} + +define i32 @lshr_i32_2(i32 %a) { +; CHECK-LABEL: lshr_i32_2: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: lsr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: ret + %res = lshr i32 %a, 2 + ret i32 %res +} + +define i32 @lshr_i32_16(i32 %a) { +; CHECK-LABEL: lshr_i32_16: +; CHECK: ; %bb.0: +; CHECK-NEXT: movw r22, r24 +; CHECK-NEXT: ldi r24, 0 +; CHECK-NEXT: ldi r25, 0 +; CHECK-NEXT: ret + %res = lshr i32 %a, 16 + ret i32 %res +} + +define i32 @ashr_i32_1(i32 %a) { +; CHECK-LABEL: ashr_i32_1: +; CHECK: ; %bb.0: +; CHECK-NEXT: asr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: ret + %res = ashr i32 %a, 1 + ret i32 %res +} + +define i32 @ashr_i32_2(i32 %a) { +; CHECK-LABEL: ashr_i32_2: +; CHECK: ; %bb.0: +; CHECK-NEXT: asr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: asr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: ret + %res = ashr i32 %a, 2 + ret i32 %res +} -- 2.7.4