From fad5e0cf50f119f083dfc82e08994825cae5001f Mon Sep 17 00:00:00 2001 From: Ayke van Laethem Date: Tue, 6 Dec 2022 14:39:37 +0100 Subject: [PATCH] [AVR] Optimize 32-bit shifts: reverse shift + move This optimization turns shifts of almost a multiple of 8 into a shift into the opposite direction. Unfortunately it doesn't compose well with the other optimizations (I've tried) so it's separate from them. Differential Revision: https://reviews.llvm.org/D140572 --- llvm/lib/Target/AVR/AVRISelLowering.cpp | 106 ++++++++++++++++ llvm/test/CodeGen/AVR/shift32.ll | 211 ++++++++++++++++++++++++++++++++ 2 files changed, 317 insertions(+) diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp index 7690a5f..d2391b7 100644 --- a/llvm/lib/Target/AVR/AVRISelLowering.cpp +++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -1873,6 +1873,112 @@ static void insertMultibyteShift(MachineInstr &MI, MachineBasicBlock *BB, BuildMI(*BB, MI, dl, TII.get(AVR::COPY), ZeroReg) .addReg(STI.getZeroRegister()); + // Do a shift modulo 6 or 7. This is a bit more complicated than most shifts + // and is hard to compose with the rest, so these are special cased. + // The basic idea is to shift one or two bits in the opposite direction and + // then move registers around to get the correct end result. + if (ShiftLeft && (ShiftAmt % 8) >= 6) { + // Left shift modulo 6 or 7. + + // Create a slice of the registers we're going to modify, to ease working + // with them. + size_t ShiftRegsOffset = ShiftAmt / 8; + size_t ShiftRegsSize = Regs.size() - ShiftRegsOffset; + MutableArrayRef> ShiftRegs = + Regs.slice(ShiftRegsOffset, ShiftRegsSize); + + // Shift one to the right, keeping the least significant bit as the carry + // bit. + insertMultibyteShift(MI, BB, ShiftRegs, ISD::SRL, 1); + + // Rotate the least significant bit from the carry bit into a new register + // (that starts out zero). + Register LowByte = MRI.createVirtualRegister(&AVR::GPR8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::RORRd), LowByte).addReg(ZeroReg); + + // Shift one more to the right if this is a modulo-6 shift. + if (ShiftAmt % 8 == 6) { + insertMultibyteShift(MI, BB, ShiftRegs, ISD::SRL, 1); + Register NewLowByte = MRI.createVirtualRegister(&AVR::GPR8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::RORRd), NewLowByte).addReg(LowByte); + LowByte = NewLowByte; + } + + // Move all registers to the left, zeroing the bottom registers as needed. + for (size_t I = 0; I < Regs.size(); I++) { + int ShiftRegsIdx = I + 1; + if (ShiftRegsIdx < (int)ShiftRegs.size()) { + Regs[I] = ShiftRegs[ShiftRegsIdx]; + } else if (ShiftRegsIdx == (int)ShiftRegs.size()) { + Regs[I] = std::pair(LowByte, 0); + } else { + Regs[I] = std::pair(ZeroReg, 0); + } + } + + return; + } + + // Right shift modulo 6 or 7. + if (!ShiftLeft && (ShiftAmt % 8) >= 6) { + // Create a view on the registers we're going to modify, to ease working + // with them. + size_t ShiftRegsSize = Regs.size() - (ShiftAmt / 8); + MutableArrayRef> ShiftRegs = + Regs.slice(0, ShiftRegsSize); + + // Shift one to the left. + insertMultibyteShift(MI, BB, ShiftRegs, ISD::SHL, 1); + + // Sign or zero extend the most significant register into a new register. + // The HighByte is the byte that still has one (or two) bits from the + // original value. The ExtByte is purely a zero/sign extend byte (all bits + // are either 0 or 1). + Register HighByte = MRI.createVirtualRegister(&AVR::GPR8RegClass); + Register ExtByte = 0; + if (ArithmeticShift) { + // Sign-extend bit that was shifted out last. + BuildMI(*BB, MI, dl, TII.get(AVR::SBCRdRr), HighByte) + .addReg(HighByte, RegState::Undef) + .addReg(HighByte, RegState::Undef); + ExtByte = HighByte; + // The highest bit of the original value is the same as the zero-extend + // byte, so HighByte and ExtByte are the same. + } else { + // Use the zero register for zero extending. + ExtByte = ZeroReg; + // Rotate most significant bit into a new register (that starts out zero). + BuildMI(*BB, MI, dl, TII.get(AVR::ADCRdRr), HighByte) + .addReg(ExtByte) + .addReg(ExtByte); + } + + // Shift one more to the left for modulo 6 shifts. + if (ShiftAmt % 8 == 6) { + insertMultibyteShift(MI, BB, ShiftRegs, ISD::SHL, 1); + // Shift the topmost bit into the HighByte. + Register NewExt = MRI.createVirtualRegister(&AVR::GPR8RegClass); + BuildMI(*BB, MI, dl, TII.get(AVR::ADCRdRr), NewExt) + .addReg(HighByte) + .addReg(HighByte); + HighByte = NewExt; + } + + // Move all to the right, while sign or zero extending. + for (int I = Regs.size() - 1; I >= 0; I--) { + int ShiftRegsIdx = I - (Regs.size() - ShiftRegs.size()) - 1; + if (ShiftRegsIdx >= 0) { + Regs[I] = ShiftRegs[ShiftRegsIdx]; + } else if (ShiftRegsIdx == -1) { + Regs[I] = std::pair(HighByte, 0); + } else { + Regs[I] = std::pair(ExtByte, 0); + } + } + + return; + } + // For shift amounts of at least one register, simply rename the registers and // zero the bottom registers. while (ShiftLeft && ShiftAmt >= 8) { diff --git a/llvm/test/CodeGen/AVR/shift32.ll b/llvm/test/CodeGen/AVR/shift32.ll index cc83810..3b58c1e 100644 --- a/llvm/test/CodeGen/AVR/shift32.ll +++ b/llvm/test/CodeGen/AVR/shift32.ll @@ -78,6 +78,50 @@ define i32 @shl_i32_5(i32 %a) { ret i32 %res } +; shift two to the right and move the registers around +define i32 @shl_i32_6(i32 %a) { +; CHECK-LABEL: shl_i32_6: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: mov r18, r1 +; CHECK-NEXT: ror r18 +; CHECK-NEXT: lsr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: ror r18 +; CHECK-NEXT: mov r25, r24 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: mov r19, r22 +; CHECK-NEXT: movw r22, r18 +; CHECK-NEXT: ret + %res = shl i32 %a, 6 + ret i32 %res +} + + +; shift one to the right and move registers around +define i32 @shl_i32_7(i32 %a) { +; CHECK-LABEL: shl_i32_7: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsr r25 +; CHECK-NEXT: ror r24 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: ror r22 +; CHECK-NEXT: mov r18, r1 +; CHECK-NEXT: ror r18 +; CHECK-NEXT: mov r25, r24 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: mov r19, r22 +; CHECK-NEXT: movw r22, r18 +; CHECK-NEXT: ret + %res = shl i32 %a, 7 + ret i32 %res +} + define i32 @shl_i32_8(i32 %a) { ; CHECK-LABEL: shl_i32_8: ; CHECK: ; %bb.0: @@ -128,6 +172,22 @@ define i32 @shl_i32_12(i32 %a) { ret i32 %res } +define i32 @shl_i32_15(i32 %a) { +; CHECK-LABEL: shl_i32_15: +; CHECK: ; %bb.0: +; CHECK-NEXT: movw r18, r22 +; CHECK-NEXT: lsr r24 +; CHECK-NEXT: ror r19 +; CHECK-NEXT: ror r18 +; CHECK-NEXT: mov r23, r1 +; CHECK-NEXT: ror r23 +; CHECK-NEXT: mov r22, r1 +; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: ret + %res = shl i32 %a, 15 + ret i32 %res +} + ; This is a special case: this shift is performed directly inside SelectionDAG ; instead of as a custom lowering like the other shift operations. define i32 @shl_i32_16(i32 %a) { @@ -175,6 +235,21 @@ define i32 @shl_i32_28(i32 %a) { ret i32 %res } +; move the rightmost bit to the leftmost bit and clear the rest +define i32 @shl_i32_31(i32 %a) { +; CHECK-LABEL: shl_i32_31: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsr r22 +; CHECK-NEXT: mov r25, r1 +; CHECK-NEXT: ror r25 +; CHECK-NEXT: mov r24, r1 +; CHECK-NEXT: mov r23, r1 +; CHECK-NEXT: mov r22, r1 +; CHECK-NEXT: ret + %res = shl i32 %a, 31 + ret i32 %res +} + define i32 @lshr_i32_1(i32 %a) { ; CHECK-LABEL: lshr_i32_1: ; CHECK: ; %bb.0: @@ -225,6 +300,49 @@ define i32 @lshr_i32_4(i32 %a) { ret i32 %res } +define i32 @lshr_i32_6(i32 %a) { +; CHECK-LABEL: lshr_i32_6: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsl r22 +; CHECK-NEXT: rol r23 +; CHECK-NEXT: rol r24 +; CHECK-NEXT: rol r25 +; CHECK-NEXT: mov r19, r1 +; CHECK-NEXT: rol r19 +; CHECK-NEXT: lsl r22 +; CHECK-NEXT: rol r23 +; CHECK-NEXT: rol r24 +; CHECK-NEXT: rol r25 +; CHECK-NEXT: rol r19 +; CHECK-NEXT: mov r18, r25 +; CHECK-NEXT: mov r25, r24 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: movw r22, r24 +; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: ret + %res = lshr i32 %a, 6 + ret i32 %res +} + +define i32 @lshr_i32_7(i32 %a) { +; CHECK-LABEL: lshr_i32_7: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsl r22 +; CHECK-NEXT: rol r23 +; CHECK-NEXT: rol r24 +; CHECK-NEXT: rol r25 +; CHECK-NEXT: mov r19, r1 +; CHECK-NEXT: rol r19 +; CHECK-NEXT: mov r18, r25 +; CHECK-NEXT: mov r25, r24 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: movw r22, r24 +; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: ret + %res = lshr i32 %a, 7 + ret i32 %res +} + define i32 @lshr_i32_8(i32 %a) { ; CHECK-LABEL: lshr_i32_8: ; CHECK: ; %bb.0: @@ -280,6 +398,20 @@ define i32 @lshr_i32_24(i32 %a) { ret i32 %res } +define i32 @lshr_i32_31(i32 %a) { +; CHECK-LABEL: lshr_i32_31: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsl r25 +; CHECK-NEXT: mov r22, r1 +; CHECK-NEXT: rol r22 +; CHECK-NEXT: mov r25, r1 +; CHECK-NEXT: mov r24, r1 +; CHECK-NEXT: mov r23, r1 +; CHECK-NEXT: ret + %res = lshr i32 %a, 31 + ret i32 %res +} + define i32 @ashr_i32_1(i32 %a) { ; CHECK-LABEL: ashr_i32_1: ; CHECK: ; %bb.0: @@ -333,6 +465,24 @@ define i32 @ashr_i32_4(i32 %a) { ret i32 %res } +define i32 @ashr_i32_7(i32 %a) { +; CHECK-LABEL: ashr_i32_7: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsl r22 +; CHECK-NEXT: rol r23 +; CHECK-NEXT: rol r24 +; CHECK-NEXT: rol r25 +; CHECK-NEXT: sbc r19, r19 +; CHECK-NEXT: mov r18, r25 +; CHECK-NEXT: mov r25, r24 +; CHECK-NEXT: mov r24, r23 +; CHECK-NEXT: movw r22, r24 +; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: ret + %res = ashr i32 %a, 7 + ret i32 %res +} + ; TODO: this could be optimized to 4 movs, instead of 6. define i32 @ashr_i32_8(i32 %a) { ; CHECK-LABEL: ashr_i32_8: @@ -375,3 +525,64 @@ define i32 @ashr_i32_17(i32 %a) { %res = ashr i32 %a, 17 ret i32 %res } + +define i32 @ashr_i32_22(i32 %a) { +; CHECK-LABEL: ashr_i32_22: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsl r24 +; CHECK-NEXT: rol r25 +; CHECK-NEXT: sbc r19, r19 +; CHECK-NEXT: lsl r24 +; CHECK-NEXT: rol r25 +; CHECK-NEXT: mov r23, r19 +; CHECK-NEXT: rol r23 +; CHECK-NEXT: mov r18, r19 +; CHECK-NEXT: mov r22, r25 +; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: ret + %res = ashr i32 %a, 22 + ret i32 %res +} + +define i32 @ashr_i32_23(i32 %a) { +; CHECK-LABEL: ashr_i32_23: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsl r24 +; CHECK-NEXT: rol r25 +; CHECK-NEXT: sbc r19, r19 +; CHECK-NEXT: mov r18, r19 +; CHECK-NEXT: mov r23, r19 +; CHECK-NEXT: mov r22, r25 +; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: ret + %res = ashr i32 %a, 23 + ret i32 %res +} + +define i32 @ashr_i32_30(i32 %a) { +; CHECK-LABEL: ashr_i32_30: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsl r25 +; CHECK-NEXT: sbc r19, r19 +; CHECK-NEXT: lsl r25 +; CHECK-NEXT: mov r22, r19 +; CHECK-NEXT: rol r22 +; CHECK-NEXT: mov r18, r19 +; CHECK-NEXT: mov r23, r19 +; CHECK-NEXT: movw r24, r18 +; CHECK-NEXT: ret + %res = ashr i32 %a, 30 + ret i32 %res +} + +define i32 @ashr_i32_31(i32 %a) { +; CHECK-LABEL: ashr_i32_31: +; CHECK: ; %bb.0: +; CHECK-NEXT: lsl r25 +; CHECK-NEXT: sbc r23, r23 +; CHECK-NEXT: mov r22, r23 +; CHECK-NEXT: movw r24, r22 +; CHECK-NEXT: ret + %res = ashr i32 %a, 31 + ret i32 %res +} -- 2.7.4