From fad5e0cf50f119f083dfc82e08994825cae5001f Mon Sep 17 00:00:00 2001
From: Ayke van Laethem <aykevanlaethem@gmail.com>
Date: Tue, 6 Dec 2022 14:39:37 +0100
Subject: [PATCH] [AVR] Optimize 32-bit shifts: reverse shift + move

This optimization turns shifts of almost a multiple of 8 into a shift
into the opposite direction. Unfortunately it doesn't compose well with
the other optimizations (I've tried) so it's separate from them.

Differential Revision: https://reviews.llvm.org/D140572
---
 llvm/lib/Target/AVR/AVRISelLowering.cpp | 106 ++++++++++++++++
 llvm/test/CodeGen/AVR/shift32.ll        | 211 ++++++++++++++++++++++++++++++++
 2 files changed, 317 insertions(+)

diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 7690a5f..d2391b7 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -1873,6 +1873,112 @@ static void insertMultibyteShift(MachineInstr &MI, MachineBasicBlock *BB,
   BuildMI(*BB, MI, dl, TII.get(AVR::COPY), ZeroReg)
       .addReg(STI.getZeroRegister());
 
+  // Do a shift modulo 6 or 7. This is a bit more complicated than most shifts
+  // and is hard to compose with the rest, so these are special cased.
+  // The basic idea is to shift one or two bits in the opposite direction and
+  // then move registers around to get the correct end result.
+  if (ShiftLeft && (ShiftAmt % 8) >= 6) {
+    // Left shift modulo 6 or 7.
+
+    // Create a slice of the registers we're going to modify, to ease working
+    // with them.
+    size_t ShiftRegsOffset = ShiftAmt / 8;
+    size_t ShiftRegsSize = Regs.size() - ShiftRegsOffset;
+    MutableArrayRef<std::pair<Register, int>> ShiftRegs =
+        Regs.slice(ShiftRegsOffset, ShiftRegsSize);
+
+    // Shift one to the right, keeping the least significant bit as the carry
+    // bit.
+    insertMultibyteShift(MI, BB, ShiftRegs, ISD::SRL, 1);
+
+    // Rotate the least significant bit from the carry bit into a new register
+    // (that starts out zero).
+    Register LowByte = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+    BuildMI(*BB, MI, dl, TII.get(AVR::RORRd), LowByte).addReg(ZeroReg);
+
+    // Shift one more to the right if this is a modulo-6 shift.
+    if (ShiftAmt % 8 == 6) {
+      insertMultibyteShift(MI, BB, ShiftRegs, ISD::SRL, 1);
+      Register NewLowByte = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+      BuildMI(*BB, MI, dl, TII.get(AVR::RORRd), NewLowByte).addReg(LowByte);
+      LowByte = NewLowByte;
+    }
+
+    // Move all registers to the left, zeroing the bottom registers as needed.
+    for (size_t I = 0; I < Regs.size(); I++) {
+      int ShiftRegsIdx = I + 1;
+      if (ShiftRegsIdx < (int)ShiftRegs.size()) {
+        Regs[I] = ShiftRegs[ShiftRegsIdx];
+      } else if (ShiftRegsIdx == (int)ShiftRegs.size()) {
+        Regs[I] = std::pair(LowByte, 0);
+      } else {
+        Regs[I] = std::pair(ZeroReg, 0);
+      }
+    }
+
+    return;
+  }
+
+  // Right shift modulo 6 or 7.
+  if (!ShiftLeft && (ShiftAmt % 8) >= 6) {
+    // Create a view on the registers we're going to modify, to ease working
+    // with them.
+    size_t ShiftRegsSize = Regs.size() - (ShiftAmt / 8);
+    MutableArrayRef<std::pair<Register, int>> ShiftRegs =
+        Regs.slice(0, ShiftRegsSize);
+
+    // Shift one to the left.
+    insertMultibyteShift(MI, BB, ShiftRegs, ISD::SHL, 1);
+
+    // Sign or zero extend the most significant register into a new register.
+    // The HighByte is the byte that still has one (or two) bits from the
+    // original value. The ExtByte is purely a zero/sign extend byte (all bits
+    // are either 0 or 1).
+    Register HighByte = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+    Register ExtByte = 0;
+    if (ArithmeticShift) {
+      // Sign-extend bit that was shifted out last.
+      BuildMI(*BB, MI, dl, TII.get(AVR::SBCRdRr), HighByte)
+          .addReg(HighByte, RegState::Undef)
+          .addReg(HighByte, RegState::Undef);
+      ExtByte = HighByte;
+      // The highest bit of the original value is the same as the zero-extend
+      // byte, so HighByte and ExtByte are the same.
+    } else {
+      // Use the zero register for zero extending.
+      ExtByte = ZeroReg;
+      // Rotate most significant bit into a new register (that starts out zero).
+      BuildMI(*BB, MI, dl, TII.get(AVR::ADCRdRr), HighByte)
+          .addReg(ExtByte)
+          .addReg(ExtByte);
+    }
+
+    // Shift one more to the left for modulo 6 shifts.
+    if (ShiftAmt % 8 == 6) {
+      insertMultibyteShift(MI, BB, ShiftRegs, ISD::SHL, 1);
+      // Shift the topmost bit into the HighByte.
+      Register NewExt = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+      BuildMI(*BB, MI, dl, TII.get(AVR::ADCRdRr), NewExt)
+          .addReg(HighByte)
+          .addReg(HighByte);
+      HighByte = NewExt;
+    }
+
+    // Move all to the right, while sign or zero extending.
+    for (int I = Regs.size() - 1; I >= 0; I--) {
+      int ShiftRegsIdx = I - (Regs.size() - ShiftRegs.size()) - 1;
+      if (ShiftRegsIdx >= 0) {
+        Regs[I] = ShiftRegs[ShiftRegsIdx];
+      } else if (ShiftRegsIdx == -1) {
+        Regs[I] = std::pair(HighByte, 0);
+      } else {
+        Regs[I] = std::pair(ExtByte, 0);
+      }
+    }
+
+    return;
+  }
+
   // For shift amounts of at least one register, simply rename the registers and
   // zero the bottom registers.
   while (ShiftLeft && ShiftAmt >= 8) {
diff --git a/llvm/test/CodeGen/AVR/shift32.ll b/llvm/test/CodeGen/AVR/shift32.ll
index cc83810..3b58c1e 100644
--- a/llvm/test/CodeGen/AVR/shift32.ll
+++ b/llvm/test/CodeGen/AVR/shift32.ll
@@ -78,6 +78,50 @@ define i32 @shl_i32_5(i32 %a) {
   ret i32 %res
 }
 
+; shift two to the right and move the registers around
+define i32 @shl_i32_6(i32 %a) {
+; CHECK-LABEL: shl_i32_6:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    lsr r25
+; CHECK-NEXT:    ror r24
+; CHECK-NEXT:    ror r23
+; CHECK-NEXT:    ror r22
+; CHECK-NEXT:    mov r18, r1
+; CHECK-NEXT:    ror r18
+; CHECK-NEXT:    lsr r25
+; CHECK-NEXT:    ror r24
+; CHECK-NEXT:    ror r23
+; CHECK-NEXT:    ror r22
+; CHECK-NEXT:    ror r18
+; CHECK-NEXT:    mov r25, r24
+; CHECK-NEXT:    mov r24, r23
+; CHECK-NEXT:    mov r19, r22
+; CHECK-NEXT:    movw r22, r18
+; CHECK-NEXT:    ret
+  %res = shl i32 %a, 6
+  ret i32 %res
+}
+
+
+; shift one to the right and move registers around
+define i32 @shl_i32_7(i32 %a) {
+; CHECK-LABEL: shl_i32_7:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    lsr r25
+; CHECK-NEXT:    ror r24
+; CHECK-NEXT:    ror r23
+; CHECK-NEXT:    ror r22
+; CHECK-NEXT:    mov r18, r1
+; CHECK-NEXT:    ror r18
+; CHECK-NEXT:    mov r25, r24
+; CHECK-NEXT:    mov r24, r23
+; CHECK-NEXT:    mov r19, r22
+; CHECK-NEXT:    movw r22, r18
+; CHECK-NEXT:    ret
+  %res = shl i32 %a, 7
+  ret i32 %res
+}
+
 define i32 @shl_i32_8(i32 %a) {
 ; CHECK-LABEL: shl_i32_8:
 ; CHECK:       ; %bb.0:
@@ -128,6 +172,22 @@ define i32 @shl_i32_12(i32 %a) {
   ret i32 %res
 }
 
+define i32 @shl_i32_15(i32 %a) {
+; CHECK-LABEL: shl_i32_15:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    movw r18, r22
+; CHECK-NEXT:    lsr r24
+; CHECK-NEXT:    ror r19
+; CHECK-NEXT:    ror r18
+; CHECK-NEXT:    mov r23, r1
+; CHECK-NEXT:    ror r23
+; CHECK-NEXT:    mov r22, r1
+; CHECK-NEXT:    movw r24, r18
+; CHECK-NEXT:    ret
+  %res = shl i32 %a, 15
+  ret i32 %res
+}
+
 ; This is a special case: this shift is performed directly inside SelectionDAG
 ; instead of as a custom lowering like the other shift operations.
 define i32 @shl_i32_16(i32 %a) {
@@ -175,6 +235,21 @@ define i32 @shl_i32_28(i32 %a) {
   ret i32 %res
 }
 
+; move the rightmost bit to the leftmost bit and clear the rest
+define i32 @shl_i32_31(i32 %a) {
+; CHECK-LABEL: shl_i32_31:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    lsr r22
+; CHECK-NEXT:    mov r25, r1
+; CHECK-NEXT:    ror r25
+; CHECK-NEXT:    mov r24, r1
+; CHECK-NEXT:    mov r23, r1
+; CHECK-NEXT:    mov r22, r1
+; CHECK-NEXT:    ret
+  %res = shl i32 %a, 31
+  ret i32 %res
+}
+
 define i32 @lshr_i32_1(i32 %a) {
 ; CHECK-LABEL: lshr_i32_1:
 ; CHECK:       ; %bb.0:
@@ -225,6 +300,49 @@ define i32 @lshr_i32_4(i32 %a) {
   ret i32 %res
 }
 
+define i32 @lshr_i32_6(i32 %a) {
+; CHECK-LABEL: lshr_i32_6:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    lsl r22
+; CHECK-NEXT:    rol r23
+; CHECK-NEXT:    rol r24
+; CHECK-NEXT:    rol r25
+; CHECK-NEXT:    mov r19, r1
+; CHECK-NEXT:    rol r19
+; CHECK-NEXT:    lsl r22
+; CHECK-NEXT:    rol r23
+; CHECK-NEXT:    rol r24
+; CHECK-NEXT:    rol r25
+; CHECK-NEXT:    rol r19
+; CHECK-NEXT:    mov r18, r25
+; CHECK-NEXT:    mov r25, r24
+; CHECK-NEXT:    mov r24, r23
+; CHECK-NEXT:    movw r22, r24
+; CHECK-NEXT:    movw r24, r18
+; CHECK-NEXT:    ret
+  %res = lshr i32 %a, 6
+  ret i32 %res
+}
+
+define i32 @lshr_i32_7(i32 %a) {
+; CHECK-LABEL: lshr_i32_7:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    lsl r22
+; CHECK-NEXT:    rol r23
+; CHECK-NEXT:    rol r24
+; CHECK-NEXT:    rol r25
+; CHECK-NEXT:    mov r19, r1
+; CHECK-NEXT:    rol r19
+; CHECK-NEXT:    mov r18, r25
+; CHECK-NEXT:    mov r25, r24
+; CHECK-NEXT:    mov r24, r23
+; CHECK-NEXT:    movw r22, r24
+; CHECK-NEXT:    movw r24, r18
+; CHECK-NEXT:    ret
+  %res = lshr i32 %a, 7
+  ret i32 %res
+}
+
 define i32 @lshr_i32_8(i32 %a) {
 ; CHECK-LABEL: lshr_i32_8:
 ; CHECK:       ; %bb.0:
@@ -280,6 +398,20 @@ define i32 @lshr_i32_24(i32 %a) {
   ret i32 %res
 }
 
+define i32 @lshr_i32_31(i32 %a) {
+; CHECK-LABEL: lshr_i32_31:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    lsl r25
+; CHECK-NEXT:    mov r22, r1
+; CHECK-NEXT:    rol r22
+; CHECK-NEXT:    mov r25, r1
+; CHECK-NEXT:    mov r24, r1
+; CHECK-NEXT:    mov r23, r1
+; CHECK-NEXT:    ret
+  %res = lshr i32 %a, 31
+  ret i32 %res
+}
+
 define i32 @ashr_i32_1(i32 %a) {
 ; CHECK-LABEL: ashr_i32_1:
 ; CHECK:       ; %bb.0:
@@ -333,6 +465,24 @@ define i32 @ashr_i32_4(i32 %a) {
   ret i32 %res
 }
 
+define i32 @ashr_i32_7(i32 %a) {
+; CHECK-LABEL: ashr_i32_7:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    lsl r22
+; CHECK-NEXT:    rol r23
+; CHECK-NEXT:    rol r24
+; CHECK-NEXT:    rol r25
+; CHECK-NEXT:    sbc r19, r19
+; CHECK-NEXT:    mov r18, r25
+; CHECK-NEXT:    mov r25, r24
+; CHECK-NEXT:    mov r24, r23
+; CHECK-NEXT:    movw r22, r24
+; CHECK-NEXT:    movw r24, r18
+; CHECK-NEXT:    ret
+  %res = ashr i32 %a, 7
+  ret i32 %res
+}
+
 ; TODO: this could be optimized to 4 movs, instead of 6.
 define i32 @ashr_i32_8(i32 %a) {
 ; CHECK-LABEL: ashr_i32_8:
@@ -375,3 +525,64 @@ define i32 @ashr_i32_17(i32 %a) {
   %res = ashr i32 %a, 17
   ret i32 %res
 }
+
+define i32 @ashr_i32_22(i32 %a) {
+; CHECK-LABEL: ashr_i32_22:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    lsl r24
+; CHECK-NEXT:    rol r25
+; CHECK-NEXT:    sbc r19, r19
+; CHECK-NEXT:    lsl r24
+; CHECK-NEXT:    rol r25
+; CHECK-NEXT:    mov r23, r19
+; CHECK-NEXT:    rol r23
+; CHECK-NEXT:    mov r18, r19
+; CHECK-NEXT:    mov r22, r25
+; CHECK-NEXT:    movw r24, r18
+; CHECK-NEXT:    ret
+  %res = ashr i32 %a, 22
+  ret i32 %res
+}
+
+define i32 @ashr_i32_23(i32 %a) {
+; CHECK-LABEL: ashr_i32_23:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    lsl r24
+; CHECK-NEXT:    rol r25
+; CHECK-NEXT:    sbc r19, r19
+; CHECK-NEXT:    mov r18, r19
+; CHECK-NEXT:    mov r23, r19
+; CHECK-NEXT:    mov r22, r25
+; CHECK-NEXT:    movw r24, r18
+; CHECK-NEXT:    ret
+  %res = ashr i32 %a, 23
+  ret i32 %res
+}
+
+define i32 @ashr_i32_30(i32 %a) {
+; CHECK-LABEL: ashr_i32_30:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    lsl r25
+; CHECK-NEXT:    sbc r19, r19
+; CHECK-NEXT:    lsl r25
+; CHECK-NEXT:    mov r22, r19
+; CHECK-NEXT:    rol r22
+; CHECK-NEXT:    mov r18, r19
+; CHECK-NEXT:    mov r23, r19
+; CHECK-NEXT:    movw r24, r18
+; CHECK-NEXT:    ret
+  %res = ashr i32 %a, 30
+  ret i32 %res
+}
+
+define i32 @ashr_i32_31(i32 %a) {
+; CHECK-LABEL: ashr_i32_31:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    lsl r25
+; CHECK-NEXT:    sbc r23, r23
+; CHECK-NEXT:    mov r22, r23
+; CHECK-NEXT:    movw r24, r22
+; CHECK-NEXT:    ret
+  %res = ashr i32 %a, 31
+  ret i32 %res
+}
-- 
2.7.4