From: Ayke van Laethem <aykevanlaethem@gmail.com>
Date: Tue, 6 Dec 2022 12:58:08 +0000 (+0100)
Subject: [AVR] Optimize 32-bit shift: move bytes around
X-Git-Tag: upstream/17.0.6~21777
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=8f8afabd32092590a81e10e11e0a2c8b24e09b76;p=platform%2Fupstream%2Fllvm.git

[AVR] Optimize 32-bit shift: move bytes around

This patch optimizes 32-bit constant shifts by renaming registers. This
is very effective as the compiler would otherwise need to do a lot of
single bit shift instructions. Instead, the registers are renamed at the
SSA level which means the register allocator will insert the necessary
mov instructions.

Unfortunately, the register allocator will insert some unnecessary movs
with the current code. This will be fixed in a later patch.

Differential Revision: https://reviews.llvm.org/D140570
---

diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 1770cdb..cfc35d8 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -1850,16 +1850,79 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
 // registers in the Regs array.
 // Because AVR does not have a normal shift instruction (only a single bit shift
 // instruction), we have to emulate this behavior with other instructions.
+// It first tries large steps (moving registers around) and then smaller steps
+// like single bit shifts.
+// Large shifts actually reduce the number of shifted registers, so the below
+// algorithms have to work independently of the number of registers that are
+// shifted.
+// For more information and background, see this blogpost:
+// https://aykevl.nl/2021/02/avr-bitshift
 static void insertMultibyteShift(MachineInstr &MI, MachineBasicBlock *BB,
                                  MutableArrayRef<std::pair<Register, int>> Regs,
                                  ISD::NodeType Opc, int64_t ShiftAmt) {
   const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
+  const AVRSubtarget &STI = BB->getParent()->getSubtarget<AVRSubtarget>();
   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
   const DebugLoc &dl = MI.getDebugLoc();
 
   const bool ShiftLeft = Opc == ISD::SHL;
   const bool ArithmeticShift = Opc == ISD::SRA;
 
+  // Zero a register, for use in later operations.
+  Register ZeroReg = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+  BuildMI(*BB, MI, dl, TII.get(AVR::COPY), ZeroReg)
+      .addReg(STI.getZeroRegister());
+
+  // For shift amounts of at least one register, simply rename the registers and
+  // zero the bottom registers.
+  while (ShiftLeft && ShiftAmt >= 8) {
+    // Move all registers one to the left.
+    for (size_t I = 0; I < Regs.size() - 1; I++) {
+      Regs[I] = Regs[I + 1];
+    }
+
+    // Zero the least significant register.
+    Regs[Regs.size() - 1] = std::pair(ZeroReg, 0);
+
+    // Continue shifts with the leftover registers.
+    Regs = Regs.drop_back(1);
+
+    ShiftAmt -= 8;
+  }
+
+  // And again, the same for right shifts.
+  Register ShrExtendReg = 0;
+  if (!ShiftLeft && ShiftAmt >= 8) {
+    if (ArithmeticShift) {
+      // Sign extend the most significant register into ShrExtendReg.
+      ShrExtendReg = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+      Register Tmp = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+      BuildMI(*BB, MI, dl, TII.get(AVR::ADDRdRr), Tmp)
+          .addReg(Regs[0].first, 0, Regs[0].second)
+          .addReg(Regs[0].first, 0, Regs[0].second);
+      BuildMI(*BB, MI, dl, TII.get(AVR::SBCRdRr), ShrExtendReg)
+          .addReg(Tmp)
+          .addReg(Tmp);
+    } else {
+      ShrExtendReg = ZeroReg;
+    }
+    for (; ShiftAmt >= 8; ShiftAmt -= 8) {
+      // Move all registers one to the right.
+      for (size_t I = Regs.size() - 1; I != 0; I--) {
+        Regs[I] = Regs[I - 1];
+      }
+
+      // Zero or sign extend the most significant register.
+      Regs[0] = std::pair(ShrExtendReg, 0);
+
+      // Continue shifts with the leftover registers.
+      Regs = Regs.drop_front(1);
+    }
+  }
+
+  // The bigger shifts are already handled above.
+  assert((ShiftAmt < 8) && "Unexpect shift amount");
+
   // Shift by one. This is the fallback that always works, and the shift
   // operation that is used for 1, 2, and 3 bit shifts.
   while (ShiftLeft && ShiftAmt) {
diff --git a/llvm/test/CodeGen/AVR/shift32.ll b/llvm/test/CodeGen/AVR/shift32.ll
index bc25e40..d3066fb 100644
--- a/llvm/test/CodeGen/AVR/shift32.ll
+++ b/llvm/test/CodeGen/AVR/shift32.ll
@@ -29,6 +29,33 @@ define i32 @shl_i32_2(i32 %a) {
   ret i32 %res
 }
 
+define i32 @shl_i32_8(i32 %a) {
+; CHECK-LABEL: shl_i32_8:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    mov r25, r24
+; CHECK-NEXT:    mov r24, r23
+; CHECK-NEXT:    mov r23, r22
+; CHECK-NEXT:    mov r22, r1
+; CHECK-NEXT:    ret
+  %res = shl i32 %a, 8
+  ret i32 %res
+}
+
+define i32 @shl_i32_9(i32 %a) {
+; CHECK-LABEL: shl_i32_9:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    lsl r22
+; CHECK-NEXT:    rol r23
+; CHECK-NEXT:    rol r24
+; CHECK-NEXT:    mov r25, r24
+; CHECK-NEXT:    mov r24, r23
+; CHECK-NEXT:    mov r23, r22
+; CHECK-NEXT:    mov r22, r1
+; CHECK-NEXT:    ret
+  %res = shl i32 %a, 9
+  ret i32 %res
+}
+
 ; This is a special case: this shift is performed directly inside SelectionDAG
 ; instead of as a custom lowering like the other shift operations.
 define i32 @shl_i32_16(i32 %a) {
@@ -89,6 +116,37 @@ define i32 @lshr_i32_2(i32 %a) {
   ret i32 %res
 }
 
+define i32 @lshr_i32_8(i32 %a) {
+; CHECK-LABEL: lshr_i32_8:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    mov r19, r1
+; CHECK-NEXT:    mov r18, r25
+; CHECK-NEXT:    mov r25, r24
+; CHECK-NEXT:    mov r24, r23
+; CHECK-NEXT:    movw r22, r24
+; CHECK-NEXT:    movw r24, r18
+; CHECK-NEXT:    ret
+  %res = lshr i32 %a, 8
+  ret i32 %res
+}
+
+define i32 @lshr_i32_9(i32 %a) {
+; CHECK-LABEL: lshr_i32_9:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    lsr r25
+; CHECK-NEXT:    ror r24
+; CHECK-NEXT:    ror r23
+; CHECK-NEXT:    mov r19, r1
+; CHECK-NEXT:    mov r18, r25
+; CHECK-NEXT:    mov r25, r24
+; CHECK-NEXT:    mov r24, r23
+; CHECK-NEXT:    movw r22, r24
+; CHECK-NEXT:    movw r24, r18
+; CHECK-NEXT:    ret
+  %res = lshr i32 %a, 9
+  ret i32 %res
+}
+
 define i32 @lshr_i32_16(i32 %a) {
 ; CHECK-LABEL: lshr_i32_16:
 ; CHECK:       ; %bb.0:
@@ -100,6 +158,19 @@ define i32 @lshr_i32_16(i32 %a) {
   ret i32 %res
 }
 
+define i32 @lshr_i32_24(i32 %a) {
+; CHECK-LABEL: lshr_i32_24:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    mov r19, r1
+; CHECK-NEXT:    mov r18, r1
+; CHECK-NEXT:    mov r23, r1
+; CHECK-NEXT:    mov r22, r25
+; CHECK-NEXT:    movw r24, r18
+; CHECK-NEXT:    ret
+  %res = lshr i32 %a, 24
+  ret i32 %res
+}
+
 define i32 @ashr_i32_1(i32 %a) {
 ; CHECK-LABEL: ashr_i32_1:
 ; CHECK:       ; %bb.0:
@@ -127,3 +198,46 @@ define i32 @ashr_i32_2(i32 %a) {
   %res = ashr i32 %a, 2
   ret i32 %res
 }
+
+; TODO: this could be optimized to 4 movs, instead of 6.
+define i32 @ashr_i32_8(i32 %a) {
+; CHECK-LABEL: ashr_i32_8:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    mov r19, r25
+; CHECK-NEXT:    lsl r19
+; CHECK-NEXT:    sbc r19, r19
+; CHECK-NEXT:    mov r18, r25
+; CHECK-NEXT:    mov r25, r24
+; CHECK-NEXT:    mov r24, r23
+; CHECK-NEXT:    movw r22, r24
+; CHECK-NEXT:    movw r24, r18
+; CHECK-NEXT:    ret
+  %res = ashr i32 %a, 8
+  ret i32 %res
+}
+
+define i32 @ashr_i32_16(i32 %a) {
+; CHECK-LABEL: ashr_i32_16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    movw r22, r24
+; CHECK-NEXT:    lsl r25
+; CHECK-NEXT:    sbc r25, r25
+; CHECK-NEXT:    mov r24, r25
+; CHECK-NEXT:    ret
+  %res = ashr i32 %a, 16
+  ret i32 %res
+}
+
+define i32 @ashr_i32_17(i32 %a) {
+; CHECK-LABEL: ashr_i32_17:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    movw r22, r24
+; CHECK-NEXT:    lsl r25
+; CHECK-NEXT:    sbc r25, r25
+; CHECK-NEXT:    asr r23
+; CHECK-NEXT:    ror r22
+; CHECK-NEXT:    mov r24, r25
+; CHECK-NEXT:    ret
+  %res = ashr i32 %a, 17
+  ret i32 %res
+}