// registers in the Regs array.
// Because AVR does not have a normal shift instruction (only a single bit shift
// instruction), we have to emulate this behavior with other instructions.
+// It first tries large steps (moving registers around) and then smaller steps
+// like single bit shifts.
+// Large shifts actually reduce the number of shifted registers, so the below
+// algorithms have to work independently of the number of registers that are
+// shifted.
+// For more information and background, see this blogpost:
+// https://aykevl.nl/2021/02/avr-bitshift
static void insertMultibyteShift(MachineInstr &MI, MachineBasicBlock *BB,
MutableArrayRef<std::pair<Register, int>> Regs,
ISD::NodeType Opc, int64_t ShiftAmt) {
const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
+ const AVRSubtarget &STI = BB->getParent()->getSubtarget<AVRSubtarget>();
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
const DebugLoc &dl = MI.getDebugLoc();
const bool ShiftLeft = Opc == ISD::SHL;
const bool ArithmeticShift = Opc == ISD::SRA;
+ // Zero a register, for use in later operations.
+ Register ZeroReg = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+ BuildMI(*BB, MI, dl, TII.get(AVR::COPY), ZeroReg)
+ .addReg(STI.getZeroRegister());
+
+ // For shift amounts of at least one register, simply rename the registers and
+ // zero the bottom registers.
+ while (ShiftLeft && ShiftAmt >= 8) {
+ // Move all registers one to the left.
+ for (size_t I = 0; I < Regs.size() - 1; I++) {
+ Regs[I] = Regs[I + 1];
+ }
+
+ // Zero the least significant register.
+ Regs[Regs.size() - 1] = std::pair(ZeroReg, 0);
+
+ // Continue shifts with the leftover registers.
+ Regs = Regs.drop_back(1);
+
+ ShiftAmt -= 8;
+ }
+
+ // And again, the same for right shifts.
+ Register ShrExtendReg = 0;
+ if (!ShiftLeft && ShiftAmt >= 8) {
+ if (ArithmeticShift) {
+ // Sign extend the most significant register into ShrExtendReg.
+ ShrExtendReg = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+ Register Tmp = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+ BuildMI(*BB, MI, dl, TII.get(AVR::ADDRdRr), Tmp)
+ .addReg(Regs[0].first, 0, Regs[0].second)
+ .addReg(Regs[0].first, 0, Regs[0].second);
+ BuildMI(*BB, MI, dl, TII.get(AVR::SBCRdRr), ShrExtendReg)
+ .addReg(Tmp)
+ .addReg(Tmp);
+ } else {
+ ShrExtendReg = ZeroReg;
+ }
+ for (; ShiftAmt >= 8; ShiftAmt -= 8) {
+ // Move all registers one to the right.
+ for (size_t I = Regs.size() - 1; I != 0; I--) {
+ Regs[I] = Regs[I - 1];
+ }
+
+ // Zero or sign extend the most significant register.
+ Regs[0] = std::pair(ShrExtendReg, 0);
+
+ // Continue shifts with the leftover registers.
+ Regs = Regs.drop_front(1);
+ }
+ }
+
+ // The bigger shifts are already handled above.
+ assert((ShiftAmt < 8) && "Unexpect shift amount");
+
// Shift by one. This is the fallback that always works, and the shift
// operation that is used for 1, 2, and 3 bit shifts.
while (ShiftLeft && ShiftAmt) {
ret i32 %res
}
+define i32 @shl_i32_8(i32 %a) {
+; CHECK-LABEL: shl_i32_8:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: mov r25, r24
+; CHECK-NEXT: mov r24, r23
+; CHECK-NEXT: mov r23, r22
+; CHECK-NEXT: mov r22, r1
+; CHECK-NEXT: ret
+ %res = shl i32 %a, 8
+ ret i32 %res
+}
+
+define i32 @shl_i32_9(i32 %a) {
+; CHECK-LABEL: shl_i32_9:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: lsl r22
+; CHECK-NEXT: rol r23
+; CHECK-NEXT: rol r24
+; CHECK-NEXT: mov r25, r24
+; CHECK-NEXT: mov r24, r23
+; CHECK-NEXT: mov r23, r22
+; CHECK-NEXT: mov r22, r1
+; CHECK-NEXT: ret
+ %res = shl i32 %a, 9
+ ret i32 %res
+}
+
; This is a special case: this shift is performed directly inside SelectionDAG
; instead of as a custom lowering like the other shift operations.
define i32 @shl_i32_16(i32 %a) {
ret i32 %res
}
+define i32 @lshr_i32_8(i32 %a) {
+; CHECK-LABEL: lshr_i32_8:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: mov r19, r1
+; CHECK-NEXT: mov r18, r25
+; CHECK-NEXT: mov r25, r24
+; CHECK-NEXT: mov r24, r23
+; CHECK-NEXT: movw r22, r24
+; CHECK-NEXT: movw r24, r18
+; CHECK-NEXT: ret
+ %res = lshr i32 %a, 8
+ ret i32 %res
+}
+
+define i32 @lshr_i32_9(i32 %a) {
+; CHECK-LABEL: lshr_i32_9:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: lsr r25
+; CHECK-NEXT: ror r24
+; CHECK-NEXT: ror r23
+; CHECK-NEXT: mov r19, r1
+; CHECK-NEXT: mov r18, r25
+; CHECK-NEXT: mov r25, r24
+; CHECK-NEXT: mov r24, r23
+; CHECK-NEXT: movw r22, r24
+; CHECK-NEXT: movw r24, r18
+; CHECK-NEXT: ret
+ %res = lshr i32 %a, 9
+ ret i32 %res
+}
+
define i32 @lshr_i32_16(i32 %a) {
; CHECK-LABEL: lshr_i32_16:
; CHECK: ; %bb.0:
ret i32 %res
}
+define i32 @lshr_i32_24(i32 %a) {
+; CHECK-LABEL: lshr_i32_24:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: mov r19, r1
+; CHECK-NEXT: mov r18, r1
+; CHECK-NEXT: mov r23, r1
+; CHECK-NEXT: mov r22, r25
+; CHECK-NEXT: movw r24, r18
+; CHECK-NEXT: ret
+ %res = lshr i32 %a, 24
+ ret i32 %res
+}
+
define i32 @ashr_i32_1(i32 %a) {
; CHECK-LABEL: ashr_i32_1:
; CHECK: ; %bb.0:
%res = ashr i32 %a, 2
ret i32 %res
}
+
+; TODO: this could be optimized to 4 movs, instead of 6.
+define i32 @ashr_i32_8(i32 %a) {
+; CHECK-LABEL: ashr_i32_8:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: mov r19, r25
+; CHECK-NEXT: lsl r19
+; CHECK-NEXT: sbc r19, r19
+; CHECK-NEXT: mov r18, r25
+; CHECK-NEXT: mov r25, r24
+; CHECK-NEXT: mov r24, r23
+; CHECK-NEXT: movw r22, r24
+; CHECK-NEXT: movw r24, r18
+; CHECK-NEXT: ret
+ %res = ashr i32 %a, 8
+ ret i32 %res
+}
+
+define i32 @ashr_i32_16(i32 %a) {
+; CHECK-LABEL: ashr_i32_16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: movw r22, r24
+; CHECK-NEXT: lsl r25
+; CHECK-NEXT: sbc r25, r25
+; CHECK-NEXT: mov r24, r25
+; CHECK-NEXT: ret
+ %res = ashr i32 %a, 16
+ ret i32 %res
+}
+
+define i32 @ashr_i32_17(i32 %a) {
+; CHECK-LABEL: ashr_i32_17:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: movw r22, r24
+; CHECK-NEXT: lsl r25
+; CHECK-NEXT: sbc r25, r25
+; CHECK-NEXT: asr r23
+; CHECK-NEXT: ror r22
+; CHECK-NEXT: mov r24, r25
+; CHECK-NEXT: ret
+ %res = ashr i32 %a, 17
+ ret i32 %res
+}