// The bigger shifts are already handled above.
assert((ShiftAmt < 8) && "Unexpect shift amount");
+ // Shift by four bits, using a complicated swap/eor/andi/eor sequence.
+ // It only works for logical shifts because the bits shifted in are all
+ // zeroes.
+ // To shift a single byte right, it produces code like this:
+ // swap r0
+ // andi r0, 0x0f
+ // For a two-byte (16-bit) shift, it adds the following instructions to shift
+ // the upper byte into the lower byte:
+ // swap r1
+ // eor r0, r1
+ // andi r1, 0x0f
+ // eor r0, r1
+ // For bigger shifts, it repeats the above sequence. For example, for a 3-byte
+ // (24-bit) shift it adds:
+ // swap r2
+ // eor r1, r2
+ // andi r2, 0x0f
+ // eor r1, r2
+ if (!ArithmeticShift && ShiftAmt >= 4) {
+ Register Prev = 0;
+ for (size_t I = 0; I < Regs.size(); I++) {
+ size_t Idx = ShiftLeft ? I : Regs.size() - I - 1;
+ Register SwapReg = MRI.createVirtualRegister(&AVR::LD8RegClass);
+ BuildMI(*BB, MI, dl, TII.get(AVR::SWAPRd), SwapReg)
+ .addReg(Regs[Idx].first, 0, Regs[Idx].second);
+ if (I != 0) {
+ Register R = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+ BuildMI(*BB, MI, dl, TII.get(AVR::EORRdRr), R)
+ .addReg(Prev)
+ .addReg(SwapReg);
+ Prev = R;
+ }
+ Register AndReg = MRI.createVirtualRegister(&AVR::LD8RegClass);
+ BuildMI(*BB, MI, dl, TII.get(AVR::ANDIRdK), AndReg)
+ .addReg(SwapReg)
+ .addImm(ShiftLeft ? 0xf0 : 0x0f);
+ if (I != 0) {
+ Register R = MRI.createVirtualRegister(&AVR::GPR8RegClass);
+ BuildMI(*BB, MI, dl, TII.get(AVR::EORRdRr), R)
+ .addReg(Prev)
+ .addReg(AndReg);
+ size_t PrevIdx = ShiftLeft ? Idx - 1 : Idx + 1;
+ Regs[PrevIdx] = std::pair(R, 0);
+ }
+ Prev = AndReg;
+ Regs[Idx] = std::pair(AndReg, 0);
+ }
+ ShiftAmt -= 4;
+ }
+
// Shift by one. This is the fallback that always works, and the shift
// operation that is used for 1, 2, and 3 bit shifts.
while (ShiftLeft && ShiftAmt) {
ret i32 %res
}
+define i32 @shl_i32_4(i32 %a) {
+; CHECK-LABEL: shl_i32_4:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: swap r25
+; CHECK-NEXT: andi r25, 240
+; CHECK-NEXT: swap r24
+; CHECK-NEXT: eor r25, r24
+; CHECK-NEXT: andi r24, 240
+; CHECK-NEXT: eor r25, r24
+; CHECK-NEXT: swap r23
+; CHECK-NEXT: eor r24, r23
+; CHECK-NEXT: andi r23, 240
+; CHECK-NEXT: eor r24, r23
+; CHECK-NEXT: swap r22
+; CHECK-NEXT: eor r23, r22
+; CHECK-NEXT: andi r22, 240
+; CHECK-NEXT: eor r23, r22
+; CHECK-NEXT: ret
+ %res = shl i32 %a, 4
+ ret i32 %res
+}
+
+; shift four bits and then shift one bit
+define i32 @shl_i32_5(i32 %a) {
+; CHECK-LABEL: shl_i32_5:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: swap r25
+; CHECK-NEXT: andi r25, 240
+; CHECK-NEXT: swap r24
+; CHECK-NEXT: eor r25, r24
+; CHECK-NEXT: andi r24, 240
+; CHECK-NEXT: eor r25, r24
+; CHECK-NEXT: swap r23
+; CHECK-NEXT: eor r24, r23
+; CHECK-NEXT: andi r23, 240
+; CHECK-NEXT: eor r24, r23
+; CHECK-NEXT: swap r22
+; CHECK-NEXT: eor r23, r22
+; CHECK-NEXT: andi r22, 240
+; CHECK-NEXT: eor r23, r22
+; CHECK-NEXT: lsl r22
+; CHECK-NEXT: rol r23
+; CHECK-NEXT: rol r24
+; CHECK-NEXT: rol r25
+; CHECK-NEXT: ret
+ %res = shl i32 %a, 5
+ ret i32 %res
+}
+
define i32 @shl_i32_8(i32 %a) {
; CHECK-LABEL: shl_i32_8:
; CHECK: ; %bb.0:
ret i32 %res
}
+; shift 3 of 4 registers and move the others around
+define i32 @shl_i32_12(i32 %a) {
+; CHECK-LABEL: shl_i32_12:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: swap r24
+; CHECK-NEXT: andi r24, 240
+; CHECK-NEXT: swap r23
+; CHECK-NEXT: eor r24, r23
+; CHECK-NEXT: andi r23, 240
+; CHECK-NEXT: eor r24, r23
+; CHECK-NEXT: swap r22
+; CHECK-NEXT: eor r23, r22
+; CHECK-NEXT: andi r22, 240
+; CHECK-NEXT: eor r23, r22
+; CHECK-NEXT: mov r25, r24
+; CHECK-NEXT: mov r24, r23
+; CHECK-NEXT: mov r23, r22
+; CHECK-NEXT: mov r22, r1
+; CHECK-NEXT: ret
+ %res = shl i32 %a, 12
+ ret i32 %res
+}
+
; This is a special case: this shift is performed directly inside SelectionDAG
; instead of as a custom lowering like the other shift operations.
define i32 @shl_i32_16(i32 %a) {
ret void
}
+; shift only the most significant byte and then move it
+define i32 @shl_i32_28(i32 %a) {
+; CHECK-LABEL: shl_i32_28:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: swap r22
+; CHECK-NEXT: andi r22, 240
+; CHECK-NEXT: mov r25, r22
+; CHECK-NEXT: mov r24, r1
+; CHECK-NEXT: mov r23, r1
+; CHECK-NEXT: mov r22, r1
+; CHECK-NEXT: ret
+ %res = shl i32 %a, 28
+ ret i32 %res
+}
+
define i32 @lshr_i32_1(i32 %a) {
; CHECK-LABEL: lshr_i32_1:
; CHECK: ; %bb.0:
ret i32 %res
}
+define i32 @lshr_i32_4(i32 %a) {
+; CHECK-LABEL: lshr_i32_4:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: swap r22
+; CHECK-NEXT: andi r22, 15
+; CHECK-NEXT: swap r23
+; CHECK-NEXT: eor r22, r23
+; CHECK-NEXT: andi r23, 15
+; CHECK-NEXT: eor r22, r23
+; CHECK-NEXT: swap r24
+; CHECK-NEXT: eor r23, r24
+; CHECK-NEXT: andi r24, 15
+; CHECK-NEXT: eor r23, r24
+; CHECK-NEXT: swap r25
+; CHECK-NEXT: eor r24, r25
+; CHECK-NEXT: andi r25, 15
+; CHECK-NEXT: eor r24, r25
+; CHECK-NEXT: ret
+ %res = lshr i32 %a, 4
+ ret i32 %res
+}
+
define i32 @lshr_i32_8(i32 %a) {
; CHECK-LABEL: lshr_i32_8:
; CHECK: ; %bb.0:
ret i32 %res
}
+; can't use the swap/andi/eor trick here
+define i32 @ashr_i32_4(i32 %a) {
+; CHECK-LABEL: ashr_i32_4:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: asr r25
+; CHECK-NEXT: ror r24
+; CHECK-NEXT: ror r23
+; CHECK-NEXT: ror r22
+; CHECK-NEXT: asr r25
+; CHECK-NEXT: ror r24
+; CHECK-NEXT: ror r23
+; CHECK-NEXT: ror r22
+; CHECK-NEXT: asr r25
+; CHECK-NEXT: ror r24
+; CHECK-NEXT: ror r23
+; CHECK-NEXT: ror r22
+; CHECK-NEXT: asr r25
+; CHECK-NEXT: ror r24
+; CHECK-NEXT: ror r23
+; CHECK-NEXT: ror r22
+; CHECK-NEXT: ret
+ %res = ashr i32 %a, 4
+ ret i32 %res
+}
+
; TODO: this could be optimized to 4 movs, instead of 6.
define i32 @ashr_i32_8(i32 %a) {
; CHECK-LABEL: ashr_i32_8: