// Match the shift amount operands for a funnel/rotate pattern. This always
// matches a subtraction on the R operand.
- auto matchShiftAmount = [](Value *L, Value *R, unsigned Width) -> Value * {
+ auto matchShiftAmount = [&](Value *L, Value *R, unsigned Width) -> Value * {
// The shift amounts may add up to the narrow bit width:
// (shl ShVal0, L) | (lshr ShVal1, Width - L)
if (match(R, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(L)))))
return L;
+ // The following patterns currently only work for rotation patterns.
+ // TODO: Add more general funnel-shift compatible patterns.
+ if (ShVal0 != ShVal1)
+ return nullptr;
+
// The shift amount may be masked with negation:
// (shl ShVal0, (X & (Width - 1))) | (lshr ShVal1, ((-X) & (Width - 1)))
Value *X;
return nullptr;
};
- // TODO: Add support for funnel shifts (ShVal0 != ShVal1).
- if (ShVal0 != ShVal1)
- return nullptr;
- Value *ShVal = ShVal0;
-
Value *ShAmt = matchShiftAmount(ShAmt0, ShAmt1, NarrowWidth);
bool IsFshl = true; // Sub on LSHR.
if (!ShAmt) {
// will be a zext, but it could also be the result of an 'and' or 'shift'.
unsigned WideWidth = Trunc.getSrcTy()->getScalarSizeInBits();
APInt HiBitMask = APInt::getHighBitsSet(WideWidth, WideWidth - NarrowWidth);
- if (!MaskedValueIsZero(ShVal, HiBitMask, 0, &Trunc))
+ if (!MaskedValueIsZero(ShVal0, HiBitMask, 0, &Trunc) ||
+ !MaskedValueIsZero(ShVal1, HiBitMask, 0, &Trunc))
return nullptr;
// We have an unnecessarily wide rotate!
- // trunc (or (lshr ShVal, ShAmt), (shl ShVal, BitWidth - ShAmt))
+ // trunc (or (lshr ShVal0, ShAmt), (shl ShVal1, BitWidth - ShAmt))
// Narrow the inputs and convert to funnel shift intrinsic:
// llvm.fshl.i8(trunc(ShVal), trunc(ShVal), trunc(ShAmt))
Value *NarrowShAmt = Builder.CreateTrunc(ShAmt, DestTy);
- Value *X = Builder.CreateTrunc(ShVal, DestTy);
+ Value *X, *Y;
+ X = Y = Builder.CreateTrunc(ShVal0, DestTy);
+ if (ShVal0 != ShVal1)
+ Y = Builder.CreateTrunc(ShVal1, DestTy);
Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr;
Function *F = Intrinsic::getDeclaration(Trunc.getModule(), IID, DestTy);
- return IntrinsicInst::Create(F, { X, X, NarrowShAmt });
+ return IntrinsicInst::Create(F, {X, Y, NarrowShAmt});
}
/// Try to narrow the width of math or bitwise logic instructions by pulling a
define i16 @fshl_16bit(i16 %x, i16 %y, i32 %shift) {
; CHECK-LABEL: @fshl_16bit(
-; CHECK-NEXT: [[AND:%.*]] = and i32 [[SHIFT:%.*]], 15
-; CHECK-NEXT: [[CONVX:%.*]] = zext i16 [[X:%.*]] to i32
-; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[CONVX]], [[AND]]
-; CHECK-NEXT: [[SUB:%.*]] = sub nuw nsw i32 16, [[AND]]
-; CHECK-NEXT: [[CONVY:%.*]] = zext i16 [[Y:%.*]] to i32
-; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[CONVY]], [[SUB]]
-; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHR]], [[SHL]]
-; CHECK-NEXT: [[CONV2:%.*]] = trunc i32 [[OR]] to i16
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[SHIFT:%.*]] to i16
+; CHECK-NEXT: [[CONV2:%.*]] = call i16 @llvm.fshl.i16(i16 [[X:%.*]], i16 [[Y:%.*]], i16 [[TMP1]])
; CHECK-NEXT: ret i16 [[CONV2]]
;
%and = and i32 %shift, 15
define <2 x i16> @fshl_commute_16bit_vec(<2 x i16> %x, <2 x i16> %y, <2 x i32> %shift) {
; CHECK-LABEL: @fshl_commute_16bit_vec(
-; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[SHIFT:%.*]], <i32 15, i32 15>
-; CHECK-NEXT: [[CONVX:%.*]] = zext <2 x i16> [[X:%.*]] to <2 x i32>
-; CHECK-NEXT: [[SHL:%.*]] = shl <2 x i32> [[CONVX]], [[AND]]
-; CHECK-NEXT: [[SUB:%.*]] = sub nuw nsw <2 x i32> <i32 16, i32 16>, [[AND]]
-; CHECK-NEXT: [[CONVY:%.*]] = zext <2 x i16> [[Y:%.*]] to <2 x i32>
-; CHECK-NEXT: [[SHR:%.*]] = lshr <2 x i32> [[CONVY]], [[SUB]]
-; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[SHL]], [[SHR]]
-; CHECK-NEXT: [[CONV2:%.*]] = trunc <2 x i32> [[OR]] to <2 x i16>
+; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[SHIFT:%.*]] to <2 x i16>
+; CHECK-NEXT: [[CONV2:%.*]] = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> [[X:%.*]], <2 x i16> [[Y:%.*]], <2 x i16> [[TMP1]])
; CHECK-NEXT: ret <2 x i16> [[CONV2]]
;
%and = and <2 x i32> %shift, <i32 15, i32 15>
define i8 @fshr_8bit(i8 %x, i8 %y, i3 %shift) {
; CHECK-LABEL: @fshr_8bit(
-; CHECK-NEXT: [[AND:%.*]] = zext i3 [[SHIFT:%.*]] to i32
-; CHECK-NEXT: [[CONVX:%.*]] = zext i8 [[X:%.*]] to i32
-; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[CONVX]], [[AND]]
-; CHECK-NEXT: [[SUB:%.*]] = sub nuw nsw i32 8, [[AND]]
-; CHECK-NEXT: [[CONVY:%.*]] = zext i8 [[Y:%.*]] to i32
-; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[CONVY]], [[SUB]]
-; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHL]], [[SHR]]
-; CHECK-NEXT: [[CONV2:%.*]] = trunc i32 [[OR]] to i8
+; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[SHIFT:%.*]] to i8
+; CHECK-NEXT: [[CONV2:%.*]] = call i8 @llvm.fshr.i8(i8 [[Y:%.*]], i8 [[X:%.*]], i8 [[TMP1]])
; CHECK-NEXT: ret i8 [[CONV2]]
;
%and = zext i3 %shift to i32
define i8 @fshr_commute_8bit(i32 %x, i32 %y, i32 %shift) {
; CHECK-LABEL: @fshr_commute_8bit(
-; CHECK-NEXT: [[AND:%.*]] = and i32 [[SHIFT:%.*]], 3
-; CHECK-NEXT: [[CONVX:%.*]] = and i32 [[X:%.*]], 255
-; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[CONVX]], [[AND]]
-; CHECK-NEXT: [[SUB:%.*]] = sub nuw nsw i32 8, [[AND]]
-; CHECK-NEXT: [[CONVY:%.*]] = and i32 [[Y:%.*]], 255
-; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[CONVY]], [[SUB]]
-; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHR]], [[SHL]]
-; CHECK-NEXT: [[CONV2:%.*]] = trunc i32 [[OR]] to i8
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[SHIFT:%.*]] to i8
+; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[Y:%.*]] to i8
+; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[X:%.*]] to i8
+; CHECK-NEXT: [[CONV2:%.*]] = call i8 @llvm.fshr.i8(i8 [[TMP3]], i8 [[TMP4]], i8 [[TMP2]])
; CHECK-NEXT: ret i8 [[CONV2]]
;
%and = and i32 %shift, 3