const SDLoc &dl(Op);
unsigned ElemWidth = ElemTy.getSizeInBits();
+ bool IsLeft = Opc == ISD::FSHL;
+
+ // The expansion into regular shifts produces worse code for i8 and for
+ // right shift of i32 on v65+.
+ bool UseShifts = ElemTy != MVT::i8;
+ if (Subtarget.useHVXV65Ops() && ElemTy == MVT::i32)
+ UseShifts = false;
+
+ if (SDValue SplatV = getSplatValue(S, DAG); SplatV && UseShifts) {
+ // If this is a funnel shift by a scalar, lower it into regular shifts.
+ SDValue Mask = DAG.getConstant(ElemWidth - 1, dl, MVT::i32);
+ SDValue ModS =
+ DAG.getNode(ISD::AND, dl, MVT::i32,
+ {DAG.getZExtOrTrunc(SplatV, dl, MVT::i32), Mask});
+ SDValue NegS =
+ DAG.getNode(ISD::SUB, dl, MVT::i32,
+ {DAG.getConstant(ElemWidth, dl, MVT::i32), ModS});
+ SDValue IsZero =
+ DAG.getSetCC(dl, MVT::i1, ModS, getZero(dl, MVT::i32, DAG), ISD::SETEQ);
+ // FSHL A, B => A << | B >>n
+ // FSHR A, B => A <<n | B >>
+ SDValue Part1 =
+ DAG.getNode(HexagonISD::VASL, dl, InpTy, {A, IsLeft ? ModS : NegS});
+ SDValue Part2 =
+ DAG.getNode(HexagonISD::VLSR, dl, InpTy, {B, IsLeft ? NegS : ModS});
+ SDValue Or = DAG.getNode(ISD::OR, dl, InpTy, {Part1, Part2});
+ // If the shift amount was 0, pick A or B, depending on the direction.
+ // The opposite shift will also be by 0, so the "Or" will be incorrect.
+ return DAG.getNode(ISD::SELECT, dl, InpTy, {IsZero, (IsLeft ? A : B), Or});
+ }
+
SDValue Mask = DAG.getSplatBuildVector(
InpTy, dl, DAG.getConstant(ElemWidth - 1, dl, ElemTy));
; V60-LABEL: f1:
; V60: // %bb.0:
; V60-NEXT: {
-; V60-NEXT: r0 = combine(r0.l,r0.l)
+; V60-NEXT: r1 = and(r0,#15)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: r1 = ##983055
+; V60-NEXT: p0 = bitsclr(r0,#15)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: r2 = ##1048592
+; V60-NEXT: v2.h = vasl(v0.h,r1)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v4 = vxor(v4,v4)
+; V60-NEXT: r1 = sub(#16,r1)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v2 = vsplat(r0)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: v3 = vsplat(r1)
+; V60-NEXT: v1.uh = vlsr(v1.uh,r1)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v2 = vand(v2,v3)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: v30 = vsplat(r2)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: v3.h = vsub(v30.h,v2.h)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: q0 = vcmp.eq(v2.h,v4.h)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: v31.h = vasl(v0.h,v2.h)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: v1.h = vlsr(v1.h,v3.h)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: v1 = vor(v31,v1)
+; V60-NEXT: v1 = vor(v2,v1)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v0 = vmux(q0,v0,v1)
+; V60-NEXT: if (!p0) v0 = v1
; V60-NEXT: }
; V60-NEXT: {
; V60-NEXT: jumpr r31
; V62-LABEL: f1:
; V62: // %bb.0:
; V62-NEXT: {
-; V62-NEXT: r3:2 = combine(#16,#15)
+; V62-NEXT: r1 = and(r0,#15)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v2.h = vsplat(r0)
+; V62-NEXT: p0 = bitsclr(r0,#15)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v3.h = vsplat(r2)
+; V62-NEXT: v2.h = vasl(v0.h,r1)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v4.h = vsplat(r3)
+; V62-NEXT: r1 = sub(#16,r1)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v2 = vand(v2,v3)
-; V62-NEXT: }
-; V62-NEXT: {
-; V62-NEXT: v3.h = vsub(v4.h,v2.h)
-; V62-NEXT: }
-; V62-NEXT: {
-; V62-NEXT: v0.h = vasl(v0.h,v2.h)
+; V62-NEXT: v1.uh = vlsr(v1.uh,r1)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v1.h = vlsr(v1.h,v3.h)
+; V62-NEXT: v1 = vor(v2,v1)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v0 = vor(v0,v1)
+; V62-NEXT: if (!p0) v0 = v1
; V62-NEXT: }
; V62-NEXT: {
; V62-NEXT: jumpr r31
; V66-LABEL: f1:
; V66: // %bb.0:
; V66-NEXT: {
-; V66-NEXT: r3:2 = combine(#16,#15)
-; V66-NEXT: }
-; V66-NEXT: {
-; V66-NEXT: v2.h = vsplat(r0)
+; V66-NEXT: r1 = and(r0,#15)
; V66-NEXT: }
; V66-NEXT: {
-; V66-NEXT: v3.h = vsplat(r2)
+; V66-NEXT: p0 = bitsclr(r0,#15)
; V66-NEXT: }
; V66-NEXT: {
-; V66-NEXT: v4.h = vsplat(r3)
+; V66-NEXT: v2.h = vasl(v0.h,r1)
; V66-NEXT: }
; V66-NEXT: {
-; V66-NEXT: v2 = vand(v2,v3)
+; V66-NEXT: r1 = sub(#16,r1)
; V66-NEXT: }
; V66-NEXT: {
-; V66-NEXT: v3.h = vsub(v4.h,v2.h)
+; V66-NEXT: v1.uh = vlsr(v1.uh,r1)
; V66-NEXT: }
; V66-NEXT: {
-; V66-NEXT: v0.h = vasl(v0.h,v2.h)
+; V66-NEXT: v1 = vor(v2,v1)
; V66-NEXT: }
; V66-NEXT: {
-; V66-NEXT: v1.h = vlsr(v1.h,v3.h)
-; V66-NEXT: }
-; V66-NEXT: {
-; V66-NEXT: v0 = vor(v0,v1)
+; V66-NEXT: if (!p0) v0 = v1
; V66-NEXT: }
; V66-NEXT: {
; V66-NEXT: jumpr r31
; V60-LABEL: f2:
; V60: // %bb.0:
; V60-NEXT: {
-; V60-NEXT: r3:2 = combine(#31,#32)
+; V60-NEXT: r1 = and(r0,#31)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: r0 = and(r0,#31)
+; V60-NEXT: p0 = bitsclr(r0,#31)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v2 = vxor(v2,v2)
+; V60-NEXT: v2.w = vasl(v0.w,r1)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v3 = vsplat(r0)
+; V60-NEXT: r1 = sub(#32,r1)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v4 = vsplat(r2)
+; V60-NEXT: v1.uw = vlsr(v1.uw,r1)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v4.w = vsub(v4.w,v3.w)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: v5 = vsplat(r3)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: v31 = vand(v3,v5)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: v3.w = vasl(v0.w,v3.w)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: v1.w = vlsr(v1.w,v4.w)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: q0 = vcmp.eq(v31.w,v2.w)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: v1 = vor(v3,v1)
+; V60-NEXT: v1 = vor(v2,v1)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v0 = vmux(q0,v0,v1)
+; V60-NEXT: if (!p0) v0 = v1
; V60-NEXT: }
; V60-NEXT: {
; V60-NEXT: jumpr r31
; V62-LABEL: f2:
; V62: // %bb.0:
; V62-NEXT: {
-; V62-NEXT: r0 = and(r0,#31)
+; V62-NEXT: r1 = and(r0,#31)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: r1 = #32
+; V62-NEXT: p0 = bitsclr(r0,#31)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v2 = vsplat(r0)
+; V62-NEXT: v2.w = vasl(v0.w,r1)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v3 = vsplat(r1)
+; V62-NEXT: r1 = sub(#32,r1)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v3.w = vsub(v3.w,v2.w)
+; V62-NEXT: v1.uw = vlsr(v1.uw,r1)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v0.w = vasl(v0.w,v2.w)
+; V62-NEXT: v1 = vor(v2,v1)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v1.w = vlsr(v1.w,v3.w)
-; V62-NEXT: }
-; V62-NEXT: {
-; V62-NEXT: v0 = vor(v0,v1)
+; V62-NEXT: if (!p0) v0 = v1
; V62-NEXT: }
; V62-NEXT: {
; V62-NEXT: jumpr r31
; V60-LABEL: f4:
; V60: // %bb.0:
; V60-NEXT: {
-; V60-NEXT: r0 = combine(r0.l,r0.l)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: r1 = ##983055
+; V60-NEXT: r1 = and(r0,#15)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: r2 = ##1048592
+; V60-NEXT: p0 = bitsclr(r0,#15)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v4 = vxor(v4,v4)
+; V60-NEXT: v2.uh = vlsr(v1.uh,r1)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v2 = vsplat(r0)
+; V60-NEXT: r1 = sub(#16,r1)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v3 = vsplat(r1)
+; V60-NEXT: v0.h = vasl(v0.h,r1)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v2 = vand(v2,v3)
+; V60-NEXT: v0 = vor(v0,v2)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v30 = vsplat(r2)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: v3.h = vsub(v30.h,v2.h)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: q0 = vcmp.eq(v2.h,v4.h)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: v31.h = vlsr(v1.h,v2.h)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: v0.h = vasl(v0.h,v3.h)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: v0 = vor(v0,v31)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: v0 = vmux(q0,v1,v0)
+; V60-NEXT: if (p0) v0 = v1
; V60-NEXT: }
; V60-NEXT: {
; V60-NEXT: jumpr r31
; V62-LABEL: f4:
; V62: // %bb.0:
; V62-NEXT: {
-; V62-NEXT: r3:2 = combine(#16,#15)
+; V62-NEXT: r1 = and(r0,#15)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v2.h = vsplat(r0)
+; V62-NEXT: p0 = bitsclr(r0,#15)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v3.h = vsplat(r2)
+; V62-NEXT: v2.uh = vlsr(v1.uh,r1)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v4.h = vsplat(r3)
+; V62-NEXT: r1 = sub(#16,r1)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v2 = vand(v2,v3)
+; V62-NEXT: v0.h = vasl(v0.h,r1)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v3.h = vsub(v2.h,v4.h)
+; V62-NEXT: v0 = vor(v0,v2)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v1.h = vlsr(v1.h,v2.h)
-; V62-NEXT: }
-; V62-NEXT: {
-; V62-NEXT: v0.h = vlsr(v0.h,v3.h)
-; V62-NEXT: }
-; V62-NEXT: {
-; V62-NEXT: v0 = vor(v0,v1)
+; V62-NEXT: if (p0) v0 = v1
; V62-NEXT: }
; V62-NEXT: {
; V62-NEXT: jumpr r31
; V66-LABEL: f4:
; V66: // %bb.0:
; V66-NEXT: {
-; V66-NEXT: r3:2 = combine(#16,#15)
-; V66-NEXT: }
-; V66-NEXT: {
-; V66-NEXT: v2.h = vsplat(r0)
+; V66-NEXT: r1 = and(r0,#15)
; V66-NEXT: }
; V66-NEXT: {
-; V66-NEXT: v3.h = vsplat(r2)
+; V66-NEXT: p0 = bitsclr(r0,#15)
; V66-NEXT: }
; V66-NEXT: {
-; V66-NEXT: v4.h = vsplat(r3)
-; V66-NEXT: }
-; V66-NEXT: {
-; V66-NEXT: v2 = vand(v2,v3)
+; V66-NEXT: v2.uh = vlsr(v1.uh,r1)
; V66-NEXT: }
; V66-NEXT: {
-; V66-NEXT: v3.h = vsub(v2.h,v4.h)
+; V66-NEXT: r1 = sub(#16,r1)
; V66-NEXT: }
; V66-NEXT: {
-; V66-NEXT: v1.h = vlsr(v1.h,v2.h)
+; V66-NEXT: v0.h = vasl(v0.h,r1)
; V66-NEXT: }
; V66-NEXT: {
-; V66-NEXT: v0.h = vlsr(v0.h,v3.h)
+; V66-NEXT: v0 = vor(v0,v2)
; V66-NEXT: }
; V66-NEXT: {
-; V66-NEXT: v0 = vor(v0,v1)
+; V66-NEXT: if (p0) v0 = v1
; V66-NEXT: }
; V66-NEXT: {
; V66-NEXT: jumpr r31
; V60-LABEL: f5:
; V60: // %bb.0:
; V60-NEXT: {
-; V60-NEXT: r0 = and(r0,#31)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: r1 = #32
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: v2 = vxor(v2,v2)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: v3 = vsplat(r0)
+; V60-NEXT: r1 = and(r0,#31)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v4 = vsplat(r1)
+; V60-NEXT: p0 = bitsclr(r0,#31)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v4.w = vsub(v4.w,v3.w)
+; V60-NEXT: v2.uw = vlsr(v1.uw,r1)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: q0 = vcmp.eq(v3.w,v2.w)
+; V60-NEXT: r1 = sub(#32,r1)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v31.w = vlsr(v1.w,v3.w)
-; V60-NEXT: }
-; V60-NEXT: {
-; V60-NEXT: v0.w = vasl(v0.w,v4.w)
+; V60-NEXT: v0.w = vasl(v0.w,r1)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v0 = vor(v0,v31)
+; V60-NEXT: v0 = vor(v0,v2)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v0 = vmux(q0,v1,v0)
+; V60-NEXT: if (p0) v0 = v1
; V60-NEXT: }
; V60-NEXT: {
; V60-NEXT: jumpr r31
; V62-LABEL: f5:
; V62: // %bb.0:
; V62-NEXT: {
-; V62-NEXT: r0 = and(r0,#31)
+; V62-NEXT: r1 = and(r0,#31)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: r1 = #32
+; V62-NEXT: p0 = bitsclr(r0,#31)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v2 = vsplat(r0)
+; V62-NEXT: v2.uw = vlsr(v1.uw,r1)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v3 = vsplat(r1)
+; V62-NEXT: r1 = sub(#32,r1)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v3.w = vsub(v2.w,v3.w)
+; V62-NEXT: v0.w = vasl(v0.w,r1)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v1.w = vlsr(v1.w,v2.w)
+; V62-NEXT: v0 = vor(v0,v2)
; V62-NEXT: }
; V62-NEXT: {
-; V62-NEXT: v0.w = vlsr(v0.w,v3.w)
-; V62-NEXT: }
-; V62-NEXT: {
-; V62-NEXT: v0 = vor(v0,v1)
+; V62-NEXT: if (p0) v0 = v1
; V62-NEXT: }
; V62-NEXT: {
; V62-NEXT: jumpr r31