return SDValue();
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
+ InputV = DAG.getBitcast(VT, InputV);
InputV = ShuffleOffset(InputV);
InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
DL, ExtVT, InputV, DAG);
}
assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
+ InputV = DAG.getBitcast(VT, InputV);
// For any extends we can cheat for larger element sizes and use shuffle
// instructions that can fold with a load and/or copy.
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
}
+ if (Subtarget.hasSSE2())
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
+ DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
+ ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
+ return ZExt;
+ }
+
if (Subtarget.hasAVX2())
if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
return Extract;
/// AVX vector shuffle types.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+ SelectionDAG &DAG, bool SimpleOnly) {
assert(VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!");
assert(V1.getSimpleValueType() == VT && "Bad operand type!");
std::tie(LoV2, HiV2) = SplitVector(V2);
// Now create two 4-way blends of these half-width vectors.
- auto HalfBlend = [&](ArrayRef<int> HalfMask) {
- bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
- SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
- SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
- SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
+ auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
+ bool &UseHiV1, bool &UseLoV2,
+ bool &UseHiV2) {
+ UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
for (int i = 0; i < SplitNumElements; ++i) {
int M = HalfMask[i];
if (M >= NumElements) {
UseHiV2 = true;
else
UseLoV2 = true;
- V2BlendMask[i] = M - NumElements;
- BlendMask[i] = SplitNumElements + i;
} else if (M >= 0) {
if (M >= SplitNumElements)
UseHiV1 = true;
else
UseLoV1 = true;
+ }
+ }
+ };
+
+ auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
+ if (!SimpleOnly)
+ return true;
+
+ bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
+ GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
+
+ return !(UseHiV1 || UseHiV2);
+ };
+
+ auto HalfBlend = [&](ArrayRef<int> HalfMask) {
+ SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
+ SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
+ SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
+ for (int i = 0; i < SplitNumElements; ++i) {
+ int M = HalfMask[i];
+ if (M >= NumElements) {
+ V2BlendMask[i] = M - NumElements;
+ BlendMask[i] = SplitNumElements + i;
+ } else if (M >= 0) {
V1BlendMask[i] = M;
BlendMask[i] = i;
}
}
+ bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
+ GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
+
// Because the lowering happens after all combining takes place, we need to
// manually combine these blend masks as much as possible so that we create
// a minimal number of high-level vector shuffle nodes.
+ assert(!SimpleOnly || (!UseHiV1 && !UseHiV2) && "Shuffle won't be simple");
+
// First try just blending the halves of V1 or V2.
if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
return DAG.getUNDEF(SplitVT);
SDValue V1Blend, V2Blend;
if (UseLoV1 && UseHiV1) {
- V1Blend =
- DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
+ V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
} else {
// We only use half of V1 so map the usage down into the final blend mask.
V1Blend = UseLoV1 ? LoV1 : HiV1;
BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
}
if (UseLoV2 && UseHiV2) {
- V2Blend =
- DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
+ V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
} else {
// We only use half of V2 so map the usage down into the final blend mask.
V2Blend = UseLoV2 ? LoV2 : HiV2;
}
return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
};
+
+ if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
+ return SDValue();
+
SDValue Lo = HalfBlend(LoMask);
SDValue Hi = HalfBlend(HiMask);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
if (Mask[i] >= 0)
LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
- return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
+ /*SimpleOnly*/ false);
// Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
// requires that the decomposed single-input shuffles don't end up here.
return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
}
+/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
+static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
+ SmallVector<int> &InLaneMask) {
+ int Size = Mask.size();
+ InLaneMask.assign(Mask.begin(), Mask.end());
+ for (int i = 0; i < Size; ++i) {
+ int &M = InLaneMask[i];
+ if (M < 0)
+ continue;
+ if (((M % Size) / LaneSize) != (i / LaneSize))
+ M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
+ }
+}
+
/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
/// source with a lane permutation.
///
assert(V2.isUndef() &&
"This last part of this routine only works on single input shuffles");
- SmallVector<int, 32> InLaneMask(Mask);
- for (int i = 0; i < Size; ++i) {
- int &M = InLaneMask[i];
- if (M < 0)
- continue;
- if (((M % Size) / LaneSize) != (i / LaneSize))
- M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
- }
+ SmallVector<int> InLaneMask;
+ computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
+
assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
"In-lane shuffle mask expected");
// If we're not using both lanes in each lane and the inlane mask is not
// repeating, then we're better off splitting.
if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
- return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
+ /*SimpleOnly*/ false);
// Flip the lanes, and shuffle the results which should now be in-lane.
MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
Subtarget, DAG))
return Broadcast;
+ if (!Subtarget.hasAVX2()) {
+ SmallVector<int> InLaneMask;
+ computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
+
+ if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
+ if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
+ /*SimpleOnly*/ true))
+ return R;
+ }
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return DAG.getBitcast(MVT::v8f32, ZExt);
+
// If the shuffle mask is repeated in each 128-bit lane, we have many more
// options to efficiently lower the shuffle.
SmallVector<int, 4> RepeatedMask;
return V;
if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
return V;
- return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
}
MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
Zeroable, Subtarget, DAG))
return Blend;
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
+ DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return DAG.getBitcast(MVT::v16f32, ZExt);
+
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
if (Subtarget.hasVBMI())
return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
- return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
}
/// High-level routine to lower various 512-bit x86 vector shuffles.
if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
return V;
- return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
}
if (VT == MVT::v32f16) {
;
; AVX2-LABEL: shuffle_v8f32_091b2d3f:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
-; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v8f32_091b2d3f:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15]
-; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
-; AVX512VL-NEXT: retq
+; AVX512VL-SLOW-LABEL: shuffle_v8f32_091b2d3f:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_091b2d3f:
+; AVX512VL-FAST-ALL: # %bb.0:
+; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15]
+; AVX512VL-FAST-ALL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT: retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_091b2d3f:
+; AVX512VL-FAST-PERLANE: # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
ret <8 x float> %shuffle
}
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
+; AVX-NEXT: vmovdqa %xmm2, (%rcx)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
-; AVX-NEXT: vmovdqa %xmm0, (%rcx)
-; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_v12i32_to_v6i64_factor2:
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2
-; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3
-; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rcx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
-; AVX-NEXT: vmovdqa %xmm2, 48(%rcx)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3
+; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
+; AVX-NEXT: vmovdqa %xmm2, (%rcx)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec512_v16i32_to_v8i64_factor2: