}
}
// FALLTHROUGH
+ case MVT::v16i8:
case MVT::v32i8: {
- assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
// Scale the blend by the number of bytes per element.
- int Scale = VT.getScalarSizeInBits() / 8;
- assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
+ int Scale = VT.getScalarSizeInBits() / 8;
+
+ // This form of blend is always done on bytes. Compute the byte vector
+ // type.
+ MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
// Compute the VSELECT mask. Note that VSELECT is really confusing in the
// mix of LLVM's code generator and the x86 backend. We tell the code
// the LLVM model for boolean values in vector elements gets the relevant
// bit set, it is set backwards and over constrained relative to x86's
// actual model.
- SDValue VSELECTMask[32];
+ SmallVector<SDValue, 32> VSELECTMask;
for (int i = 0, Size = Mask.size(); i < Size; ++i)
for (int j = 0; j < Scale; ++j)
- VSELECTMask[Scale * i + j] =
+ VSELECTMask.push_back(
Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
- : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
+ : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8));
- V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
- V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
+ V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
return DAG.getNode(
ISD::BITCAST, DL, VT,
- DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
+ DAG.getNode(ISD::VSELECT, DL, BlendVT,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, BlendVT, VSELECTMask),
V1, V2));
}
}
}
+ // If both V1 and V2 are in use and we can use a direct blend, do so. This
+ // avoids using blends to handle blends-with-zero which is important as
+ // a single pshufb is significantly faster for that.
+ if (V1InUse && V2InUse && Subtarget->hasSSE41())
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+
if (V1InUse)
V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
;
; SSE41-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
; SSE41: # BB#0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[4],zero,zero,xmm1[7],zero,zero,zero,zero,xmm1[12],zero,zero,xmm1[15]
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,xmm0[5,6],zero,xmm0[8,9,10,11],zero,xmm0[13,14],zero
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
; AVX: # BB#0:
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[4],zero,zero,xmm1[7],zero,zero,zero,zero,xmm1[12],zero,zero,xmm1[15]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,xmm0[5,6],zero,xmm0[8,9,10,11],zero,xmm0[13,14],zero
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
+; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 13, i32 14, i32 31>
ret <16 x i8> %shuffle
define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[u],zero,xmm0[u,u,u,u,u,u,u,7,u,u,u,u]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,3,u,3,u,u,u,u,u,u,u],zero,xmm3[u,u,u,u]
-; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1],zero,xmm2[3],zero,zero,zero,zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,4,u,1,6],zero,zero,xmm4[0],zero,xmm4[11,u],zero,zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u],zero,xmm1[u],zero,zero,xmm1[5,0],zero,xmm1[10],zero,xmm1[u,4,2,4,7]
-; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[2],zero,xmm5[4,5,6,7,8,9,10],zero,xmm5[12,13,14,15]
-; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[u,u,u,u,1,6,13,u,u],zero,xmm3[u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u]
-; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[2,3],zero,zero,zero,zero,xmm0[8,9,10],zero,zero,xmm0[13],zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm4[u,u],zero,zero,xmm4[12],zero,xmm4[u,u,u],zero,zero,xmm4[u,0,3]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,4,u,1,6],zero,zero,xmm2[0],zero,xmm2[11,u],zero,zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u],zero,xmm1[u],zero,zero,xmm1[5,0],zero,xmm1[10],zero,xmm1[u,4,2,4,7]
+; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm0[u],zero,xmm0[u,u,u,u,u,u,u,7,u,u,u,u]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,3,u,3,u,u,u,u,u,u,u],zero,xmm5[u,u,u,u]
+; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[u,u],zero,zero,xmm2[12],zero,xmm2[u,u,u],zero,zero,xmm2[u,0,3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero
-; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,xmm1[4,5,6,7],zero,zero,zero,xmm1[11,12],zero,xmm1[14,15]
-; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u],zero,zero,xmm5[u,u,u,u,1,6,13,u,u],zero,xmm5[u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u]
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255]
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: