// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
// is a valid shuffle index.
- SDValue N0 = peekThroughBitcasts(N.getOperand(0));
- SDValue N1 = peekThroughBitcasts(N.getOperand(1));
+ SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
+ SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
return false;
SmallVector<int, 64> SrcMask0, SrcMask1;
true))
return false;
+ // Shuffle inputs must be the same size as the result.
+ if (llvm::any_of(SrcInputs0, [VT](SDValue Op) {
+ return VT.getSizeInBits() != Op.getValueSizeInBits();
+ }))
+ return false;
+ if (llvm::any_of(SrcInputs1, [VT](SDValue Op) {
+ return VT.getSizeInBits() != Op.getValueSizeInBits();
+ }))
+ return false;
+
size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
SmallVector<int, 64> Mask0, Mask1;
narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
- for (int i = 0; i != (int)MaskSize; ++i) {
+ for (size_t i = 0; i != MaskSize; ++i) {
if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
Mask.push_back(SM_SentinelUndef);
else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
Mask.push_back(SM_SentinelZero);
else if (Mask1[i] == SM_SentinelZero)
- Mask.push_back(i);
+ Mask.push_back(Mask0[i]);
else if (Mask0[i] == SM_SentinelZero)
- Mask.push_back(i + MaskSize);
+ Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size()));
else
return false;
}
- Ops.push_back(N0);
- Ops.push_back(N1);
+ Ops.append(SrcInputs0.begin(), SrcInputs0.end());
+ Ops.append(SrcInputs1.begin(), SrcInputs1.end());
return true;
}
case ISD::INSERT_SUBVECTOR: {
SelectionDAG &DAG, const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
bool IsUnary) {
- unsigned NumMaskElts = Mask.size();
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
if (MaskVT.is128BitVector()) {
}
}
- // Attempt to match against a OR if we're performing a blend shuffle and the
- // non-blended source element is zero in each case.
- if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
- (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
- bool IsBlend = true;
- unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
- unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
- unsigned Scale1 = NumV1Elts / NumMaskElts;
- unsigned Scale2 = NumV2Elts / NumMaskElts;
- APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts);
- APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts);
- for (unsigned i = 0; i != NumMaskElts; ++i) {
- int M = Mask[i];
- if (M == SM_SentinelUndef)
- continue;
- if (M == SM_SentinelZero) {
- DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
- DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
- continue;
- }
- if (M == i) {
- DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
- continue;
- }
- if (M == (i + NumMaskElts)) {
- DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
- continue;
- }
- IsBlend = false;
- break;
- }
- if (IsBlend &&
- DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
- DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
- Shuffle = ISD::OR;
- SrcVT = DstVT = EVT(MaskVT).changeTypeToInteger().getSimpleVT();
- return true;
- }
- }
-
return false;
}
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
-; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: retq
; SSE3-NEXT: movdqa %xmm3, %xmm4
; SSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
; SSE3-NEXT: por %xmm4, %xmm0
-; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; SSE3-NEXT: pand %xmm5, %xmm1
; SSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
-; SSE3-NEXT: por %xmm3, %xmm1
+; SSE3-NEXT: pandn %xmm3, %xmm5
+; SSE3-NEXT: por %xmm5, %xmm1
; SSE3-NEXT: pand %xmm2, %xmm1
; SSE3-NEXT: por %xmm4, %xmm1
; SSE3-NEXT: retq
define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
; AVX1-LABEL: negative:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14]
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vzeroupper
;
; SSSE3-LABEL: shuffle_v8i16_XX4X8acX:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,u,u]
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,8,9,u,u],zero,zero,zero,zero,zero,zero,xmm0[u,u]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[u,u,0,1,4,5,8,9,u,u]
+; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_XX4X8acX:
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[u,u],zero,zero,xmm2[12],zero,xmm2[u,u,u],zero,zero,xmm2[u,0,3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero
; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u],zero,zero,xmm4[u,u,u,u,1,6,13,u,u],zero,xmm4[u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u]
+; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255]
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0