We can combine unary shuffles into either of SHUFPS's inputs and adjust the shuffle mask accordingly.
Unlike general shuffle combining, we can be more aggressive and handle multiuse cases as we're not going to accidentally create additional shuffles.
}
return SDValue();
}
+ case X86ISD::SHUFP: {
+ // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
+ // This is a more relaxed shuffle combiner that can ignore oneuse limits.
+ // TODO: Support types other than v4f32.
+ if (VT == MVT::v4f32) {
+ bool Updated = false;
+ SmallVector<int> Mask;
+ SmallVector<SDValue> Ops;
+ if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&
+ Ops.size() == 2) {
+ for (int i = 0; i != 2; ++i) {
+ SmallVector<SDValue> SubOps;
+ SmallVector<int> SubMask, SubScaledMask;
+ SDValue Sub = peekThroughBitcasts(Ops[i]);
+ // TODO: Scaling might be easier if we specify the demanded elts.
+ if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
+ scaleShuffleElements(SubMask, 4, SubScaledMask) &&
+ SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
+ int Ofs = i * 2;
+ Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
+ Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
+ Ops[i] = DAG.getBitcast(VT, SubOps[0]);
+ Updated = true;
+ }
+ }
+ }
+ if (Updated) {
+ for (int &M : Mask)
+ M %= 4;
+ Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
+ }
+ }
+ return SDValue();
+ }
case X86ISD::VPERMI: {
// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
// TODO: Remove when we have preferred domains in combineX86ShuffleChain.
; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,3]
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: movd %edi, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3]
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1]
-; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: pandn %xmm1, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: simplify_select: