}
}
+ // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
+ // Only concat of subvector high halves which vperm2x128 is best at.
+ // TODO: This should go in combineX86ShufflesRecursively eventually.
+ if (VT.is256BitVector() && Ops.size() == 2) {
+ SDValue Src0 = peekThroughBitcasts(Ops[0]);
+ SDValue Src1 = peekThroughBitcasts(Ops[1]);
+ if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+ EVT SrcVT0 = Src0.getOperand(0).getValueType();
+ EVT SrcVT1 = Src1.getOperand(0).getValueType();
+ unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
+ unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
+ if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
+ Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
+ Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
+ return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
+ DAG.getBitcast(VT, Src0.getOperand(0)),
+ DAG.getBitcast(VT, Src1.getOperand(0)),
+ DAG.getTargetConstant(0x31, DL, MVT::i8));
+ }
+ }
+ }
+
// Repeated opcode.
// TODO - combineX86ShufflesRecursively should handle shuffle concatenation
// but it currently struggles with different vector widths.
; CHECK: # %bb.0:
; CHECK-NEXT: vandps %ymm2, %ymm0, %ymm0
; CHECK-NEXT: vandps {{\.LCPI.*}}, %ymm1, %ymm1
-; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
+; CHECK-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2],ymm0[0,2],ymm2[4,6],ymm0[4,6]
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
; CHECK-NEXT: retl
%a = shufflevector <4 x i64> %y, <4 x i64> <i64 12345, i64 67890, i64 13579, i64 24680>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%b = and <8 x i64> %x, %a