if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
return R;
+ // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
+ // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
+ // iff the upper elements of the non-shifted arg are zero.
+ // KUNPCK require 16+ bool vector elements.
+ if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfElts = NumElts / 2;
+ APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
+ if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
+ N1.getConstantOperandAPInt(1) == HalfElts &&
+ DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
+ SDLoc dl(N);
+ return DAG.getNode(
+ ISD::CONCAT_VECTORS, dl, VT,
+ extractSubVector(N0, 0, DAG, dl, HalfElts),
+ extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
+ }
+ if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
+ N0.getConstantOperandAPInt(1) == HalfElts &&
+ DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
+ SDLoc dl(N);
+ return DAG.getNode(
+ ISD::CONCAT_VECTORS, dl, VT,
+ extractSubVector(N1, 0, DAG, dl, HalfElts),
+ extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
+ }
+ }
+
// Attempt to recursively combine an OR of shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vcmpltps %zmm1, %zmm0, %k0
; AVX512F-NEXT: vcmpltps %zmm3, %zmm2, %k1
-; AVX512F-NEXT: kshiftlw $8, %k0, %k0
-; AVX512F-NEXT: kshiftlw $8, %k1, %k1
-; AVX512F-NEXT: kshiftrw $8, %k1, %k1
-; AVX512F-NEXT: korw %k1, %k0, %k1
+; AVX512F-NEXT: kunpckbw %k1, %k0, %k1
; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovaps %zmm0, (%rdi) {%k1}
; AVX512F-NEXT: vzeroupper
; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vcmpltps %ymm1, %ymm0, %k0
; AVX512VL-NEXT: vcmpltps %ymm3, %ymm2, %k1
-; AVX512VL-NEXT: kshiftlw $8, %k0, %k0
-; AVX512VL-NEXT: korw %k1, %k0, %k1
+; AVX512VL-NEXT: kunpckbw %k1, %k0, %k1
; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vmovaps %zmm0, (%rdi) {%k1}
; AVX512VL-NEXT: vzeroupper
; VL_BW_DQ: # %bb.0: # %entry
; VL_BW_DQ-NEXT: vcmpltps %ymm1, %ymm0, %k0
; VL_BW_DQ-NEXT: vcmpltps %ymm3, %ymm2, %k1
-; VL_BW_DQ-NEXT: kshiftlw $8, %k0, %k0
-; VL_BW_DQ-NEXT: korw %k1, %k0, %k1
+; VL_BW_DQ-NEXT: kunpckbw %k1, %k0, %k1
; VL_BW_DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vmovaps %zmm0, (%rdi) {%k1}
; VL_BW_DQ-NEXT: vzeroupper
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vcmpltps %zmm1, %zmm0, %k0
; AVX512F-NEXT: vcmpltps %zmm3, %zmm2, %k1
-; AVX512F-NEXT: kshiftlw $8, %k0, %k0
-; AVX512F-NEXT: kshiftlw $8, %k1, %k1
-; AVX512F-NEXT: kshiftrw $8, %k1, %k1
-; AVX512F-NEXT: korw %k0, %k1, %k1
+; AVX512F-NEXT: kunpckbw %k1, %k0, %k1
; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovaps %zmm0, (%rdi) {%k1}
; AVX512F-NEXT: vzeroupper
; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vcmpltps %ymm1, %ymm0, %k0
; AVX512VL-NEXT: vcmpltps %ymm3, %ymm2, %k1
-; AVX512VL-NEXT: kshiftlw $8, %k0, %k0
-; AVX512VL-NEXT: korw %k0, %k1, %k1
+; AVX512VL-NEXT: kunpckbw %k1, %k0, %k1
; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vmovaps %zmm0, (%rdi) {%k1}
; AVX512VL-NEXT: vzeroupper
; VL_BW_DQ: # %bb.0: # %entry
; VL_BW_DQ-NEXT: vcmpltps %ymm1, %ymm0, %k0
; VL_BW_DQ-NEXT: vcmpltps %ymm3, %ymm2, %k1
-; VL_BW_DQ-NEXT: kshiftlw $8, %k0, %k0
-; VL_BW_DQ-NEXT: korw %k0, %k1, %k1
+; VL_BW_DQ-NEXT: kunpckbw %k1, %k0, %k1
; VL_BW_DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vmovaps %zmm0, (%rdi) {%k1}
; VL_BW_DQ-NEXT: vzeroupper