Add suport for shuffle combines (via combineEXTEND_VECTOR_INREG) to begin from SIGN_EXTEND_VECTOR_INREG nodes
Mask.append(NumElts, 0);
return true;
}
+ case ISD::SIGN_EXTEND_VECTOR_INREG: {
+ SDValue Src = N.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
+
+ // Extended source must be a simple vector.
+ if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
+ (NumBitsPerSrcElt % 8) != 0)
+ return false;
+
+ // We can only handle all-signbits extensions.
+ APInt DemandedSrcElts =
+ DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
+ if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
+ return false;
+
+ assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
+ unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
+ for (unsigned I = 0; I != NumElts; ++I)
+ Mask.append(Scale, I);
+ Ops.push_back(Src);
+ return true;
+ }
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND_VECTOR_INREG:
}
// Attempt to combine as a shuffle on SSE41+ targets.
- if ((Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
- Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) &&
- Subtarget.hasSSE41()) {
+ if (Subtarget.hasSSE41()) {
SDValue Op(N, 0);
if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-SSSE3-NEXT: packsswb %xmm2, %xmm2
; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSE2-SSSE3-NEXT: retq
; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpmovsxbw %xmm0, %xmm0
-; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX12-NEXT: vpmovmskb %xmm0, %eax
; AVX12-NEXT: # kill: def $al killed $al killed $eax
; AVX12-NEXT: retq
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
; AVX1-NEXT: retq
; CHECK-NEXT: negl %eax
; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%wide.load35 = load <4 x i1>, ptr %in, align 1
; CHECK-NEXT: vpmovsxdq %xmm1, %xmm2
; CHECK-NEXT: # implicit-def: $ymm0
; CHECK-NEXT: vmovaps %xmm2, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; CHECK-NEXT: vpmovsxdq %xmm1, %xmm1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: # %bb.2: # %BB1
; CHECK-NEXT: vzeroupper
; SSE4-LABEL: PR45808:
; SSE4: # %bb.0:
; SSE4-NEXT: movdqa %xmm0, %xmm4
+; SSE4-NEXT: movdqa %xmm0, %xmm5
+; SSE4-NEXT: pcmpgtq %xmm2, %xmm5
; SSE4-NEXT: movdqa %xmm1, %xmm0
; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT: movdqa %xmm4, %xmm5
-; SSE4-NEXT: pcmpgtq %xmm2, %xmm5
-; SSE4-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSE4-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE4-NEXT: pxor %xmm5, %xmm6
-; SSE4-NEXT: psllq $63, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE4-NEXT: pmovsxdq %xmm6, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2
-; SSE4-NEXT: movapd %xmm2, %xmm0
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4
+; SSE4-NEXT: movapd %xmm4, %xmm0
; SSE4-NEXT: movapd %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
; AVX2-NEXT: retq
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpsrad $31, %ymm2, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm0
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
; AVX2-NEXT: retq
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
; AVX2-NEXT: retq
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpmaxud %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
; AVX2-NEXT: retq
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,1,3,3,5,5,7,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7]
; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5
-; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
-; AVX2-NEXT: vpacksswb %xmm5, %xmm5, %xmm5
; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpmovsxbd %xmm5, %ymm0
-; AVX2-NEXT: vpacksswb %xmm4, %xmm4, %xmm1
-; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
+; AVX2-NEXT: vpacksswb %xmm4, %xmm4, %xmm0
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
+; AVX2-NEXT: vpxor %ymm6, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
; AVX2-NEXT: retq
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
; AVX1-NEXT: retq
; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpminud %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi)
; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
; AVX2-NEXT: retq
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
; AVX1-NEXT: negl %eax
; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE41-NEXT: pmovsxdq %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; X86-SSE41-NEXT: pslld $31, %xmm0
; X86-SSE41-NEXT: psrad $31, %xmm0
; X86-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
-; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X86-SSE41-NEXT: pmovsxdq %xmm0, %xmm1
+; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
; X86-SSE41-NEXT: movdqa %xmm2, %xmm0
; X86-SSE41-NEXT: retl
%extmask = sext <4 x i1> %mask to <4 x i64>
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovsxbw %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0