If somehow a vXi64 bool sign_extend_inreg pattern has been lowered to vector shifts (without PSRAQ support), then try to canonicalize to vXi32 shifts to improve likelihood of value tracking being able to fold them away.
Using a PSLLQ and bitcasted PSRAD node make it very difficult for later fold to recover from this.
return Res;
}
+ // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
+ // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
+ // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
+ // pshufd(psrad(pslld(X,31),31),0,0,2,2).
+ if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
+ N0.getOpcode() == X86ISD::PSHUFD &&
+ N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
+ N0->hasOneUse()) {
+ SDValue BC = peekThroughOneUseBitcasts(N0.getOperand(0));
+ if (BC.getOpcode() == X86ISD::VSHLI &&
+ BC.getScalarValueSizeInBits() == 64 &&
+ BC.getConstantOperandVal(1) == 63) {
+ SDLoc DL(N);
+ SDValue Src = BC.getOperand(0);
+ Src = DAG.getBitcast(VT, Src);
+ Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
+ getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
+ Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
+ Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
+ return Src;
+ }
+ }
+
auto TryConstantFold = [&](SDValue V) {
APInt UndefElts;
SmallVector<APInt, 32> EltBits;
; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE41-NEXT: orps %xmm2, %xmm0
; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; SSE41-NEXT: psllq $63, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSE41-NEXT: pslld $31, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE2-NEXT: andps %xmm3, %xmm0
; SSE2-NEXT: orps %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,3,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: psllq $63, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: pslld $31, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: retq
%cmp1 = icmp eq <4 x i64> %x, <i64 129, i64 129, i64 129, i64 129>
; SSE41-NEXT: orps %xmm2, %xmm0
; SSE41-NEXT: xorps %xmm3, %xmm0
; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; SSE41-NEXT: psllq $63, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSE41-NEXT: pslld $31, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
; SSE2-NEXT: andps %xmm4, %xmm0
; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: xorps %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,3,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: psllq $63, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: pslld $31, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: retq
%cmp1 = icmp ne <4 x i64> %x, <i64 129, i64 129, i64 129, i64 129>
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm4
; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,1,3,3]
-; SSE2-NEXT: psllq $63, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm2
; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,2,2,2]
; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
-; SSE2-NEXT: psllq $63, %xmm8
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2]
+; SSE2-NEXT: pslld $31, %xmm8
; SSE2-NEXT: psrad $31, %xmm8
; SSE2-NEXT: movdqa %xmm8, %xmm10
; SSE2-NEXT: pandn %xmm7, %xmm10
; SSE2-NEXT: por %xmm10, %xmm8
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
-; SSE2-NEXT: psllq $63, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,2]
+; SSE2-NEXT: pslld $31, %xmm7
; SSE2-NEXT: psrad $31, %xmm7
; SSE2-NEXT: movdqa %xmm7, %xmm10
; SSE2-NEXT: pandn %xmm6, %xmm10
; SSE2-NEXT: por %xmm10, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,0,0]
; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
-; SSE2-NEXT: psllq $63, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,2]
+; SSE2-NEXT: pslld $31, %xmm6
; SSE2-NEXT: psrad $31, %xmm6
; SSE2-NEXT: movdqa %xmm6, %xmm10
; SSE2-NEXT: pandn %xmm5, %xmm10
; SSE2-NEXT: por %xmm10, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; SSE2-NEXT: psllq $63, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE2-NEXT: pslld $31, %xmm5
; SSE2-NEXT: psrad $31, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm9
; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5
; SSE2-NEXT: shrb $3, %al
; SSE2-NEXT: movzbl %al, %eax
; SSE2-NEXT: pinsrw $6, %eax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
-; SSE2-NEXT: psllq $63, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1]
+; SSE2-NEXT: pslld $31, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; SSE2-NEXT: psllq $63, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-NEXT: shrb $3, %al
; SSSE3-NEXT: movzbl %al, %eax
; SSSE3-NEXT: pinsrw $6, %eax, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
-; SSSE3-NEXT: psllq $63, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1]
+; SSSE3-NEXT: pslld $31, %xmm0
; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; SSSE3-NEXT: psllq $63, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; SSSE3-NEXT: pslld $31, %xmm1
; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-NEXT: shrb $2, %cl
; SSE41-NEXT: andb $1, %cl
; SSE41-NEXT: movzbl %cl, %ecx
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; SSE41-NEXT: pinsrb $8, %ecx, %xmm1
; SSE41-NEXT: shrb $3, %al
; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1]
; SSE41-NEXT: pinsrb $12, %eax, %xmm1
-; SSE41-NEXT: psllq $63, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; SSE41-NEXT: psllq $63, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: pslld $31, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: retq
;
; X86-SSE2-NEXT: shrb $3, %al
; X86-SSE2-NEXT: movzbl %al, %eax
; X86-SSE2-NEXT: pinsrw $6, %eax, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
-; X86-SSE2-NEXT: psllq $63, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1]
+; X86-SSE2-NEXT: pslld $31, %xmm0
; X86-SSE2-NEXT: psrad $31, %xmm0
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; X86-SSE2-NEXT: psllq $63, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; X86-SSE2-NEXT: pslld $31, %xmm1
; X86-SSE2-NEXT: psrad $31, %xmm1
; X86-SSE2-NEXT: retl
;
; X86-SSE41-NEXT: shrb $2, %cl
; X86-SSE41-NEXT: andb $1, %cl
; X86-SSE41-NEXT: movzbl %cl, %ecx
-; X86-SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; X86-SSE41-NEXT: pinsrb $8, %ecx, %xmm1
; X86-SSE41-NEXT: shrb $3, %al
; X86-SSE41-NEXT: movzbl %al, %eax
+; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1]
; X86-SSE41-NEXT: pinsrb $12, %eax, %xmm1
-; X86-SSE41-NEXT: psllq $63, %xmm0
-; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-SSE41-NEXT: pslld $31, %xmm0
; X86-SSE41-NEXT: psrad $31, %xmm0
; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; X86-SSE41-NEXT: psllq $63, %xmm1
-; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE41-NEXT: pslld $31, %xmm1
; X86-SSE41-NEXT: psrad $31, %xmm1
; X86-SSE41-NEXT: retl
entry:
define <2 x i64> @shrunkblend_2uses(<2 x i1> %cond, <2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
; SSE2-LABEL: shrunkblend_2uses:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $63, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT: pslld $31, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: pandn %xmm2, %xmm5
define <2 x i64> @shrunkblend_nonvselectuse(<2 x i1> %cond, <2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
; SSE2-LABEL: shrunkblend_nonvselectuse:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $63, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
+; SSE2-NEXT: pslld $31, %xmm3
; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm0
define void @t2(ptr %dst, <3 x i64> %src1, <3 x i64> %src2) nounwind readonly {
; CHECK-LABEL: t2:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %r9, %xmm1
-; CHECK-NEXT: movq %r8, %xmm0
-; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT: movq %rdx, %xmm1
+; CHECK-NEXT: movq %r9, %xmm0
+; CHECK-NEXT: movq %r8, %xmm1
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; CHECK-NEXT: movq %rdx, %xmm0
; CHECK-NEXT: movq %rsi, %xmm2
-; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; CHECK-NEXT: movq %rcx, %xmm1
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; CHECK-NEXT: movq %rcx, %xmm0
; CHECK-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
; CHECK-NEXT: pxor %xmm4, %xmm4
-; CHECK-NEXT: pcmpeqq %xmm4, %xmm1
+; CHECK-NEXT: pcmpeqq %xmm4, %xmm0
; CHECK-NEXT: pcmpeqq %xmm4, %xmm2
-; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
-; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2]
+; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
; CHECK-NEXT: pcmpeqq %xmm4, %xmm3
-; CHECK-NEXT: pcmpeqq %xmm4, %xmm0
-; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
-; CHECK-NEXT: orps %xmm2, %xmm0
-; CHECK-NEXT: xorps %xmm1, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; CHECK-NEXT: psllq $63, %xmm1
-; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-NEXT: psrad $31, %xmm1
-; CHECK-NEXT: pmovsxdq %xmm0, %xmm0
-; CHECK-NEXT: movdqa %xmm0, (%rdi)
-; CHECK-NEXT: movq %xmm1, 16(%rdi)
+; CHECK-NEXT: pcmpeqq %xmm4, %xmm1
+; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
+; CHECK-NEXT: orps %xmm2, %xmm1
+; CHECK-NEXT: xorps %xmm0, %xmm1
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2]
+; CHECK-NEXT: pslld $31, %xmm0
+; CHECK-NEXT: psrad $31, %xmm0
+; CHECK-NEXT: pmovsxdq %xmm1, %xmm1
+; CHECK-NEXT: movdqa %xmm1, (%rdi)
+; CHECK-NEXT: movq %xmm0, 16(%rdi)
; CHECK-NEXT: retq
%cmp1 = icmp ne <3 x i64> %src1, zeroinitializer
%cmp2 = icmp ne <3 x i64> %src2, zeroinitializer