From: Simon Pilgrim Date: Sat, 11 May 2019 21:35:50 +0000 (+0000) Subject: [X86][SSE] SimplifyDemandedBits - call PEXTRB/PEXTRW SimplifyDemandedVectorElts as... X-Git-Tag: llvmorg-9.0.0-rc1~5764 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=fda6bffd3bf2a9b0bd5ecad862132b05f75ecf4d;p=platform%2Fupstream%2Fllvm.git [X86][SSE] SimplifyDemandedBits - call PEXTRB/PEXTRW SimplifyDemandedVectorElts as well. See if we can simplify the demanded vector elts from the extraction before trying to simplify the demanded bits. This helps us with target shuffles and hops in particular. llvm-svn: 360535 --- diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a428f3fac0c1..b538694b49b3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -33875,8 +33875,13 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( if (DemandedVecBits == 0) return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT)); - KnownBits KnownVec; + APInt KnownUndef, KnownZero; APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx); + if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef, + KnownZero, TLO, Depth + 1)) + return true; + + KnownBits KnownVec; if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts, KnownVec, TLO, Depth + 1)) return true; diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll index ed487ef8266f..0914f2bff9f6 100644 --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -63,12 +63,12 @@ define i2 @bitcast_v4i32_to_v2i2(<4 x i32> %a0) nounwind { ; AVX12: # %bb.0: ; AVX12-NEXT: vmovmskps %xmm0, %eax ; AVX12-NEXT: movl %eax, %ecx -; AVX12-NEXT: andl $3, %ecx -; AVX12-NEXT: vmovq %rcx, %xmm0 -; AVX12-NEXT: shrl $2, %eax -; AVX12-NEXT: vmovq %rax, %xmm1 -; AVX12-NEXT: vpextrb $0, %xmm0, %ecx -; AVX12-NEXT: vpextrb $0, %xmm1, %eax +; AVX12-NEXT: shrl $2, %ecx +; AVX12-NEXT: vmovd %ecx, %xmm0 +; AVX12-NEXT: andl $3, %eax +; AVX12-NEXT: vmovd %eax, %xmm1 +; AVX12-NEXT: vpextrb $0, %xmm1, %ecx +; AVX12-NEXT: vpextrb $0, %xmm0, %eax ; AVX12-NEXT: addb %cl, %al ; AVX12-NEXT: # kill: def $al killed $al killed $eax ; AVX12-NEXT: retq @@ -81,10 +81,9 @@ define i2 @bitcast_v4i32_to_v2i2(<4 x i32> %a0) nounwind { ; AVX512-NEXT: movzbl %al, %ecx ; AVX512-NEXT: shrl $2, %ecx ; AVX512-NEXT: andl $3, %ecx -; AVX512-NEXT: vmovq %rcx, %xmm0 -; AVX512-NEXT: movzwl %ax, %eax +; AVX512-NEXT: vmovd %ecx, %xmm0 ; AVX512-NEXT: andl $3, %eax -; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vmovd %eax, %xmm1 ; AVX512-NEXT: vpextrb $0, %xmm1, %ecx ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: addb %cl, %al @@ -120,9 +119,9 @@ define i4 @bitcast_v8i16_to_v2i4(<8 x i16> %a0) nounwind { ; AVX12-NEXT: vpmovmskb %xmm0, %eax ; AVX12-NEXT: movzbl %al, %ecx ; AVX12-NEXT: shrl $4, %ecx -; AVX12-NEXT: vmovq %rcx, %xmm0 +; AVX12-NEXT: vmovd %ecx, %xmm0 ; AVX12-NEXT: andl $15, %eax -; AVX12-NEXT: vmovq %rax, %xmm1 +; AVX12-NEXT: vmovd %eax, %xmm1 ; AVX12-NEXT: vpextrb $0, %xmm1, %ecx ; AVX12-NEXT: vpextrb $0, %xmm0, %eax ; AVX12-NEXT: addb %cl, %al @@ -135,10 +134,9 @@ define i4 @bitcast_v8i16_to_v2i4(<8 x i16> %a0) nounwind { ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movzbl %al, %ecx ; AVX512-NEXT: shrl $4, %ecx -; AVX512-NEXT: vmovq %rcx, %xmm0 -; AVX512-NEXT: movzwl %ax, %eax +; AVX512-NEXT: vmovd %ecx, %xmm0 ; AVX512-NEXT: andl $15, %eax -; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vmovd %eax, %xmm1 ; AVX512-NEXT: vpextrb $0, %xmm1, %ecx ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: addb %cl, %al @@ -243,12 +241,12 @@ define i2 @bitcast_v4i64_to_v2i2(<4 x i64> %a0) nounwind { ; AVX12: # %bb.0: ; AVX12-NEXT: vmovmskpd %ymm0, %eax ; AVX12-NEXT: movl %eax, %ecx -; AVX12-NEXT: andl $3, %ecx -; AVX12-NEXT: vmovq %rcx, %xmm0 -; AVX12-NEXT: shrl $2, %eax -; AVX12-NEXT: vmovq %rax, %xmm1 -; AVX12-NEXT: vpextrb $0, %xmm0, %ecx -; AVX12-NEXT: vpextrb $0, %xmm1, %eax +; AVX12-NEXT: shrl $2, %ecx +; AVX12-NEXT: vmovd %ecx, %xmm0 +; AVX12-NEXT: andl $3, %eax +; AVX12-NEXT: vmovd %eax, %xmm1 +; AVX12-NEXT: vpextrb $0, %xmm1, %ecx +; AVX12-NEXT: vpextrb $0, %xmm0, %eax ; AVX12-NEXT: addb %cl, %al ; AVX12-NEXT: # kill: def $al killed $al killed $eax ; AVX12-NEXT: vzeroupper @@ -262,10 +260,9 @@ define i2 @bitcast_v4i64_to_v2i2(<4 x i64> %a0) nounwind { ; AVX512-NEXT: movzbl %al, %ecx ; AVX512-NEXT: shrl $2, %ecx ; AVX512-NEXT: andl $3, %ecx -; AVX512-NEXT: vmovq %rcx, %xmm0 -; AVX512-NEXT: movzwl %ax, %eax +; AVX512-NEXT: vmovd %ecx, %xmm0 ; AVX512-NEXT: andl $3, %eax -; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vmovd %eax, %xmm1 ; AVX512-NEXT: vpextrb $0, %xmm1, %ecx ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: addb %cl, %al @@ -301,12 +298,12 @@ define i4 @bitcast_v8i32_to_v2i4(<8 x i32> %a0) nounwind { ; AVX12: # %bb.0: ; AVX12-NEXT: vmovmskps %ymm0, %eax ; AVX12-NEXT: movl %eax, %ecx -; AVX12-NEXT: andl $15, %ecx -; AVX12-NEXT: vmovq %rcx, %xmm0 -; AVX12-NEXT: shrl $4, %eax -; AVX12-NEXT: vmovq %rax, %xmm1 -; AVX12-NEXT: vpextrb $0, %xmm0, %ecx -; AVX12-NEXT: vpextrb $0, %xmm1, %eax +; AVX12-NEXT: shrl $4, %ecx +; AVX12-NEXT: vmovd %ecx, %xmm0 +; AVX12-NEXT: andl $15, %eax +; AVX12-NEXT: vmovd %eax, %xmm1 +; AVX12-NEXT: vpextrb $0, %xmm1, %ecx +; AVX12-NEXT: vpextrb $0, %xmm0, %eax ; AVX12-NEXT: addb %cl, %al ; AVX12-NEXT: # kill: def $al killed $al killed $eax ; AVX12-NEXT: vzeroupper @@ -319,10 +316,9 @@ define i4 @bitcast_v8i32_to_v2i4(<8 x i32> %a0) nounwind { ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movzbl %al, %ecx ; AVX512-NEXT: shrl $4, %ecx -; AVX512-NEXT: vmovq %rcx, %xmm0 -; AVX512-NEXT: movzwl %ax, %eax +; AVX512-NEXT: vmovd %ecx, %xmm0 ; AVX512-NEXT: andl $15, %eax -; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vmovd %eax, %xmm1 ; AVX512-NEXT: vpextrb $0, %xmm1, %ecx ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: addb %cl, %al @@ -549,9 +545,9 @@ define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind { ; AVX1-NEXT: vmovmskps %ymm0, %eax ; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vmovd %ecx, %xmm0 ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vmovd %eax, %xmm1 ; AVX1-NEXT: vpextrb $0, %xmm1, %ecx ; AVX1-NEXT: vpextrb $0, %xmm0, %eax ; AVX1-NEXT: addb %cl, %al @@ -569,9 +565,9 @@ define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vmovmskps %ymm0, %eax ; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $4, %ecx -; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: vmovd %ecx, %xmm0 ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vmovd %eax, %xmm1 ; AVX2-NEXT: vpextrb $0, %xmm1, %ecx ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: addb %cl, %al @@ -586,10 +582,9 @@ define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind { ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movzbl %al, %ecx ; AVX512-NEXT: shrl $4, %ecx -; AVX512-NEXT: vmovq %rcx, %xmm0 -; AVX512-NEXT: movzwl %ax, %eax +; AVX512-NEXT: vmovd %ecx, %xmm0 ; AVX512-NEXT: andl $15, %eax -; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vmovd %eax, %xmm1 ; AVX512-NEXT: vpextrb $0, %xmm1, %ecx ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: addb %cl, %al diff --git a/llvm/test/CodeGen/X86/horizontal-shuffle-demanded.ll b/llvm/test/CodeGen/X86/horizontal-shuffle-demanded.ll index 31d217a9d3cc..5b1482a221ec 100644 --- a/llvm/test/CodeGen/X86/horizontal-shuffle-demanded.ll +++ b/llvm/test/CodeGen/X86/horizontal-shuffle-demanded.ll @@ -130,15 +130,13 @@ define void @test_demanded_phaddw_128(<8 x i16> %a0, <8 x i16> %a1, i16 *%a2) no ; X86-LABEL: test_demanded_phaddw_128: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpbroadcastw %xmm1, %xmm1 -; X86-NEXT: vphaddw %xmm1, %xmm0, %xmm0 +; X86-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; X86-NEXT: vpextrw $0, %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: test_demanded_phaddw_128: ; X64: ## %bb.0: -; X64-NEXT: vpbroadcastw %xmm1, %xmm1 -; X64-NEXT: vphaddw %xmm1, %xmm0, %xmm0 +; X64-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; X64-NEXT: vpextrw $0, %xmm0, (%rdi) ; X64-NEXT: retq %1 = shufflevector <8 x i16> %a1, <8 x i16> undef, <8 x i32> zeroinitializer @@ -152,15 +150,13 @@ define void @test_demanded_phsubw_128(<8 x i16> %a0, <8 x i16> %a1, i16 *%a2) no ; X86-LABEL: test_demanded_phsubw_128: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpbroadcastw %xmm1, %xmm1 -; X86-NEXT: vphsubw %xmm1, %xmm0, %xmm0 +; X86-NEXT: vphsubw %xmm0, %xmm0, %xmm0 ; X86-NEXT: vpextrw $2, %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: test_demanded_phsubw_128: ; X64: ## %bb.0: -; X64-NEXT: vpbroadcastw %xmm1, %xmm1 -; X64-NEXT: vphsubw %xmm1, %xmm0, %xmm0 +; X64-NEXT: vphsubw %xmm0, %xmm0, %xmm0 ; X64-NEXT: vpextrw $2, %xmm0, (%rdi) ; X64-NEXT: retq %1 = shufflevector <8 x i16> %a1, <8 x i16> undef, <8 x i32> zeroinitializer @@ -322,16 +318,16 @@ define void @test_demanded_phaddw_256(<16 x i16> %a0, <16 x i16> %a1, i16 *%a2) ; X86-LABEL: test_demanded_phaddw_256: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpbroadcastw %xmm1, %xmm1 -; X86-NEXT: vphaddw %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpbroadcastw %xmm1, %xmm0 +; X86-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; X86-NEXT: vpextrw $4, %xmm0, (%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test_demanded_phaddw_256: ; X64: ## %bb.0: -; X64-NEXT: vpbroadcastw %xmm1, %xmm1 -; X64-NEXT: vphaddw %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpbroadcastw %xmm1, %xmm0 +; X64-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; X64-NEXT: vpextrw $4, %xmm0, (%rdi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -346,7 +342,6 @@ define void @test_demanded_phsubw_256(<16 x i16> %a0, <16 x i16> %a1, i16 *%a2) ; X86-LABEL: test_demanded_phsubw_256: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpbroadcastw %xmm0, %xmm0 ; X86-NEXT: vphsubw %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpextrw $6, %xmm0, (%eax) ; X86-NEXT: vzeroupper @@ -354,7 +349,6 @@ define void @test_demanded_phsubw_256(<16 x i16> %a0, <16 x i16> %a1, i16 *%a2) ; ; X64-LABEL: test_demanded_phsubw_256: ; X64: ## %bb.0: -; X64-NEXT: vpbroadcastw %xmm0, %xmm0 ; X64-NEXT: vphsubw %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpextrw $6, %xmm0, (%rdi) ; X64-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index fa70edbf121a..efbb1ef8cc66 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -37,25 +37,21 @@ define void @store_v1f64_v1i64(<1 x i64> %trigger, <1 x double>* %addr, <1 x dou define void @store_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %val) { ; SSE2-LABEL: store_v2f64_v2i64: ; SSE2: ## %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: movd %xmm4, %eax +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_2 ; SSE2-NEXT: ## %bb.1: ## %cond.store ; SSE2-NEXT: movlpd %xmm1, (%rdi) ; SSE2-NEXT: LBB1_2: ## %else -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_4 @@ -124,20 +120,16 @@ define void @store_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, <4 x dou ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pand %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm7 -; SSE2-NEXT: movd %xmm7, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_2 ; SSE2-NEXT: ## %bb.1: ## %cond.store ; SSE2-NEXT: movlpd %xmm2, (%rdi) ; SSE2-NEXT: LBB2_2: ## %else -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_4 @@ -147,10 +139,9 @@ define void @store_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, <4 x dou ; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm4, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pextrw $0, %xmm0, %eax @@ -902,25 +893,21 @@ define void @store_v16f32_v16i32(<16 x float> %x, <16 x float>* %ptr, <16 x floa define void @store_v2i64_v2i64(<2 x i64> %trigger, <2 x i64>* %addr, <2 x i64> %val) { ; SSE2-LABEL: store_v2i64_v2i64: ; SSE2: ## %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: movd %xmm4, %eax +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB7_2 ; SSE2-NEXT: ## %bb.1: ## %cond.store ; SSE2-NEXT: movq %xmm1, (%rdi) ; SSE2-NEXT: LBB7_2: ## %else -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB7_4 @@ -995,20 +982,16 @@ define void @store_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i64> % ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pand %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm7 -; SSE2-NEXT: movd %xmm7, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB8_2 ; SSE2-NEXT: ## %bb.1: ## %cond.store ; SSE2-NEXT: movq %xmm2, (%rdi) ; SSE2-NEXT: LBB8_2: ## %else -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB8_4 @@ -1019,10 +1002,9 @@ define void @store_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i64> % ; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm4, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pextrw $0, %xmm0, %eax